{ "best_global_step": 30784, "best_metric": 0.23655234277248383, "best_model_checkpoint": "saves_stability/prefix-tuning/llama-3-8b-instruct/train_cola_1757340184/checkpoint-30784", "epoch": 20.0, "eval_steps": 3848, "global_step": 76960, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012993762993762994, "grad_norm": 217.3270721435547, "learning_rate": 2.598752598752599e-08, "loss": 12.8151, "num_input_tokens_seen": 432, "step": 5 }, { "epoch": 0.002598752598752599, "grad_norm": 117.97624206542969, "learning_rate": 5.8471933471933477e-08, "loss": 11.7776, "num_input_tokens_seen": 848, "step": 10 }, { "epoch": 0.0038981288981288983, "grad_norm": 212.2732391357422, "learning_rate": 9.095634095634096e-08, "loss": 12.5996, "num_input_tokens_seen": 1280, "step": 15 }, { "epoch": 0.005197505197505198, "grad_norm": 151.02342224121094, "learning_rate": 1.2344074844074844e-07, "loss": 10.3592, "num_input_tokens_seen": 1744, "step": 20 }, { "epoch": 0.006496881496881497, "grad_norm": 124.02364349365234, "learning_rate": 1.5592515592515593e-07, "loss": 11.4489, "num_input_tokens_seen": 2176, "step": 25 }, { "epoch": 0.007796257796257797, "grad_norm": 211.38796997070312, "learning_rate": 1.8840956340956344e-07, "loss": 12.0689, "num_input_tokens_seen": 2592, "step": 30 }, { "epoch": 0.009095634095634096, "grad_norm": 214.40615844726562, "learning_rate": 2.2089397089397092e-07, "loss": 11.2658, "num_input_tokens_seen": 3040, "step": 35 }, { "epoch": 0.010395010395010396, "grad_norm": 205.7491912841797, "learning_rate": 2.533783783783784e-07, "loss": 10.6036, "num_input_tokens_seen": 3472, "step": 40 }, { "epoch": 0.011694386694386695, "grad_norm": 232.00997924804688, "learning_rate": 2.858627858627859e-07, "loss": 10.4079, "num_input_tokens_seen": 3920, "step": 45 }, { "epoch": 0.012993762993762994, "grad_norm": 205.06314086914062, "learning_rate": 3.183471933471934e-07, "loss": 10.5629, "num_input_tokens_seen": 4368, "step": 50 }, { "epoch": 0.014293139293139294, "grad_norm": 206.07904052734375, "learning_rate": 3.5083160083160086e-07, "loss": 9.256, "num_input_tokens_seen": 4816, "step": 55 }, { "epoch": 0.015592515592515593, "grad_norm": 197.94979858398438, "learning_rate": 3.8331600831600834e-07, "loss": 8.486, "num_input_tokens_seen": 5280, "step": 60 }, { "epoch": 0.016891891891891893, "grad_norm": 154.30702209472656, "learning_rate": 4.1580041580041583e-07, "loss": 7.4865, "num_input_tokens_seen": 5712, "step": 65 }, { "epoch": 0.018191268191268192, "grad_norm": 165.2001953125, "learning_rate": 4.482848232848233e-07, "loss": 7.5211, "num_input_tokens_seen": 6160, "step": 70 }, { "epoch": 0.01949064449064449, "grad_norm": 148.34776306152344, "learning_rate": 4.807692307692308e-07, "loss": 7.2602, "num_input_tokens_seen": 6608, "step": 75 }, { "epoch": 0.02079002079002079, "grad_norm": 133.42678833007812, "learning_rate": 5.132536382536383e-07, "loss": 6.8226, "num_input_tokens_seen": 7120, "step": 80 }, { "epoch": 0.02208939708939709, "grad_norm": 108.57149505615234, "learning_rate": 5.457380457380458e-07, "loss": 5.8078, "num_input_tokens_seen": 7568, "step": 85 }, { "epoch": 0.02338877338877339, "grad_norm": 108.9032974243164, "learning_rate": 5.782224532224532e-07, "loss": 5.6128, "num_input_tokens_seen": 8000, "step": 90 }, { "epoch": 0.02468814968814969, "grad_norm": 106.33738708496094, "learning_rate": 6.107068607068608e-07, "loss": 5.3646, "num_input_tokens_seen": 8464, "step": 95 }, { "epoch": 0.02598752598752599, "grad_norm": 74.63994598388672, "learning_rate": 6.431912681912682e-07, "loss": 4.6696, "num_input_tokens_seen": 8912, "step": 100 }, { "epoch": 0.02728690228690229, "grad_norm": 83.71345520019531, "learning_rate": 6.756756756756758e-07, "loss": 4.2801, "num_input_tokens_seen": 9376, "step": 105 }, { "epoch": 0.028586278586278588, "grad_norm": 83.3984603881836, "learning_rate": 7.081600831600832e-07, "loss": 3.6646, "num_input_tokens_seen": 9808, "step": 110 }, { "epoch": 0.029885654885654887, "grad_norm": 84.65328979492188, "learning_rate": 7.406444906444907e-07, "loss": 3.167, "num_input_tokens_seen": 10208, "step": 115 }, { "epoch": 0.031185031185031187, "grad_norm": 82.24077606201172, "learning_rate": 7.731288981288983e-07, "loss": 2.8749, "num_input_tokens_seen": 10640, "step": 120 }, { "epoch": 0.032484407484407486, "grad_norm": 78.59873962402344, "learning_rate": 8.056133056133057e-07, "loss": 2.3385, "num_input_tokens_seen": 11088, "step": 125 }, { "epoch": 0.033783783783783786, "grad_norm": 60.01283645629883, "learning_rate": 8.380977130977132e-07, "loss": 1.8462, "num_input_tokens_seen": 11536, "step": 130 }, { "epoch": 0.035083160083160085, "grad_norm": 64.38899230957031, "learning_rate": 8.705821205821207e-07, "loss": 1.671, "num_input_tokens_seen": 12000, "step": 135 }, { "epoch": 0.036382536382536385, "grad_norm": 39.60575485229492, "learning_rate": 9.030665280665282e-07, "loss": 1.1814, "num_input_tokens_seen": 12480, "step": 140 }, { "epoch": 0.037681912681912684, "grad_norm": 40.35724639892578, "learning_rate": 9.355509355509356e-07, "loss": 0.8406, "num_input_tokens_seen": 12960, "step": 145 }, { "epoch": 0.03898128898128898, "grad_norm": 26.11021614074707, "learning_rate": 9.680353430353432e-07, "loss": 0.7208, "num_input_tokens_seen": 13440, "step": 150 }, { "epoch": 0.04028066528066528, "grad_norm": 46.03295135498047, "learning_rate": 1.0005197505197507e-06, "loss": 0.8895, "num_input_tokens_seen": 13856, "step": 155 }, { "epoch": 0.04158004158004158, "grad_norm": 37.76517105102539, "learning_rate": 1.033004158004158e-06, "loss": 0.548, "num_input_tokens_seen": 14304, "step": 160 }, { "epoch": 0.04287941787941788, "grad_norm": 56.906776428222656, "learning_rate": 1.0654885654885656e-06, "loss": 0.3001, "num_input_tokens_seen": 14720, "step": 165 }, { "epoch": 0.04417879417879418, "grad_norm": 18.87135887145996, "learning_rate": 1.097972972972973e-06, "loss": 0.3482, "num_input_tokens_seen": 15168, "step": 170 }, { "epoch": 0.04547817047817048, "grad_norm": 11.732749938964844, "learning_rate": 1.1304573804573806e-06, "loss": 0.2253, "num_input_tokens_seen": 15632, "step": 175 }, { "epoch": 0.04677754677754678, "grad_norm": 7.656787395477295, "learning_rate": 1.162941787941788e-06, "loss": 0.5573, "num_input_tokens_seen": 16080, "step": 180 }, { "epoch": 0.04807692307692308, "grad_norm": 70.41670989990234, "learning_rate": 1.1954261954261955e-06, "loss": 0.7184, "num_input_tokens_seen": 16544, "step": 185 }, { "epoch": 0.04937629937629938, "grad_norm": 23.4609317779541, "learning_rate": 1.227910602910603e-06, "loss": 0.5268, "num_input_tokens_seen": 16960, "step": 190 }, { "epoch": 0.05067567567567568, "grad_norm": 36.3455696105957, "learning_rate": 1.2603950103950106e-06, "loss": 0.3229, "num_input_tokens_seen": 17408, "step": 195 }, { "epoch": 0.05197505197505198, "grad_norm": 54.14461135864258, "learning_rate": 1.2928794178794181e-06, "loss": 0.2941, "num_input_tokens_seen": 17872, "step": 200 }, { "epoch": 0.05327442827442828, "grad_norm": 28.191268920898438, "learning_rate": 1.3253638253638254e-06, "loss": 0.334, "num_input_tokens_seen": 18336, "step": 205 }, { "epoch": 0.05457380457380458, "grad_norm": 60.84302520751953, "learning_rate": 1.357848232848233e-06, "loss": 0.3404, "num_input_tokens_seen": 18784, "step": 210 }, { "epoch": 0.055873180873180876, "grad_norm": 104.69641876220703, "learning_rate": 1.3903326403326405e-06, "loss": 0.3284, "num_input_tokens_seen": 19200, "step": 215 }, { "epoch": 0.057172557172557176, "grad_norm": 33.998863220214844, "learning_rate": 1.422817047817048e-06, "loss": 0.3493, "num_input_tokens_seen": 19680, "step": 220 }, { "epoch": 0.058471933471933475, "grad_norm": 17.4316463470459, "learning_rate": 1.4553014553014554e-06, "loss": 0.1682, "num_input_tokens_seen": 20112, "step": 225 }, { "epoch": 0.059771309771309775, "grad_norm": 4.784578323364258, "learning_rate": 1.487785862785863e-06, "loss": 0.3047, "num_input_tokens_seen": 20560, "step": 230 }, { "epoch": 0.061070686070686074, "grad_norm": 2.2853472232818604, "learning_rate": 1.5202702702702704e-06, "loss": 0.3932, "num_input_tokens_seen": 21008, "step": 235 }, { "epoch": 0.062370062370062374, "grad_norm": 0.5865190029144287, "learning_rate": 1.552754677754678e-06, "loss": 0.579, "num_input_tokens_seen": 21424, "step": 240 }, { "epoch": 0.06366943866943867, "grad_norm": 175.4270782470703, "learning_rate": 1.5852390852390853e-06, "loss": 1.2588, "num_input_tokens_seen": 21872, "step": 245 }, { "epoch": 0.06496881496881497, "grad_norm": 2.6170225143432617, "learning_rate": 1.6177234927234926e-06, "loss": 0.3716, "num_input_tokens_seen": 22352, "step": 250 }, { "epoch": 0.06626819126819126, "grad_norm": 0.3893991708755493, "learning_rate": 1.6502079002079004e-06, "loss": 0.7009, "num_input_tokens_seen": 22816, "step": 255 }, { "epoch": 0.06756756756756757, "grad_norm": 0.5933239459991455, "learning_rate": 1.6826923076923077e-06, "loss": 0.9143, "num_input_tokens_seen": 23280, "step": 260 }, { "epoch": 0.06886694386694386, "grad_norm": 268.55242919921875, "learning_rate": 1.7151767151767155e-06, "loss": 0.5462, "num_input_tokens_seen": 23680, "step": 265 }, { "epoch": 0.07016632016632017, "grad_norm": 0.3882233500480652, "learning_rate": 1.7476611226611226e-06, "loss": 0.8815, "num_input_tokens_seen": 24112, "step": 270 }, { "epoch": 0.07146569646569646, "grad_norm": 0.4812813699245453, "learning_rate": 1.7801455301455303e-06, "loss": 0.288, "num_input_tokens_seen": 24512, "step": 275 }, { "epoch": 0.07276507276507277, "grad_norm": 95.09648895263672, "learning_rate": 1.8126299376299376e-06, "loss": 0.6454, "num_input_tokens_seen": 25008, "step": 280 }, { "epoch": 0.07406444906444906, "grad_norm": 3.0603537559509277, "learning_rate": 1.8451143451143454e-06, "loss": 0.3728, "num_input_tokens_seen": 25456, "step": 285 }, { "epoch": 0.07536382536382537, "grad_norm": 5.000244140625, "learning_rate": 1.8775987525987527e-06, "loss": 0.4746, "num_input_tokens_seen": 25920, "step": 290 }, { "epoch": 0.07666320166320166, "grad_norm": 10.000635147094727, "learning_rate": 1.9100831600831605e-06, "loss": 0.5626, "num_input_tokens_seen": 26368, "step": 295 }, { "epoch": 0.07796257796257797, "grad_norm": 171.50401306152344, "learning_rate": 1.9425675675675676e-06, "loss": 0.7306, "num_input_tokens_seen": 26816, "step": 300 }, { "epoch": 0.07926195426195426, "grad_norm": 7.4310078620910645, "learning_rate": 1.975051975051975e-06, "loss": 0.5197, "num_input_tokens_seen": 27280, "step": 305 }, { "epoch": 0.08056133056133057, "grad_norm": 66.99835205078125, "learning_rate": 2.0075363825363827e-06, "loss": 0.3768, "num_input_tokens_seen": 27728, "step": 310 }, { "epoch": 0.08186070686070686, "grad_norm": 29.43844985961914, "learning_rate": 2.04002079002079e-06, "loss": 0.376, "num_input_tokens_seen": 28160, "step": 315 }, { "epoch": 0.08316008316008316, "grad_norm": 147.89427185058594, "learning_rate": 2.0725051975051977e-06, "loss": 0.3699, "num_input_tokens_seen": 28576, "step": 320 }, { "epoch": 0.08445945945945946, "grad_norm": 47.27925491333008, "learning_rate": 2.1049896049896053e-06, "loss": 0.2854, "num_input_tokens_seen": 29040, "step": 325 }, { "epoch": 0.08575883575883576, "grad_norm": 72.90226745605469, "learning_rate": 2.1374740124740124e-06, "loss": 0.3273, "num_input_tokens_seen": 29504, "step": 330 }, { "epoch": 0.08705821205821206, "grad_norm": 44.39766311645508, "learning_rate": 2.1699584199584203e-06, "loss": 0.2331, "num_input_tokens_seen": 29952, "step": 335 }, { "epoch": 0.08835758835758836, "grad_norm": 96.43351745605469, "learning_rate": 2.2024428274428275e-06, "loss": 0.3314, "num_input_tokens_seen": 30432, "step": 340 }, { "epoch": 0.08965696465696466, "grad_norm": 80.67144012451172, "learning_rate": 2.234927234927235e-06, "loss": 0.2369, "num_input_tokens_seen": 30896, "step": 345 }, { "epoch": 0.09095634095634096, "grad_norm": 54.7521858215332, "learning_rate": 2.2674116424116425e-06, "loss": 0.2514, "num_input_tokens_seen": 31360, "step": 350 }, { "epoch": 0.09225571725571725, "grad_norm": 150.3870391845703, "learning_rate": 2.29989604989605e-06, "loss": 0.3967, "num_input_tokens_seen": 31776, "step": 355 }, { "epoch": 0.09355509355509356, "grad_norm": 0.9152669906616211, "learning_rate": 2.3323804573804576e-06, "loss": 0.0192, "num_input_tokens_seen": 32224, "step": 360 }, { "epoch": 0.09485446985446985, "grad_norm": 0.2661440074443817, "learning_rate": 2.364864864864865e-06, "loss": 0.7292, "num_input_tokens_seen": 32656, "step": 365 }, { "epoch": 0.09615384615384616, "grad_norm": 82.03950500488281, "learning_rate": 2.3973492723492723e-06, "loss": 1.5391, "num_input_tokens_seen": 33088, "step": 370 }, { "epoch": 0.09745322245322245, "grad_norm": 1.8947633504867554, "learning_rate": 2.4298336798336802e-06, "loss": 0.1892, "num_input_tokens_seen": 33520, "step": 375 }, { "epoch": 0.09875259875259876, "grad_norm": 55.96096420288086, "learning_rate": 2.4623180873180873e-06, "loss": 0.5899, "num_input_tokens_seen": 33968, "step": 380 }, { "epoch": 0.10005197505197505, "grad_norm": 1.2453818321228027, "learning_rate": 2.494802494802495e-06, "loss": 0.2018, "num_input_tokens_seen": 34400, "step": 385 }, { "epoch": 0.10135135135135136, "grad_norm": 72.07263946533203, "learning_rate": 2.5272869022869024e-06, "loss": 0.3079, "num_input_tokens_seen": 34864, "step": 390 }, { "epoch": 0.10265072765072765, "grad_norm": 69.0866928100586, "learning_rate": 2.55977130977131e-06, "loss": 1.3881, "num_input_tokens_seen": 35280, "step": 395 }, { "epoch": 0.10395010395010396, "grad_norm": 62.748538970947266, "learning_rate": 2.5922557172557175e-06, "loss": 0.6543, "num_input_tokens_seen": 35728, "step": 400 }, { "epoch": 0.10524948024948025, "grad_norm": 47.33015441894531, "learning_rate": 2.624740124740125e-06, "loss": 0.7843, "num_input_tokens_seen": 36192, "step": 405 }, { "epoch": 0.10654885654885655, "grad_norm": 51.111507415771484, "learning_rate": 2.657224532224532e-06, "loss": 0.3982, "num_input_tokens_seen": 36656, "step": 410 }, { "epoch": 0.10784823284823285, "grad_norm": 13.874995231628418, "learning_rate": 2.68970893970894e-06, "loss": 0.4756, "num_input_tokens_seen": 37072, "step": 415 }, { "epoch": 0.10914760914760915, "grad_norm": 26.480627059936523, "learning_rate": 2.722193347193347e-06, "loss": 0.3814, "num_input_tokens_seen": 37552, "step": 420 }, { "epoch": 0.11044698544698545, "grad_norm": 19.51970672607422, "learning_rate": 2.754677754677755e-06, "loss": 0.2489, "num_input_tokens_seen": 38032, "step": 425 }, { "epoch": 0.11174636174636175, "grad_norm": 20.115943908691406, "learning_rate": 2.7871621621621623e-06, "loss": 0.3363, "num_input_tokens_seen": 38448, "step": 430 }, { "epoch": 0.11304573804573805, "grad_norm": 11.848551750183105, "learning_rate": 2.81964656964657e-06, "loss": 0.2996, "num_input_tokens_seen": 38960, "step": 435 }, { "epoch": 0.11434511434511435, "grad_norm": 70.38330078125, "learning_rate": 2.8521309771309773e-06, "loss": 0.3007, "num_input_tokens_seen": 39408, "step": 440 }, { "epoch": 0.11564449064449064, "grad_norm": 11.464641571044922, "learning_rate": 2.884615384615385e-06, "loss": 0.1994, "num_input_tokens_seen": 39840, "step": 445 }, { "epoch": 0.11694386694386695, "grad_norm": 10.946282386779785, "learning_rate": 2.9170997920997924e-06, "loss": 0.4265, "num_input_tokens_seen": 40320, "step": 450 }, { "epoch": 0.11824324324324324, "grad_norm": 42.357147216796875, "learning_rate": 2.9495841995842e-06, "loss": 0.4072, "num_input_tokens_seen": 40768, "step": 455 }, { "epoch": 0.11954261954261955, "grad_norm": 42.804962158203125, "learning_rate": 2.982068607068607e-06, "loss": 0.1751, "num_input_tokens_seen": 41232, "step": 460 }, { "epoch": 0.12084199584199584, "grad_norm": 71.0899887084961, "learning_rate": 3.014553014553015e-06, "loss": 0.9312, "num_input_tokens_seen": 41680, "step": 465 }, { "epoch": 0.12214137214137215, "grad_norm": 6.630216598510742, "learning_rate": 3.047037422037422e-06, "loss": 0.2745, "num_input_tokens_seen": 42128, "step": 470 }, { "epoch": 0.12344074844074844, "grad_norm": 7.1825480461120605, "learning_rate": 3.0795218295218297e-06, "loss": 0.2449, "num_input_tokens_seen": 42560, "step": 475 }, { "epoch": 0.12474012474012475, "grad_norm": 37.09231948852539, "learning_rate": 3.1120062370062372e-06, "loss": 0.2718, "num_input_tokens_seen": 43008, "step": 480 }, { "epoch": 0.12603950103950104, "grad_norm": 80.85445404052734, "learning_rate": 3.1444906444906448e-06, "loss": 0.6545, "num_input_tokens_seen": 43520, "step": 485 }, { "epoch": 0.12733887733887733, "grad_norm": 35.461158752441406, "learning_rate": 3.1769750519750523e-06, "loss": 0.7367, "num_input_tokens_seen": 43968, "step": 490 }, { "epoch": 0.12863825363825362, "grad_norm": 28.829334259033203, "learning_rate": 3.2094594594594594e-06, "loss": 0.3003, "num_input_tokens_seen": 44400, "step": 495 }, { "epoch": 0.12993762993762994, "grad_norm": 52.47704315185547, "learning_rate": 3.241943866943867e-06, "loss": 0.2641, "num_input_tokens_seen": 44880, "step": 500 }, { "epoch": 0.13123700623700624, "grad_norm": 44.309078216552734, "learning_rate": 3.274428274428275e-06, "loss": 0.2329, "num_input_tokens_seen": 45312, "step": 505 }, { "epoch": 0.13253638253638253, "grad_norm": 0.40625202655792236, "learning_rate": 3.306912681912682e-06, "loss": 0.5172, "num_input_tokens_seen": 45728, "step": 510 }, { "epoch": 0.13383575883575882, "grad_norm": 1.2600537538528442, "learning_rate": 3.3393970893970896e-06, "loss": 0.7481, "num_input_tokens_seen": 46176, "step": 515 }, { "epoch": 0.13513513513513514, "grad_norm": 14.297417640686035, "learning_rate": 3.3718814968814967e-06, "loss": 0.4908, "num_input_tokens_seen": 46640, "step": 520 }, { "epoch": 0.13643451143451144, "grad_norm": 63.23566436767578, "learning_rate": 3.4043659043659046e-06, "loss": 0.5315, "num_input_tokens_seen": 47088, "step": 525 }, { "epoch": 0.13773388773388773, "grad_norm": 59.27616882324219, "learning_rate": 3.436850311850312e-06, "loss": 0.4366, "num_input_tokens_seen": 47536, "step": 530 }, { "epoch": 0.13903326403326402, "grad_norm": 45.20004653930664, "learning_rate": 3.4693347193347193e-06, "loss": 0.4517, "num_input_tokens_seen": 47984, "step": 535 }, { "epoch": 0.14033264033264034, "grad_norm": 88.41967010498047, "learning_rate": 3.501819126819127e-06, "loss": 0.2912, "num_input_tokens_seen": 48400, "step": 540 }, { "epoch": 0.14163201663201663, "grad_norm": 117.27122497558594, "learning_rate": 3.5343035343035348e-06, "loss": 0.6241, "num_input_tokens_seen": 48848, "step": 545 }, { "epoch": 0.14293139293139293, "grad_norm": 98.1905746459961, "learning_rate": 3.566787941787942e-06, "loss": 0.2579, "num_input_tokens_seen": 49296, "step": 550 }, { "epoch": 0.14423076923076922, "grad_norm": 24.4013614654541, "learning_rate": 3.5992723492723494e-06, "loss": 0.4002, "num_input_tokens_seen": 49760, "step": 555 }, { "epoch": 0.14553014553014554, "grad_norm": 11.510665893554688, "learning_rate": 3.6317567567567565e-06, "loss": 0.2637, "num_input_tokens_seen": 50208, "step": 560 }, { "epoch": 0.14682952182952183, "grad_norm": 60.65883255004883, "learning_rate": 3.6642411642411645e-06, "loss": 0.3172, "num_input_tokens_seen": 50640, "step": 565 }, { "epoch": 0.14812889812889812, "grad_norm": 18.869993209838867, "learning_rate": 3.696725571725572e-06, "loss": 0.3157, "num_input_tokens_seen": 51056, "step": 570 }, { "epoch": 0.14942827442827442, "grad_norm": 42.07174301147461, "learning_rate": 3.729209979209979e-06, "loss": 0.2728, "num_input_tokens_seen": 51504, "step": 575 }, { "epoch": 0.15072765072765074, "grad_norm": 26.77210235595703, "learning_rate": 3.7616943866943867e-06, "loss": 0.42, "num_input_tokens_seen": 52016, "step": 580 }, { "epoch": 0.15202702702702703, "grad_norm": 55.31374740600586, "learning_rate": 3.7941787941787947e-06, "loss": 0.257, "num_input_tokens_seen": 52464, "step": 585 }, { "epoch": 0.15332640332640332, "grad_norm": 73.0823745727539, "learning_rate": 3.826663201663202e-06, "loss": 0.3028, "num_input_tokens_seen": 52912, "step": 590 }, { "epoch": 0.1546257796257796, "grad_norm": 9.104533195495605, "learning_rate": 3.859147609147609e-06, "loss": 0.2497, "num_input_tokens_seen": 53376, "step": 595 }, { "epoch": 0.15592515592515593, "grad_norm": 22.04579734802246, "learning_rate": 3.891632016632016e-06, "loss": 0.3422, "num_input_tokens_seen": 53856, "step": 600 }, { "epoch": 0.15722453222453223, "grad_norm": 18.22458839416504, "learning_rate": 3.924116424116424e-06, "loss": 0.1665, "num_input_tokens_seen": 54336, "step": 605 }, { "epoch": 0.15852390852390852, "grad_norm": 42.97499465942383, "learning_rate": 3.9566008316008315e-06, "loss": 0.6221, "num_input_tokens_seen": 54800, "step": 610 }, { "epoch": 0.1598232848232848, "grad_norm": 16.06820297241211, "learning_rate": 3.9890852390852394e-06, "loss": 0.281, "num_input_tokens_seen": 55264, "step": 615 }, { "epoch": 0.16112266112266113, "grad_norm": 17.470619201660156, "learning_rate": 4.0215696465696466e-06, "loss": 0.4267, "num_input_tokens_seen": 55728, "step": 620 }, { "epoch": 0.16242203742203742, "grad_norm": 22.049142837524414, "learning_rate": 4.0540540540540545e-06, "loss": 0.2341, "num_input_tokens_seen": 56176, "step": 625 }, { "epoch": 0.16372141372141372, "grad_norm": 25.779682159423828, "learning_rate": 4.086538461538462e-06, "loss": 0.2439, "num_input_tokens_seen": 56624, "step": 630 }, { "epoch": 0.16502079002079, "grad_norm": 8.266478538513184, "learning_rate": 4.119022869022869e-06, "loss": 0.2524, "num_input_tokens_seen": 57056, "step": 635 }, { "epoch": 0.16632016632016633, "grad_norm": 7.276123046875, "learning_rate": 4.151507276507277e-06, "loss": 0.2135, "num_input_tokens_seen": 57520, "step": 640 }, { "epoch": 0.16761954261954262, "grad_norm": 1.1707391738891602, "learning_rate": 4.183991683991685e-06, "loss": 0.1677, "num_input_tokens_seen": 58000, "step": 645 }, { "epoch": 0.16891891891891891, "grad_norm": 0.1741313934326172, "learning_rate": 4.216476091476092e-06, "loss": 0.0035, "num_input_tokens_seen": 58448, "step": 650 }, { "epoch": 0.1702182952182952, "grad_norm": 45.30925750732422, "learning_rate": 4.248960498960499e-06, "loss": 1.381, "num_input_tokens_seen": 58912, "step": 655 }, { "epoch": 0.17151767151767153, "grad_norm": 58.714420318603516, "learning_rate": 4.281444906444906e-06, "loss": 0.8743, "num_input_tokens_seen": 59328, "step": 660 }, { "epoch": 0.17281704781704782, "grad_norm": 41.478759765625, "learning_rate": 4.313929313929314e-06, "loss": 0.3708, "num_input_tokens_seen": 59744, "step": 665 }, { "epoch": 0.1741164241164241, "grad_norm": 28.031553268432617, "learning_rate": 4.346413721413722e-06, "loss": 0.708, "num_input_tokens_seen": 60160, "step": 670 }, { "epoch": 0.1754158004158004, "grad_norm": 114.1934814453125, "learning_rate": 4.378898128898129e-06, "loss": 0.4418, "num_input_tokens_seen": 60624, "step": 675 }, { "epoch": 0.17671517671517672, "grad_norm": 37.54389190673828, "learning_rate": 4.411382536382536e-06, "loss": 0.3896, "num_input_tokens_seen": 61088, "step": 680 }, { "epoch": 0.17801455301455302, "grad_norm": 60.02890396118164, "learning_rate": 4.443866943866944e-06, "loss": 0.322, "num_input_tokens_seen": 61504, "step": 685 }, { "epoch": 0.1793139293139293, "grad_norm": 31.810081481933594, "learning_rate": 4.476351351351351e-06, "loss": 0.2966, "num_input_tokens_seen": 61936, "step": 690 }, { "epoch": 0.1806133056133056, "grad_norm": 28.408050537109375, "learning_rate": 4.508835758835759e-06, "loss": 0.4891, "num_input_tokens_seen": 62416, "step": 695 }, { "epoch": 0.18191268191268192, "grad_norm": 1.8305028676986694, "learning_rate": 4.541320166320166e-06, "loss": 0.1663, "num_input_tokens_seen": 62864, "step": 700 }, { "epoch": 0.18321205821205822, "grad_norm": 32.065303802490234, "learning_rate": 4.573804573804574e-06, "loss": 0.2123, "num_input_tokens_seen": 63328, "step": 705 }, { "epoch": 0.1845114345114345, "grad_norm": 38.77202224731445, "learning_rate": 4.606288981288981e-06, "loss": 0.4818, "num_input_tokens_seen": 63760, "step": 710 }, { "epoch": 0.1858108108108108, "grad_norm": 0.5831516981124878, "learning_rate": 4.6387733887733885e-06, "loss": 0.4421, "num_input_tokens_seen": 64240, "step": 715 }, { "epoch": 0.18711018711018712, "grad_norm": 2.3156375885009766, "learning_rate": 4.6712577962577965e-06, "loss": 0.5733, "num_input_tokens_seen": 64672, "step": 720 }, { "epoch": 0.1884095634095634, "grad_norm": 5.638915061950684, "learning_rate": 4.703742203742204e-06, "loss": 0.4139, "num_input_tokens_seen": 65120, "step": 725 }, { "epoch": 0.1897089397089397, "grad_norm": 17.654186248779297, "learning_rate": 4.7362266112266115e-06, "loss": 0.5009, "num_input_tokens_seen": 65584, "step": 730 }, { "epoch": 0.191008316008316, "grad_norm": 49.01826095581055, "learning_rate": 4.768711018711019e-06, "loss": 0.2901, "num_input_tokens_seen": 66032, "step": 735 }, { "epoch": 0.19230769230769232, "grad_norm": 61.58161926269531, "learning_rate": 4.801195426195426e-06, "loss": 0.3939, "num_input_tokens_seen": 66512, "step": 740 }, { "epoch": 0.1936070686070686, "grad_norm": 96.95549011230469, "learning_rate": 4.8336798336798346e-06, "loss": 0.4844, "num_input_tokens_seen": 66960, "step": 745 }, { "epoch": 0.1949064449064449, "grad_norm": 50.068119049072266, "learning_rate": 4.866164241164242e-06, "loss": 0.3656, "num_input_tokens_seen": 67408, "step": 750 }, { "epoch": 0.1962058212058212, "grad_norm": 25.655302047729492, "learning_rate": 4.898648648648649e-06, "loss": 0.2991, "num_input_tokens_seen": 67808, "step": 755 }, { "epoch": 0.19750519750519752, "grad_norm": 10.581441879272461, "learning_rate": 4.931133056133056e-06, "loss": 0.2255, "num_input_tokens_seen": 68272, "step": 760 }, { "epoch": 0.1988045738045738, "grad_norm": 1.144434928894043, "learning_rate": 4.963617463617464e-06, "loss": 0.2025, "num_input_tokens_seen": 68704, "step": 765 }, { "epoch": 0.2001039501039501, "grad_norm": 96.55215454101562, "learning_rate": 4.996101871101872e-06, "loss": 0.9298, "num_input_tokens_seen": 69168, "step": 770 }, { "epoch": 0.2014033264033264, "grad_norm": 67.91202545166016, "learning_rate": 5.028586278586279e-06, "loss": 0.4523, "num_input_tokens_seen": 69648, "step": 775 }, { "epoch": 0.20270270270270271, "grad_norm": 0.48043668270111084, "learning_rate": 5.061070686070686e-06, "loss": 0.4008, "num_input_tokens_seen": 70112, "step": 780 }, { "epoch": 0.204002079002079, "grad_norm": 0.6539661288261414, "learning_rate": 5.093555093555094e-06, "loss": 0.7618, "num_input_tokens_seen": 70560, "step": 785 }, { "epoch": 0.2053014553014553, "grad_norm": 1.311177134513855, "learning_rate": 5.126039501039501e-06, "loss": 0.6668, "num_input_tokens_seen": 71040, "step": 790 }, { "epoch": 0.2066008316008316, "grad_norm": 26.01776123046875, "learning_rate": 5.158523908523909e-06, "loss": 0.508, "num_input_tokens_seen": 71472, "step": 795 }, { "epoch": 0.2079002079002079, "grad_norm": 5.097243785858154, "learning_rate": 5.191008316008316e-06, "loss": 0.2602, "num_input_tokens_seen": 71920, "step": 800 }, { "epoch": 0.2091995841995842, "grad_norm": 1.2109040021896362, "learning_rate": 5.223492723492724e-06, "loss": 0.2711, "num_input_tokens_seen": 72336, "step": 805 }, { "epoch": 0.2104989604989605, "grad_norm": 19.28411293029785, "learning_rate": 5.255977130977131e-06, "loss": 0.708, "num_input_tokens_seen": 72768, "step": 810 }, { "epoch": 0.2117983367983368, "grad_norm": 17.908855438232422, "learning_rate": 5.288461538461538e-06, "loss": 0.5968, "num_input_tokens_seen": 73216, "step": 815 }, { "epoch": 0.2130977130977131, "grad_norm": 7.869619369506836, "learning_rate": 5.320945945945946e-06, "loss": 0.3205, "num_input_tokens_seen": 73696, "step": 820 }, { "epoch": 0.2143970893970894, "grad_norm": 39.229026794433594, "learning_rate": 5.353430353430354e-06, "loss": 0.19, "num_input_tokens_seen": 74160, "step": 825 }, { "epoch": 0.2156964656964657, "grad_norm": 62.70517349243164, "learning_rate": 5.3859147609147614e-06, "loss": 0.6299, "num_input_tokens_seen": 74608, "step": 830 }, { "epoch": 0.216995841995842, "grad_norm": 22.660409927368164, "learning_rate": 5.4183991683991685e-06, "loss": 0.6357, "num_input_tokens_seen": 75040, "step": 835 }, { "epoch": 0.2182952182952183, "grad_norm": 5.770066738128662, "learning_rate": 5.450883575883576e-06, "loss": 0.2671, "num_input_tokens_seen": 75520, "step": 840 }, { "epoch": 0.2195945945945946, "grad_norm": 2.080104351043701, "learning_rate": 5.483367983367984e-06, "loss": 0.1522, "num_input_tokens_seen": 75984, "step": 845 }, { "epoch": 0.2208939708939709, "grad_norm": 1.055938959121704, "learning_rate": 5.5158523908523916e-06, "loss": 0.5454, "num_input_tokens_seen": 76432, "step": 850 }, { "epoch": 0.22219334719334718, "grad_norm": 19.473657608032227, "learning_rate": 5.548336798336799e-06, "loss": 0.3526, "num_input_tokens_seen": 76896, "step": 855 }, { "epoch": 0.2234927234927235, "grad_norm": 0.36662107706069946, "learning_rate": 5.580821205821206e-06, "loss": 0.1899, "num_input_tokens_seen": 77344, "step": 860 }, { "epoch": 0.2247920997920998, "grad_norm": 29.77851676940918, "learning_rate": 5.613305613305614e-06, "loss": 0.7414, "num_input_tokens_seen": 77808, "step": 865 }, { "epoch": 0.2260914760914761, "grad_norm": 28.050506591796875, "learning_rate": 5.645790020790021e-06, "loss": 0.608, "num_input_tokens_seen": 78272, "step": 870 }, { "epoch": 0.22739085239085238, "grad_norm": 3.529449224472046, "learning_rate": 5.678274428274429e-06, "loss": 0.3063, "num_input_tokens_seen": 78752, "step": 875 }, { "epoch": 0.2286902286902287, "grad_norm": 39.749366760253906, "learning_rate": 5.710758835758836e-06, "loss": 0.2151, "num_input_tokens_seen": 79216, "step": 880 }, { "epoch": 0.229989604989605, "grad_norm": 50.39662551879883, "learning_rate": 5.743243243243244e-06, "loss": 1.2484, "num_input_tokens_seen": 79696, "step": 885 }, { "epoch": 0.2312889812889813, "grad_norm": 3.402251720428467, "learning_rate": 5.775727650727651e-06, "loss": 0.2694, "num_input_tokens_seen": 80160, "step": 890 }, { "epoch": 0.23258835758835758, "grad_norm": 1.8595067262649536, "learning_rate": 5.808212058212058e-06, "loss": 0.2226, "num_input_tokens_seen": 80592, "step": 895 }, { "epoch": 0.2338877338877339, "grad_norm": 0.4623396098613739, "learning_rate": 5.840696465696466e-06, "loss": 0.3326, "num_input_tokens_seen": 81040, "step": 900 }, { "epoch": 0.2351871101871102, "grad_norm": 21.58821678161621, "learning_rate": 5.873180873180874e-06, "loss": 0.9515, "num_input_tokens_seen": 81472, "step": 905 }, { "epoch": 0.23648648648648649, "grad_norm": 18.96805191040039, "learning_rate": 5.905665280665281e-06, "loss": 0.3746, "num_input_tokens_seen": 81936, "step": 910 }, { "epoch": 0.23778586278586278, "grad_norm": 13.781149864196777, "learning_rate": 5.938149688149688e-06, "loss": 0.8797, "num_input_tokens_seen": 82368, "step": 915 }, { "epoch": 0.2390852390852391, "grad_norm": 13.807232856750488, "learning_rate": 5.970634095634095e-06, "loss": 0.237, "num_input_tokens_seen": 82816, "step": 920 }, { "epoch": 0.2403846153846154, "grad_norm": 27.904123306274414, "learning_rate": 6.003118503118503e-06, "loss": 0.3303, "num_input_tokens_seen": 83264, "step": 925 }, { "epoch": 0.24168399168399168, "grad_norm": 9.622493743896484, "learning_rate": 6.035602910602911e-06, "loss": 0.3002, "num_input_tokens_seen": 83728, "step": 930 }, { "epoch": 0.24298336798336798, "grad_norm": 33.582603454589844, "learning_rate": 6.0680873180873184e-06, "loss": 0.2804, "num_input_tokens_seen": 84160, "step": 935 }, { "epoch": 0.2442827442827443, "grad_norm": 25.02117919921875, "learning_rate": 6.1005717255717255e-06, "loss": 0.3103, "num_input_tokens_seen": 84640, "step": 940 }, { "epoch": 0.2455821205821206, "grad_norm": 9.195063591003418, "learning_rate": 6.1330561330561335e-06, "loss": 0.3041, "num_input_tokens_seen": 85104, "step": 945 }, { "epoch": 0.24688149688149688, "grad_norm": 2.5172007083892822, "learning_rate": 6.165540540540541e-06, "loss": 0.3037, "num_input_tokens_seen": 85552, "step": 950 }, { "epoch": 0.24818087318087317, "grad_norm": 18.22745704650879, "learning_rate": 6.198024948024949e-06, "loss": 0.3463, "num_input_tokens_seen": 86016, "step": 955 }, { "epoch": 0.2494802494802495, "grad_norm": 23.433265686035156, "learning_rate": 6.230509355509356e-06, "loss": 0.2979, "num_input_tokens_seen": 86480, "step": 960 }, { "epoch": 0.25077962577962576, "grad_norm": 9.372118949890137, "learning_rate": 6.262993762993763e-06, "loss": 0.248, "num_input_tokens_seen": 86928, "step": 965 }, { "epoch": 0.2520790020790021, "grad_norm": 9.122040748596191, "learning_rate": 6.295478170478171e-06, "loss": 0.3838, "num_input_tokens_seen": 87376, "step": 970 }, { "epoch": 0.2533783783783784, "grad_norm": 9.051695823669434, "learning_rate": 6.327962577962578e-06, "loss": 0.2414, "num_input_tokens_seen": 87856, "step": 975 }, { "epoch": 0.25467775467775466, "grad_norm": 2.7229011058807373, "learning_rate": 6.360446985446986e-06, "loss": 0.3444, "num_input_tokens_seen": 88336, "step": 980 }, { "epoch": 0.255977130977131, "grad_norm": 18.769319534301758, "learning_rate": 6.392931392931394e-06, "loss": 0.4116, "num_input_tokens_seen": 88848, "step": 985 }, { "epoch": 0.25727650727650725, "grad_norm": 13.33142375946045, "learning_rate": 6.4254158004158e-06, "loss": 0.3022, "num_input_tokens_seen": 89312, "step": 990 }, { "epoch": 0.25857588357588357, "grad_norm": 18.016887664794922, "learning_rate": 6.457900207900208e-06, "loss": 0.24, "num_input_tokens_seen": 89760, "step": 995 }, { "epoch": 0.2598752598752599, "grad_norm": 1.4639091491699219, "learning_rate": 6.490384615384616e-06, "loss": 0.1437, "num_input_tokens_seen": 90176, "step": 1000 }, { "epoch": 0.26117463617463615, "grad_norm": 36.42005920410156, "learning_rate": 6.522869022869023e-06, "loss": 0.8435, "num_input_tokens_seen": 90608, "step": 1005 }, { "epoch": 0.2624740124740125, "grad_norm": 76.91634368896484, "learning_rate": 6.555353430353431e-06, "loss": 0.8849, "num_input_tokens_seen": 91024, "step": 1010 }, { "epoch": 0.2637733887733888, "grad_norm": 6.7747344970703125, "learning_rate": 6.587837837837837e-06, "loss": 0.3326, "num_input_tokens_seen": 91456, "step": 1015 }, { "epoch": 0.26507276507276506, "grad_norm": 48.24112319946289, "learning_rate": 6.620322245322245e-06, "loss": 0.6236, "num_input_tokens_seen": 91872, "step": 1020 }, { "epoch": 0.2663721413721414, "grad_norm": 34.30349349975586, "learning_rate": 6.652806652806653e-06, "loss": 0.9367, "num_input_tokens_seen": 92304, "step": 1025 }, { "epoch": 0.26767151767151764, "grad_norm": 16.990406036376953, "learning_rate": 6.68529106029106e-06, "loss": 0.6496, "num_input_tokens_seen": 92752, "step": 1030 }, { "epoch": 0.26897089397089397, "grad_norm": 59.837398529052734, "learning_rate": 6.717775467775468e-06, "loss": 0.5908, "num_input_tokens_seen": 93216, "step": 1035 }, { "epoch": 0.2702702702702703, "grad_norm": 5.9570817947387695, "learning_rate": 6.750259875259876e-06, "loss": 0.3757, "num_input_tokens_seen": 93744, "step": 1040 }, { "epoch": 0.27156964656964655, "grad_norm": 10.61559772491455, "learning_rate": 6.7827442827442826e-06, "loss": 0.2507, "num_input_tokens_seen": 94208, "step": 1045 }, { "epoch": 0.27286902286902287, "grad_norm": 7.134811878204346, "learning_rate": 6.8152286902286905e-06, "loss": 0.3862, "num_input_tokens_seen": 94720, "step": 1050 }, { "epoch": 0.2741683991683992, "grad_norm": 4.707842826843262, "learning_rate": 6.847713097713098e-06, "loss": 0.2777, "num_input_tokens_seen": 95184, "step": 1055 }, { "epoch": 0.27546777546777546, "grad_norm": 10.981966972351074, "learning_rate": 6.880197505197506e-06, "loss": 0.2769, "num_input_tokens_seen": 95648, "step": 1060 }, { "epoch": 0.2767671517671518, "grad_norm": 6.268673896789551, "learning_rate": 6.9126819126819135e-06, "loss": 0.28, "num_input_tokens_seen": 96096, "step": 1065 }, { "epoch": 0.27806652806652804, "grad_norm": 8.95946216583252, "learning_rate": 6.94516632016632e-06, "loss": 0.2845, "num_input_tokens_seen": 96544, "step": 1070 }, { "epoch": 0.27936590436590436, "grad_norm": 8.994393348693848, "learning_rate": 6.977650727650728e-06, "loss": 0.2942, "num_input_tokens_seen": 97008, "step": 1075 }, { "epoch": 0.2806652806652807, "grad_norm": 3.966276168823242, "learning_rate": 7.010135135135136e-06, "loss": 0.2714, "num_input_tokens_seen": 97488, "step": 1080 }, { "epoch": 0.28196465696465695, "grad_norm": 7.000974178314209, "learning_rate": 7.042619542619543e-06, "loss": 0.2585, "num_input_tokens_seen": 97936, "step": 1085 }, { "epoch": 0.28326403326403327, "grad_norm": 21.57143783569336, "learning_rate": 7.075103950103951e-06, "loss": 0.3446, "num_input_tokens_seen": 98400, "step": 1090 }, { "epoch": 0.2845634095634096, "grad_norm": 18.062440872192383, "learning_rate": 7.107588357588357e-06, "loss": 0.2876, "num_input_tokens_seen": 98832, "step": 1095 }, { "epoch": 0.28586278586278585, "grad_norm": 100.11751556396484, "learning_rate": 7.140072765072765e-06, "loss": 0.3803, "num_input_tokens_seen": 99296, "step": 1100 }, { "epoch": 0.28716216216216217, "grad_norm": 0.0853123813867569, "learning_rate": 7.172557172557173e-06, "loss": 0.3615, "num_input_tokens_seen": 99728, "step": 1105 }, { "epoch": 0.28846153846153844, "grad_norm": 86.53138732910156, "learning_rate": 7.20504158004158e-06, "loss": 1.1691, "num_input_tokens_seen": 100192, "step": 1110 }, { "epoch": 0.28976091476091476, "grad_norm": 66.39452362060547, "learning_rate": 7.237525987525988e-06, "loss": 1.2237, "num_input_tokens_seen": 100640, "step": 1115 }, { "epoch": 0.2910602910602911, "grad_norm": 9.955547332763672, "learning_rate": 7.270010395010396e-06, "loss": 0.3541, "num_input_tokens_seen": 101072, "step": 1120 }, { "epoch": 0.29235966735966734, "grad_norm": 6.920104503631592, "learning_rate": 7.302494802494802e-06, "loss": 0.2586, "num_input_tokens_seen": 101536, "step": 1125 }, { "epoch": 0.29365904365904366, "grad_norm": 5.588209629058838, "learning_rate": 7.33497920997921e-06, "loss": 0.2367, "num_input_tokens_seen": 102000, "step": 1130 }, { "epoch": 0.29495841995842, "grad_norm": 12.965239524841309, "learning_rate": 7.367463617463617e-06, "loss": 0.456, "num_input_tokens_seen": 102528, "step": 1135 }, { "epoch": 0.29625779625779625, "grad_norm": 5.530828475952148, "learning_rate": 7.399948024948025e-06, "loss": 0.1665, "num_input_tokens_seen": 102944, "step": 1140 }, { "epoch": 0.29755717255717257, "grad_norm": 7.029217720031738, "learning_rate": 7.432432432432433e-06, "loss": 0.3392, "num_input_tokens_seen": 103392, "step": 1145 }, { "epoch": 0.29885654885654883, "grad_norm": 3.537116527557373, "learning_rate": 7.4649168399168396e-06, "loss": 0.2933, "num_input_tokens_seen": 103856, "step": 1150 }, { "epoch": 0.30015592515592515, "grad_norm": 12.73840045928955, "learning_rate": 7.4974012474012475e-06, "loss": 0.2889, "num_input_tokens_seen": 104304, "step": 1155 }, { "epoch": 0.30145530145530147, "grad_norm": 3.163538694381714, "learning_rate": 7.5298856548856555e-06, "loss": 0.3154, "num_input_tokens_seen": 104784, "step": 1160 }, { "epoch": 0.30275467775467774, "grad_norm": 9.474032402038574, "learning_rate": 7.562370062370063e-06, "loss": 0.2627, "num_input_tokens_seen": 105264, "step": 1165 }, { "epoch": 0.30405405405405406, "grad_norm": 6.081691741943359, "learning_rate": 7.5948544698544706e-06, "loss": 0.2394, "num_input_tokens_seen": 105696, "step": 1170 }, { "epoch": 0.3053534303534304, "grad_norm": 3.0535545349121094, "learning_rate": 7.627338877338877e-06, "loss": 0.2799, "num_input_tokens_seen": 106192, "step": 1175 }, { "epoch": 0.30665280665280664, "grad_norm": 1.9830788373947144, "learning_rate": 7.659823284823286e-06, "loss": 0.22, "num_input_tokens_seen": 106624, "step": 1180 }, { "epoch": 0.30795218295218296, "grad_norm": 10.62048053741455, "learning_rate": 7.692307692307694e-06, "loss": 0.3004, "num_input_tokens_seen": 107040, "step": 1185 }, { "epoch": 0.3092515592515592, "grad_norm": 12.567427635192871, "learning_rate": 7.7247920997921e-06, "loss": 0.4758, "num_input_tokens_seen": 107488, "step": 1190 }, { "epoch": 0.31055093555093555, "grad_norm": 0.9195358157157898, "learning_rate": 7.757276507276508e-06, "loss": 0.2583, "num_input_tokens_seen": 107936, "step": 1195 }, { "epoch": 0.31185031185031187, "grad_norm": 13.969161033630371, "learning_rate": 7.789760914760916e-06, "loss": 0.3869, "num_input_tokens_seen": 108400, "step": 1200 }, { "epoch": 0.31314968814968813, "grad_norm": 16.736230850219727, "learning_rate": 7.822245322245322e-06, "loss": 0.8237, "num_input_tokens_seen": 108816, "step": 1205 }, { "epoch": 0.31444906444906445, "grad_norm": 0.9401370286941528, "learning_rate": 7.85472972972973e-06, "loss": 0.1527, "num_input_tokens_seen": 109296, "step": 1210 }, { "epoch": 0.3157484407484408, "grad_norm": 7.7446608543396, "learning_rate": 7.887214137214136e-06, "loss": 0.4133, "num_input_tokens_seen": 109744, "step": 1215 }, { "epoch": 0.31704781704781704, "grad_norm": 12.61561393737793, "learning_rate": 7.919698544698544e-06, "loss": 0.5694, "num_input_tokens_seen": 110224, "step": 1220 }, { "epoch": 0.31834719334719336, "grad_norm": 6.735221862792969, "learning_rate": 7.952182952182952e-06, "loss": 0.1444, "num_input_tokens_seen": 110672, "step": 1225 }, { "epoch": 0.3196465696465696, "grad_norm": 1.5893070697784424, "learning_rate": 7.98466735966736e-06, "loss": 0.2214, "num_input_tokens_seen": 111120, "step": 1230 }, { "epoch": 0.32094594594594594, "grad_norm": 0.9574958682060242, "learning_rate": 8.017151767151768e-06, "loss": 0.3206, "num_input_tokens_seen": 111600, "step": 1235 }, { "epoch": 0.32224532224532226, "grad_norm": 0.47650134563446045, "learning_rate": 8.049636174636176e-06, "loss": 0.2914, "num_input_tokens_seen": 112112, "step": 1240 }, { "epoch": 0.32354469854469853, "grad_norm": 0.4807547628879547, "learning_rate": 8.082120582120582e-06, "loss": 0.5559, "num_input_tokens_seen": 112608, "step": 1245 }, { "epoch": 0.32484407484407485, "grad_norm": 0.54386967420578, "learning_rate": 8.11460498960499e-06, "loss": 0.1572, "num_input_tokens_seen": 113024, "step": 1250 }, { "epoch": 0.32614345114345117, "grad_norm": 6.469844341278076, "learning_rate": 8.147089397089397e-06, "loss": 0.5412, "num_input_tokens_seen": 113488, "step": 1255 }, { "epoch": 0.32744282744282743, "grad_norm": 5.983775615692139, "learning_rate": 8.179573804573805e-06, "loss": 0.347, "num_input_tokens_seen": 113936, "step": 1260 }, { "epoch": 0.32874220374220375, "grad_norm": 4.37260627746582, "learning_rate": 8.212058212058212e-06, "loss": 0.4625, "num_input_tokens_seen": 114432, "step": 1265 }, { "epoch": 0.33004158004158, "grad_norm": 6.1467742919921875, "learning_rate": 8.244542619542619e-06, "loss": 0.2708, "num_input_tokens_seen": 114944, "step": 1270 }, { "epoch": 0.33134095634095634, "grad_norm": 4.4578118324279785, "learning_rate": 8.277027027027027e-06, "loss": 0.5073, "num_input_tokens_seen": 115408, "step": 1275 }, { "epoch": 0.33264033264033266, "grad_norm": 6.779780387878418, "learning_rate": 8.309511434511436e-06, "loss": 0.2534, "num_input_tokens_seen": 115840, "step": 1280 }, { "epoch": 0.3339397089397089, "grad_norm": 0.2562643587589264, "learning_rate": 8.341995841995843e-06, "loss": 0.2028, "num_input_tokens_seen": 116304, "step": 1285 }, { "epoch": 0.33523908523908524, "grad_norm": 1.3789492845535278, "learning_rate": 8.37448024948025e-06, "loss": 1.1493, "num_input_tokens_seen": 116752, "step": 1290 }, { "epoch": 0.33653846153846156, "grad_norm": 50.952850341796875, "learning_rate": 8.406964656964657e-06, "loss": 0.8717, "num_input_tokens_seen": 117232, "step": 1295 }, { "epoch": 0.33783783783783783, "grad_norm": 19.79963493347168, "learning_rate": 8.439449064449065e-06, "loss": 0.3119, "num_input_tokens_seen": 117712, "step": 1300 }, { "epoch": 0.33913721413721415, "grad_norm": 0.7255949974060059, "learning_rate": 8.471933471933473e-06, "loss": 0.3574, "num_input_tokens_seen": 118144, "step": 1305 }, { "epoch": 0.3404365904365904, "grad_norm": 0.8811672925949097, "learning_rate": 8.504417879417879e-06, "loss": 0.6423, "num_input_tokens_seen": 118576, "step": 1310 }, { "epoch": 0.34173596673596673, "grad_norm": 3.643005609512329, "learning_rate": 8.536902286902287e-06, "loss": 0.9245, "num_input_tokens_seen": 119024, "step": 1315 }, { "epoch": 0.34303534303534305, "grad_norm": 2.8282644748687744, "learning_rate": 8.569386694386695e-06, "loss": 0.4702, "num_input_tokens_seen": 119456, "step": 1320 }, { "epoch": 0.3443347193347193, "grad_norm": 0.28392037749290466, "learning_rate": 8.601871101871101e-06, "loss": 0.2245, "num_input_tokens_seen": 119920, "step": 1325 }, { "epoch": 0.34563409563409564, "grad_norm": 0.6397088766098022, "learning_rate": 8.634355509355511e-06, "loss": 0.7473, "num_input_tokens_seen": 120336, "step": 1330 }, { "epoch": 0.34693347193347196, "grad_norm": 27.01479148864746, "learning_rate": 8.666839916839917e-06, "loss": 0.4736, "num_input_tokens_seen": 120784, "step": 1335 }, { "epoch": 0.3482328482328482, "grad_norm": 12.552734375, "learning_rate": 8.699324324324325e-06, "loss": 0.4615, "num_input_tokens_seen": 121216, "step": 1340 }, { "epoch": 0.34953222453222454, "grad_norm": 12.01845932006836, "learning_rate": 8.731808731808733e-06, "loss": 0.2166, "num_input_tokens_seen": 121632, "step": 1345 }, { "epoch": 0.3508316008316008, "grad_norm": 3.275070905685425, "learning_rate": 8.76429313929314e-06, "loss": 0.5177, "num_input_tokens_seen": 122064, "step": 1350 }, { "epoch": 0.35213097713097713, "grad_norm": 58.5206413269043, "learning_rate": 8.796777546777547e-06, "loss": 0.5488, "num_input_tokens_seen": 122512, "step": 1355 }, { "epoch": 0.35343035343035345, "grad_norm": 4.489483833312988, "learning_rate": 8.829261954261955e-06, "loss": 0.426, "num_input_tokens_seen": 122944, "step": 1360 }, { "epoch": 0.3547297297297297, "grad_norm": 19.180728912353516, "learning_rate": 8.861746361746362e-06, "loss": 0.3313, "num_input_tokens_seen": 123360, "step": 1365 }, { "epoch": 0.35602910602910603, "grad_norm": 6.395532608032227, "learning_rate": 8.89423076923077e-06, "loss": 0.2855, "num_input_tokens_seen": 123808, "step": 1370 }, { "epoch": 0.35732848232848236, "grad_norm": 6.822404384613037, "learning_rate": 8.926715176715176e-06, "loss": 0.2169, "num_input_tokens_seen": 124240, "step": 1375 }, { "epoch": 0.3586278586278586, "grad_norm": 2.0576117038726807, "learning_rate": 8.959199584199585e-06, "loss": 0.2462, "num_input_tokens_seen": 124720, "step": 1380 }, { "epoch": 0.35992723492723494, "grad_norm": 0.24725136160850525, "learning_rate": 8.991683991683993e-06, "loss": 0.018, "num_input_tokens_seen": 125168, "step": 1385 }, { "epoch": 0.3612266112266112, "grad_norm": 33.67279815673828, "learning_rate": 9.0241683991684e-06, "loss": 0.9528, "num_input_tokens_seen": 125648, "step": 1390 }, { "epoch": 0.3625259875259875, "grad_norm": 21.955867767333984, "learning_rate": 9.056652806652808e-06, "loss": 0.5186, "num_input_tokens_seen": 126160, "step": 1395 }, { "epoch": 0.36382536382536385, "grad_norm": 48.04606246948242, "learning_rate": 9.089137214137216e-06, "loss": 0.4338, "num_input_tokens_seen": 126608, "step": 1400 }, { "epoch": 0.3651247401247401, "grad_norm": 60.993465423583984, "learning_rate": 9.121621621621622e-06, "loss": 0.2978, "num_input_tokens_seen": 127056, "step": 1405 }, { "epoch": 0.36642411642411643, "grad_norm": 2.4093360900878906, "learning_rate": 9.15410602910603e-06, "loss": 0.1989, "num_input_tokens_seen": 127504, "step": 1410 }, { "epoch": 0.36772349272349275, "grad_norm": 0.3145374357700348, "learning_rate": 9.186590436590436e-06, "loss": 0.0043, "num_input_tokens_seen": 127936, "step": 1415 }, { "epoch": 0.369022869022869, "grad_norm": 22.641023635864258, "learning_rate": 9.219074844074844e-06, "loss": 0.576, "num_input_tokens_seen": 128368, "step": 1420 }, { "epoch": 0.37032224532224534, "grad_norm": 0.14763346314430237, "learning_rate": 9.251559251559252e-06, "loss": 0.0015, "num_input_tokens_seen": 128816, "step": 1425 }, { "epoch": 0.3716216216216216, "grad_norm": 17.89459800720215, "learning_rate": 9.28404365904366e-06, "loss": 0.4519, "num_input_tokens_seen": 129264, "step": 1430 }, { "epoch": 0.3729209979209979, "grad_norm": 14.312708854675293, "learning_rate": 9.316528066528068e-06, "loss": 0.3652, "num_input_tokens_seen": 129744, "step": 1435 }, { "epoch": 0.37422037422037424, "grad_norm": 1.8839209079742432, "learning_rate": 9.349012474012476e-06, "loss": 0.4234, "num_input_tokens_seen": 130208, "step": 1440 }, { "epoch": 0.3755197505197505, "grad_norm": 11.094046592712402, "learning_rate": 9.381496881496882e-06, "loss": 0.4155, "num_input_tokens_seen": 130640, "step": 1445 }, { "epoch": 0.3768191268191268, "grad_norm": 3.858970880508423, "learning_rate": 9.41398128898129e-06, "loss": 0.2927, "num_input_tokens_seen": 131088, "step": 1450 }, { "epoch": 0.3781185031185031, "grad_norm": 2.784972667694092, "learning_rate": 9.446465696465696e-06, "loss": 0.2669, "num_input_tokens_seen": 131504, "step": 1455 }, { "epoch": 0.3794178794178794, "grad_norm": 6.821401596069336, "learning_rate": 9.478950103950104e-06, "loss": 0.2483, "num_input_tokens_seen": 131968, "step": 1460 }, { "epoch": 0.38071725571725573, "grad_norm": 11.747190475463867, "learning_rate": 9.511434511434512e-06, "loss": 0.2152, "num_input_tokens_seen": 132400, "step": 1465 }, { "epoch": 0.382016632016632, "grad_norm": 15.113099098205566, "learning_rate": 9.543918918918919e-06, "loss": 0.689, "num_input_tokens_seen": 132864, "step": 1470 }, { "epoch": 0.3833160083160083, "grad_norm": 0.8919235467910767, "learning_rate": 9.576403326403327e-06, "loss": 0.3742, "num_input_tokens_seen": 133376, "step": 1475 }, { "epoch": 0.38461538461538464, "grad_norm": 22.717350006103516, "learning_rate": 9.608887733887734e-06, "loss": 0.4327, "num_input_tokens_seen": 133792, "step": 1480 }, { "epoch": 0.3859147609147609, "grad_norm": 0.35171082615852356, "learning_rate": 9.641372141372142e-06, "loss": 0.1998, "num_input_tokens_seen": 134224, "step": 1485 }, { "epoch": 0.3872141372141372, "grad_norm": 0.3527545928955078, "learning_rate": 9.67385654885655e-06, "loss": 0.5326, "num_input_tokens_seen": 134688, "step": 1490 }, { "epoch": 0.3885135135135135, "grad_norm": 18.06315040588379, "learning_rate": 9.706340956340957e-06, "loss": 0.7441, "num_input_tokens_seen": 135120, "step": 1495 }, { "epoch": 0.3898128898128898, "grad_norm": 60.6504020690918, "learning_rate": 9.738825363825365e-06, "loss": 0.4481, "num_input_tokens_seen": 135552, "step": 1500 }, { "epoch": 0.3911122661122661, "grad_norm": 10.532940864562988, "learning_rate": 9.771309771309773e-06, "loss": 0.7465, "num_input_tokens_seen": 136016, "step": 1505 }, { "epoch": 0.3924116424116424, "grad_norm": 3.328155279159546, "learning_rate": 9.803794178794179e-06, "loss": 0.3186, "num_input_tokens_seen": 136448, "step": 1510 }, { "epoch": 0.3937110187110187, "grad_norm": 4.449581623077393, "learning_rate": 9.836278586278587e-06, "loss": 0.3949, "num_input_tokens_seen": 136896, "step": 1515 }, { "epoch": 0.39501039501039503, "grad_norm": 3.3305206298828125, "learning_rate": 9.868762993762993e-06, "loss": 0.2186, "num_input_tokens_seen": 137376, "step": 1520 }, { "epoch": 0.3963097713097713, "grad_norm": 6.510509014129639, "learning_rate": 9.901247401247401e-06, "loss": 0.5653, "num_input_tokens_seen": 137792, "step": 1525 }, { "epoch": 0.3976091476091476, "grad_norm": 1.23641037940979, "learning_rate": 9.933731808731809e-06, "loss": 0.2772, "num_input_tokens_seen": 138224, "step": 1530 }, { "epoch": 0.3989085239085239, "grad_norm": 4.106539249420166, "learning_rate": 9.966216216216217e-06, "loss": 0.4003, "num_input_tokens_seen": 138656, "step": 1535 }, { "epoch": 0.4002079002079002, "grad_norm": 1.7889118194580078, "learning_rate": 9.998700623700625e-06, "loss": 0.3198, "num_input_tokens_seen": 139120, "step": 1540 }, { "epoch": 0.4015072765072765, "grad_norm": 10.929959297180176, "learning_rate": 1.0031185031185033e-05, "loss": 0.3247, "num_input_tokens_seen": 139568, "step": 1545 }, { "epoch": 0.4028066528066528, "grad_norm": 0.8253903388977051, "learning_rate": 1.0063669438669439e-05, "loss": 0.3817, "num_input_tokens_seen": 140032, "step": 1550 }, { "epoch": 0.4041060291060291, "grad_norm": 1.4060564041137695, "learning_rate": 1.0096153846153847e-05, "loss": 0.286, "num_input_tokens_seen": 140480, "step": 1555 }, { "epoch": 0.40540540540540543, "grad_norm": 1.84172523021698, "learning_rate": 1.0128638253638253e-05, "loss": 0.279, "num_input_tokens_seen": 140896, "step": 1560 }, { "epoch": 0.4067047817047817, "grad_norm": 4.252024173736572, "learning_rate": 1.0161122661122661e-05, "loss": 0.2592, "num_input_tokens_seen": 141328, "step": 1565 }, { "epoch": 0.408004158004158, "grad_norm": 3.864443063735962, "learning_rate": 1.019360706860707e-05, "loss": 0.2813, "num_input_tokens_seen": 141792, "step": 1570 }, { "epoch": 0.4093035343035343, "grad_norm": 0.8663985729217529, "learning_rate": 1.0226091476091476e-05, "loss": 0.3268, "num_input_tokens_seen": 142272, "step": 1575 }, { "epoch": 0.4106029106029106, "grad_norm": 4.250802993774414, "learning_rate": 1.0258575883575884e-05, "loss": 0.2892, "num_input_tokens_seen": 142752, "step": 1580 }, { "epoch": 0.4119022869022869, "grad_norm": 3.355073928833008, "learning_rate": 1.0291060291060291e-05, "loss": 0.3351, "num_input_tokens_seen": 143200, "step": 1585 }, { "epoch": 0.4132016632016632, "grad_norm": 3.1847009658813477, "learning_rate": 1.03235446985447e-05, "loss": 0.271, "num_input_tokens_seen": 143648, "step": 1590 }, { "epoch": 0.4145010395010395, "grad_norm": 2.5425467491149902, "learning_rate": 1.0356029106029107e-05, "loss": 0.2133, "num_input_tokens_seen": 144128, "step": 1595 }, { "epoch": 0.4158004158004158, "grad_norm": 3.281196117401123, "learning_rate": 1.0388513513513514e-05, "loss": 0.2743, "num_input_tokens_seen": 144576, "step": 1600 }, { "epoch": 0.4170997920997921, "grad_norm": 3.568016529083252, "learning_rate": 1.0420997920997922e-05, "loss": 0.2322, "num_input_tokens_seen": 145008, "step": 1605 }, { "epoch": 0.4183991683991684, "grad_norm": 0.8200395703315735, "learning_rate": 1.045348232848233e-05, "loss": 0.1371, "num_input_tokens_seen": 145440, "step": 1610 }, { "epoch": 0.4196985446985447, "grad_norm": 0.26280081272125244, "learning_rate": 1.0485966735966736e-05, "loss": 0.1703, "num_input_tokens_seen": 145904, "step": 1615 }, { "epoch": 0.420997920997921, "grad_norm": 0.1150948777794838, "learning_rate": 1.0518451143451144e-05, "loss": 0.1999, "num_input_tokens_seen": 146400, "step": 1620 }, { "epoch": 0.4222972972972973, "grad_norm": 0.1409635841846466, "learning_rate": 1.0550935550935552e-05, "loss": 0.6232, "num_input_tokens_seen": 146864, "step": 1625 }, { "epoch": 0.4235966735966736, "grad_norm": 0.23007076978683472, "learning_rate": 1.0583419958419958e-05, "loss": 0.1925, "num_input_tokens_seen": 147312, "step": 1630 }, { "epoch": 0.4248960498960499, "grad_norm": 4.452457427978516, "learning_rate": 1.0615904365904366e-05, "loss": 0.5416, "num_input_tokens_seen": 147744, "step": 1635 }, { "epoch": 0.4261954261954262, "grad_norm": 3.271984100341797, "learning_rate": 1.0648388773388774e-05, "loss": 0.4471, "num_input_tokens_seen": 148240, "step": 1640 }, { "epoch": 0.4274948024948025, "grad_norm": 1.7360289096832275, "learning_rate": 1.0680873180873182e-05, "loss": 0.2304, "num_input_tokens_seen": 148656, "step": 1645 }, { "epoch": 0.4287941787941788, "grad_norm": 7.370316982269287, "learning_rate": 1.071335758835759e-05, "loss": 0.227, "num_input_tokens_seen": 149088, "step": 1650 }, { "epoch": 0.43009355509355507, "grad_norm": 0.8776435256004333, "learning_rate": 1.0745841995841996e-05, "loss": 0.2969, "num_input_tokens_seen": 149536, "step": 1655 }, { "epoch": 0.4313929313929314, "grad_norm": 6.4300079345703125, "learning_rate": 1.0778326403326404e-05, "loss": 0.4131, "num_input_tokens_seen": 149968, "step": 1660 }, { "epoch": 0.4326923076923077, "grad_norm": 0.5566754937171936, "learning_rate": 1.0810810810810812e-05, "loss": 0.1475, "num_input_tokens_seen": 150448, "step": 1665 }, { "epoch": 0.433991683991684, "grad_norm": 7.300588607788086, "learning_rate": 1.0843295218295218e-05, "loss": 0.4126, "num_input_tokens_seen": 150896, "step": 1670 }, { "epoch": 0.4352910602910603, "grad_norm": 0.5475958585739136, "learning_rate": 1.0875779625779626e-05, "loss": 0.4592, "num_input_tokens_seen": 151344, "step": 1675 }, { "epoch": 0.4365904365904366, "grad_norm": 3.4077744483947754, "learning_rate": 1.0908264033264033e-05, "loss": 0.5658, "num_input_tokens_seen": 151808, "step": 1680 }, { "epoch": 0.4378898128898129, "grad_norm": 1.866682767868042, "learning_rate": 1.094074844074844e-05, "loss": 0.4772, "num_input_tokens_seen": 152304, "step": 1685 }, { "epoch": 0.4391891891891892, "grad_norm": 1.4640748500823975, "learning_rate": 1.0973232848232848e-05, "loss": 0.2322, "num_input_tokens_seen": 152752, "step": 1690 }, { "epoch": 0.44048856548856546, "grad_norm": 3.398298501968384, "learning_rate": 1.1005717255717256e-05, "loss": 0.2961, "num_input_tokens_seen": 153216, "step": 1695 }, { "epoch": 0.4417879417879418, "grad_norm": 0.8414958119392395, "learning_rate": 1.1038201663201664e-05, "loss": 0.2933, "num_input_tokens_seen": 153632, "step": 1700 }, { "epoch": 0.4430873180873181, "grad_norm": 3.7914459705352783, "learning_rate": 1.1070686070686072e-05, "loss": 0.3081, "num_input_tokens_seen": 154048, "step": 1705 }, { "epoch": 0.44438669438669437, "grad_norm": 3.4012668132781982, "learning_rate": 1.1103170478170479e-05, "loss": 0.302, "num_input_tokens_seen": 154496, "step": 1710 }, { "epoch": 0.4456860706860707, "grad_norm": 3.2816267013549805, "learning_rate": 1.1135654885654887e-05, "loss": 0.2831, "num_input_tokens_seen": 154896, "step": 1715 }, { "epoch": 0.446985446985447, "grad_norm": 1.097791075706482, "learning_rate": 1.1168139293139293e-05, "loss": 0.282, "num_input_tokens_seen": 155392, "step": 1720 }, { "epoch": 0.4482848232848233, "grad_norm": 3.9340875148773193, "learning_rate": 1.12006237006237e-05, "loss": 0.2661, "num_input_tokens_seen": 155856, "step": 1725 }, { "epoch": 0.4495841995841996, "grad_norm": 2.592378854751587, "learning_rate": 1.1233108108108109e-05, "loss": 0.277, "num_input_tokens_seen": 156320, "step": 1730 }, { "epoch": 0.45088357588357586, "grad_norm": 2.2185897827148438, "learning_rate": 1.1265592515592515e-05, "loss": 0.2806, "num_input_tokens_seen": 156800, "step": 1735 }, { "epoch": 0.4521829521829522, "grad_norm": 5.698373317718506, "learning_rate": 1.1298076923076923e-05, "loss": 0.2616, "num_input_tokens_seen": 157264, "step": 1740 }, { "epoch": 0.4534823284823285, "grad_norm": 5.226992130279541, "learning_rate": 1.1330561330561331e-05, "loss": 0.2893, "num_input_tokens_seen": 157712, "step": 1745 }, { "epoch": 0.45478170478170477, "grad_norm": 1.1887578964233398, "learning_rate": 1.1363045738045739e-05, "loss": 0.3529, "num_input_tokens_seen": 158192, "step": 1750 }, { "epoch": 0.4560810810810811, "grad_norm": 4.3782525062561035, "learning_rate": 1.1395530145530147e-05, "loss": 0.4121, "num_input_tokens_seen": 158624, "step": 1755 }, { "epoch": 0.4573804573804574, "grad_norm": 3.3087217807769775, "learning_rate": 1.1428014553014553e-05, "loss": 0.3055, "num_input_tokens_seen": 159104, "step": 1760 }, { "epoch": 0.45867983367983367, "grad_norm": 2.3408849239349365, "learning_rate": 1.1460498960498961e-05, "loss": 0.415, "num_input_tokens_seen": 159536, "step": 1765 }, { "epoch": 0.45997920997921, "grad_norm": 2.477445125579834, "learning_rate": 1.1492983367983369e-05, "loss": 0.2133, "num_input_tokens_seen": 160000, "step": 1770 }, { "epoch": 0.46127858627858626, "grad_norm": 2.4187729358673096, "learning_rate": 1.1525467775467775e-05, "loss": 0.2307, "num_input_tokens_seen": 160448, "step": 1775 }, { "epoch": 0.4625779625779626, "grad_norm": 1.5579341650009155, "learning_rate": 1.1557952182952183e-05, "loss": 0.2754, "num_input_tokens_seen": 160896, "step": 1780 }, { "epoch": 0.4638773388773389, "grad_norm": 3.796355724334717, "learning_rate": 1.1590436590436591e-05, "loss": 0.299, "num_input_tokens_seen": 161360, "step": 1785 }, { "epoch": 0.46517671517671516, "grad_norm": 0.5514821410179138, "learning_rate": 1.1622920997920998e-05, "loss": 0.3227, "num_input_tokens_seen": 161824, "step": 1790 }, { "epoch": 0.4664760914760915, "grad_norm": 4.132238388061523, "learning_rate": 1.1655405405405405e-05, "loss": 0.3732, "num_input_tokens_seen": 162288, "step": 1795 }, { "epoch": 0.4677754677754678, "grad_norm": 3.082573652267456, "learning_rate": 1.1687889812889813e-05, "loss": 0.2413, "num_input_tokens_seen": 162768, "step": 1800 }, { "epoch": 0.46907484407484407, "grad_norm": 3.605388879776001, "learning_rate": 1.1720374220374221e-05, "loss": 0.1531, "num_input_tokens_seen": 163248, "step": 1805 }, { "epoch": 0.4703742203742204, "grad_norm": 0.5648653507232666, "learning_rate": 1.175285862785863e-05, "loss": 0.4559, "num_input_tokens_seen": 163712, "step": 1810 }, { "epoch": 0.47167359667359665, "grad_norm": 3.202906847000122, "learning_rate": 1.1785343035343036e-05, "loss": 0.3159, "num_input_tokens_seen": 164128, "step": 1815 }, { "epoch": 0.47297297297297297, "grad_norm": 0.7738898396492004, "learning_rate": 1.1817827442827444e-05, "loss": 0.3276, "num_input_tokens_seen": 164576, "step": 1820 }, { "epoch": 0.4742723492723493, "grad_norm": 2.1587607860565186, "learning_rate": 1.1850311850311852e-05, "loss": 0.3361, "num_input_tokens_seen": 165008, "step": 1825 }, { "epoch": 0.47557172557172556, "grad_norm": 1.6063964366912842, "learning_rate": 1.1882796257796258e-05, "loss": 0.2706, "num_input_tokens_seen": 165456, "step": 1830 }, { "epoch": 0.4768711018711019, "grad_norm": 2.364854335784912, "learning_rate": 1.1915280665280666e-05, "loss": 0.2236, "num_input_tokens_seen": 165888, "step": 1835 }, { "epoch": 0.4781704781704782, "grad_norm": 1.3248852491378784, "learning_rate": 1.1947765072765072e-05, "loss": 0.3084, "num_input_tokens_seen": 166320, "step": 1840 }, { "epoch": 0.47946985446985446, "grad_norm": 3.195805788040161, "learning_rate": 1.198024948024948e-05, "loss": 0.2556, "num_input_tokens_seen": 166800, "step": 1845 }, { "epoch": 0.4807692307692308, "grad_norm": 1.8197885751724243, "learning_rate": 1.2012733887733888e-05, "loss": 0.2024, "num_input_tokens_seen": 167248, "step": 1850 }, { "epoch": 0.48206860706860705, "grad_norm": 1.3366034030914307, "learning_rate": 1.2045218295218296e-05, "loss": 0.329, "num_input_tokens_seen": 167680, "step": 1855 }, { "epoch": 0.48336798336798337, "grad_norm": 1.4058109521865845, "learning_rate": 1.2077702702702704e-05, "loss": 0.1667, "num_input_tokens_seen": 168112, "step": 1860 }, { "epoch": 0.4846673596673597, "grad_norm": 1.3531534671783447, "learning_rate": 1.2110187110187112e-05, "loss": 0.3772, "num_input_tokens_seen": 168576, "step": 1865 }, { "epoch": 0.48596673596673595, "grad_norm": 3.456672430038452, "learning_rate": 1.2142671517671518e-05, "loss": 0.3344, "num_input_tokens_seen": 169024, "step": 1870 }, { "epoch": 0.4872661122661123, "grad_norm": 2.2562272548675537, "learning_rate": 1.2175155925155926e-05, "loss": 0.2381, "num_input_tokens_seen": 169440, "step": 1875 }, { "epoch": 0.4885654885654886, "grad_norm": 3.3289785385131836, "learning_rate": 1.2207640332640332e-05, "loss": 0.3043, "num_input_tokens_seen": 169888, "step": 1880 }, { "epoch": 0.48986486486486486, "grad_norm": 3.258950710296631, "learning_rate": 1.224012474012474e-05, "loss": 0.3004, "num_input_tokens_seen": 170336, "step": 1885 }, { "epoch": 0.4911642411642412, "grad_norm": 1.0701372623443604, "learning_rate": 1.2272609147609148e-05, "loss": 0.2659, "num_input_tokens_seen": 170800, "step": 1890 }, { "epoch": 0.49246361746361744, "grad_norm": 1.7519508600234985, "learning_rate": 1.2305093555093555e-05, "loss": 0.249, "num_input_tokens_seen": 171264, "step": 1895 }, { "epoch": 0.49376299376299376, "grad_norm": 6.552335739135742, "learning_rate": 1.2337577962577962e-05, "loss": 0.2863, "num_input_tokens_seen": 171728, "step": 1900 }, { "epoch": 0.4950623700623701, "grad_norm": 0.7573915719985962, "learning_rate": 1.2370062370062372e-05, "loss": 0.3674, "num_input_tokens_seen": 172176, "step": 1905 }, { "epoch": 0.49636174636174635, "grad_norm": 0.3436911404132843, "learning_rate": 1.2402546777546778e-05, "loss": 0.3861, "num_input_tokens_seen": 172592, "step": 1910 }, { "epoch": 0.49766112266112267, "grad_norm": 9.358720779418945, "learning_rate": 1.2435031185031186e-05, "loss": 0.8701, "num_input_tokens_seen": 173056, "step": 1915 }, { "epoch": 0.498960498960499, "grad_norm": 5.514165878295898, "learning_rate": 1.2467515592515593e-05, "loss": 0.2721, "num_input_tokens_seen": 173520, "step": 1920 }, { "epoch": 0.5002598752598753, "grad_norm": 0.9608751535415649, "learning_rate": 1.25e-05, "loss": 0.323, "num_input_tokens_seen": 173968, "step": 1925 }, { "epoch": 0.5015592515592515, "grad_norm": 0.8306057453155518, "learning_rate": 1.2532484407484407e-05, "loss": 0.3638, "num_input_tokens_seen": 174432, "step": 1930 }, { "epoch": 0.5028586278586279, "grad_norm": 2.379908561706543, "learning_rate": 1.2564968814968817e-05, "loss": 0.2279, "num_input_tokens_seen": 174896, "step": 1935 }, { "epoch": 0.5041580041580042, "grad_norm": 1.3962714672088623, "learning_rate": 1.2597453222453223e-05, "loss": 0.4306, "num_input_tokens_seen": 175328, "step": 1940 }, { "epoch": 0.5054573804573804, "grad_norm": 1.7856347560882568, "learning_rate": 1.2629937629937629e-05, "loss": 0.2341, "num_input_tokens_seen": 175792, "step": 1945 }, { "epoch": 0.5067567567567568, "grad_norm": 1.463149070739746, "learning_rate": 1.2662422037422039e-05, "loss": 0.2839, "num_input_tokens_seen": 176240, "step": 1950 }, { "epoch": 0.5080561330561331, "grad_norm": 1.3701974153518677, "learning_rate": 1.2694906444906447e-05, "loss": 0.1848, "num_input_tokens_seen": 176672, "step": 1955 }, { "epoch": 0.5093555093555093, "grad_norm": 2.1785619258880615, "learning_rate": 1.2727390852390853e-05, "loss": 0.2749, "num_input_tokens_seen": 177152, "step": 1960 }, { "epoch": 0.5106548856548857, "grad_norm": 0.5812851190567017, "learning_rate": 1.275987525987526e-05, "loss": 0.1397, "num_input_tokens_seen": 177568, "step": 1965 }, { "epoch": 0.511954261954262, "grad_norm": 0.3913922607898712, "learning_rate": 1.2792359667359669e-05, "loss": 0.2573, "num_input_tokens_seen": 178000, "step": 1970 }, { "epoch": 0.5132536382536382, "grad_norm": 2.995216131210327, "learning_rate": 1.2824844074844075e-05, "loss": 0.2861, "num_input_tokens_seen": 178448, "step": 1975 }, { "epoch": 0.5145530145530145, "grad_norm": 0.41884151101112366, "learning_rate": 1.2857328482328481e-05, "loss": 0.2399, "num_input_tokens_seen": 178896, "step": 1980 }, { "epoch": 0.5158523908523909, "grad_norm": 2.558621883392334, "learning_rate": 1.2889812889812891e-05, "loss": 0.3686, "num_input_tokens_seen": 179344, "step": 1985 }, { "epoch": 0.5171517671517671, "grad_norm": 2.2179276943206787, "learning_rate": 1.2922297297297297e-05, "loss": 0.3409, "num_input_tokens_seen": 179840, "step": 1990 }, { "epoch": 0.5184511434511434, "grad_norm": 2.0841856002807617, "learning_rate": 1.2954781704781704e-05, "loss": 0.1412, "num_input_tokens_seen": 180272, "step": 1995 }, { "epoch": 0.5197505197505198, "grad_norm": 3.858668565750122, "learning_rate": 1.2987266112266113e-05, "loss": 0.4739, "num_input_tokens_seen": 180736, "step": 2000 }, { "epoch": 0.521049896049896, "grad_norm": 1.1919540166854858, "learning_rate": 1.3019750519750521e-05, "loss": 0.3564, "num_input_tokens_seen": 181184, "step": 2005 }, { "epoch": 0.5223492723492723, "grad_norm": 0.5664909482002258, "learning_rate": 1.3052234927234927e-05, "loss": 0.2737, "num_input_tokens_seen": 181648, "step": 2010 }, { "epoch": 0.5236486486486487, "grad_norm": 3.9449596405029297, "learning_rate": 1.3084719334719337e-05, "loss": 0.3817, "num_input_tokens_seen": 182112, "step": 2015 }, { "epoch": 0.524948024948025, "grad_norm": 3.0238089561462402, "learning_rate": 1.3117203742203743e-05, "loss": 0.34, "num_input_tokens_seen": 182544, "step": 2020 }, { "epoch": 0.5262474012474012, "grad_norm": 3.94118595123291, "learning_rate": 1.314968814968815e-05, "loss": 0.2985, "num_input_tokens_seen": 182960, "step": 2025 }, { "epoch": 0.5275467775467776, "grad_norm": 1.5913641452789307, "learning_rate": 1.318217255717256e-05, "loss": 0.2667, "num_input_tokens_seen": 183392, "step": 2030 }, { "epoch": 0.5288461538461539, "grad_norm": 1.8363103866577148, "learning_rate": 1.3214656964656966e-05, "loss": 0.2259, "num_input_tokens_seen": 183856, "step": 2035 }, { "epoch": 0.5301455301455301, "grad_norm": 1.6571433544158936, "learning_rate": 1.3247141372141372e-05, "loss": 0.2233, "num_input_tokens_seen": 184320, "step": 2040 }, { "epoch": 0.5314449064449065, "grad_norm": 1.7741318941116333, "learning_rate": 1.3279625779625778e-05, "loss": 0.3215, "num_input_tokens_seen": 184752, "step": 2045 }, { "epoch": 0.5327442827442828, "grad_norm": 1.760062336921692, "learning_rate": 1.3312110187110188e-05, "loss": 0.1922, "num_input_tokens_seen": 185200, "step": 2050 }, { "epoch": 0.534043659043659, "grad_norm": 1.5278565883636475, "learning_rate": 1.3344594594594596e-05, "loss": 0.3431, "num_input_tokens_seen": 185632, "step": 2055 }, { "epoch": 0.5353430353430353, "grad_norm": 3.046398878097534, "learning_rate": 1.3377079002079002e-05, "loss": 0.2792, "num_input_tokens_seen": 186064, "step": 2060 }, { "epoch": 0.5366424116424117, "grad_norm": 1.0690884590148926, "learning_rate": 1.3409563409563412e-05, "loss": 0.2746, "num_input_tokens_seen": 186480, "step": 2065 }, { "epoch": 0.5379417879417879, "grad_norm": 0.5375344157218933, "learning_rate": 1.3442047817047818e-05, "loss": 0.2795, "num_input_tokens_seen": 186928, "step": 2070 }, { "epoch": 0.5392411642411642, "grad_norm": 0.9343839883804321, "learning_rate": 1.3474532224532224e-05, "loss": 0.2861, "num_input_tokens_seen": 187392, "step": 2075 }, { "epoch": 0.5405405405405406, "grad_norm": 3.7156429290771484, "learning_rate": 1.3507016632016634e-05, "loss": 0.405, "num_input_tokens_seen": 187840, "step": 2080 }, { "epoch": 0.5418399168399168, "grad_norm": 2.323223114013672, "learning_rate": 1.353950103950104e-05, "loss": 0.3071, "num_input_tokens_seen": 188320, "step": 2085 }, { "epoch": 0.5431392931392931, "grad_norm": 1.3156801462173462, "learning_rate": 1.3571985446985446e-05, "loss": 0.1696, "num_input_tokens_seen": 188784, "step": 2090 }, { "epoch": 0.5444386694386695, "grad_norm": 4.619382858276367, "learning_rate": 1.3604469854469856e-05, "loss": 0.2452, "num_input_tokens_seen": 189216, "step": 2095 }, { "epoch": 0.5457380457380457, "grad_norm": 0.25099891424179077, "learning_rate": 1.3636954261954262e-05, "loss": 0.321, "num_input_tokens_seen": 189648, "step": 2100 }, { "epoch": 0.547037422037422, "grad_norm": 0.2536044716835022, "learning_rate": 1.366943866943867e-05, "loss": 0.498, "num_input_tokens_seen": 190096, "step": 2105 }, { "epoch": 0.5483367983367984, "grad_norm": 2.9614765644073486, "learning_rate": 1.3701923076923078e-05, "loss": 0.5702, "num_input_tokens_seen": 190592, "step": 2110 }, { "epoch": 0.5496361746361746, "grad_norm": 0.5112137198448181, "learning_rate": 1.3734407484407486e-05, "loss": 0.0351, "num_input_tokens_seen": 191024, "step": 2115 }, { "epoch": 0.5509355509355509, "grad_norm": 2.768092632293701, "learning_rate": 1.3766891891891892e-05, "loss": 0.4396, "num_input_tokens_seen": 191456, "step": 2120 }, { "epoch": 0.5522349272349273, "grad_norm": 2.586953639984131, "learning_rate": 1.3799376299376299e-05, "loss": 0.2458, "num_input_tokens_seen": 191920, "step": 2125 }, { "epoch": 0.5535343035343036, "grad_norm": 2.37839937210083, "learning_rate": 1.3831860706860708e-05, "loss": 0.3172, "num_input_tokens_seen": 192384, "step": 2130 }, { "epoch": 0.5548336798336798, "grad_norm": 2.1151585578918457, "learning_rate": 1.3864345114345115e-05, "loss": 0.3236, "num_input_tokens_seen": 192848, "step": 2135 }, { "epoch": 0.5561330561330561, "grad_norm": 1.9594858884811401, "learning_rate": 1.3896829521829521e-05, "loss": 0.2624, "num_input_tokens_seen": 193360, "step": 2140 }, { "epoch": 0.5574324324324325, "grad_norm": 0.9003175497055054, "learning_rate": 1.392931392931393e-05, "loss": 0.2816, "num_input_tokens_seen": 193792, "step": 2145 }, { "epoch": 0.5587318087318087, "grad_norm": 1.2594473361968994, "learning_rate": 1.3961798336798337e-05, "loss": 0.2969, "num_input_tokens_seen": 194256, "step": 2150 }, { "epoch": 0.560031185031185, "grad_norm": 0.5138112306594849, "learning_rate": 1.3994282744282745e-05, "loss": 0.2895, "num_input_tokens_seen": 194736, "step": 2155 }, { "epoch": 0.5613305613305614, "grad_norm": 1.424521565437317, "learning_rate": 1.4026767151767153e-05, "loss": 0.307, "num_input_tokens_seen": 195184, "step": 2160 }, { "epoch": 0.5626299376299376, "grad_norm": 0.7587670087814331, "learning_rate": 1.405925155925156e-05, "loss": 0.3648, "num_input_tokens_seen": 195632, "step": 2165 }, { "epoch": 0.5639293139293139, "grad_norm": 2.9391040802001953, "learning_rate": 1.4091735966735967e-05, "loss": 0.2859, "num_input_tokens_seen": 196048, "step": 2170 }, { "epoch": 0.5652286902286903, "grad_norm": 2.014838457107544, "learning_rate": 1.4124220374220377e-05, "loss": 0.2661, "num_input_tokens_seen": 196480, "step": 2175 }, { "epoch": 0.5665280665280665, "grad_norm": 2.515575408935547, "learning_rate": 1.4156704781704783e-05, "loss": 0.296, "num_input_tokens_seen": 196912, "step": 2180 }, { "epoch": 0.5678274428274428, "grad_norm": 1.0873377323150635, "learning_rate": 1.4189189189189189e-05, "loss": 0.26, "num_input_tokens_seen": 197376, "step": 2185 }, { "epoch": 0.5691268191268192, "grad_norm": 1.4830100536346436, "learning_rate": 1.4221673596673599e-05, "loss": 0.2552, "num_input_tokens_seen": 197808, "step": 2190 }, { "epoch": 0.5704261954261954, "grad_norm": 1.8480886220932007, "learning_rate": 1.4254158004158005e-05, "loss": 0.2951, "num_input_tokens_seen": 198256, "step": 2195 }, { "epoch": 0.5717255717255717, "grad_norm": 1.29442298412323, "learning_rate": 1.4286642411642411e-05, "loss": 0.2793, "num_input_tokens_seen": 198704, "step": 2200 }, { "epoch": 0.5730249480249481, "grad_norm": 2.2351458072662354, "learning_rate": 1.431912681912682e-05, "loss": 0.2825, "num_input_tokens_seen": 199136, "step": 2205 }, { "epoch": 0.5743243243243243, "grad_norm": 2.762031078338623, "learning_rate": 1.4351611226611227e-05, "loss": 0.2904, "num_input_tokens_seen": 199568, "step": 2210 }, { "epoch": 0.5756237006237006, "grad_norm": 2.4770679473876953, "learning_rate": 1.4384095634095635e-05, "loss": 0.2855, "num_input_tokens_seen": 200016, "step": 2215 }, { "epoch": 0.5769230769230769, "grad_norm": 1.91924250125885, "learning_rate": 1.4416580041580041e-05, "loss": 0.2436, "num_input_tokens_seen": 200480, "step": 2220 }, { "epoch": 0.5782224532224532, "grad_norm": 1.6884247064590454, "learning_rate": 1.4449064449064451e-05, "loss": 0.2525, "num_input_tokens_seen": 200960, "step": 2225 }, { "epoch": 0.5795218295218295, "grad_norm": 1.6499378681182861, "learning_rate": 1.4481548856548857e-05, "loss": 0.2779, "num_input_tokens_seen": 201392, "step": 2230 }, { "epoch": 0.5808212058212058, "grad_norm": 1.5450466871261597, "learning_rate": 1.4514033264033264e-05, "loss": 0.3055, "num_input_tokens_seen": 201840, "step": 2235 }, { "epoch": 0.5821205821205822, "grad_norm": 1.3945363759994507, "learning_rate": 1.4546517671517673e-05, "loss": 0.2275, "num_input_tokens_seen": 202336, "step": 2240 }, { "epoch": 0.5834199584199584, "grad_norm": 1.3867671489715576, "learning_rate": 1.457900207900208e-05, "loss": 0.2669, "num_input_tokens_seen": 202768, "step": 2245 }, { "epoch": 0.5847193347193347, "grad_norm": 1.3427993059158325, "learning_rate": 1.4611486486486486e-05, "loss": 0.1707, "num_input_tokens_seen": 203200, "step": 2250 }, { "epoch": 0.5860187110187111, "grad_norm": 1.6780028343200684, "learning_rate": 1.4643970893970896e-05, "loss": 0.3244, "num_input_tokens_seen": 203632, "step": 2255 }, { "epoch": 0.5873180873180873, "grad_norm": 1.3340959548950195, "learning_rate": 1.4676455301455302e-05, "loss": 0.2726, "num_input_tokens_seen": 204096, "step": 2260 }, { "epoch": 0.5886174636174636, "grad_norm": 1.381375789642334, "learning_rate": 1.470893970893971e-05, "loss": 0.3612, "num_input_tokens_seen": 204512, "step": 2265 }, { "epoch": 0.58991683991684, "grad_norm": 1.810633897781372, "learning_rate": 1.4741424116424118e-05, "loss": 0.2707, "num_input_tokens_seen": 204960, "step": 2270 }, { "epoch": 0.5912162162162162, "grad_norm": 2.347827911376953, "learning_rate": 1.4773908523908526e-05, "loss": 0.2752, "num_input_tokens_seen": 205392, "step": 2275 }, { "epoch": 0.5925155925155925, "grad_norm": 2.00293231010437, "learning_rate": 1.4806392931392932e-05, "loss": 0.281, "num_input_tokens_seen": 205824, "step": 2280 }, { "epoch": 0.5938149688149689, "grad_norm": 1.171217918395996, "learning_rate": 1.4838877338877338e-05, "loss": 0.2309, "num_input_tokens_seen": 206256, "step": 2285 }, { "epoch": 0.5951143451143451, "grad_norm": 1.1778905391693115, "learning_rate": 1.4871361746361748e-05, "loss": 0.2677, "num_input_tokens_seen": 206688, "step": 2290 }, { "epoch": 0.5964137214137214, "grad_norm": 1.954673409461975, "learning_rate": 1.4903846153846154e-05, "loss": 0.1679, "num_input_tokens_seen": 207136, "step": 2295 }, { "epoch": 0.5977130977130977, "grad_norm": 2.219686985015869, "learning_rate": 1.493633056133056e-05, "loss": 0.3869, "num_input_tokens_seen": 207584, "step": 2300 }, { "epoch": 0.599012474012474, "grad_norm": 1.7662094831466675, "learning_rate": 1.496881496881497e-05, "loss": 0.4048, "num_input_tokens_seen": 208080, "step": 2305 }, { "epoch": 0.6003118503118503, "grad_norm": 2.742873191833496, "learning_rate": 1.5001299376299376e-05, "loss": 0.3918, "num_input_tokens_seen": 208528, "step": 2310 }, { "epoch": 0.6016112266112266, "grad_norm": 1.622961401939392, "learning_rate": 1.5033783783783784e-05, "loss": 0.1838, "num_input_tokens_seen": 208976, "step": 2315 }, { "epoch": 0.6029106029106029, "grad_norm": 0.6184095740318298, "learning_rate": 1.5066268191268192e-05, "loss": 0.3114, "num_input_tokens_seen": 209408, "step": 2320 }, { "epoch": 0.6042099792099792, "grad_norm": 1.9576281309127808, "learning_rate": 1.50987525987526e-05, "loss": 0.2641, "num_input_tokens_seen": 209856, "step": 2325 }, { "epoch": 0.6055093555093555, "grad_norm": 1.7926819324493408, "learning_rate": 1.5131237006237006e-05, "loss": 0.2606, "num_input_tokens_seen": 210320, "step": 2330 }, { "epoch": 0.6068087318087318, "grad_norm": 1.0943502187728882, "learning_rate": 1.5163721413721416e-05, "loss": 0.1884, "num_input_tokens_seen": 210736, "step": 2335 }, { "epoch": 0.6081081081081081, "grad_norm": 2.578436851501465, "learning_rate": 1.5196205821205822e-05, "loss": 0.3068, "num_input_tokens_seen": 211168, "step": 2340 }, { "epoch": 0.6094074844074844, "grad_norm": 2.2546892166137695, "learning_rate": 1.5228690228690229e-05, "loss": 0.4991, "num_input_tokens_seen": 211616, "step": 2345 }, { "epoch": 0.6107068607068608, "grad_norm": 1.2917931079864502, "learning_rate": 1.5261174636174637e-05, "loss": 0.3685, "num_input_tokens_seen": 212032, "step": 2350 }, { "epoch": 0.612006237006237, "grad_norm": 2.315732002258301, "learning_rate": 1.5293659043659046e-05, "loss": 0.3447, "num_input_tokens_seen": 212512, "step": 2355 }, { "epoch": 0.6133056133056133, "grad_norm": 0.44890138506889343, "learning_rate": 1.5326143451143453e-05, "loss": 0.2675, "num_input_tokens_seen": 212976, "step": 2360 }, { "epoch": 0.6146049896049897, "grad_norm": 2.5283079147338867, "learning_rate": 1.535862785862786e-05, "loss": 0.2879, "num_input_tokens_seen": 213424, "step": 2365 }, { "epoch": 0.6159043659043659, "grad_norm": 2.484156608581543, "learning_rate": 1.539111226611227e-05, "loss": 0.2646, "num_input_tokens_seen": 213872, "step": 2370 }, { "epoch": 0.6172037422037422, "grad_norm": 0.7549534440040588, "learning_rate": 1.5423596673596675e-05, "loss": 0.3186, "num_input_tokens_seen": 214304, "step": 2375 }, { "epoch": 0.6185031185031185, "grad_norm": 0.23168422281742096, "learning_rate": 1.545608108108108e-05, "loss": 0.313, "num_input_tokens_seen": 214752, "step": 2380 }, { "epoch": 0.6198024948024948, "grad_norm": 1.8640373945236206, "learning_rate": 1.548856548856549e-05, "loss": 0.3159, "num_input_tokens_seen": 215168, "step": 2385 }, { "epoch": 0.6211018711018711, "grad_norm": 1.7857081890106201, "learning_rate": 1.5521049896049897e-05, "loss": 0.2671, "num_input_tokens_seen": 215600, "step": 2390 }, { "epoch": 0.6224012474012474, "grad_norm": 0.9801259636878967, "learning_rate": 1.5553534303534303e-05, "loss": 0.3111, "num_input_tokens_seen": 216048, "step": 2395 }, { "epoch": 0.6237006237006237, "grad_norm": 0.4931375980377197, "learning_rate": 1.5586018711018713e-05, "loss": 0.2897, "num_input_tokens_seen": 216512, "step": 2400 }, { "epoch": 0.625, "grad_norm": 1.7556933164596558, "learning_rate": 1.561850311850312e-05, "loss": 0.2516, "num_input_tokens_seen": 216976, "step": 2405 }, { "epoch": 0.6262993762993763, "grad_norm": 2.632594347000122, "learning_rate": 1.5650987525987525e-05, "loss": 0.297, "num_input_tokens_seen": 217440, "step": 2410 }, { "epoch": 0.6275987525987526, "grad_norm": 0.8514929413795471, "learning_rate": 1.5683471933471935e-05, "loss": 0.2932, "num_input_tokens_seen": 217920, "step": 2415 }, { "epoch": 0.6288981288981289, "grad_norm": 1.291745901107788, "learning_rate": 1.571595634095634e-05, "loss": 0.2659, "num_input_tokens_seen": 218368, "step": 2420 }, { "epoch": 0.6301975051975052, "grad_norm": 0.8954877853393555, "learning_rate": 1.5748440748440748e-05, "loss": 0.2178, "num_input_tokens_seen": 218832, "step": 2425 }, { "epoch": 0.6314968814968815, "grad_norm": 2.533785104751587, "learning_rate": 1.5780925155925154e-05, "loss": 0.3335, "num_input_tokens_seen": 219312, "step": 2430 }, { "epoch": 0.6327962577962578, "grad_norm": 0.9203861951828003, "learning_rate": 1.5813409563409563e-05, "loss": 0.251, "num_input_tokens_seen": 219776, "step": 2435 }, { "epoch": 0.6340956340956341, "grad_norm": 1.2218533754348755, "learning_rate": 1.584589397089397e-05, "loss": 0.1823, "num_input_tokens_seen": 220224, "step": 2440 }, { "epoch": 0.6353950103950103, "grad_norm": 0.9075462222099304, "learning_rate": 1.587837837837838e-05, "loss": 0.2453, "num_input_tokens_seen": 220672, "step": 2445 }, { "epoch": 0.6366943866943867, "grad_norm": 0.8233855366706848, "learning_rate": 1.591086278586279e-05, "loss": 0.2864, "num_input_tokens_seen": 221120, "step": 2450 }, { "epoch": 0.637993762993763, "grad_norm": 1.5151569843292236, "learning_rate": 1.5943347193347195e-05, "loss": 0.2455, "num_input_tokens_seen": 221536, "step": 2455 }, { "epoch": 0.6392931392931392, "grad_norm": 0.5659878253936768, "learning_rate": 1.59758316008316e-05, "loss": 0.0644, "num_input_tokens_seen": 221984, "step": 2460 }, { "epoch": 0.6405925155925156, "grad_norm": 0.2985662519931793, "learning_rate": 1.600831600831601e-05, "loss": 0.1486, "num_input_tokens_seen": 222432, "step": 2465 }, { "epoch": 0.6418918918918919, "grad_norm": 0.2325480580329895, "learning_rate": 1.6040800415800417e-05, "loss": 0.269, "num_input_tokens_seen": 222848, "step": 2470 }, { "epoch": 0.6431912681912682, "grad_norm": 0.25578829646110535, "learning_rate": 1.6073284823284824e-05, "loss": 0.5511, "num_input_tokens_seen": 223280, "step": 2475 }, { "epoch": 0.6444906444906445, "grad_norm": 2.003007411956787, "learning_rate": 1.6105769230769233e-05, "loss": 0.4877, "num_input_tokens_seen": 223728, "step": 2480 }, { "epoch": 0.6457900207900208, "grad_norm": 1.17902672290802, "learning_rate": 1.613825363825364e-05, "loss": 0.4574, "num_input_tokens_seen": 224160, "step": 2485 }, { "epoch": 0.6470893970893971, "grad_norm": 0.49367570877075195, "learning_rate": 1.6170738045738046e-05, "loss": 0.3178, "num_input_tokens_seen": 224608, "step": 2490 }, { "epoch": 0.6483887733887734, "grad_norm": 0.5113803148269653, "learning_rate": 1.6203222453222456e-05, "loss": 0.3177, "num_input_tokens_seen": 225104, "step": 2495 }, { "epoch": 0.6496881496881497, "grad_norm": 0.41911444067955017, "learning_rate": 1.6235706860706862e-05, "loss": 0.3131, "num_input_tokens_seen": 225552, "step": 2500 }, { "epoch": 0.650987525987526, "grad_norm": 0.5396031141281128, "learning_rate": 1.6268191268191268e-05, "loss": 0.2897, "num_input_tokens_seen": 226016, "step": 2505 }, { "epoch": 0.6522869022869023, "grad_norm": 1.9035283327102661, "learning_rate": 1.6300675675675674e-05, "loss": 0.2886, "num_input_tokens_seen": 226480, "step": 2510 }, { "epoch": 0.6535862785862786, "grad_norm": 1.4847476482391357, "learning_rate": 1.6333160083160084e-05, "loss": 0.2637, "num_input_tokens_seen": 226944, "step": 2515 }, { "epoch": 0.6548856548856549, "grad_norm": 1.4273273944854736, "learning_rate": 1.636564449064449e-05, "loss": 0.25, "num_input_tokens_seen": 227392, "step": 2520 }, { "epoch": 0.6561850311850311, "grad_norm": 2.5359036922454834, "learning_rate": 1.6398128898128897e-05, "loss": 0.3504, "num_input_tokens_seen": 227840, "step": 2525 }, { "epoch": 0.6574844074844075, "grad_norm": 1.4790616035461426, "learning_rate": 1.6430613305613306e-05, "loss": 0.234, "num_input_tokens_seen": 228272, "step": 2530 }, { "epoch": 0.6587837837837838, "grad_norm": 1.429358720779419, "learning_rate": 1.6463097713097712e-05, "loss": 0.2303, "num_input_tokens_seen": 228720, "step": 2535 }, { "epoch": 0.66008316008316, "grad_norm": 2.444161891937256, "learning_rate": 1.649558212058212e-05, "loss": 0.3079, "num_input_tokens_seen": 229168, "step": 2540 }, { "epoch": 0.6613825363825364, "grad_norm": 0.508370041847229, "learning_rate": 1.652806652806653e-05, "loss": 0.3265, "num_input_tokens_seen": 229632, "step": 2545 }, { "epoch": 0.6626819126819127, "grad_norm": 2.514033079147339, "learning_rate": 1.6560550935550938e-05, "loss": 0.2976, "num_input_tokens_seen": 230048, "step": 2550 }, { "epoch": 0.6639812889812889, "grad_norm": 2.3108139038085938, "learning_rate": 1.6593035343035344e-05, "loss": 0.3384, "num_input_tokens_seen": 230528, "step": 2555 }, { "epoch": 0.6652806652806653, "grad_norm": 1.630780577659607, "learning_rate": 1.6625519750519754e-05, "loss": 0.3049, "num_input_tokens_seen": 230976, "step": 2560 }, { "epoch": 0.6665800415800416, "grad_norm": 1.1430549621582031, "learning_rate": 1.665800415800416e-05, "loss": 0.2221, "num_input_tokens_seen": 231488, "step": 2565 }, { "epoch": 0.6678794178794178, "grad_norm": 2.475811004638672, "learning_rate": 1.6690488565488567e-05, "loss": 0.2649, "num_input_tokens_seen": 231984, "step": 2570 }, { "epoch": 0.6691787941787942, "grad_norm": 0.5402740240097046, "learning_rate": 1.6722972972972976e-05, "loss": 0.388, "num_input_tokens_seen": 232448, "step": 2575 }, { "epoch": 0.6704781704781705, "grad_norm": 0.568192183971405, "learning_rate": 1.6755457380457382e-05, "loss": 0.3654, "num_input_tokens_seen": 232960, "step": 2580 }, { "epoch": 0.6717775467775468, "grad_norm": 2.656228542327881, "learning_rate": 1.678794178794179e-05, "loss": 0.3333, "num_input_tokens_seen": 233424, "step": 2585 }, { "epoch": 0.6730769230769231, "grad_norm": 1.0595232248306274, "learning_rate": 1.6820426195426195e-05, "loss": 0.2411, "num_input_tokens_seen": 233856, "step": 2590 }, { "epoch": 0.6743762993762994, "grad_norm": 2.5870134830474854, "learning_rate": 1.6852910602910605e-05, "loss": 0.3625, "num_input_tokens_seen": 234304, "step": 2595 }, { "epoch": 0.6756756756756757, "grad_norm": 0.7548781633377075, "learning_rate": 1.688539501039501e-05, "loss": 0.2624, "num_input_tokens_seen": 234784, "step": 2600 }, { "epoch": 0.6769750519750519, "grad_norm": 2.491814613342285, "learning_rate": 1.6917879417879417e-05, "loss": 0.2722, "num_input_tokens_seen": 235216, "step": 2605 }, { "epoch": 0.6782744282744283, "grad_norm": 1.752241611480713, "learning_rate": 1.6950363825363827e-05, "loss": 0.2495, "num_input_tokens_seen": 235664, "step": 2610 }, { "epoch": 0.6795738045738046, "grad_norm": 1.1649837493896484, "learning_rate": 1.6982848232848233e-05, "loss": 0.2657, "num_input_tokens_seen": 236128, "step": 2615 }, { "epoch": 0.6808731808731808, "grad_norm": 1.0088857412338257, "learning_rate": 1.701533264033264e-05, "loss": 0.3102, "num_input_tokens_seen": 236592, "step": 2620 }, { "epoch": 0.6821725571725572, "grad_norm": 1.0575181245803833, "learning_rate": 1.704781704781705e-05, "loss": 0.2256, "num_input_tokens_seen": 237056, "step": 2625 }, { "epoch": 0.6834719334719335, "grad_norm": 1.321672797203064, "learning_rate": 1.7080301455301455e-05, "loss": 0.1945, "num_input_tokens_seen": 237536, "step": 2630 }, { "epoch": 0.6847713097713097, "grad_norm": 0.6625785231590271, "learning_rate": 1.711278586278586e-05, "loss": 0.3316, "num_input_tokens_seen": 237968, "step": 2635 }, { "epoch": 0.6860706860706861, "grad_norm": 0.6095259785652161, "learning_rate": 1.714527027027027e-05, "loss": 0.2234, "num_input_tokens_seen": 238416, "step": 2640 }, { "epoch": 0.6873700623700624, "grad_norm": 1.2678464651107788, "learning_rate": 1.7177754677754677e-05, "loss": 0.2172, "num_input_tokens_seen": 238864, "step": 2645 }, { "epoch": 0.6886694386694386, "grad_norm": 2.2795538902282715, "learning_rate": 1.7210239085239087e-05, "loss": 0.3273, "num_input_tokens_seen": 239328, "step": 2650 }, { "epoch": 0.689968814968815, "grad_norm": 1.1235836744308472, "learning_rate": 1.7242723492723493e-05, "loss": 0.2027, "num_input_tokens_seen": 239808, "step": 2655 }, { "epoch": 0.6912681912681913, "grad_norm": 0.8389131426811218, "learning_rate": 1.7275207900207903e-05, "loss": 0.262, "num_input_tokens_seen": 240256, "step": 2660 }, { "epoch": 0.6925675675675675, "grad_norm": 0.9015601277351379, "learning_rate": 1.730769230769231e-05, "loss": 0.1786, "num_input_tokens_seen": 240688, "step": 2665 }, { "epoch": 0.6938669438669439, "grad_norm": 0.6059492826461792, "learning_rate": 1.7340176715176716e-05, "loss": 0.1528, "num_input_tokens_seen": 241104, "step": 2670 }, { "epoch": 0.6951663201663202, "grad_norm": 0.5542104840278625, "learning_rate": 1.7372661122661125e-05, "loss": 0.3362, "num_input_tokens_seen": 241552, "step": 2675 }, { "epoch": 0.6964656964656964, "grad_norm": 1.188051700592041, "learning_rate": 1.740514553014553e-05, "loss": 0.2792, "num_input_tokens_seen": 242016, "step": 2680 }, { "epoch": 0.6977650727650727, "grad_norm": 2.253455877304077, "learning_rate": 1.7437629937629938e-05, "loss": 0.3014, "num_input_tokens_seen": 242496, "step": 2685 }, { "epoch": 0.6990644490644491, "grad_norm": 0.6970899105072021, "learning_rate": 1.7470114345114347e-05, "loss": 0.2691, "num_input_tokens_seen": 242960, "step": 2690 }, { "epoch": 0.7003638253638254, "grad_norm": 0.806122899055481, "learning_rate": 1.7502598752598754e-05, "loss": 0.1735, "num_input_tokens_seen": 243456, "step": 2695 }, { "epoch": 0.7016632016632016, "grad_norm": 0.42699798941612244, "learning_rate": 1.753508316008316e-05, "loss": 0.0736, "num_input_tokens_seen": 243888, "step": 2700 }, { "epoch": 0.702962577962578, "grad_norm": 0.28691160678863525, "learning_rate": 1.756756756756757e-05, "loss": 0.209, "num_input_tokens_seen": 244336, "step": 2705 }, { "epoch": 0.7042619542619543, "grad_norm": 0.2348431795835495, "learning_rate": 1.7600051975051976e-05, "loss": 0.2268, "num_input_tokens_seen": 244768, "step": 2710 }, { "epoch": 0.7055613305613305, "grad_norm": 0.28159359097480774, "learning_rate": 1.7632536382536382e-05, "loss": 0.5021, "num_input_tokens_seen": 245216, "step": 2715 }, { "epoch": 0.7068607068607069, "grad_norm": 1.1598429679870605, "learning_rate": 1.7665020790020792e-05, "loss": 0.3927, "num_input_tokens_seen": 245648, "step": 2720 }, { "epoch": 0.7081600831600832, "grad_norm": 0.7158223986625671, "learning_rate": 1.7697505197505198e-05, "loss": 0.1482, "num_input_tokens_seen": 246064, "step": 2725 }, { "epoch": 0.7094594594594594, "grad_norm": 1.1398886442184448, "learning_rate": 1.7729989604989604e-05, "loss": 0.3141, "num_input_tokens_seen": 246512, "step": 2730 }, { "epoch": 0.7107588357588358, "grad_norm": 1.2907837629318237, "learning_rate": 1.7762474012474014e-05, "loss": 0.2613, "num_input_tokens_seen": 246976, "step": 2735 }, { "epoch": 0.7120582120582121, "grad_norm": 0.5951778888702393, "learning_rate": 1.779495841995842e-05, "loss": 0.2668, "num_input_tokens_seen": 247408, "step": 2740 }, { "epoch": 0.7133575883575883, "grad_norm": 0.6553027629852295, "learning_rate": 1.7827442827442827e-05, "loss": 0.2972, "num_input_tokens_seen": 247888, "step": 2745 }, { "epoch": 0.7146569646569647, "grad_norm": 0.8023022413253784, "learning_rate": 1.7859927234927236e-05, "loss": 0.2253, "num_input_tokens_seen": 248352, "step": 2750 }, { "epoch": 0.715956340956341, "grad_norm": 0.9066344499588013, "learning_rate": 1.7892411642411642e-05, "loss": 0.2343, "num_input_tokens_seen": 248832, "step": 2755 }, { "epoch": 0.7172557172557172, "grad_norm": 2.2007830142974854, "learning_rate": 1.7924896049896052e-05, "loss": 0.2805, "num_input_tokens_seen": 249280, "step": 2760 }, { "epoch": 0.7185550935550935, "grad_norm": 0.7284269332885742, "learning_rate": 1.795738045738046e-05, "loss": 0.3285, "num_input_tokens_seen": 249760, "step": 2765 }, { "epoch": 0.7198544698544699, "grad_norm": 1.8397806882858276, "learning_rate": 1.7989864864864868e-05, "loss": 0.2911, "num_input_tokens_seen": 250224, "step": 2770 }, { "epoch": 0.7211538461538461, "grad_norm": 1.2060942649841309, "learning_rate": 1.8022349272349274e-05, "loss": 0.283, "num_input_tokens_seen": 250656, "step": 2775 }, { "epoch": 0.7224532224532224, "grad_norm": 0.45333942770957947, "learning_rate": 1.805483367983368e-05, "loss": 0.3065, "num_input_tokens_seen": 251072, "step": 2780 }, { "epoch": 0.7237525987525988, "grad_norm": 1.8029295206069946, "learning_rate": 1.808731808731809e-05, "loss": 0.2556, "num_input_tokens_seen": 251520, "step": 2785 }, { "epoch": 0.725051975051975, "grad_norm": 0.6019675731658936, "learning_rate": 1.8119802494802496e-05, "loss": 0.2816, "num_input_tokens_seen": 251984, "step": 2790 }, { "epoch": 0.7263513513513513, "grad_norm": 1.3647534847259521, "learning_rate": 1.8152286902286903e-05, "loss": 0.2547, "num_input_tokens_seen": 252416, "step": 2795 }, { "epoch": 0.7276507276507277, "grad_norm": 1.5671380758285522, "learning_rate": 1.8184771309771312e-05, "loss": 0.2933, "num_input_tokens_seen": 252864, "step": 2800 }, { "epoch": 0.728950103950104, "grad_norm": 2.0136160850524902, "learning_rate": 1.821725571725572e-05, "loss": 0.2712, "num_input_tokens_seen": 253328, "step": 2805 }, { "epoch": 0.7302494802494802, "grad_norm": 0.9895743131637573, "learning_rate": 1.8249740124740125e-05, "loss": 0.2355, "num_input_tokens_seen": 253744, "step": 2810 }, { "epoch": 0.7315488565488566, "grad_norm": 0.8228417038917542, "learning_rate": 1.8282224532224535e-05, "loss": 0.1787, "num_input_tokens_seen": 254160, "step": 2815 }, { "epoch": 0.7328482328482329, "grad_norm": 2.6144745349884033, "learning_rate": 1.831470893970894e-05, "loss": 0.316, "num_input_tokens_seen": 254592, "step": 2820 }, { "epoch": 0.7341476091476091, "grad_norm": 1.2673567533493042, "learning_rate": 1.8347193347193347e-05, "loss": 0.3617, "num_input_tokens_seen": 255008, "step": 2825 }, { "epoch": 0.7354469854469855, "grad_norm": 1.3274040222167969, "learning_rate": 1.8379677754677753e-05, "loss": 0.2134, "num_input_tokens_seen": 255440, "step": 2830 }, { "epoch": 0.7367463617463618, "grad_norm": 0.9121501445770264, "learning_rate": 1.8412162162162163e-05, "loss": 0.3154, "num_input_tokens_seen": 255904, "step": 2835 }, { "epoch": 0.738045738045738, "grad_norm": 0.9694650769233704, "learning_rate": 1.844464656964657e-05, "loss": 0.2149, "num_input_tokens_seen": 256352, "step": 2840 }, { "epoch": 0.7393451143451143, "grad_norm": 0.8390766382217407, "learning_rate": 1.8477130977130976e-05, "loss": 0.287, "num_input_tokens_seen": 256816, "step": 2845 }, { "epoch": 0.7406444906444907, "grad_norm": 0.8059400916099548, "learning_rate": 1.8509615384615385e-05, "loss": 0.2294, "num_input_tokens_seen": 257264, "step": 2850 }, { "epoch": 0.7419438669438669, "grad_norm": 1.1564360857009888, "learning_rate": 1.854209979209979e-05, "loss": 0.2197, "num_input_tokens_seen": 257712, "step": 2855 }, { "epoch": 0.7432432432432432, "grad_norm": 0.6771439909934998, "learning_rate": 1.85745841995842e-05, "loss": 0.2226, "num_input_tokens_seen": 258192, "step": 2860 }, { "epoch": 0.7445426195426196, "grad_norm": 2.2015271186828613, "learning_rate": 1.8607068607068607e-05, "loss": 0.2468, "num_input_tokens_seen": 258624, "step": 2865 }, { "epoch": 0.7458419958419958, "grad_norm": 1.1471641063690186, "learning_rate": 1.8639553014553017e-05, "loss": 0.2087, "num_input_tokens_seen": 259088, "step": 2870 }, { "epoch": 0.7471413721413721, "grad_norm": 0.6978226900100708, "learning_rate": 1.8672037422037423e-05, "loss": 0.2543, "num_input_tokens_seen": 259584, "step": 2875 }, { "epoch": 0.7484407484407485, "grad_norm": 2.1293952465057373, "learning_rate": 1.8704521829521833e-05, "loss": 0.3658, "num_input_tokens_seen": 260000, "step": 2880 }, { "epoch": 0.7497401247401247, "grad_norm": 1.02323579788208, "learning_rate": 1.873700623700624e-05, "loss": 0.2338, "num_input_tokens_seen": 260464, "step": 2885 }, { "epoch": 0.751039501039501, "grad_norm": 0.8211590647697449, "learning_rate": 1.8769490644490646e-05, "loss": 0.2234, "num_input_tokens_seen": 260896, "step": 2890 }, { "epoch": 0.7523388773388774, "grad_norm": 1.0027040243148804, "learning_rate": 1.8801975051975052e-05, "loss": 0.127, "num_input_tokens_seen": 261344, "step": 2895 }, { "epoch": 0.7536382536382537, "grad_norm": 0.569628119468689, "learning_rate": 1.883445945945946e-05, "loss": 0.3835, "num_input_tokens_seen": 261808, "step": 2900 }, { "epoch": 0.7549376299376299, "grad_norm": 1.1051156520843506, "learning_rate": 1.8866943866943868e-05, "loss": 0.3565, "num_input_tokens_seen": 262288, "step": 2905 }, { "epoch": 0.7562370062370062, "grad_norm": 0.9950494170188904, "learning_rate": 1.8899428274428274e-05, "loss": 0.2935, "num_input_tokens_seen": 262704, "step": 2910 }, { "epoch": 0.7575363825363826, "grad_norm": 0.8895951509475708, "learning_rate": 1.8931912681912684e-05, "loss": 0.2326, "num_input_tokens_seen": 263104, "step": 2915 }, { "epoch": 0.7588357588357588, "grad_norm": 0.6304056644439697, "learning_rate": 1.896439708939709e-05, "loss": 0.2668, "num_input_tokens_seen": 263520, "step": 2920 }, { "epoch": 0.7601351351351351, "grad_norm": 0.877174973487854, "learning_rate": 1.8996881496881496e-05, "loss": 0.1773, "num_input_tokens_seen": 263952, "step": 2925 }, { "epoch": 0.7614345114345115, "grad_norm": 0.6035802364349365, "learning_rate": 1.9029365904365906e-05, "loss": 0.197, "num_input_tokens_seen": 264384, "step": 2930 }, { "epoch": 0.7627338877338877, "grad_norm": 1.0632818937301636, "learning_rate": 1.9061850311850312e-05, "loss": 0.3629, "num_input_tokens_seen": 264816, "step": 2935 }, { "epoch": 0.764033264033264, "grad_norm": 1.9242866039276123, "learning_rate": 1.909433471933472e-05, "loss": 0.308, "num_input_tokens_seen": 265248, "step": 2940 }, { "epoch": 0.7653326403326404, "grad_norm": 0.9367808699607849, "learning_rate": 1.9126819126819128e-05, "loss": 0.1857, "num_input_tokens_seen": 265696, "step": 2945 }, { "epoch": 0.7666320166320166, "grad_norm": 0.7454084157943726, "learning_rate": 1.9159303534303534e-05, "loss": 0.2956, "num_input_tokens_seen": 266144, "step": 2950 }, { "epoch": 0.7679313929313929, "grad_norm": 1.1538443565368652, "learning_rate": 1.919178794178794e-05, "loss": 0.3214, "num_input_tokens_seen": 266576, "step": 2955 }, { "epoch": 0.7692307692307693, "grad_norm": 1.617733359336853, "learning_rate": 1.922427234927235e-05, "loss": 0.2826, "num_input_tokens_seen": 267008, "step": 2960 }, { "epoch": 0.7705301455301455, "grad_norm": 0.4068509638309479, "learning_rate": 1.9256756756756756e-05, "loss": 0.274, "num_input_tokens_seen": 267472, "step": 2965 }, { "epoch": 0.7718295218295218, "grad_norm": 1.1033257246017456, "learning_rate": 1.9289241164241166e-05, "loss": 0.2532, "num_input_tokens_seen": 267888, "step": 2970 }, { "epoch": 0.7731288981288982, "grad_norm": 1.63666570186615, "learning_rate": 1.9321725571725572e-05, "loss": 0.3785, "num_input_tokens_seen": 268336, "step": 2975 }, { "epoch": 0.7744282744282744, "grad_norm": 1.0141950845718384, "learning_rate": 1.9354209979209982e-05, "loss": 0.2007, "num_input_tokens_seen": 268784, "step": 2980 }, { "epoch": 0.7757276507276507, "grad_norm": 0.596834659576416, "learning_rate": 1.9386694386694388e-05, "loss": 0.1205, "num_input_tokens_seen": 269216, "step": 2985 }, { "epoch": 0.777027027027027, "grad_norm": 0.5764458179473877, "learning_rate": 1.9419178794178795e-05, "loss": 0.2918, "num_input_tokens_seen": 269616, "step": 2990 }, { "epoch": 0.7783264033264033, "grad_norm": 1.1990091800689697, "learning_rate": 1.9451663201663204e-05, "loss": 0.2949, "num_input_tokens_seen": 270048, "step": 2995 }, { "epoch": 0.7796257796257796, "grad_norm": 0.5460131168365479, "learning_rate": 1.948414760914761e-05, "loss": 0.297, "num_input_tokens_seen": 270496, "step": 3000 }, { "epoch": 0.7809251559251559, "grad_norm": 0.8223181962966919, "learning_rate": 1.9516632016632017e-05, "loss": 0.2336, "num_input_tokens_seen": 270944, "step": 3005 }, { "epoch": 0.7822245322245323, "grad_norm": 0.7083848118782043, "learning_rate": 1.9549116424116426e-05, "loss": 0.2706, "num_input_tokens_seen": 271408, "step": 3010 }, { "epoch": 0.7835239085239085, "grad_norm": 2.119354486465454, "learning_rate": 1.9581600831600833e-05, "loss": 0.3264, "num_input_tokens_seen": 271856, "step": 3015 }, { "epoch": 0.7848232848232848, "grad_norm": 1.0586234331130981, "learning_rate": 1.961408523908524e-05, "loss": 0.3284, "num_input_tokens_seen": 272320, "step": 3020 }, { "epoch": 0.7861226611226612, "grad_norm": 0.487371563911438, "learning_rate": 1.964656964656965e-05, "loss": 0.2372, "num_input_tokens_seen": 272784, "step": 3025 }, { "epoch": 0.7874220374220374, "grad_norm": 0.670344352722168, "learning_rate": 1.9679054054054055e-05, "loss": 0.2561, "num_input_tokens_seen": 273232, "step": 3030 }, { "epoch": 0.7887214137214137, "grad_norm": 0.7999149560928345, "learning_rate": 1.971153846153846e-05, "loss": 0.2564, "num_input_tokens_seen": 273696, "step": 3035 }, { "epoch": 0.7900207900207901, "grad_norm": 0.6705266237258911, "learning_rate": 1.974402286902287e-05, "loss": 0.3011, "num_input_tokens_seen": 274160, "step": 3040 }, { "epoch": 0.7913201663201663, "grad_norm": 0.8856589794158936, "learning_rate": 1.9776507276507277e-05, "loss": 0.2239, "num_input_tokens_seen": 274576, "step": 3045 }, { "epoch": 0.7926195426195426, "grad_norm": 0.7791913151741028, "learning_rate": 1.9808991683991683e-05, "loss": 0.2096, "num_input_tokens_seen": 275024, "step": 3050 }, { "epoch": 0.793918918918919, "grad_norm": 2.321260452270508, "learning_rate": 1.984147609147609e-05, "loss": 0.2809, "num_input_tokens_seen": 275456, "step": 3055 }, { "epoch": 0.7952182952182952, "grad_norm": 0.9906023144721985, "learning_rate": 1.98739604989605e-05, "loss": 0.3673, "num_input_tokens_seen": 275888, "step": 3060 }, { "epoch": 0.7965176715176715, "grad_norm": 0.6556845307350159, "learning_rate": 1.9906444906444905e-05, "loss": 0.2166, "num_input_tokens_seen": 276352, "step": 3065 }, { "epoch": 0.7978170478170478, "grad_norm": 0.6475791931152344, "learning_rate": 1.9938929313929315e-05, "loss": 0.3329, "num_input_tokens_seen": 276816, "step": 3070 }, { "epoch": 0.7991164241164241, "grad_norm": 1.4653081893920898, "learning_rate": 1.9971413721413725e-05, "loss": 0.2713, "num_input_tokens_seen": 277280, "step": 3075 }, { "epoch": 0.8004158004158004, "grad_norm": 0.5043230056762695, "learning_rate": 2.000389812889813e-05, "loss": 0.2571, "num_input_tokens_seen": 277728, "step": 3080 }, { "epoch": 0.8017151767151767, "grad_norm": 1.0457721948623657, "learning_rate": 2.0036382536382537e-05, "loss": 0.2178, "num_input_tokens_seen": 278208, "step": 3085 }, { "epoch": 0.803014553014553, "grad_norm": 0.751908004283905, "learning_rate": 2.0068866943866947e-05, "loss": 0.3397, "num_input_tokens_seen": 278672, "step": 3090 }, { "epoch": 0.8043139293139293, "grad_norm": 0.9987584352493286, "learning_rate": 2.0101351351351353e-05, "loss": 0.2808, "num_input_tokens_seen": 279120, "step": 3095 }, { "epoch": 0.8056133056133056, "grad_norm": 0.9050061702728271, "learning_rate": 2.013383575883576e-05, "loss": 0.1708, "num_input_tokens_seen": 279536, "step": 3100 }, { "epoch": 0.806912681912682, "grad_norm": 0.6488624215126038, "learning_rate": 2.016632016632017e-05, "loss": 0.2772, "num_input_tokens_seen": 279968, "step": 3105 }, { "epoch": 0.8082120582120582, "grad_norm": 1.8340122699737549, "learning_rate": 2.0198804573804575e-05, "loss": 0.307, "num_input_tokens_seen": 280448, "step": 3110 }, { "epoch": 0.8095114345114345, "grad_norm": 0.5901659727096558, "learning_rate": 2.0231288981288982e-05, "loss": 0.3163, "num_input_tokens_seen": 280864, "step": 3115 }, { "epoch": 0.8108108108108109, "grad_norm": 0.6736297011375427, "learning_rate": 2.026377338877339e-05, "loss": 0.1749, "num_input_tokens_seen": 281312, "step": 3120 }, { "epoch": 0.8121101871101871, "grad_norm": 0.6851815581321716, "learning_rate": 2.0296257796257798e-05, "loss": 0.3363, "num_input_tokens_seen": 281760, "step": 3125 }, { "epoch": 0.8134095634095634, "grad_norm": 0.4474368393421173, "learning_rate": 2.0328742203742204e-05, "loss": 0.2545, "num_input_tokens_seen": 282224, "step": 3130 }, { "epoch": 0.8147089397089398, "grad_norm": 0.44778063893318176, "learning_rate": 2.036122661122661e-05, "loss": 0.2516, "num_input_tokens_seen": 282656, "step": 3135 }, { "epoch": 0.816008316008316, "grad_norm": 0.9270078539848328, "learning_rate": 2.039371101871102e-05, "loss": 0.2361, "num_input_tokens_seen": 283072, "step": 3140 }, { "epoch": 0.8173076923076923, "grad_norm": 0.6582227349281311, "learning_rate": 2.0426195426195426e-05, "loss": 0.1798, "num_input_tokens_seen": 283504, "step": 3145 }, { "epoch": 0.8186070686070686, "grad_norm": 1.0286258459091187, "learning_rate": 2.0458679833679832e-05, "loss": 0.2205, "num_input_tokens_seen": 283984, "step": 3150 }, { "epoch": 0.8199064449064449, "grad_norm": 0.42343106865882874, "learning_rate": 2.0491164241164242e-05, "loss": 0.2964, "num_input_tokens_seen": 284432, "step": 3155 }, { "epoch": 0.8212058212058212, "grad_norm": 0.5603933334350586, "learning_rate": 2.0523648648648648e-05, "loss": 0.2723, "num_input_tokens_seen": 284864, "step": 3160 }, { "epoch": 0.8225051975051975, "grad_norm": 0.6395183205604553, "learning_rate": 2.0556133056133055e-05, "loss": 0.1551, "num_input_tokens_seen": 285328, "step": 3165 }, { "epoch": 0.8238045738045738, "grad_norm": 0.5850247144699097, "learning_rate": 2.0588617463617464e-05, "loss": 0.2109, "num_input_tokens_seen": 285792, "step": 3170 }, { "epoch": 0.8251039501039501, "grad_norm": 0.44333839416503906, "learning_rate": 2.0621101871101874e-05, "loss": 0.221, "num_input_tokens_seen": 286256, "step": 3175 }, { "epoch": 0.8264033264033264, "grad_norm": 0.49961474537849426, "learning_rate": 2.065358627858628e-05, "loss": 0.1535, "num_input_tokens_seen": 286656, "step": 3180 }, { "epoch": 0.8277027027027027, "grad_norm": 1.0088493824005127, "learning_rate": 2.068607068607069e-05, "loss": 0.2762, "num_input_tokens_seen": 287088, "step": 3185 }, { "epoch": 0.829002079002079, "grad_norm": 0.499686062335968, "learning_rate": 2.0718555093555096e-05, "loss": 0.2546, "num_input_tokens_seen": 287520, "step": 3190 }, { "epoch": 0.8303014553014553, "grad_norm": 0.912488579750061, "learning_rate": 2.0751039501039502e-05, "loss": 0.2139, "num_input_tokens_seen": 287968, "step": 3195 }, { "epoch": 0.8316008316008316, "grad_norm": 0.746367335319519, "learning_rate": 2.0783523908523912e-05, "loss": 0.3135, "num_input_tokens_seen": 288400, "step": 3200 }, { "epoch": 0.8329002079002079, "grad_norm": 1.0392967462539673, "learning_rate": 2.0816008316008318e-05, "loss": 0.331, "num_input_tokens_seen": 288848, "step": 3205 }, { "epoch": 0.8341995841995842, "grad_norm": 1.4873493909835815, "learning_rate": 2.0848492723492724e-05, "loss": 0.2927, "num_input_tokens_seen": 289264, "step": 3210 }, { "epoch": 0.8354989604989606, "grad_norm": 1.174919843673706, "learning_rate": 2.088097713097713e-05, "loss": 0.3089, "num_input_tokens_seen": 289744, "step": 3215 }, { "epoch": 0.8367983367983368, "grad_norm": 0.6052835583686829, "learning_rate": 2.091346153846154e-05, "loss": 0.2972, "num_input_tokens_seen": 290192, "step": 3220 }, { "epoch": 0.8380977130977131, "grad_norm": 0.6853124499320984, "learning_rate": 2.0945945945945947e-05, "loss": 0.2616, "num_input_tokens_seen": 290608, "step": 3225 }, { "epoch": 0.8393970893970893, "grad_norm": 0.7823435068130493, "learning_rate": 2.0978430353430353e-05, "loss": 0.266, "num_input_tokens_seen": 291072, "step": 3230 }, { "epoch": 0.8406964656964657, "grad_norm": 0.7654406428337097, "learning_rate": 2.1010914760914763e-05, "loss": 0.2802, "num_input_tokens_seen": 291536, "step": 3235 }, { "epoch": 0.841995841995842, "grad_norm": 0.6969149112701416, "learning_rate": 2.104339916839917e-05, "loss": 0.2448, "num_input_tokens_seen": 292016, "step": 3240 }, { "epoch": 0.8432952182952183, "grad_norm": 1.06014084815979, "learning_rate": 2.1075883575883575e-05, "loss": 0.2302, "num_input_tokens_seen": 292448, "step": 3245 }, { "epoch": 0.8445945945945946, "grad_norm": 0.7736815214157104, "learning_rate": 2.1108367983367985e-05, "loss": 0.2259, "num_input_tokens_seen": 292912, "step": 3250 }, { "epoch": 0.8458939708939709, "grad_norm": 0.8977036476135254, "learning_rate": 2.114085239085239e-05, "loss": 0.2649, "num_input_tokens_seen": 293360, "step": 3255 }, { "epoch": 0.8471933471933472, "grad_norm": 0.900132954120636, "learning_rate": 2.1173336798336797e-05, "loss": 0.2184, "num_input_tokens_seen": 293808, "step": 3260 }, { "epoch": 0.8484927234927235, "grad_norm": 0.6870909333229065, "learning_rate": 2.1205821205821207e-05, "loss": 0.0923, "num_input_tokens_seen": 294256, "step": 3265 }, { "epoch": 0.8497920997920998, "grad_norm": 3.3512320518493652, "learning_rate": 2.1238305613305613e-05, "loss": 0.2919, "num_input_tokens_seen": 294768, "step": 3270 }, { "epoch": 0.8510914760914761, "grad_norm": 5.951888561248779, "learning_rate": 2.1270790020790023e-05, "loss": 0.5244, "num_input_tokens_seen": 295232, "step": 3275 }, { "epoch": 0.8523908523908524, "grad_norm": 3.2377560138702393, "learning_rate": 2.130327442827443e-05, "loss": 0.6269, "num_input_tokens_seen": 295680, "step": 3280 }, { "epoch": 0.8536902286902287, "grad_norm": 0.6453222036361694, "learning_rate": 2.133575883575884e-05, "loss": 0.1905, "num_input_tokens_seen": 296176, "step": 3285 }, { "epoch": 0.854989604989605, "grad_norm": 1.8082829713821411, "learning_rate": 2.1368243243243245e-05, "loss": 0.3013, "num_input_tokens_seen": 296624, "step": 3290 }, { "epoch": 0.8562889812889813, "grad_norm": 0.6724337935447693, "learning_rate": 2.140072765072765e-05, "loss": 0.3414, "num_input_tokens_seen": 297088, "step": 3295 }, { "epoch": 0.8575883575883576, "grad_norm": 2.0306828022003174, "learning_rate": 2.143321205821206e-05, "loss": 0.2927, "num_input_tokens_seen": 297536, "step": 3300 }, { "epoch": 0.8588877338877339, "grad_norm": 2.1534743309020996, "learning_rate": 2.1465696465696467e-05, "loss": 0.3233, "num_input_tokens_seen": 297968, "step": 3305 }, { "epoch": 0.8601871101871101, "grad_norm": 1.1049643754959106, "learning_rate": 2.1498180873180874e-05, "loss": 0.2588, "num_input_tokens_seen": 298400, "step": 3310 }, { "epoch": 0.8614864864864865, "grad_norm": 14.03341007232666, "learning_rate": 2.1530665280665283e-05, "loss": 0.4847, "num_input_tokens_seen": 298848, "step": 3315 }, { "epoch": 0.8627858627858628, "grad_norm": 1.922985315322876, "learning_rate": 2.156314968814969e-05, "loss": 0.3894, "num_input_tokens_seen": 299296, "step": 3320 }, { "epoch": 0.864085239085239, "grad_norm": 1.770250916481018, "learning_rate": 2.1595634095634096e-05, "loss": 0.2775, "num_input_tokens_seen": 299728, "step": 3325 }, { "epoch": 0.8653846153846154, "grad_norm": 2.042508125305176, "learning_rate": 2.1628118503118505e-05, "loss": 0.2826, "num_input_tokens_seen": 300144, "step": 3330 }, { "epoch": 0.8666839916839917, "grad_norm": 0.6064192652702332, "learning_rate": 2.166060291060291e-05, "loss": 0.2859, "num_input_tokens_seen": 300624, "step": 3335 }, { "epoch": 0.867983367983368, "grad_norm": 2.9941909313201904, "learning_rate": 2.1693087318087318e-05, "loss": 0.2705, "num_input_tokens_seen": 301072, "step": 3340 }, { "epoch": 0.8692827442827443, "grad_norm": 1.2237114906311035, "learning_rate": 2.1725571725571728e-05, "loss": 0.1608, "num_input_tokens_seen": 301536, "step": 3345 }, { "epoch": 0.8705821205821206, "grad_norm": 0.6225523948669434, "learning_rate": 2.1758056133056134e-05, "loss": 0.3349, "num_input_tokens_seen": 302000, "step": 3350 }, { "epoch": 0.8718814968814969, "grad_norm": 11.182257652282715, "learning_rate": 2.179054054054054e-05, "loss": 0.8814, "num_input_tokens_seen": 302432, "step": 3355 }, { "epoch": 0.8731808731808732, "grad_norm": 1.390172004699707, "learning_rate": 2.182302494802495e-05, "loss": 0.5668, "num_input_tokens_seen": 302864, "step": 3360 }, { "epoch": 0.8744802494802495, "grad_norm": 0.7459644675254822, "learning_rate": 2.1855509355509356e-05, "loss": 0.2571, "num_input_tokens_seen": 303312, "step": 3365 }, { "epoch": 0.8757796257796258, "grad_norm": 1.8990484476089478, "learning_rate": 2.1887993762993762e-05, "loss": 0.2506, "num_input_tokens_seen": 303776, "step": 3370 }, { "epoch": 0.877079002079002, "grad_norm": 1.3473306894302368, "learning_rate": 2.1920478170478172e-05, "loss": 0.3093, "num_input_tokens_seen": 304224, "step": 3375 }, { "epoch": 0.8783783783783784, "grad_norm": 1.4128385782241821, "learning_rate": 2.1952962577962578e-05, "loss": 0.1953, "num_input_tokens_seen": 304672, "step": 3380 }, { "epoch": 0.8796777546777547, "grad_norm": 0.9868000745773315, "learning_rate": 2.1985446985446988e-05, "loss": 0.2708, "num_input_tokens_seen": 305104, "step": 3385 }, { "epoch": 0.8809771309771309, "grad_norm": 1.755475640296936, "learning_rate": 2.2017931392931394e-05, "loss": 0.4255, "num_input_tokens_seen": 305584, "step": 3390 }, { "epoch": 0.8822765072765073, "grad_norm": 1.3360148668289185, "learning_rate": 2.2050415800415804e-05, "loss": 0.2312, "num_input_tokens_seen": 306048, "step": 3395 }, { "epoch": 0.8835758835758836, "grad_norm": 1.0469084978103638, "learning_rate": 2.208290020790021e-05, "loss": 0.3106, "num_input_tokens_seen": 306480, "step": 3400 }, { "epoch": 0.8848752598752598, "grad_norm": 1.189652681350708, "learning_rate": 2.2115384615384616e-05, "loss": 0.2871, "num_input_tokens_seen": 306944, "step": 3405 }, { "epoch": 0.8861746361746362, "grad_norm": 1.1388484239578247, "learning_rate": 2.2147869022869026e-05, "loss": 0.2745, "num_input_tokens_seen": 307392, "step": 3410 }, { "epoch": 0.8874740124740125, "grad_norm": 1.578480839729309, "learning_rate": 2.2180353430353432e-05, "loss": 0.3078, "num_input_tokens_seen": 307856, "step": 3415 }, { "epoch": 0.8887733887733887, "grad_norm": 1.0248075723648071, "learning_rate": 2.221283783783784e-05, "loss": 0.2442, "num_input_tokens_seen": 308288, "step": 3420 }, { "epoch": 0.8900727650727651, "grad_norm": 0.8600338101387024, "learning_rate": 2.2245322245322248e-05, "loss": 0.2251, "num_input_tokens_seen": 308704, "step": 3425 }, { "epoch": 0.8913721413721414, "grad_norm": 0.9039783477783203, "learning_rate": 2.2277806652806654e-05, "loss": 0.2622, "num_input_tokens_seen": 309168, "step": 3430 }, { "epoch": 0.8926715176715176, "grad_norm": 0.5433151721954346, "learning_rate": 2.231029106029106e-05, "loss": 0.0923, "num_input_tokens_seen": 309648, "step": 3435 }, { "epoch": 0.893970893970894, "grad_norm": 0.3526660203933716, "learning_rate": 2.2342775467775467e-05, "loss": 0.2287, "num_input_tokens_seen": 310128, "step": 3440 }, { "epoch": 0.8952702702702703, "grad_norm": 1.0589499473571777, "learning_rate": 2.2375259875259877e-05, "loss": 0.4678, "num_input_tokens_seen": 310576, "step": 3445 }, { "epoch": 0.8965696465696466, "grad_norm": 0.8281912207603455, "learning_rate": 2.2407744282744283e-05, "loss": 0.2791, "num_input_tokens_seen": 311040, "step": 3450 }, { "epoch": 0.8978690228690228, "grad_norm": 0.48778557777404785, "learning_rate": 2.244022869022869e-05, "loss": 0.299, "num_input_tokens_seen": 311488, "step": 3455 }, { "epoch": 0.8991683991683992, "grad_norm": 0.34911760687828064, "learning_rate": 2.24727130977131e-05, "loss": 0.27, "num_input_tokens_seen": 311936, "step": 3460 }, { "epoch": 0.9004677754677755, "grad_norm": 0.2722511887550354, "learning_rate": 2.2505197505197505e-05, "loss": 0.2766, "num_input_tokens_seen": 312368, "step": 3465 }, { "epoch": 0.9017671517671517, "grad_norm": 1.181038737297058, "learning_rate": 2.253768191268191e-05, "loss": 0.2977, "num_input_tokens_seen": 312880, "step": 3470 }, { "epoch": 0.9030665280665281, "grad_norm": 1.100883960723877, "learning_rate": 2.257016632016632e-05, "loss": 0.2735, "num_input_tokens_seen": 313344, "step": 3475 }, { "epoch": 0.9043659043659044, "grad_norm": 0.67288738489151, "learning_rate": 2.2602650727650727e-05, "loss": 0.228, "num_input_tokens_seen": 313744, "step": 3480 }, { "epoch": 0.9056652806652806, "grad_norm": 0.8222284913063049, "learning_rate": 2.2635135135135137e-05, "loss": 0.3226, "num_input_tokens_seen": 314176, "step": 3485 }, { "epoch": 0.906964656964657, "grad_norm": 0.8283339142799377, "learning_rate": 2.2667619542619543e-05, "loss": 0.2728, "num_input_tokens_seen": 314624, "step": 3490 }, { "epoch": 0.9082640332640333, "grad_norm": 0.7380000352859497, "learning_rate": 2.2700103950103953e-05, "loss": 0.2262, "num_input_tokens_seen": 315088, "step": 3495 }, { "epoch": 0.9095634095634095, "grad_norm": 1.5229315757751465, "learning_rate": 2.273258835758836e-05, "loss": 0.2946, "num_input_tokens_seen": 315520, "step": 3500 }, { "epoch": 0.9108627858627859, "grad_norm": 0.4742961823940277, "learning_rate": 2.276507276507277e-05, "loss": 0.2958, "num_input_tokens_seen": 315952, "step": 3505 }, { "epoch": 0.9121621621621622, "grad_norm": 0.5072182416915894, "learning_rate": 2.2797557172557175e-05, "loss": 0.2548, "num_input_tokens_seen": 316384, "step": 3510 }, { "epoch": 0.9134615384615384, "grad_norm": 0.7686716914176941, "learning_rate": 2.283004158004158e-05, "loss": 0.206, "num_input_tokens_seen": 316832, "step": 3515 }, { "epoch": 0.9147609147609148, "grad_norm": 0.6879532337188721, "learning_rate": 2.2862525987525988e-05, "loss": 0.2743, "num_input_tokens_seen": 317280, "step": 3520 }, { "epoch": 0.9160602910602911, "grad_norm": 0.4881362020969391, "learning_rate": 2.2895010395010397e-05, "loss": 0.1572, "num_input_tokens_seen": 317728, "step": 3525 }, { "epoch": 0.9173596673596673, "grad_norm": 1.8034288883209229, "learning_rate": 2.2927494802494803e-05, "loss": 0.3265, "num_input_tokens_seen": 318192, "step": 3530 }, { "epoch": 0.9186590436590436, "grad_norm": 0.8790608048439026, "learning_rate": 2.295997920997921e-05, "loss": 0.3673, "num_input_tokens_seen": 318672, "step": 3535 }, { "epoch": 0.91995841995842, "grad_norm": 0.6702221632003784, "learning_rate": 2.299246361746362e-05, "loss": 0.2735, "num_input_tokens_seen": 319104, "step": 3540 }, { "epoch": 0.9212577962577962, "grad_norm": 0.4340205192565918, "learning_rate": 2.3024948024948026e-05, "loss": 0.2906, "num_input_tokens_seen": 319584, "step": 3545 }, { "epoch": 0.9225571725571725, "grad_norm": 0.2214689403772354, "learning_rate": 2.3057432432432432e-05, "loss": 0.2939, "num_input_tokens_seen": 320048, "step": 3550 }, { "epoch": 0.9238565488565489, "grad_norm": 1.7518254518508911, "learning_rate": 2.308991683991684e-05, "loss": 0.3249, "num_input_tokens_seen": 320528, "step": 3555 }, { "epoch": 0.9251559251559252, "grad_norm": 0.21442332863807678, "learning_rate": 2.3122401247401248e-05, "loss": 0.288, "num_input_tokens_seen": 320928, "step": 3560 }, { "epoch": 0.9264553014553014, "grad_norm": 0.8403407335281372, "learning_rate": 2.3154885654885654e-05, "loss": 0.2691, "num_input_tokens_seen": 321344, "step": 3565 }, { "epoch": 0.9277546777546778, "grad_norm": 0.47403454780578613, "learning_rate": 2.3187370062370064e-05, "loss": 0.3027, "num_input_tokens_seen": 321824, "step": 3570 }, { "epoch": 0.9290540540540541, "grad_norm": 0.9364228248596191, "learning_rate": 2.321985446985447e-05, "loss": 0.2503, "num_input_tokens_seen": 322256, "step": 3575 }, { "epoch": 0.9303534303534303, "grad_norm": 0.698004961013794, "learning_rate": 2.3252338877338876e-05, "loss": 0.2139, "num_input_tokens_seen": 322672, "step": 3580 }, { "epoch": 0.9316528066528067, "grad_norm": 0.7979891896247864, "learning_rate": 2.3284823284823286e-05, "loss": 0.272, "num_input_tokens_seen": 323120, "step": 3585 }, { "epoch": 0.932952182952183, "grad_norm": 0.44595658779144287, "learning_rate": 2.3317307692307692e-05, "loss": 0.2562, "num_input_tokens_seen": 323600, "step": 3590 }, { "epoch": 0.9342515592515592, "grad_norm": 0.5159906148910522, "learning_rate": 2.3349792099792102e-05, "loss": 0.2211, "num_input_tokens_seen": 324048, "step": 3595 }, { "epoch": 0.9355509355509356, "grad_norm": 0.41699278354644775, "learning_rate": 2.3382276507276508e-05, "loss": 0.1482, "num_input_tokens_seen": 324512, "step": 3600 }, { "epoch": 0.9368503118503119, "grad_norm": 0.540618360042572, "learning_rate": 2.3414760914760918e-05, "loss": 0.3235, "num_input_tokens_seen": 325024, "step": 3605 }, { "epoch": 0.9381496881496881, "grad_norm": 0.6581080555915833, "learning_rate": 2.3447245322245324e-05, "loss": 0.3255, "num_input_tokens_seen": 325456, "step": 3610 }, { "epoch": 0.9394490644490644, "grad_norm": 0.7900993227958679, "learning_rate": 2.347972972972973e-05, "loss": 0.2684, "num_input_tokens_seen": 325920, "step": 3615 }, { "epoch": 0.9407484407484408, "grad_norm": 0.9577163457870483, "learning_rate": 2.351221413721414e-05, "loss": 0.2964, "num_input_tokens_seen": 326352, "step": 3620 }, { "epoch": 0.942047817047817, "grad_norm": 0.8883885145187378, "learning_rate": 2.3544698544698546e-05, "loss": 0.2189, "num_input_tokens_seen": 326816, "step": 3625 }, { "epoch": 0.9433471933471933, "grad_norm": 0.690438985824585, "learning_rate": 2.3577182952182953e-05, "loss": 0.2858, "num_input_tokens_seen": 327296, "step": 3630 }, { "epoch": 0.9446465696465697, "grad_norm": 0.6451578736305237, "learning_rate": 2.3609667359667362e-05, "loss": 0.3241, "num_input_tokens_seen": 327776, "step": 3635 }, { "epoch": 0.9459459459459459, "grad_norm": 0.7343765497207642, "learning_rate": 2.364215176715177e-05, "loss": 0.1677, "num_input_tokens_seen": 328224, "step": 3640 }, { "epoch": 0.9472453222453222, "grad_norm": 0.6266632676124573, "learning_rate": 2.3674636174636175e-05, "loss": 0.2443, "num_input_tokens_seen": 328656, "step": 3645 }, { "epoch": 0.9485446985446986, "grad_norm": 0.7936698794364929, "learning_rate": 2.3707120582120584e-05, "loss": 0.3226, "num_input_tokens_seen": 329104, "step": 3650 }, { "epoch": 0.9498440748440748, "grad_norm": 0.6928353905677795, "learning_rate": 2.373960498960499e-05, "loss": 0.2662, "num_input_tokens_seen": 329584, "step": 3655 }, { "epoch": 0.9511434511434511, "grad_norm": 0.6839269399642944, "learning_rate": 2.3772089397089397e-05, "loss": 0.2212, "num_input_tokens_seen": 330080, "step": 3660 }, { "epoch": 0.9524428274428275, "grad_norm": 1.4666272401809692, "learning_rate": 2.3804573804573807e-05, "loss": 0.2586, "num_input_tokens_seen": 330496, "step": 3665 }, { "epoch": 0.9537422037422038, "grad_norm": 0.7405213117599487, "learning_rate": 2.3837058212058213e-05, "loss": 0.243, "num_input_tokens_seen": 330944, "step": 3670 }, { "epoch": 0.95504158004158, "grad_norm": 0.6016635298728943, "learning_rate": 2.386954261954262e-05, "loss": 0.1934, "num_input_tokens_seen": 331440, "step": 3675 }, { "epoch": 0.9563409563409564, "grad_norm": 0.7390451431274414, "learning_rate": 2.3902027027027025e-05, "loss": 0.406, "num_input_tokens_seen": 331888, "step": 3680 }, { "epoch": 0.9576403326403327, "grad_norm": 0.8035585284233093, "learning_rate": 2.3934511434511435e-05, "loss": 0.1846, "num_input_tokens_seen": 332352, "step": 3685 }, { "epoch": 0.9589397089397089, "grad_norm": 0.6649120450019836, "learning_rate": 2.396699584199584e-05, "loss": 0.2223, "num_input_tokens_seen": 332768, "step": 3690 }, { "epoch": 0.9602390852390852, "grad_norm": 0.6157417893409729, "learning_rate": 2.399948024948025e-05, "loss": 0.3355, "num_input_tokens_seen": 333232, "step": 3695 }, { "epoch": 0.9615384615384616, "grad_norm": 0.6223859190940857, "learning_rate": 2.403196465696466e-05, "loss": 0.2948, "num_input_tokens_seen": 333680, "step": 3700 }, { "epoch": 0.9628378378378378, "grad_norm": 0.9395202398300171, "learning_rate": 2.4064449064449067e-05, "loss": 0.2973, "num_input_tokens_seen": 334144, "step": 3705 }, { "epoch": 0.9641372141372141, "grad_norm": 1.1697152853012085, "learning_rate": 2.4096933471933473e-05, "loss": 0.2871, "num_input_tokens_seen": 334592, "step": 3710 }, { "epoch": 0.9654365904365905, "grad_norm": 0.16049166023731232, "learning_rate": 2.4129417879417883e-05, "loss": 0.2912, "num_input_tokens_seen": 335008, "step": 3715 }, { "epoch": 0.9667359667359667, "grad_norm": 0.15806108713150024, "learning_rate": 2.416190228690229e-05, "loss": 0.28, "num_input_tokens_seen": 335456, "step": 3720 }, { "epoch": 0.968035343035343, "grad_norm": 1.2130588293075562, "learning_rate": 2.4194386694386695e-05, "loss": 0.2782, "num_input_tokens_seen": 335888, "step": 3725 }, { "epoch": 0.9693347193347194, "grad_norm": 1.480756402015686, "learning_rate": 2.4226871101871105e-05, "loss": 0.2964, "num_input_tokens_seen": 336336, "step": 3730 }, { "epoch": 0.9706340956340956, "grad_norm": 0.748284637928009, "learning_rate": 2.425935550935551e-05, "loss": 0.2335, "num_input_tokens_seen": 336784, "step": 3735 }, { "epoch": 0.9719334719334719, "grad_norm": 0.9545630216598511, "learning_rate": 2.4291839916839917e-05, "loss": 0.2201, "num_input_tokens_seen": 337264, "step": 3740 }, { "epoch": 0.9732328482328483, "grad_norm": 0.7169840931892395, "learning_rate": 2.4324324324324327e-05, "loss": 0.2689, "num_input_tokens_seen": 337680, "step": 3745 }, { "epoch": 0.9745322245322245, "grad_norm": 0.7248729467391968, "learning_rate": 2.4356808731808733e-05, "loss": 0.1906, "num_input_tokens_seen": 338112, "step": 3750 }, { "epoch": 0.9758316008316008, "grad_norm": 1.3176848888397217, "learning_rate": 2.438929313929314e-05, "loss": 0.2836, "num_input_tokens_seen": 338560, "step": 3755 }, { "epoch": 0.9771309771309772, "grad_norm": 0.7007032036781311, "learning_rate": 2.4421777546777546e-05, "loss": 0.2805, "num_input_tokens_seen": 339040, "step": 3760 }, { "epoch": 0.9784303534303534, "grad_norm": 0.7681906223297119, "learning_rate": 2.4454261954261956e-05, "loss": 0.2475, "num_input_tokens_seen": 339504, "step": 3765 }, { "epoch": 0.9797297297297297, "grad_norm": 0.789909303188324, "learning_rate": 2.4486746361746362e-05, "loss": 0.2582, "num_input_tokens_seen": 339936, "step": 3770 }, { "epoch": 0.981029106029106, "grad_norm": 0.5161377191543579, "learning_rate": 2.4519230769230768e-05, "loss": 0.2666, "num_input_tokens_seen": 340352, "step": 3775 }, { "epoch": 0.9823284823284824, "grad_norm": 0.791709303855896, "learning_rate": 2.4551715176715178e-05, "loss": 0.2573, "num_input_tokens_seen": 340832, "step": 3780 }, { "epoch": 0.9836278586278586, "grad_norm": 0.8512579798698425, "learning_rate": 2.4584199584199584e-05, "loss": 0.2612, "num_input_tokens_seen": 341296, "step": 3785 }, { "epoch": 0.9849272349272349, "grad_norm": 0.7171605229377747, "learning_rate": 2.461668399168399e-05, "loss": 0.3082, "num_input_tokens_seen": 341712, "step": 3790 }, { "epoch": 0.9862266112266113, "grad_norm": 0.4524569809436798, "learning_rate": 2.46491683991684e-05, "loss": 0.2575, "num_input_tokens_seen": 342144, "step": 3795 }, { "epoch": 0.9875259875259875, "grad_norm": 0.6195245981216431, "learning_rate": 2.468165280665281e-05, "loss": 0.1976, "num_input_tokens_seen": 342576, "step": 3800 }, { "epoch": 0.9888253638253638, "grad_norm": 0.7501808404922485, "learning_rate": 2.4714137214137216e-05, "loss": 0.2573, "num_input_tokens_seen": 343024, "step": 3805 }, { "epoch": 0.9901247401247402, "grad_norm": 0.4323696792125702, "learning_rate": 2.4746621621621626e-05, "loss": 0.1887, "num_input_tokens_seen": 343456, "step": 3810 }, { "epoch": 0.9914241164241164, "grad_norm": 0.762767493724823, "learning_rate": 2.4779106029106032e-05, "loss": 0.3847, "num_input_tokens_seen": 343904, "step": 3815 }, { "epoch": 0.9927234927234927, "grad_norm": 0.6421389579772949, "learning_rate": 2.4811590436590438e-05, "loss": 0.2484, "num_input_tokens_seen": 344416, "step": 3820 }, { "epoch": 0.9940228690228691, "grad_norm": 0.7141616940498352, "learning_rate": 2.4844074844074848e-05, "loss": 0.2193, "num_input_tokens_seen": 344848, "step": 3825 }, { "epoch": 0.9953222453222453, "grad_norm": 0.6817874908447266, "learning_rate": 2.4876559251559254e-05, "loss": 0.1902, "num_input_tokens_seen": 345312, "step": 3830 }, { "epoch": 0.9966216216216216, "grad_norm": 0.6230705380439758, "learning_rate": 2.490904365904366e-05, "loss": 0.266, "num_input_tokens_seen": 345744, "step": 3835 }, { "epoch": 0.997920997920998, "grad_norm": 0.43257391452789307, "learning_rate": 2.4941528066528067e-05, "loss": 0.0978, "num_input_tokens_seen": 346176, "step": 3840 }, { "epoch": 0.9992203742203742, "grad_norm": 0.3623557388782501, "learning_rate": 2.4974012474012476e-05, "loss": 0.1268, "num_input_tokens_seen": 346672, "step": 3845 }, { "epoch": 1.0, "eval_loss": 0.28198137879371643, "eval_runtime": 13.1515, "eval_samples_per_second": 65.088, "eval_steps_per_second": 32.544, "num_input_tokens_seen": 346872, "step": 3848 }, { "epoch": 1.0005197505197505, "grad_norm": 1.1175973415374756, "learning_rate": 2.5006496881496882e-05, "loss": 0.3316, "num_input_tokens_seen": 347064, "step": 3850 }, { "epoch": 1.0018191268191268, "grad_norm": 0.3623558580875397, "learning_rate": 2.503898128898129e-05, "loss": 0.3127, "num_input_tokens_seen": 347512, "step": 3855 }, { "epoch": 1.003118503118503, "grad_norm": 0.6952615976333618, "learning_rate": 2.5071465696465695e-05, "loss": 0.2969, "num_input_tokens_seen": 347976, "step": 3860 }, { "epoch": 1.0044178794178795, "grad_norm": 0.5214846730232239, "learning_rate": 2.51039501039501e-05, "loss": 0.3167, "num_input_tokens_seen": 348424, "step": 3865 }, { "epoch": 1.0057172557172558, "grad_norm": 0.20750373601913452, "learning_rate": 2.5136434511434514e-05, "loss": 0.2714, "num_input_tokens_seen": 348888, "step": 3870 }, { "epoch": 1.007016632016632, "grad_norm": 0.16052913665771484, "learning_rate": 2.516891891891892e-05, "loss": 0.2912, "num_input_tokens_seen": 349336, "step": 3875 }, { "epoch": 1.0083160083160083, "grad_norm": 1.0202120542526245, "learning_rate": 2.5201403326403327e-05, "loss": 0.2847, "num_input_tokens_seen": 349784, "step": 3880 }, { "epoch": 1.0096153846153846, "grad_norm": 0.7443398833274841, "learning_rate": 2.5233887733887733e-05, "loss": 0.24, "num_input_tokens_seen": 350216, "step": 3885 }, { "epoch": 1.0109147609147608, "grad_norm": 0.6774588823318481, "learning_rate": 2.526637214137214e-05, "loss": 0.2767, "num_input_tokens_seen": 350696, "step": 3890 }, { "epoch": 1.012214137214137, "grad_norm": 0.4690181612968445, "learning_rate": 2.529885654885655e-05, "loss": 0.3065, "num_input_tokens_seen": 351160, "step": 3895 }, { "epoch": 1.0135135135135136, "grad_norm": 1.357663631439209, "learning_rate": 2.533134095634096e-05, "loss": 0.3839, "num_input_tokens_seen": 351608, "step": 3900 }, { "epoch": 1.0148128898128899, "grad_norm": 0.4158093333244324, "learning_rate": 2.5363825363825365e-05, "loss": 0.2553, "num_input_tokens_seen": 352056, "step": 3905 }, { "epoch": 1.0161122661122661, "grad_norm": 0.3030758798122406, "learning_rate": 2.5396309771309775e-05, "loss": 0.2986, "num_input_tokens_seen": 352504, "step": 3910 }, { "epoch": 1.0174116424116424, "grad_norm": 1.8163589239120483, "learning_rate": 2.542879417879418e-05, "loss": 0.2913, "num_input_tokens_seen": 352936, "step": 3915 }, { "epoch": 1.0187110187110187, "grad_norm": 1.7820079326629639, "learning_rate": 2.5461278586278587e-05, "loss": 0.3258, "num_input_tokens_seen": 353384, "step": 3920 }, { "epoch": 1.020010395010395, "grad_norm": 1.2415109872817993, "learning_rate": 2.5493762993762993e-05, "loss": 0.2987, "num_input_tokens_seen": 353864, "step": 3925 }, { "epoch": 1.0213097713097714, "grad_norm": 0.3314462900161743, "learning_rate": 2.5526247401247406e-05, "loss": 0.27, "num_input_tokens_seen": 354328, "step": 3930 }, { "epoch": 1.0226091476091477, "grad_norm": 0.7290846705436707, "learning_rate": 2.5558731808731813e-05, "loss": 0.2499, "num_input_tokens_seen": 354776, "step": 3935 }, { "epoch": 1.023908523908524, "grad_norm": 0.7488200664520264, "learning_rate": 2.559121621621622e-05, "loss": 0.2133, "num_input_tokens_seen": 355256, "step": 3940 }, { "epoch": 1.0252079002079002, "grad_norm": 1.9271914958953857, "learning_rate": 2.5623700623700625e-05, "loss": 0.292, "num_input_tokens_seen": 355688, "step": 3945 }, { "epoch": 1.0265072765072765, "grad_norm": 0.4842210114002228, "learning_rate": 2.565618503118503e-05, "loss": 0.2222, "num_input_tokens_seen": 356120, "step": 3950 }, { "epoch": 1.0278066528066527, "grad_norm": 1.9215142726898193, "learning_rate": 2.5688669438669438e-05, "loss": 0.2655, "num_input_tokens_seen": 356600, "step": 3955 }, { "epoch": 1.0291060291060292, "grad_norm": 0.4538833498954773, "learning_rate": 2.5721153846153844e-05, "loss": 0.2102, "num_input_tokens_seen": 357016, "step": 3960 }, { "epoch": 1.0304054054054055, "grad_norm": 0.7765560746192932, "learning_rate": 2.5753638253638257e-05, "loss": 0.2035, "num_input_tokens_seen": 357464, "step": 3965 }, { "epoch": 1.0317047817047817, "grad_norm": 0.6736892461776733, "learning_rate": 2.5786122661122663e-05, "loss": 0.3703, "num_input_tokens_seen": 357880, "step": 3970 }, { "epoch": 1.033004158004158, "grad_norm": 0.5352579355239868, "learning_rate": 2.581860706860707e-05, "loss": 0.2644, "num_input_tokens_seen": 358312, "step": 3975 }, { "epoch": 1.0343035343035343, "grad_norm": 0.4248576760292053, "learning_rate": 2.5851091476091476e-05, "loss": 0.2386, "num_input_tokens_seen": 358776, "step": 3980 }, { "epoch": 1.0356029106029105, "grad_norm": 1.2052738666534424, "learning_rate": 2.5883575883575882e-05, "loss": 0.2951, "num_input_tokens_seen": 359224, "step": 3985 }, { "epoch": 1.0369022869022868, "grad_norm": 1.028624176979065, "learning_rate": 2.591606029106029e-05, "loss": 0.2782, "num_input_tokens_seen": 359688, "step": 3990 }, { "epoch": 1.0382016632016633, "grad_norm": 1.0160475969314575, "learning_rate": 2.59485446985447e-05, "loss": 0.2713, "num_input_tokens_seen": 360136, "step": 3995 }, { "epoch": 1.0395010395010396, "grad_norm": 0.7812900543212891, "learning_rate": 2.5981029106029108e-05, "loss": 0.2276, "num_input_tokens_seen": 360600, "step": 4000 }, { "epoch": 1.0408004158004158, "grad_norm": 0.8038558959960938, "learning_rate": 2.6013513513513514e-05, "loss": 0.3163, "num_input_tokens_seen": 361064, "step": 4005 }, { "epoch": 1.042099792099792, "grad_norm": 0.6834136843681335, "learning_rate": 2.6045997920997924e-05, "loss": 0.193, "num_input_tokens_seen": 361480, "step": 4010 }, { "epoch": 1.0433991683991684, "grad_norm": 0.7178271412849426, "learning_rate": 2.607848232848233e-05, "loss": 0.2601, "num_input_tokens_seen": 361944, "step": 4015 }, { "epoch": 1.0446985446985446, "grad_norm": 0.6119319200515747, "learning_rate": 2.6110966735966736e-05, "loss": 0.3448, "num_input_tokens_seen": 362408, "step": 4020 }, { "epoch": 1.045997920997921, "grad_norm": 0.5310854911804199, "learning_rate": 2.6143451143451142e-05, "loss": 0.2211, "num_input_tokens_seen": 362856, "step": 4025 }, { "epoch": 1.0472972972972974, "grad_norm": 0.5187208652496338, "learning_rate": 2.6175935550935555e-05, "loss": 0.2673, "num_input_tokens_seen": 363336, "step": 4030 }, { "epoch": 1.0485966735966736, "grad_norm": 0.5800952315330505, "learning_rate": 2.6208419958419962e-05, "loss": 0.2269, "num_input_tokens_seen": 363800, "step": 4035 }, { "epoch": 1.04989604989605, "grad_norm": 0.6748794317245483, "learning_rate": 2.6240904365904368e-05, "loss": 0.2845, "num_input_tokens_seen": 364232, "step": 4040 }, { "epoch": 1.0511954261954262, "grad_norm": 0.5697217583656311, "learning_rate": 2.6273388773388774e-05, "loss": 0.2148, "num_input_tokens_seen": 364712, "step": 4045 }, { "epoch": 1.0524948024948024, "grad_norm": 0.5242099165916443, "learning_rate": 2.630587318087318e-05, "loss": 0.2147, "num_input_tokens_seen": 365176, "step": 4050 }, { "epoch": 1.0537941787941787, "grad_norm": 0.4996689558029175, "learning_rate": 2.6338357588357587e-05, "loss": 0.1555, "num_input_tokens_seen": 365656, "step": 4055 }, { "epoch": 1.0550935550935552, "grad_norm": 0.8486701846122742, "learning_rate": 2.6370841995842e-05, "loss": 0.3733, "num_input_tokens_seen": 366136, "step": 4060 }, { "epoch": 1.0563929313929314, "grad_norm": 0.61484295129776, "learning_rate": 2.6403326403326406e-05, "loss": 0.1949, "num_input_tokens_seen": 366600, "step": 4065 }, { "epoch": 1.0576923076923077, "grad_norm": 0.6104258298873901, "learning_rate": 2.6435810810810812e-05, "loss": 0.1996, "num_input_tokens_seen": 367064, "step": 4070 }, { "epoch": 1.058991683991684, "grad_norm": 1.0766972303390503, "learning_rate": 2.646829521829522e-05, "loss": 0.2949, "num_input_tokens_seen": 367528, "step": 4075 }, { "epoch": 1.0602910602910602, "grad_norm": 1.5139309167861938, "learning_rate": 2.6500779625779625e-05, "loss": 0.2706, "num_input_tokens_seen": 367960, "step": 4080 }, { "epoch": 1.0615904365904365, "grad_norm": 0.5734085440635681, "learning_rate": 2.653326403326403e-05, "loss": 0.4016, "num_input_tokens_seen": 368408, "step": 4085 }, { "epoch": 1.062889812889813, "grad_norm": 0.441510945558548, "learning_rate": 2.6565748440748444e-05, "loss": 0.296, "num_input_tokens_seen": 368872, "step": 4090 }, { "epoch": 1.0641891891891893, "grad_norm": 1.241613507270813, "learning_rate": 2.659823284823285e-05, "loss": 0.3181, "num_input_tokens_seen": 369288, "step": 4095 }, { "epoch": 1.0654885654885655, "grad_norm": 0.20478306710720062, "learning_rate": 2.6630717255717257e-05, "loss": 0.2979, "num_input_tokens_seen": 369736, "step": 4100 }, { "epoch": 1.0667879417879418, "grad_norm": 1.5790977478027344, "learning_rate": 2.6663201663201663e-05, "loss": 0.3449, "num_input_tokens_seen": 370216, "step": 4105 }, { "epoch": 1.068087318087318, "grad_norm": 1.1136276721954346, "learning_rate": 2.6695686070686073e-05, "loss": 0.3188, "num_input_tokens_seen": 370664, "step": 4110 }, { "epoch": 1.0693866943866943, "grad_norm": 1.5963430404663086, "learning_rate": 2.672817047817048e-05, "loss": 0.2693, "num_input_tokens_seen": 371112, "step": 4115 }, { "epoch": 1.0706860706860706, "grad_norm": 0.4814929664134979, "learning_rate": 2.6760654885654885e-05, "loss": 0.2364, "num_input_tokens_seen": 371560, "step": 4120 }, { "epoch": 1.071985446985447, "grad_norm": 1.6378793716430664, "learning_rate": 2.6793139293139298e-05, "loss": 0.499, "num_input_tokens_seen": 372040, "step": 4125 }, { "epoch": 1.0732848232848233, "grad_norm": 0.8552508354187012, "learning_rate": 2.6825623700623705e-05, "loss": 0.1734, "num_input_tokens_seen": 372504, "step": 4130 }, { "epoch": 1.0745841995841996, "grad_norm": 2.522719621658325, "learning_rate": 2.685810810810811e-05, "loss": 0.373, "num_input_tokens_seen": 372936, "step": 4135 }, { "epoch": 1.0758835758835759, "grad_norm": 1.1273139715194702, "learning_rate": 2.6890592515592517e-05, "loss": 0.2816, "num_input_tokens_seen": 373464, "step": 4140 }, { "epoch": 1.0771829521829521, "grad_norm": 0.6090481877326965, "learning_rate": 2.6923076923076923e-05, "loss": 0.1593, "num_input_tokens_seen": 373912, "step": 4145 }, { "epoch": 1.0784823284823284, "grad_norm": 0.5643723607063293, "learning_rate": 2.695556133056133e-05, "loss": 0.2522, "num_input_tokens_seen": 374376, "step": 4150 }, { "epoch": 1.0797817047817049, "grad_norm": 0.38096630573272705, "learning_rate": 2.6988045738045743e-05, "loss": 0.1968, "num_input_tokens_seen": 374856, "step": 4155 }, { "epoch": 1.0810810810810811, "grad_norm": 0.3919920027256012, "learning_rate": 2.702053014553015e-05, "loss": 0.2671, "num_input_tokens_seen": 375288, "step": 4160 }, { "epoch": 1.0823804573804574, "grad_norm": 0.9494787454605103, "learning_rate": 2.7053014553014555e-05, "loss": 0.1427, "num_input_tokens_seen": 375736, "step": 4165 }, { "epoch": 1.0836798336798337, "grad_norm": 0.360011488199234, "learning_rate": 2.708549896049896e-05, "loss": 0.3121, "num_input_tokens_seen": 376184, "step": 4170 }, { "epoch": 1.08497920997921, "grad_norm": 0.5792566537857056, "learning_rate": 2.7117983367983368e-05, "loss": 0.335, "num_input_tokens_seen": 376616, "step": 4175 }, { "epoch": 1.0862785862785862, "grad_norm": 0.6527485847473145, "learning_rate": 2.7150467775467774e-05, "loss": 0.2228, "num_input_tokens_seen": 377048, "step": 4180 }, { "epoch": 1.0875779625779627, "grad_norm": 1.4165326356887817, "learning_rate": 2.718295218295218e-05, "loss": 0.3321, "num_input_tokens_seen": 377480, "step": 4185 }, { "epoch": 1.088877338877339, "grad_norm": 1.831849455833435, "learning_rate": 2.7215436590436593e-05, "loss": 0.3094, "num_input_tokens_seen": 377912, "step": 4190 }, { "epoch": 1.0901767151767152, "grad_norm": 0.2768891453742981, "learning_rate": 2.7247920997921e-05, "loss": 0.288, "num_input_tokens_seen": 378360, "step": 4195 }, { "epoch": 1.0914760914760915, "grad_norm": 0.3330693542957306, "learning_rate": 2.7280405405405406e-05, "loss": 0.302, "num_input_tokens_seen": 378776, "step": 4200 }, { "epoch": 1.0927754677754677, "grad_norm": 0.8962578773498535, "learning_rate": 2.7312889812889812e-05, "loss": 0.2419, "num_input_tokens_seen": 379208, "step": 4205 }, { "epoch": 1.094074844074844, "grad_norm": 0.5938984155654907, "learning_rate": 2.7345374220374222e-05, "loss": 0.3265, "num_input_tokens_seen": 379640, "step": 4210 }, { "epoch": 1.0953742203742203, "grad_norm": 0.6827664375305176, "learning_rate": 2.7377858627858628e-05, "loss": 0.258, "num_input_tokens_seen": 380104, "step": 4215 }, { "epoch": 1.0966735966735968, "grad_norm": 0.6171044111251831, "learning_rate": 2.7410343035343038e-05, "loss": 0.1928, "num_input_tokens_seen": 380568, "step": 4220 }, { "epoch": 1.097972972972973, "grad_norm": 0.48248517513275146, "learning_rate": 2.7442827442827447e-05, "loss": 0.291, "num_input_tokens_seen": 381016, "step": 4225 }, { "epoch": 1.0992723492723493, "grad_norm": 0.5474457740783691, "learning_rate": 2.7475311850311854e-05, "loss": 0.3303, "num_input_tokens_seen": 381480, "step": 4230 }, { "epoch": 1.1005717255717256, "grad_norm": 0.6615641713142395, "learning_rate": 2.750779625779626e-05, "loss": 0.1827, "num_input_tokens_seen": 381944, "step": 4235 }, { "epoch": 1.1018711018711018, "grad_norm": 0.5418712496757507, "learning_rate": 2.7540280665280666e-05, "loss": 0.2641, "num_input_tokens_seen": 382408, "step": 4240 }, { "epoch": 1.103170478170478, "grad_norm": 0.5531789660453796, "learning_rate": 2.7572765072765072e-05, "loss": 0.2145, "num_input_tokens_seen": 382872, "step": 4245 }, { "epoch": 1.1044698544698546, "grad_norm": 0.5497608780860901, "learning_rate": 2.7605249480249485e-05, "loss": 0.2747, "num_input_tokens_seen": 383320, "step": 4250 }, { "epoch": 1.1057692307692308, "grad_norm": 0.5985249280929565, "learning_rate": 2.763773388773389e-05, "loss": 0.2187, "num_input_tokens_seen": 383768, "step": 4255 }, { "epoch": 1.107068607068607, "grad_norm": 0.6274611353874207, "learning_rate": 2.7670218295218298e-05, "loss": 0.2201, "num_input_tokens_seen": 384248, "step": 4260 }, { "epoch": 1.1083679833679834, "grad_norm": 0.5590729713439941, "learning_rate": 2.7702702702702704e-05, "loss": 0.3144, "num_input_tokens_seen": 384664, "step": 4265 }, { "epoch": 1.1096673596673596, "grad_norm": 0.5145828127861023, "learning_rate": 2.773518711018711e-05, "loss": 0.2599, "num_input_tokens_seen": 385144, "step": 4270 }, { "epoch": 1.110966735966736, "grad_norm": 0.41526591777801514, "learning_rate": 2.7767671517671517e-05, "loss": 0.3007, "num_input_tokens_seen": 385576, "step": 4275 }, { "epoch": 1.1122661122661124, "grad_norm": 0.33397039771080017, "learning_rate": 2.7800155925155923e-05, "loss": 0.2986, "num_input_tokens_seen": 386008, "step": 4280 }, { "epoch": 1.1135654885654886, "grad_norm": 1.0201936960220337, "learning_rate": 2.7832640332640336e-05, "loss": 0.2878, "num_input_tokens_seen": 386472, "step": 4285 }, { "epoch": 1.114864864864865, "grad_norm": 1.0422983169555664, "learning_rate": 2.7865124740124742e-05, "loss": 0.2769, "num_input_tokens_seen": 386920, "step": 4290 }, { "epoch": 1.1161642411642412, "grad_norm": 1.0355554819107056, "learning_rate": 2.789760914760915e-05, "loss": 0.2951, "num_input_tokens_seen": 387368, "step": 4295 }, { "epoch": 1.1174636174636174, "grad_norm": 0.2626696228981018, "learning_rate": 2.7930093555093555e-05, "loss": 0.2477, "num_input_tokens_seen": 387800, "step": 4300 }, { "epoch": 1.1187629937629937, "grad_norm": 0.49221479892730713, "learning_rate": 2.796257796257796e-05, "loss": 0.2643, "num_input_tokens_seen": 388264, "step": 4305 }, { "epoch": 1.12006237006237, "grad_norm": 0.562303364276886, "learning_rate": 2.799506237006237e-05, "loss": 0.2236, "num_input_tokens_seen": 388728, "step": 4310 }, { "epoch": 1.1213617463617465, "grad_norm": 0.533210277557373, "learning_rate": 2.802754677754678e-05, "loss": 0.2938, "num_input_tokens_seen": 389144, "step": 4315 }, { "epoch": 1.1226611226611227, "grad_norm": 0.5439296364784241, "learning_rate": 2.8060031185031187e-05, "loss": 0.1694, "num_input_tokens_seen": 389592, "step": 4320 }, { "epoch": 1.123960498960499, "grad_norm": 0.6483762264251709, "learning_rate": 2.8092515592515596e-05, "loss": 0.2792, "num_input_tokens_seen": 390040, "step": 4325 }, { "epoch": 1.1252598752598753, "grad_norm": 0.5782018303871155, "learning_rate": 2.8125000000000003e-05, "loss": 0.3526, "num_input_tokens_seen": 390488, "step": 4330 }, { "epoch": 1.1265592515592515, "grad_norm": 1.1814717054367065, "learning_rate": 2.815748440748441e-05, "loss": 0.2364, "num_input_tokens_seen": 390936, "step": 4335 }, { "epoch": 1.1278586278586278, "grad_norm": 0.33026427030563354, "learning_rate": 2.8189968814968815e-05, "loss": 0.2852, "num_input_tokens_seen": 391368, "step": 4340 }, { "epoch": 1.129158004158004, "grad_norm": 0.3902978003025055, "learning_rate": 2.822245322245322e-05, "loss": 0.2592, "num_input_tokens_seen": 391800, "step": 4345 }, { "epoch": 1.1304573804573805, "grad_norm": 0.8683748245239258, "learning_rate": 2.8254937629937634e-05, "loss": 0.2695, "num_input_tokens_seen": 392232, "step": 4350 }, { "epoch": 1.1317567567567568, "grad_norm": 0.4963666498661041, "learning_rate": 2.828742203742204e-05, "loss": 0.226, "num_input_tokens_seen": 392664, "step": 4355 }, { "epoch": 1.133056133056133, "grad_norm": 0.45515233278274536, "learning_rate": 2.8319906444906447e-05, "loss": 0.1616, "num_input_tokens_seen": 393096, "step": 4360 }, { "epoch": 1.1343555093555093, "grad_norm": 0.7294480800628662, "learning_rate": 2.8352390852390853e-05, "loss": 0.2646, "num_input_tokens_seen": 393592, "step": 4365 }, { "epoch": 1.1356548856548856, "grad_norm": 0.7851203680038452, "learning_rate": 2.838487525987526e-05, "loss": 0.3458, "num_input_tokens_seen": 394104, "step": 4370 }, { "epoch": 1.1369542619542619, "grad_norm": 1.3502060174942017, "learning_rate": 2.8417359667359666e-05, "loss": 0.3494, "num_input_tokens_seen": 394552, "step": 4375 }, { "epoch": 1.1382536382536383, "grad_norm": 0.42374187707901, "learning_rate": 2.844984407484408e-05, "loss": 0.2906, "num_input_tokens_seen": 395016, "step": 4380 }, { "epoch": 1.1395530145530146, "grad_norm": 0.23817437887191772, "learning_rate": 2.8482328482328485e-05, "loss": 0.2861, "num_input_tokens_seen": 395416, "step": 4385 }, { "epoch": 1.1408523908523909, "grad_norm": 0.3217451572418213, "learning_rate": 2.851481288981289e-05, "loss": 0.2803, "num_input_tokens_seen": 395864, "step": 4390 }, { "epoch": 1.1421517671517671, "grad_norm": 1.106561303138733, "learning_rate": 2.8547297297297298e-05, "loss": 0.3196, "num_input_tokens_seen": 396312, "step": 4395 }, { "epoch": 1.1434511434511434, "grad_norm": 0.3906770944595337, "learning_rate": 2.8579781704781704e-05, "loss": 0.2432, "num_input_tokens_seen": 396744, "step": 4400 }, { "epoch": 1.1447505197505197, "grad_norm": 0.7623933553695679, "learning_rate": 2.861226611226611e-05, "loss": 0.1879, "num_input_tokens_seen": 397160, "step": 4405 }, { "epoch": 1.1460498960498962, "grad_norm": 1.3094485998153687, "learning_rate": 2.864475051975052e-05, "loss": 0.3014, "num_input_tokens_seen": 397608, "step": 4410 }, { "epoch": 1.1473492723492724, "grad_norm": 0.4655569791793823, "learning_rate": 2.867723492723493e-05, "loss": 0.3056, "num_input_tokens_seen": 398056, "step": 4415 }, { "epoch": 1.1486486486486487, "grad_norm": 0.863553524017334, "learning_rate": 2.8709719334719336e-05, "loss": 0.2526, "num_input_tokens_seen": 398568, "step": 4420 }, { "epoch": 1.149948024948025, "grad_norm": 1.2350990772247314, "learning_rate": 2.8742203742203745e-05, "loss": 0.2704, "num_input_tokens_seen": 399064, "step": 4425 }, { "epoch": 1.1512474012474012, "grad_norm": 0.7656514644622803, "learning_rate": 2.877468814968815e-05, "loss": 0.1857, "num_input_tokens_seen": 399496, "step": 4430 }, { "epoch": 1.1525467775467775, "grad_norm": 1.5840016603469849, "learning_rate": 2.8807172557172558e-05, "loss": 0.2631, "num_input_tokens_seen": 399928, "step": 4435 }, { "epoch": 1.1538461538461537, "grad_norm": 2.0615475177764893, "learning_rate": 2.8839656964656964e-05, "loss": 0.439, "num_input_tokens_seen": 400376, "step": 4440 }, { "epoch": 1.1551455301455302, "grad_norm": 0.6542909145355225, "learning_rate": 2.8872141372141377e-05, "loss": 0.2331, "num_input_tokens_seen": 400856, "step": 4445 }, { "epoch": 1.1564449064449065, "grad_norm": 0.953460693359375, "learning_rate": 2.8904625779625784e-05, "loss": 0.2763, "num_input_tokens_seen": 401368, "step": 4450 }, { "epoch": 1.1577442827442828, "grad_norm": 2.4818127155303955, "learning_rate": 2.893711018711019e-05, "loss": 0.269, "num_input_tokens_seen": 401816, "step": 4455 }, { "epoch": 1.159043659043659, "grad_norm": 0.5746747851371765, "learning_rate": 2.8969594594594596e-05, "loss": 0.1462, "num_input_tokens_seen": 402296, "step": 4460 }, { "epoch": 1.1603430353430353, "grad_norm": 0.3866441547870636, "learning_rate": 2.9002079002079002e-05, "loss": 0.2123, "num_input_tokens_seen": 402744, "step": 4465 }, { "epoch": 1.1616424116424116, "grad_norm": 0.3913857042789459, "learning_rate": 2.903456340956341e-05, "loss": 0.2152, "num_input_tokens_seen": 403192, "step": 4470 }, { "epoch": 1.1629417879417878, "grad_norm": 1.4885311126708984, "learning_rate": 2.906704781704782e-05, "loss": 0.3885, "num_input_tokens_seen": 403672, "step": 4475 }, { "epoch": 1.1642411642411643, "grad_norm": 1.3835670948028564, "learning_rate": 2.9099532224532228e-05, "loss": 0.2988, "num_input_tokens_seen": 404152, "step": 4480 }, { "epoch": 1.1655405405405406, "grad_norm": 0.9146977066993713, "learning_rate": 2.9132016632016634e-05, "loss": 0.2196, "num_input_tokens_seen": 404584, "step": 4485 }, { "epoch": 1.1668399168399168, "grad_norm": 0.8192022442817688, "learning_rate": 2.916450103950104e-05, "loss": 0.2004, "num_input_tokens_seen": 405064, "step": 4490 }, { "epoch": 1.168139293139293, "grad_norm": 0.4266611933708191, "learning_rate": 2.9196985446985447e-05, "loss": 0.2267, "num_input_tokens_seen": 405528, "step": 4495 }, { "epoch": 1.1694386694386694, "grad_norm": 1.530479073524475, "learning_rate": 2.9229469854469853e-05, "loss": 0.336, "num_input_tokens_seen": 405976, "step": 4500 }, { "epoch": 1.1707380457380459, "grad_norm": 0.6004845499992371, "learning_rate": 2.926195426195426e-05, "loss": 0.346, "num_input_tokens_seen": 406408, "step": 4505 }, { "epoch": 1.1720374220374221, "grad_norm": 0.7252393364906311, "learning_rate": 2.9294438669438672e-05, "loss": 0.2644, "num_input_tokens_seen": 406840, "step": 4510 }, { "epoch": 1.1733367983367984, "grad_norm": 0.39252156019210815, "learning_rate": 2.932692307692308e-05, "loss": 0.2422, "num_input_tokens_seen": 407272, "step": 4515 }, { "epoch": 1.1746361746361746, "grad_norm": 1.3050298690795898, "learning_rate": 2.9359407484407485e-05, "loss": 0.3009, "num_input_tokens_seen": 407720, "step": 4520 }, { "epoch": 1.175935550935551, "grad_norm": 0.7103352546691895, "learning_rate": 2.9391891891891894e-05, "loss": 0.1512, "num_input_tokens_seen": 408152, "step": 4525 }, { "epoch": 1.1772349272349272, "grad_norm": 1.5249428749084473, "learning_rate": 2.94243762993763e-05, "loss": 0.346, "num_input_tokens_seen": 408600, "step": 4530 }, { "epoch": 1.1785343035343034, "grad_norm": 0.9014526605606079, "learning_rate": 2.9456860706860707e-05, "loss": 0.3058, "num_input_tokens_seen": 409048, "step": 4535 }, { "epoch": 1.17983367983368, "grad_norm": 0.5868172645568848, "learning_rate": 2.9489345114345117e-05, "loss": 0.2708, "num_input_tokens_seen": 409480, "step": 4540 }, { "epoch": 1.1811330561330562, "grad_norm": 0.4988204836845398, "learning_rate": 2.9521829521829526e-05, "loss": 0.2262, "num_input_tokens_seen": 409928, "step": 4545 }, { "epoch": 1.1824324324324325, "grad_norm": 0.4023747742176056, "learning_rate": 2.9554313929313933e-05, "loss": 0.3204, "num_input_tokens_seen": 410392, "step": 4550 }, { "epoch": 1.1837318087318087, "grad_norm": 0.8529812693595886, "learning_rate": 2.958679833679834e-05, "loss": 0.2596, "num_input_tokens_seen": 410840, "step": 4555 }, { "epoch": 1.185031185031185, "grad_norm": 1.03570556640625, "learning_rate": 2.9619282744282745e-05, "loss": 0.312, "num_input_tokens_seen": 411288, "step": 4560 }, { "epoch": 1.1863305613305613, "grad_norm": 0.08040257543325424, "learning_rate": 2.965176715176715e-05, "loss": 0.299, "num_input_tokens_seen": 411736, "step": 4565 }, { "epoch": 1.1876299376299375, "grad_norm": 1.6013987064361572, "learning_rate": 2.9684251559251558e-05, "loss": 0.3002, "num_input_tokens_seen": 412184, "step": 4570 }, { "epoch": 1.188929313929314, "grad_norm": 1.3445866107940674, "learning_rate": 2.971673596673597e-05, "loss": 0.319, "num_input_tokens_seen": 412632, "step": 4575 }, { "epoch": 1.1902286902286903, "grad_norm": 0.9755032658576965, "learning_rate": 2.9749220374220377e-05, "loss": 0.2749, "num_input_tokens_seen": 413064, "step": 4580 }, { "epoch": 1.1915280665280665, "grad_norm": 0.7417953610420227, "learning_rate": 2.9781704781704783e-05, "loss": 0.2506, "num_input_tokens_seen": 413480, "step": 4585 }, { "epoch": 1.1928274428274428, "grad_norm": 0.6250741481781006, "learning_rate": 2.981418918918919e-05, "loss": 0.2764, "num_input_tokens_seen": 413960, "step": 4590 }, { "epoch": 1.194126819126819, "grad_norm": 0.5471602082252502, "learning_rate": 2.9846673596673596e-05, "loss": 0.2359, "num_input_tokens_seen": 414472, "step": 4595 }, { "epoch": 1.1954261954261955, "grad_norm": 1.1040467023849487, "learning_rate": 2.9879158004158002e-05, "loss": 0.3689, "num_input_tokens_seen": 414968, "step": 4600 }, { "epoch": 1.1967255717255718, "grad_norm": 0.2852615714073181, "learning_rate": 2.9911642411642415e-05, "loss": 0.2926, "num_input_tokens_seen": 415416, "step": 4605 }, { "epoch": 1.198024948024948, "grad_norm": 0.16983476281166077, "learning_rate": 2.994412681912682e-05, "loss": 0.2753, "num_input_tokens_seen": 415880, "step": 4610 }, { "epoch": 1.1993243243243243, "grad_norm": 0.8794458508491516, "learning_rate": 2.9976611226611228e-05, "loss": 0.2661, "num_input_tokens_seen": 416328, "step": 4615 }, { "epoch": 1.2006237006237006, "grad_norm": 0.34041452407836914, "learning_rate": 3.0009095634095634e-05, "loss": 0.269, "num_input_tokens_seen": 416744, "step": 4620 }, { "epoch": 1.2019230769230769, "grad_norm": 0.3844810426235199, "learning_rate": 3.0041580041580043e-05, "loss": 0.304, "num_input_tokens_seen": 417208, "step": 4625 }, { "epoch": 1.2032224532224531, "grad_norm": 0.4391044080257416, "learning_rate": 3.007406444906445e-05, "loss": 0.2616, "num_input_tokens_seen": 417640, "step": 4630 }, { "epoch": 1.2045218295218296, "grad_norm": 0.31830060482025146, "learning_rate": 3.010654885654886e-05, "loss": 0.3582, "num_input_tokens_seen": 418136, "step": 4635 }, { "epoch": 1.2058212058212059, "grad_norm": 0.8414891362190247, "learning_rate": 3.0139033264033266e-05, "loss": 0.2902, "num_input_tokens_seen": 418616, "step": 4640 }, { "epoch": 1.2071205821205822, "grad_norm": 0.07259811460971832, "learning_rate": 3.0171517671517675e-05, "loss": 0.2947, "num_input_tokens_seen": 419096, "step": 4645 }, { "epoch": 1.2084199584199584, "grad_norm": 1.1062567234039307, "learning_rate": 3.020400207900208e-05, "loss": 0.3405, "num_input_tokens_seen": 419544, "step": 4650 }, { "epoch": 1.2097193347193347, "grad_norm": 0.3144121766090393, "learning_rate": 3.0236486486486488e-05, "loss": 0.2953, "num_input_tokens_seen": 419976, "step": 4655 }, { "epoch": 1.211018711018711, "grad_norm": 0.6437689661979675, "learning_rate": 3.0268970893970894e-05, "loss": 0.2286, "num_input_tokens_seen": 420424, "step": 4660 }, { "epoch": 1.2123180873180872, "grad_norm": 0.6962844133377075, "learning_rate": 3.03014553014553e-05, "loss": 0.2127, "num_input_tokens_seen": 420872, "step": 4665 }, { "epoch": 1.2136174636174637, "grad_norm": 0.36616480350494385, "learning_rate": 3.0333939708939713e-05, "loss": 0.2222, "num_input_tokens_seen": 421336, "step": 4670 }, { "epoch": 1.21491683991684, "grad_norm": 0.3588239848613739, "learning_rate": 3.036642411642412e-05, "loss": 0.1869, "num_input_tokens_seen": 421800, "step": 4675 }, { "epoch": 1.2162162162162162, "grad_norm": 0.2670489549636841, "learning_rate": 3.0398908523908526e-05, "loss": 0.0567, "num_input_tokens_seen": 422200, "step": 4680 }, { "epoch": 1.2175155925155925, "grad_norm": 1.1515281200408936, "learning_rate": 3.0431392931392932e-05, "loss": 0.3446, "num_input_tokens_seen": 422600, "step": 4685 }, { "epoch": 1.2188149688149688, "grad_norm": 0.30639225244522095, "learning_rate": 3.046387733887734e-05, "loss": 0.2948, "num_input_tokens_seen": 423064, "step": 4690 }, { "epoch": 1.220114345114345, "grad_norm": 0.3457757830619812, "learning_rate": 3.0496361746361745e-05, "loss": 0.3316, "num_input_tokens_seen": 423528, "step": 4695 }, { "epoch": 1.2214137214137215, "grad_norm": 0.5450494289398193, "learning_rate": 3.052884615384616e-05, "loss": 0.316, "num_input_tokens_seen": 423992, "step": 4700 }, { "epoch": 1.2227130977130978, "grad_norm": 0.5775039792060852, "learning_rate": 3.056133056133057e-05, "loss": 0.1964, "num_input_tokens_seen": 424440, "step": 4705 }, { "epoch": 1.224012474012474, "grad_norm": 0.4772631824016571, "learning_rate": 3.059381496881497e-05, "loss": 0.2925, "num_input_tokens_seen": 424920, "step": 4710 }, { "epoch": 1.2253118503118503, "grad_norm": 0.5626017451286316, "learning_rate": 3.062629937629938e-05, "loss": 0.2621, "num_input_tokens_seen": 425352, "step": 4715 }, { "epoch": 1.2266112266112266, "grad_norm": 0.3591625988483429, "learning_rate": 3.065878378378378e-05, "loss": 0.3405, "num_input_tokens_seen": 425816, "step": 4720 }, { "epoch": 1.2279106029106028, "grad_norm": 0.21676753461360931, "learning_rate": 3.069126819126819e-05, "loss": 0.2684, "num_input_tokens_seen": 426296, "step": 4725 }, { "epoch": 1.2292099792099793, "grad_norm": 0.8733010292053223, "learning_rate": 3.0723752598752595e-05, "loss": 0.2846, "num_input_tokens_seen": 426696, "step": 4730 }, { "epoch": 1.2305093555093556, "grad_norm": 0.24175013601779938, "learning_rate": 3.075623700623701e-05, "loss": 0.2675, "num_input_tokens_seen": 427112, "step": 4735 }, { "epoch": 1.2318087318087318, "grad_norm": 0.6626141667366028, "learning_rate": 3.0788721413721415e-05, "loss": 0.2356, "num_input_tokens_seen": 427576, "step": 4740 }, { "epoch": 1.2331081081081081, "grad_norm": 0.6922448873519897, "learning_rate": 3.0821205821205824e-05, "loss": 0.2277, "num_input_tokens_seen": 428024, "step": 4745 }, { "epoch": 1.2344074844074844, "grad_norm": 0.7521610260009766, "learning_rate": 3.085369022869023e-05, "loss": 0.2906, "num_input_tokens_seen": 428504, "step": 4750 }, { "epoch": 1.2357068607068606, "grad_norm": 0.6271171569824219, "learning_rate": 3.088617463617464e-05, "loss": 0.3357, "num_input_tokens_seen": 428952, "step": 4755 }, { "epoch": 1.237006237006237, "grad_norm": 0.46049490571022034, "learning_rate": 3.091865904365904e-05, "loss": 0.322, "num_input_tokens_seen": 429368, "step": 4760 }, { "epoch": 1.2383056133056134, "grad_norm": 0.422737181186676, "learning_rate": 3.0951143451143456e-05, "loss": 0.1969, "num_input_tokens_seen": 429848, "step": 4765 }, { "epoch": 1.2396049896049897, "grad_norm": 0.3725098669528961, "learning_rate": 3.098362785862786e-05, "loss": 0.3236, "num_input_tokens_seen": 430296, "step": 4770 }, { "epoch": 1.240904365904366, "grad_norm": 0.7244720458984375, "learning_rate": 3.101611226611227e-05, "loss": 0.2543, "num_input_tokens_seen": 430776, "step": 4775 }, { "epoch": 1.2422037422037422, "grad_norm": 0.7035996317863464, "learning_rate": 3.104859667359667e-05, "loss": 0.2741, "num_input_tokens_seen": 431224, "step": 4780 }, { "epoch": 1.2435031185031185, "grad_norm": 0.3773230314254761, "learning_rate": 3.108108108108108e-05, "loss": 0.2727, "num_input_tokens_seen": 431656, "step": 4785 }, { "epoch": 1.2448024948024947, "grad_norm": 0.48482802510261536, "learning_rate": 3.111356548856549e-05, "loss": 0.2625, "num_input_tokens_seen": 432072, "step": 4790 }, { "epoch": 1.246101871101871, "grad_norm": 0.41987401247024536, "learning_rate": 3.11460498960499e-05, "loss": 0.3266, "num_input_tokens_seen": 432536, "step": 4795 }, { "epoch": 1.2474012474012475, "grad_norm": 0.7023742198944092, "learning_rate": 3.117853430353431e-05, "loss": 0.2068, "num_input_tokens_seen": 433032, "step": 4800 }, { "epoch": 1.2487006237006237, "grad_norm": 0.3496343195438385, "learning_rate": 3.121101871101871e-05, "loss": 0.3102, "num_input_tokens_seen": 433496, "step": 4805 }, { "epoch": 1.25, "grad_norm": 0.2905389666557312, "learning_rate": 3.124350311850312e-05, "loss": 0.3067, "num_input_tokens_seen": 433928, "step": 4810 }, { "epoch": 1.2512993762993763, "grad_norm": 0.7890183329582214, "learning_rate": 3.1275987525987526e-05, "loss": 0.2743, "num_input_tokens_seen": 434376, "step": 4815 }, { "epoch": 1.2525987525987525, "grad_norm": 0.6756892800331116, "learning_rate": 3.1308471933471935e-05, "loss": 0.2535, "num_input_tokens_seen": 434808, "step": 4820 }, { "epoch": 1.253898128898129, "grad_norm": 0.3768901824951172, "learning_rate": 3.134095634095634e-05, "loss": 0.2825, "num_input_tokens_seen": 435256, "step": 4825 }, { "epoch": 1.255197505197505, "grad_norm": 0.4641149640083313, "learning_rate": 3.1373440748440755e-05, "loss": 0.1504, "num_input_tokens_seen": 435704, "step": 4830 }, { "epoch": 1.2564968814968815, "grad_norm": 0.5117455720901489, "learning_rate": 3.140592515592516e-05, "loss": 0.2589, "num_input_tokens_seen": 436168, "step": 4835 }, { "epoch": 1.2577962577962578, "grad_norm": 0.39046812057495117, "learning_rate": 3.143840956340957e-05, "loss": 0.3343, "num_input_tokens_seen": 436568, "step": 4840 }, { "epoch": 1.259095634095634, "grad_norm": 0.4053306579589844, "learning_rate": 3.147089397089397e-05, "loss": 0.2161, "num_input_tokens_seen": 437000, "step": 4845 }, { "epoch": 1.2603950103950103, "grad_norm": 0.5512628555297852, "learning_rate": 3.150337837837838e-05, "loss": 0.2691, "num_input_tokens_seen": 437416, "step": 4850 }, { "epoch": 1.2616943866943866, "grad_norm": 0.48224976658821106, "learning_rate": 3.153586278586278e-05, "loss": 0.242, "num_input_tokens_seen": 437880, "step": 4855 }, { "epoch": 1.262993762993763, "grad_norm": 0.45644643902778625, "learning_rate": 3.15683471933472e-05, "loss": 0.1751, "num_input_tokens_seen": 438328, "step": 4860 }, { "epoch": 1.2642931392931394, "grad_norm": 0.37934380769729614, "learning_rate": 3.16008316008316e-05, "loss": 0.2486, "num_input_tokens_seen": 438792, "step": 4865 }, { "epoch": 1.2655925155925156, "grad_norm": 0.4828096926212311, "learning_rate": 3.163331600831601e-05, "loss": 0.3395, "num_input_tokens_seen": 439224, "step": 4870 }, { "epoch": 1.2668918918918919, "grad_norm": 0.41650643944740295, "learning_rate": 3.1665800415800414e-05, "loss": 0.2673, "num_input_tokens_seen": 439656, "step": 4875 }, { "epoch": 1.2681912681912682, "grad_norm": 0.6969903707504272, "learning_rate": 3.1698284823284824e-05, "loss": 0.2595, "num_input_tokens_seen": 440072, "step": 4880 }, { "epoch": 1.2694906444906444, "grad_norm": 0.3233267068862915, "learning_rate": 3.1730769230769234e-05, "loss": 0.307, "num_input_tokens_seen": 440552, "step": 4885 }, { "epoch": 1.2707900207900207, "grad_norm": 0.3233187198638916, "learning_rate": 3.1763253638253637e-05, "loss": 0.2284, "num_input_tokens_seen": 441000, "step": 4890 }, { "epoch": 1.2720893970893972, "grad_norm": 0.42250728607177734, "learning_rate": 3.1795738045738046e-05, "loss": 0.2345, "num_input_tokens_seen": 441432, "step": 4895 }, { "epoch": 1.2733887733887734, "grad_norm": 0.47313177585601807, "learning_rate": 3.1828222453222456e-05, "loss": 0.2211, "num_input_tokens_seen": 441880, "step": 4900 }, { "epoch": 1.2746881496881497, "grad_norm": 0.34324827790260315, "learning_rate": 3.1860706860706866e-05, "loss": 0.1554, "num_input_tokens_seen": 442312, "step": 4905 }, { "epoch": 1.275987525987526, "grad_norm": 0.7500205636024475, "learning_rate": 3.189319126819127e-05, "loss": 0.2177, "num_input_tokens_seen": 442776, "step": 4910 }, { "epoch": 1.2772869022869022, "grad_norm": 0.7361673712730408, "learning_rate": 3.192567567567568e-05, "loss": 0.1475, "num_input_tokens_seen": 443208, "step": 4915 }, { "epoch": 1.2785862785862787, "grad_norm": 0.3180796504020691, "learning_rate": 3.195816008316008e-05, "loss": 0.3021, "num_input_tokens_seen": 443656, "step": 4920 }, { "epoch": 1.2798856548856548, "grad_norm": 0.30797770619392395, "learning_rate": 3.19906444906445e-05, "loss": 0.1444, "num_input_tokens_seen": 444072, "step": 4925 }, { "epoch": 1.2811850311850312, "grad_norm": 0.35886749625205994, "learning_rate": 3.20231288981289e-05, "loss": 0.3297, "num_input_tokens_seen": 444568, "step": 4930 }, { "epoch": 1.2824844074844075, "grad_norm": 0.48359090089797974, "learning_rate": 3.205561330561331e-05, "loss": 0.3114, "num_input_tokens_seen": 445016, "step": 4935 }, { "epoch": 1.2837837837837838, "grad_norm": 0.4806548058986664, "learning_rate": 3.208809771309771e-05, "loss": 0.1781, "num_input_tokens_seen": 445448, "step": 4940 }, { "epoch": 1.28508316008316, "grad_norm": 0.4528852701187134, "learning_rate": 3.212058212058212e-05, "loss": 0.2113, "num_input_tokens_seen": 445912, "step": 4945 }, { "epoch": 1.2863825363825363, "grad_norm": 0.41852861642837524, "learning_rate": 3.2153066528066525e-05, "loss": 0.2478, "num_input_tokens_seen": 446360, "step": 4950 }, { "epoch": 1.2876819126819128, "grad_norm": 0.5138948559761047, "learning_rate": 3.2185550935550935e-05, "loss": 0.2194, "num_input_tokens_seen": 446792, "step": 4955 }, { "epoch": 1.288981288981289, "grad_norm": 0.39547446370124817, "learning_rate": 3.2218035343035345e-05, "loss": 0.2697, "num_input_tokens_seen": 447240, "step": 4960 }, { "epoch": 1.2902806652806653, "grad_norm": 0.4122556447982788, "learning_rate": 3.2250519750519754e-05, "loss": 0.2385, "num_input_tokens_seen": 447688, "step": 4965 }, { "epoch": 1.2915800415800416, "grad_norm": 0.5087078809738159, "learning_rate": 3.228300415800416e-05, "loss": 0.2569, "num_input_tokens_seen": 448152, "step": 4970 }, { "epoch": 1.2928794178794178, "grad_norm": 0.400431752204895, "learning_rate": 3.231548856548857e-05, "loss": 0.2554, "num_input_tokens_seen": 448632, "step": 4975 }, { "epoch": 1.2941787941787941, "grad_norm": 0.35291561484336853, "learning_rate": 3.234797297297297e-05, "loss": 0.3469, "num_input_tokens_seen": 449128, "step": 4980 }, { "epoch": 1.2954781704781704, "grad_norm": 1.0142507553100586, "learning_rate": 3.238045738045738e-05, "loss": 0.2775, "num_input_tokens_seen": 449592, "step": 4985 }, { "epoch": 1.2967775467775469, "grad_norm": 0.6704441905021667, "learning_rate": 3.241294178794179e-05, "loss": 0.2509, "num_input_tokens_seen": 450008, "step": 4990 }, { "epoch": 1.2980769230769231, "grad_norm": 1.0206211805343628, "learning_rate": 3.24454261954262e-05, "loss": 0.3305, "num_input_tokens_seen": 450472, "step": 4995 }, { "epoch": 1.2993762993762994, "grad_norm": 0.41535621881484985, "learning_rate": 3.247791060291061e-05, "loss": 0.2676, "num_input_tokens_seen": 450920, "step": 5000 }, { "epoch": 1.3006756756756757, "grad_norm": 0.5161663293838501, "learning_rate": 3.251039501039501e-05, "loss": 0.1786, "num_input_tokens_seen": 451384, "step": 5005 }, { "epoch": 1.301975051975052, "grad_norm": 0.4032212495803833, "learning_rate": 3.254287941787942e-05, "loss": 0.1618, "num_input_tokens_seen": 451800, "step": 5010 }, { "epoch": 1.3032744282744284, "grad_norm": 0.37806499004364014, "learning_rate": 3.2575363825363824e-05, "loss": 0.3312, "num_input_tokens_seen": 452264, "step": 5015 }, { "epoch": 1.3045738045738045, "grad_norm": 0.44037413597106934, "learning_rate": 3.260784823284824e-05, "loss": 0.2566, "num_input_tokens_seen": 452712, "step": 5020 }, { "epoch": 1.305873180873181, "grad_norm": 0.524634838104248, "learning_rate": 3.264033264033264e-05, "loss": 0.2135, "num_input_tokens_seen": 453160, "step": 5025 }, { "epoch": 1.3071725571725572, "grad_norm": 0.5169949531555176, "learning_rate": 3.267281704781705e-05, "loss": 0.2153, "num_input_tokens_seen": 453576, "step": 5030 }, { "epoch": 1.3084719334719335, "grad_norm": 1.11793851852417, "learning_rate": 3.2705301455301456e-05, "loss": 0.3808, "num_input_tokens_seen": 454024, "step": 5035 }, { "epoch": 1.3097713097713097, "grad_norm": 0.45323267579078674, "learning_rate": 3.2737785862785865e-05, "loss": 0.236, "num_input_tokens_seen": 454504, "step": 5040 }, { "epoch": 1.311070686070686, "grad_norm": 0.435091495513916, "learning_rate": 3.277027027027027e-05, "loss": 0.2622, "num_input_tokens_seen": 454984, "step": 5045 }, { "epoch": 1.3123700623700625, "grad_norm": 0.5746406316757202, "learning_rate": 3.280275467775468e-05, "loss": 0.2293, "num_input_tokens_seen": 455448, "step": 5050 }, { "epoch": 1.3136694386694387, "grad_norm": 0.38964933156967163, "learning_rate": 3.283523908523909e-05, "loss": 0.3113, "num_input_tokens_seen": 455960, "step": 5055 }, { "epoch": 1.314968814968815, "grad_norm": 0.4227403700351715, "learning_rate": 3.28677234927235e-05, "loss": 0.1873, "num_input_tokens_seen": 456392, "step": 5060 }, { "epoch": 1.3162681912681913, "grad_norm": 0.46397438645362854, "learning_rate": 3.29002079002079e-05, "loss": 0.2769, "num_input_tokens_seen": 456888, "step": 5065 }, { "epoch": 1.3175675675675675, "grad_norm": 0.48174676299095154, "learning_rate": 3.293269230769231e-05, "loss": 0.2108, "num_input_tokens_seen": 457352, "step": 5070 }, { "epoch": 1.3188669438669438, "grad_norm": 0.4930940568447113, "learning_rate": 3.296517671517671e-05, "loss": 0.2679, "num_input_tokens_seen": 457864, "step": 5075 }, { "epoch": 1.32016632016632, "grad_norm": 0.38217175006866455, "learning_rate": 3.299766112266112e-05, "loss": 0.1111, "num_input_tokens_seen": 458296, "step": 5080 }, { "epoch": 1.3214656964656966, "grad_norm": 0.6046826839447021, "learning_rate": 3.303014553014553e-05, "loss": 0.2773, "num_input_tokens_seen": 458728, "step": 5085 }, { "epoch": 1.3227650727650728, "grad_norm": 0.5460085272789001, "learning_rate": 3.306262993762994e-05, "loss": 0.3752, "num_input_tokens_seen": 459208, "step": 5090 }, { "epoch": 1.324064449064449, "grad_norm": 1.0170341730117798, "learning_rate": 3.3095114345114344e-05, "loss": 0.3144, "num_input_tokens_seen": 459688, "step": 5095 }, { "epoch": 1.3253638253638254, "grad_norm": 0.38555049896240234, "learning_rate": 3.3127598752598754e-05, "loss": 0.1923, "num_input_tokens_seen": 460184, "step": 5100 }, { "epoch": 1.3266632016632016, "grad_norm": 0.4132108688354492, "learning_rate": 3.3160083160083164e-05, "loss": 0.2684, "num_input_tokens_seen": 460600, "step": 5105 }, { "epoch": 1.3279625779625779, "grad_norm": 0.5855673551559448, "learning_rate": 3.3192567567567567e-05, "loss": 0.2688, "num_input_tokens_seen": 461000, "step": 5110 }, { "epoch": 1.3292619542619541, "grad_norm": 0.6435236930847168, "learning_rate": 3.3225051975051976e-05, "loss": 0.2853, "num_input_tokens_seen": 461432, "step": 5115 }, { "epoch": 1.3305613305613306, "grad_norm": 0.33783790469169617, "learning_rate": 3.3257536382536386e-05, "loss": 0.2634, "num_input_tokens_seen": 461880, "step": 5120 }, { "epoch": 1.331860706860707, "grad_norm": 0.9009680151939392, "learning_rate": 3.3290020790020796e-05, "loss": 0.2893, "num_input_tokens_seen": 462328, "step": 5125 }, { "epoch": 1.3331600831600832, "grad_norm": 0.6981120109558105, "learning_rate": 3.33225051975052e-05, "loss": 0.247, "num_input_tokens_seen": 462744, "step": 5130 }, { "epoch": 1.3344594594594594, "grad_norm": 0.41984614729881287, "learning_rate": 3.335498960498961e-05, "loss": 0.2029, "num_input_tokens_seen": 463208, "step": 5135 }, { "epoch": 1.3357588357588357, "grad_norm": 0.5717990398406982, "learning_rate": 3.338747401247401e-05, "loss": 0.2276, "num_input_tokens_seen": 463640, "step": 5140 }, { "epoch": 1.3370582120582122, "grad_norm": 0.6540979146957397, "learning_rate": 3.341995841995842e-05, "loss": 0.1598, "num_input_tokens_seen": 464072, "step": 5145 }, { "epoch": 1.3383575883575882, "grad_norm": 0.24814845621585846, "learning_rate": 3.345244282744283e-05, "loss": 0.2781, "num_input_tokens_seen": 464520, "step": 5150 }, { "epoch": 1.3396569646569647, "grad_norm": 0.6812940239906311, "learning_rate": 3.348492723492724e-05, "loss": 0.2288, "num_input_tokens_seen": 464984, "step": 5155 }, { "epoch": 1.340956340956341, "grad_norm": 0.34365662932395935, "learning_rate": 3.351741164241164e-05, "loss": 0.3495, "num_input_tokens_seen": 465400, "step": 5160 }, { "epoch": 1.3422557172557172, "grad_norm": 0.42115357518196106, "learning_rate": 3.354989604989605e-05, "loss": 0.2238, "num_input_tokens_seen": 465864, "step": 5165 }, { "epoch": 1.3435550935550935, "grad_norm": 0.547692596912384, "learning_rate": 3.3582380457380455e-05, "loss": 0.2678, "num_input_tokens_seen": 466296, "step": 5170 }, { "epoch": 1.3448544698544698, "grad_norm": 0.49207770824432373, "learning_rate": 3.3614864864864865e-05, "loss": 0.2248, "num_input_tokens_seen": 466744, "step": 5175 }, { "epoch": 1.3461538461538463, "grad_norm": 0.4272507429122925, "learning_rate": 3.3647349272349275e-05, "loss": 0.3252, "num_input_tokens_seen": 467176, "step": 5180 }, { "epoch": 1.3474532224532225, "grad_norm": 0.408268541097641, "learning_rate": 3.3679833679833684e-05, "loss": 0.2235, "num_input_tokens_seen": 467624, "step": 5185 }, { "epoch": 1.3487525987525988, "grad_norm": 0.9659750461578369, "learning_rate": 3.371231808731809e-05, "loss": 0.3018, "num_input_tokens_seen": 468088, "step": 5190 }, { "epoch": 1.350051975051975, "grad_norm": 0.30871936678886414, "learning_rate": 3.37448024948025e-05, "loss": 0.291, "num_input_tokens_seen": 468568, "step": 5195 }, { "epoch": 1.3513513513513513, "grad_norm": 0.7884722948074341, "learning_rate": 3.3777286902286906e-05, "loss": 0.2589, "num_input_tokens_seen": 469016, "step": 5200 }, { "epoch": 1.3526507276507276, "grad_norm": 0.9196504354476929, "learning_rate": 3.380977130977131e-05, "loss": 0.3073, "num_input_tokens_seen": 469448, "step": 5205 }, { "epoch": 1.3539501039501038, "grad_norm": 0.793314516544342, "learning_rate": 3.384225571725572e-05, "loss": 0.2682, "num_input_tokens_seen": 469896, "step": 5210 }, { "epoch": 1.3552494802494803, "grad_norm": 0.5942007303237915, "learning_rate": 3.387474012474013e-05, "loss": 0.2244, "num_input_tokens_seen": 470344, "step": 5215 }, { "epoch": 1.3565488565488566, "grad_norm": 0.43668365478515625, "learning_rate": 3.390722453222454e-05, "loss": 0.2655, "num_input_tokens_seen": 470776, "step": 5220 }, { "epoch": 1.3578482328482329, "grad_norm": 0.424261212348938, "learning_rate": 3.393970893970894e-05, "loss": 0.3456, "num_input_tokens_seen": 471192, "step": 5225 }, { "epoch": 1.3591476091476091, "grad_norm": 0.9148433208465576, "learning_rate": 3.397219334719335e-05, "loss": 0.3076, "num_input_tokens_seen": 471624, "step": 5230 }, { "epoch": 1.3604469854469854, "grad_norm": 0.302743524312973, "learning_rate": 3.4004677754677754e-05, "loss": 0.2427, "num_input_tokens_seen": 472104, "step": 5235 }, { "epoch": 1.3617463617463619, "grad_norm": 0.3353903889656067, "learning_rate": 3.403716216216216e-05, "loss": 0.2646, "num_input_tokens_seen": 472536, "step": 5240 }, { "epoch": 1.363045738045738, "grad_norm": 0.871562659740448, "learning_rate": 3.406964656964657e-05, "loss": 0.2677, "num_input_tokens_seen": 472984, "step": 5245 }, { "epoch": 1.3643451143451144, "grad_norm": 0.37591978907585144, "learning_rate": 3.410213097713098e-05, "loss": 0.2548, "num_input_tokens_seen": 473416, "step": 5250 }, { "epoch": 1.3656444906444907, "grad_norm": 0.3182419538497925, "learning_rate": 3.4134615384615386e-05, "loss": 0.3413, "num_input_tokens_seen": 473848, "step": 5255 }, { "epoch": 1.366943866943867, "grad_norm": 0.2908081114292145, "learning_rate": 3.4167099792099795e-05, "loss": 0.2464, "num_input_tokens_seen": 474264, "step": 5260 }, { "epoch": 1.3682432432432432, "grad_norm": 0.7415894269943237, "learning_rate": 3.41995841995842e-05, "loss": 0.2829, "num_input_tokens_seen": 474680, "step": 5265 }, { "epoch": 1.3695426195426195, "grad_norm": 0.2923737168312073, "learning_rate": 3.423206860706861e-05, "loss": 0.2774, "num_input_tokens_seen": 475144, "step": 5270 }, { "epoch": 1.370841995841996, "grad_norm": 0.6099298596382141, "learning_rate": 3.426455301455301e-05, "loss": 0.2489, "num_input_tokens_seen": 475576, "step": 5275 }, { "epoch": 1.3721413721413722, "grad_norm": 0.5018713474273682, "learning_rate": 3.429703742203743e-05, "loss": 0.2308, "num_input_tokens_seen": 476024, "step": 5280 }, { "epoch": 1.3734407484407485, "grad_norm": 0.41971200704574585, "learning_rate": 3.432952182952183e-05, "loss": 0.1734, "num_input_tokens_seen": 476456, "step": 5285 }, { "epoch": 1.3747401247401247, "grad_norm": 0.8940828442573547, "learning_rate": 3.436200623700624e-05, "loss": 0.2233, "num_input_tokens_seen": 476920, "step": 5290 }, { "epoch": 1.376039501039501, "grad_norm": 0.35602983832359314, "learning_rate": 3.439449064449064e-05, "loss": 0.3744, "num_input_tokens_seen": 477368, "step": 5295 }, { "epoch": 1.3773388773388773, "grad_norm": 0.3868422508239746, "learning_rate": 3.442697505197505e-05, "loss": 0.1502, "num_input_tokens_seen": 477832, "step": 5300 }, { "epoch": 1.3786382536382535, "grad_norm": 0.4014594256877899, "learning_rate": 3.445945945945946e-05, "loss": 0.1636, "num_input_tokens_seen": 478264, "step": 5305 }, { "epoch": 1.37993762993763, "grad_norm": 0.5872294902801514, "learning_rate": 3.449194386694387e-05, "loss": 0.2825, "num_input_tokens_seen": 478696, "step": 5310 }, { "epoch": 1.3812370062370063, "grad_norm": 0.4450837969779968, "learning_rate": 3.4524428274428274e-05, "loss": 0.3361, "num_input_tokens_seen": 479128, "step": 5315 }, { "epoch": 1.3825363825363826, "grad_norm": 0.43887028098106384, "learning_rate": 3.4556912681912684e-05, "loss": 0.2026, "num_input_tokens_seen": 479592, "step": 5320 }, { "epoch": 1.3838357588357588, "grad_norm": 0.4102727770805359, "learning_rate": 3.4589397089397094e-05, "loss": 0.1351, "num_input_tokens_seen": 480040, "step": 5325 }, { "epoch": 1.385135135135135, "grad_norm": 0.3734804093837738, "learning_rate": 3.4621881496881496e-05, "loss": 0.2492, "num_input_tokens_seen": 480456, "step": 5330 }, { "epoch": 1.3864345114345114, "grad_norm": 0.3599206805229187, "learning_rate": 3.4654365904365906e-05, "loss": 0.241, "num_input_tokens_seen": 480904, "step": 5335 }, { "epoch": 1.3877338877338876, "grad_norm": 0.3902381658554077, "learning_rate": 3.468685031185031e-05, "loss": 0.2753, "num_input_tokens_seen": 481336, "step": 5340 }, { "epoch": 1.389033264033264, "grad_norm": 0.3880919814109802, "learning_rate": 3.4719334719334725e-05, "loss": 0.2146, "num_input_tokens_seen": 481784, "step": 5345 }, { "epoch": 1.3903326403326404, "grad_norm": 0.49338242411613464, "learning_rate": 3.475181912681913e-05, "loss": 0.3346, "num_input_tokens_seen": 482216, "step": 5350 }, { "epoch": 1.3916320166320166, "grad_norm": 0.4409925937652588, "learning_rate": 3.478430353430354e-05, "loss": 0.2872, "num_input_tokens_seen": 482664, "step": 5355 }, { "epoch": 1.392931392931393, "grad_norm": 0.70050448179245, "learning_rate": 3.481678794178794e-05, "loss": 0.3192, "num_input_tokens_seen": 483096, "step": 5360 }, { "epoch": 1.3942307692307692, "grad_norm": 0.619236171245575, "learning_rate": 3.484927234927235e-05, "loss": 0.2142, "num_input_tokens_seen": 483560, "step": 5365 }, { "epoch": 1.3955301455301456, "grad_norm": 0.4808286726474762, "learning_rate": 3.488175675675675e-05, "loss": 0.2717, "num_input_tokens_seen": 483992, "step": 5370 }, { "epoch": 1.3968295218295217, "grad_norm": 0.38441765308380127, "learning_rate": 3.491424116424117e-05, "loss": 0.2969, "num_input_tokens_seen": 484440, "step": 5375 }, { "epoch": 1.3981288981288982, "grad_norm": 0.39256054162979126, "learning_rate": 3.494672557172557e-05, "loss": 0.3505, "num_input_tokens_seen": 484888, "step": 5380 }, { "epoch": 1.3994282744282744, "grad_norm": 0.3145352005958557, "learning_rate": 3.497920997920998e-05, "loss": 0.2441, "num_input_tokens_seen": 485320, "step": 5385 }, { "epoch": 1.4007276507276507, "grad_norm": 0.43574076890945435, "learning_rate": 3.5011694386694385e-05, "loss": 0.2656, "num_input_tokens_seen": 485784, "step": 5390 }, { "epoch": 1.402027027027027, "grad_norm": 0.7527055144309998, "learning_rate": 3.5044178794178795e-05, "loss": 0.267, "num_input_tokens_seen": 486232, "step": 5395 }, { "epoch": 1.4033264033264032, "grad_norm": 0.5766013860702515, "learning_rate": 3.5076663201663205e-05, "loss": 0.2591, "num_input_tokens_seen": 486696, "step": 5400 }, { "epoch": 1.4046257796257797, "grad_norm": 0.8862217664718628, "learning_rate": 3.5109147609147614e-05, "loss": 0.3116, "num_input_tokens_seen": 487160, "step": 5405 }, { "epoch": 1.405925155925156, "grad_norm": 0.2679983377456665, "learning_rate": 3.514163201663202e-05, "loss": 0.3246, "num_input_tokens_seen": 487592, "step": 5410 }, { "epoch": 1.4072245322245323, "grad_norm": 0.1501082330942154, "learning_rate": 3.517411642411643e-05, "loss": 0.2792, "num_input_tokens_seen": 488024, "step": 5415 }, { "epoch": 1.4085239085239085, "grad_norm": 0.8642001748085022, "learning_rate": 3.5206600831600836e-05, "loss": 0.2985, "num_input_tokens_seen": 488488, "step": 5420 }, { "epoch": 1.4098232848232848, "grad_norm": 0.12770520150661469, "learning_rate": 3.523908523908524e-05, "loss": 0.2858, "num_input_tokens_seen": 488920, "step": 5425 }, { "epoch": 1.411122661122661, "grad_norm": 0.11284149438142776, "learning_rate": 3.527156964656965e-05, "loss": 0.2862, "num_input_tokens_seen": 489352, "step": 5430 }, { "epoch": 1.4124220374220373, "grad_norm": 0.13609781861305237, "learning_rate": 3.530405405405405e-05, "loss": 0.2775, "num_input_tokens_seen": 489816, "step": 5435 }, { "epoch": 1.4137214137214138, "grad_norm": 0.8612658381462097, "learning_rate": 3.533653846153847e-05, "loss": 0.3098, "num_input_tokens_seen": 490264, "step": 5440 }, { "epoch": 1.41502079002079, "grad_norm": 0.2742963135242462, "learning_rate": 3.536902286902287e-05, "loss": 0.271, "num_input_tokens_seen": 490728, "step": 5445 }, { "epoch": 1.4163201663201663, "grad_norm": 0.4290614128112793, "learning_rate": 3.540150727650728e-05, "loss": 0.2188, "num_input_tokens_seen": 491176, "step": 5450 }, { "epoch": 1.4176195426195426, "grad_norm": 0.37411829829216003, "learning_rate": 3.5433991683991684e-05, "loss": 0.2057, "num_input_tokens_seen": 491624, "step": 5455 }, { "epoch": 1.4189189189189189, "grad_norm": 0.6822190284729004, "learning_rate": 3.546647609147609e-05, "loss": 0.2953, "num_input_tokens_seen": 492136, "step": 5460 }, { "epoch": 1.4202182952182953, "grad_norm": 0.5813432335853577, "learning_rate": 3.5498960498960496e-05, "loss": 0.3739, "num_input_tokens_seen": 492600, "step": 5465 }, { "epoch": 1.4215176715176714, "grad_norm": 0.48394593596458435, "learning_rate": 3.553144490644491e-05, "loss": 0.2624, "num_input_tokens_seen": 493016, "step": 5470 }, { "epoch": 1.4228170478170479, "grad_norm": 0.991047739982605, "learning_rate": 3.5563929313929315e-05, "loss": 0.3128, "num_input_tokens_seen": 493480, "step": 5475 }, { "epoch": 1.4241164241164241, "grad_norm": 0.5614458918571472, "learning_rate": 3.5596413721413725e-05, "loss": 0.1976, "num_input_tokens_seen": 493912, "step": 5480 }, { "epoch": 1.4254158004158004, "grad_norm": 0.5348777770996094, "learning_rate": 3.562889812889813e-05, "loss": 0.2302, "num_input_tokens_seen": 494344, "step": 5485 }, { "epoch": 1.4267151767151767, "grad_norm": 0.5129832029342651, "learning_rate": 3.566138253638254e-05, "loss": 0.2221, "num_input_tokens_seen": 494792, "step": 5490 }, { "epoch": 1.428014553014553, "grad_norm": 0.3973398804664612, "learning_rate": 3.569386694386694e-05, "loss": 0.1633, "num_input_tokens_seen": 495224, "step": 5495 }, { "epoch": 1.4293139293139294, "grad_norm": 0.3368563950061798, "learning_rate": 3.572635135135135e-05, "loss": 0.2044, "num_input_tokens_seen": 495720, "step": 5500 }, { "epoch": 1.4306133056133057, "grad_norm": 1.2871404886245728, "learning_rate": 3.575883575883576e-05, "loss": 0.4463, "num_input_tokens_seen": 496152, "step": 5505 }, { "epoch": 1.431912681912682, "grad_norm": 0.49365559220314026, "learning_rate": 3.579132016632017e-05, "loss": 0.2041, "num_input_tokens_seen": 496552, "step": 5510 }, { "epoch": 1.4332120582120582, "grad_norm": 0.4950096309185028, "learning_rate": 3.582380457380458e-05, "loss": 0.2212, "num_input_tokens_seen": 496968, "step": 5515 }, { "epoch": 1.4345114345114345, "grad_norm": 0.40753045678138733, "learning_rate": 3.585628898128898e-05, "loss": 0.3021, "num_input_tokens_seen": 497432, "step": 5520 }, { "epoch": 1.4358108108108107, "grad_norm": 0.40960848331451416, "learning_rate": 3.588877338877339e-05, "loss": 0.2596, "num_input_tokens_seen": 497896, "step": 5525 }, { "epoch": 1.437110187110187, "grad_norm": 0.29907989501953125, "learning_rate": 3.5921257796257795e-05, "loss": 0.2292, "num_input_tokens_seen": 498344, "step": 5530 }, { "epoch": 1.4384095634095635, "grad_norm": 0.48821771144866943, "learning_rate": 3.595374220374221e-05, "loss": 0.1547, "num_input_tokens_seen": 498776, "step": 5535 }, { "epoch": 1.4397089397089398, "grad_norm": 0.3142526149749756, "learning_rate": 3.5986226611226614e-05, "loss": 0.1075, "num_input_tokens_seen": 499240, "step": 5540 }, { "epoch": 1.441008316008316, "grad_norm": 0.25924208760261536, "learning_rate": 3.6018711018711024e-05, "loss": 0.2388, "num_input_tokens_seen": 499736, "step": 5545 }, { "epoch": 1.4423076923076923, "grad_norm": 0.7123463749885559, "learning_rate": 3.6051195426195426e-05, "loss": 0.4022, "num_input_tokens_seen": 500168, "step": 5550 }, { "epoch": 1.4436070686070686, "grad_norm": 0.43459123373031616, "learning_rate": 3.6083679833679836e-05, "loss": 0.3806, "num_input_tokens_seen": 500632, "step": 5555 }, { "epoch": 1.444906444906445, "grad_norm": 1.3254209756851196, "learning_rate": 3.611616424116424e-05, "loss": 0.3119, "num_input_tokens_seen": 501064, "step": 5560 }, { "epoch": 1.446205821205821, "grad_norm": 1.2722039222717285, "learning_rate": 3.6148648648648655e-05, "loss": 0.2346, "num_input_tokens_seen": 501496, "step": 5565 }, { "epoch": 1.4475051975051976, "grad_norm": 4.440818786621094, "learning_rate": 3.618113305613306e-05, "loss": 0.2611, "num_input_tokens_seen": 501976, "step": 5570 }, { "epoch": 1.4488045738045738, "grad_norm": 0.397348552942276, "learning_rate": 3.621361746361747e-05, "loss": 0.2874, "num_input_tokens_seen": 502392, "step": 5575 }, { "epoch": 1.45010395010395, "grad_norm": 0.46366965770721436, "learning_rate": 3.624610187110187e-05, "loss": 0.3838, "num_input_tokens_seen": 502824, "step": 5580 }, { "epoch": 1.4514033264033264, "grad_norm": 0.7166075110435486, "learning_rate": 3.627858627858628e-05, "loss": 0.2304, "num_input_tokens_seen": 503304, "step": 5585 }, { "epoch": 1.4527027027027026, "grad_norm": 0.4607461094856262, "learning_rate": 3.631107068607068e-05, "loss": 0.2205, "num_input_tokens_seen": 503720, "step": 5590 }, { "epoch": 1.4540020790020791, "grad_norm": 0.8742169737815857, "learning_rate": 3.634355509355509e-05, "loss": 0.2235, "num_input_tokens_seen": 504184, "step": 5595 }, { "epoch": 1.4553014553014554, "grad_norm": 0.4286826252937317, "learning_rate": 3.63760395010395e-05, "loss": 0.1587, "num_input_tokens_seen": 504664, "step": 5600 }, { "epoch": 1.4566008316008316, "grad_norm": 0.5801926255226135, "learning_rate": 3.640852390852391e-05, "loss": 0.3989, "num_input_tokens_seen": 505112, "step": 5605 }, { "epoch": 1.457900207900208, "grad_norm": 0.5763977766036987, "learning_rate": 3.6441008316008315e-05, "loss": 0.2794, "num_input_tokens_seen": 505560, "step": 5610 }, { "epoch": 1.4591995841995842, "grad_norm": 0.9526160955429077, "learning_rate": 3.6473492723492725e-05, "loss": 0.275, "num_input_tokens_seen": 505992, "step": 5615 }, { "epoch": 1.4604989604989604, "grad_norm": 0.7298789620399475, "learning_rate": 3.6505977130977134e-05, "loss": 0.2734, "num_input_tokens_seen": 506456, "step": 5620 }, { "epoch": 1.4617983367983367, "grad_norm": 1.0158121585845947, "learning_rate": 3.653846153846154e-05, "loss": 0.2771, "num_input_tokens_seen": 506872, "step": 5625 }, { "epoch": 1.4630977130977132, "grad_norm": 0.4099932610988617, "learning_rate": 3.657094594594595e-05, "loss": 0.3451, "num_input_tokens_seen": 507320, "step": 5630 }, { "epoch": 1.4643970893970895, "grad_norm": 0.7569889426231384, "learning_rate": 3.660343035343036e-05, "loss": 0.3346, "num_input_tokens_seen": 507832, "step": 5635 }, { "epoch": 1.4656964656964657, "grad_norm": 0.7597914338111877, "learning_rate": 3.6635914760914766e-05, "loss": 0.2429, "num_input_tokens_seen": 508296, "step": 5640 }, { "epoch": 1.466995841995842, "grad_norm": 0.8040224313735962, "learning_rate": 3.666839916839917e-05, "loss": 0.2855, "num_input_tokens_seen": 508760, "step": 5645 }, { "epoch": 1.4682952182952183, "grad_norm": 0.5097697377204895, "learning_rate": 3.670088357588358e-05, "loss": 0.3005, "num_input_tokens_seen": 509208, "step": 5650 }, { "epoch": 1.4695945945945945, "grad_norm": 0.4064059555530548, "learning_rate": 3.673336798336798e-05, "loss": 0.2942, "num_input_tokens_seen": 509640, "step": 5655 }, { "epoch": 1.4708939708939708, "grad_norm": 0.6765508055686951, "learning_rate": 3.676585239085239e-05, "loss": 0.2358, "num_input_tokens_seen": 510104, "step": 5660 }, { "epoch": 1.4721933471933473, "grad_norm": 0.5484420657157898, "learning_rate": 3.67983367983368e-05, "loss": 0.2082, "num_input_tokens_seen": 510520, "step": 5665 }, { "epoch": 1.4734927234927235, "grad_norm": 0.4996025562286377, "learning_rate": 3.683082120582121e-05, "loss": 0.3282, "num_input_tokens_seen": 510968, "step": 5670 }, { "epoch": 1.4747920997920998, "grad_norm": 0.4757772386074066, "learning_rate": 3.6863305613305614e-05, "loss": 0.2229, "num_input_tokens_seen": 511400, "step": 5675 }, { "epoch": 1.476091476091476, "grad_norm": 0.3880458474159241, "learning_rate": 3.689579002079002e-05, "loss": 0.1633, "num_input_tokens_seen": 511848, "step": 5680 }, { "epoch": 1.4773908523908523, "grad_norm": 0.6343756318092346, "learning_rate": 3.6928274428274426e-05, "loss": 0.2138, "num_input_tokens_seen": 512312, "step": 5685 }, { "epoch": 1.4786902286902288, "grad_norm": 0.607902467250824, "learning_rate": 3.6960758835758836e-05, "loss": 0.3321, "num_input_tokens_seen": 512760, "step": 5690 }, { "epoch": 1.4799896049896049, "grad_norm": 0.4583757817745209, "learning_rate": 3.6993243243243245e-05, "loss": 0.3374, "num_input_tokens_seen": 513272, "step": 5695 }, { "epoch": 1.4812889812889813, "grad_norm": 0.24758365750312805, "learning_rate": 3.7025727650727655e-05, "loss": 0.2895, "num_input_tokens_seen": 513736, "step": 5700 }, { "epoch": 1.4825883575883576, "grad_norm": 0.18531233072280884, "learning_rate": 3.705821205821206e-05, "loss": 0.2839, "num_input_tokens_seen": 514168, "step": 5705 }, { "epoch": 1.4838877338877339, "grad_norm": 0.1222895085811615, "learning_rate": 3.709069646569647e-05, "loss": 0.2762, "num_input_tokens_seen": 514600, "step": 5710 }, { "epoch": 1.4851871101871101, "grad_norm": 0.8972460627555847, "learning_rate": 3.712318087318088e-05, "loss": 0.2768, "num_input_tokens_seen": 515112, "step": 5715 }, { "epoch": 1.4864864864864864, "grad_norm": 0.9418618083000183, "learning_rate": 3.715566528066528e-05, "loss": 0.2751, "num_input_tokens_seen": 515560, "step": 5720 }, { "epoch": 1.487785862785863, "grad_norm": 0.39803043007850647, "learning_rate": 3.718814968814969e-05, "loss": 0.1696, "num_input_tokens_seen": 515976, "step": 5725 }, { "epoch": 1.4890852390852392, "grad_norm": 0.35725733637809753, "learning_rate": 3.72206340956341e-05, "loss": 0.3319, "num_input_tokens_seen": 516504, "step": 5730 }, { "epoch": 1.4903846153846154, "grad_norm": 0.3276252746582031, "learning_rate": 3.725311850311851e-05, "loss": 0.1943, "num_input_tokens_seen": 516920, "step": 5735 }, { "epoch": 1.4916839916839917, "grad_norm": 0.6144804954528809, "learning_rate": 3.728560291060291e-05, "loss": 0.3327, "num_input_tokens_seen": 517352, "step": 5740 }, { "epoch": 1.492983367983368, "grad_norm": 0.49424681067466736, "learning_rate": 3.731808731808732e-05, "loss": 0.3304, "num_input_tokens_seen": 517816, "step": 5745 }, { "epoch": 1.4942827442827442, "grad_norm": 0.41510045528411865, "learning_rate": 3.7350571725571725e-05, "loss": 0.1735, "num_input_tokens_seen": 518296, "step": 5750 }, { "epoch": 1.4955821205821205, "grad_norm": 0.44305419921875, "learning_rate": 3.7383056133056134e-05, "loss": 0.3286, "num_input_tokens_seen": 518744, "step": 5755 }, { "epoch": 1.496881496881497, "grad_norm": 0.5757625102996826, "learning_rate": 3.7415540540540544e-05, "loss": 0.2308, "num_input_tokens_seen": 519224, "step": 5760 }, { "epoch": 1.4981808731808732, "grad_norm": 0.3715832233428955, "learning_rate": 3.7448024948024953e-05, "loss": 0.281, "num_input_tokens_seen": 519672, "step": 5765 }, { "epoch": 1.4994802494802495, "grad_norm": 0.2869332432746887, "learning_rate": 3.7480509355509356e-05, "loss": 0.3025, "num_input_tokens_seen": 520120, "step": 5770 }, { "epoch": 1.5007796257796258, "grad_norm": 0.8152077794075012, "learning_rate": 3.7512993762993766e-05, "loss": 0.2471, "num_input_tokens_seen": 520552, "step": 5775 }, { "epoch": 1.502079002079002, "grad_norm": 0.6033880710601807, "learning_rate": 3.754547817047817e-05, "loss": 0.2577, "num_input_tokens_seen": 520984, "step": 5780 }, { "epoch": 1.5033783783783785, "grad_norm": 0.5137702226638794, "learning_rate": 3.757796257796258e-05, "loss": 0.228, "num_input_tokens_seen": 521448, "step": 5785 }, { "epoch": 1.5046777546777546, "grad_norm": 1.0382946729660034, "learning_rate": 3.761044698544699e-05, "loss": 0.366, "num_input_tokens_seen": 521896, "step": 5790 }, { "epoch": 1.505977130977131, "grad_norm": 0.43165323138237, "learning_rate": 3.76429313929314e-05, "loss": 0.177, "num_input_tokens_seen": 522344, "step": 5795 }, { "epoch": 1.5072765072765073, "grad_norm": 0.4024536907672882, "learning_rate": 3.76754158004158e-05, "loss": 0.2918, "num_input_tokens_seen": 522760, "step": 5800 }, { "epoch": 1.5085758835758836, "grad_norm": 0.5059704780578613, "learning_rate": 3.770790020790021e-05, "loss": 0.2278, "num_input_tokens_seen": 523240, "step": 5805 }, { "epoch": 1.5098752598752598, "grad_norm": 0.40660157799720764, "learning_rate": 3.774038461538461e-05, "loss": 0.1806, "num_input_tokens_seen": 523688, "step": 5810 }, { "epoch": 1.511174636174636, "grad_norm": 0.3578947186470032, "learning_rate": 3.777286902286902e-05, "loss": 0.2258, "num_input_tokens_seen": 524184, "step": 5815 }, { "epoch": 1.5124740124740126, "grad_norm": 0.5430552959442139, "learning_rate": 3.780535343035343e-05, "loss": 0.3478, "num_input_tokens_seen": 524664, "step": 5820 }, { "epoch": 1.5137733887733886, "grad_norm": 0.8975651264190674, "learning_rate": 3.783783783783784e-05, "loss": 0.3982, "num_input_tokens_seen": 525160, "step": 5825 }, { "epoch": 1.5150727650727651, "grad_norm": 0.3199855089187622, "learning_rate": 3.7870322245322245e-05, "loss": 0.234, "num_input_tokens_seen": 525624, "step": 5830 }, { "epoch": 1.5163721413721414, "grad_norm": 0.3407924473285675, "learning_rate": 3.7902806652806655e-05, "loss": 0.2615, "num_input_tokens_seen": 526056, "step": 5835 }, { "epoch": 1.5176715176715176, "grad_norm": 0.24153843522071838, "learning_rate": 3.7935291060291064e-05, "loss": 0.2927, "num_input_tokens_seen": 526472, "step": 5840 }, { "epoch": 1.5189708939708941, "grad_norm": 0.7238161563873291, "learning_rate": 3.796777546777547e-05, "loss": 0.2867, "num_input_tokens_seen": 526920, "step": 5845 }, { "epoch": 1.5202702702702702, "grad_norm": 0.6994080543518066, "learning_rate": 3.800025987525988e-05, "loss": 0.2623, "num_input_tokens_seen": 527368, "step": 5850 }, { "epoch": 1.5215696465696467, "grad_norm": 0.5064318180084229, "learning_rate": 3.8032744282744287e-05, "loss": 0.1997, "num_input_tokens_seen": 527784, "step": 5855 }, { "epoch": 1.5228690228690227, "grad_norm": 0.5553390383720398, "learning_rate": 3.8065228690228696e-05, "loss": 0.2658, "num_input_tokens_seen": 528264, "step": 5860 }, { "epoch": 1.5241683991683992, "grad_norm": 0.5397229790687561, "learning_rate": 3.80977130977131e-05, "loss": 0.3128, "num_input_tokens_seen": 528696, "step": 5865 }, { "epoch": 1.5254677754677755, "grad_norm": 0.3381687104701996, "learning_rate": 3.813019750519751e-05, "loss": 0.2169, "num_input_tokens_seen": 529128, "step": 5870 }, { "epoch": 1.5267671517671517, "grad_norm": 0.4837164282798767, "learning_rate": 3.816268191268191e-05, "loss": 0.3493, "num_input_tokens_seen": 529576, "step": 5875 }, { "epoch": 1.5280665280665282, "grad_norm": 0.40644827485084534, "learning_rate": 3.819516632016632e-05, "loss": 0.264, "num_input_tokens_seen": 530024, "step": 5880 }, { "epoch": 1.5293659043659042, "grad_norm": 1.1587257385253906, "learning_rate": 3.8227650727650724e-05, "loss": 0.2942, "num_input_tokens_seen": 530488, "step": 5885 }, { "epoch": 1.5306652806652807, "grad_norm": 0.9804296493530273, "learning_rate": 3.826013513513514e-05, "loss": 0.2711, "num_input_tokens_seen": 530920, "step": 5890 }, { "epoch": 1.531964656964657, "grad_norm": 0.6448507905006409, "learning_rate": 3.8292619542619543e-05, "loss": 0.2554, "num_input_tokens_seen": 531352, "step": 5895 }, { "epoch": 1.5332640332640333, "grad_norm": 0.9120684862136841, "learning_rate": 3.832510395010395e-05, "loss": 0.2967, "num_input_tokens_seen": 531784, "step": 5900 }, { "epoch": 1.5345634095634095, "grad_norm": 0.38405096530914307, "learning_rate": 3.8357588357588356e-05, "loss": 0.2415, "num_input_tokens_seen": 532280, "step": 5905 }, { "epoch": 1.5358627858627858, "grad_norm": 0.8761555552482605, "learning_rate": 3.8390072765072766e-05, "loss": 0.2446, "num_input_tokens_seen": 532744, "step": 5910 }, { "epoch": 1.5371621621621623, "grad_norm": 0.43049323558807373, "learning_rate": 3.8422557172557175e-05, "loss": 0.2227, "num_input_tokens_seen": 533208, "step": 5915 }, { "epoch": 1.5384615384615383, "grad_norm": 0.45020970702171326, "learning_rate": 3.8455041580041585e-05, "loss": 0.2638, "num_input_tokens_seen": 533656, "step": 5920 }, { "epoch": 1.5397609147609148, "grad_norm": 0.36757680773735046, "learning_rate": 3.848752598752599e-05, "loss": 0.3044, "num_input_tokens_seen": 534104, "step": 5925 }, { "epoch": 1.541060291060291, "grad_norm": 0.3010944128036499, "learning_rate": 3.85200103950104e-05, "loss": 0.298, "num_input_tokens_seen": 534536, "step": 5930 }, { "epoch": 1.5423596673596673, "grad_norm": 0.29868170619010925, "learning_rate": 3.855249480249481e-05, "loss": 0.2112, "num_input_tokens_seen": 535000, "step": 5935 }, { "epoch": 1.5436590436590436, "grad_norm": 0.500500500202179, "learning_rate": 3.858497920997921e-05, "loss": 0.2796, "num_input_tokens_seen": 535448, "step": 5940 }, { "epoch": 1.5449584199584199, "grad_norm": 0.5505921840667725, "learning_rate": 3.861746361746362e-05, "loss": 0.2315, "num_input_tokens_seen": 535928, "step": 5945 }, { "epoch": 1.5462577962577964, "grad_norm": 0.37176257371902466, "learning_rate": 3.864994802494803e-05, "loss": 0.2575, "num_input_tokens_seen": 536360, "step": 5950 }, { "epoch": 1.5475571725571724, "grad_norm": 0.36234158277511597, "learning_rate": 3.868243243243244e-05, "loss": 0.3245, "num_input_tokens_seen": 536760, "step": 5955 }, { "epoch": 1.5488565488565489, "grad_norm": 0.8218351006507874, "learning_rate": 3.871491683991684e-05, "loss": 0.28, "num_input_tokens_seen": 537176, "step": 5960 }, { "epoch": 1.5501559251559252, "grad_norm": 0.682638943195343, "learning_rate": 3.874740124740125e-05, "loss": 0.3386, "num_input_tokens_seen": 537656, "step": 5965 }, { "epoch": 1.5514553014553014, "grad_norm": 0.138752281665802, "learning_rate": 3.8779885654885654e-05, "loss": 0.2821, "num_input_tokens_seen": 538104, "step": 5970 }, { "epoch": 1.552754677754678, "grad_norm": 0.592427670955658, "learning_rate": 3.8812370062370064e-05, "loss": 0.2541, "num_input_tokens_seen": 538584, "step": 5975 }, { "epoch": 1.554054054054054, "grad_norm": 0.4635239541530609, "learning_rate": 3.884485446985447e-05, "loss": 0.2535, "num_input_tokens_seen": 539064, "step": 5980 }, { "epoch": 1.5553534303534304, "grad_norm": 0.4203489124774933, "learning_rate": 3.8877338877338883e-05, "loss": 0.2164, "num_input_tokens_seen": 539480, "step": 5985 }, { "epoch": 1.5566528066528067, "grad_norm": 0.7944439053535461, "learning_rate": 3.8909823284823286e-05, "loss": 0.2182, "num_input_tokens_seen": 539912, "step": 5990 }, { "epoch": 1.557952182952183, "grad_norm": 0.7717284560203552, "learning_rate": 3.8942307692307696e-05, "loss": 0.3255, "num_input_tokens_seen": 540360, "step": 5995 }, { "epoch": 1.5592515592515592, "grad_norm": 0.43591076135635376, "learning_rate": 3.89747920997921e-05, "loss": 0.273, "num_input_tokens_seen": 540824, "step": 6000 }, { "epoch": 1.5605509355509355, "grad_norm": 0.6678225994110107, "learning_rate": 3.900727650727651e-05, "loss": 0.3186, "num_input_tokens_seen": 541320, "step": 6005 }, { "epoch": 1.561850311850312, "grad_norm": 0.618170976638794, "learning_rate": 3.903976091476091e-05, "loss": 0.2186, "num_input_tokens_seen": 541768, "step": 6010 }, { "epoch": 1.563149688149688, "grad_norm": 0.5657948851585388, "learning_rate": 3.907224532224533e-05, "loss": 0.2006, "num_input_tokens_seen": 542248, "step": 6015 }, { "epoch": 1.5644490644490645, "grad_norm": 0.45394760370254517, "learning_rate": 3.910472972972973e-05, "loss": 0.2073, "num_input_tokens_seen": 542680, "step": 6020 }, { "epoch": 1.5657484407484408, "grad_norm": 0.40495431423187256, "learning_rate": 3.913721413721414e-05, "loss": 0.1431, "num_input_tokens_seen": 543112, "step": 6025 }, { "epoch": 1.567047817047817, "grad_norm": 0.9091681241989136, "learning_rate": 3.916969854469854e-05, "loss": 0.4523, "num_input_tokens_seen": 543576, "step": 6030 }, { "epoch": 1.5683471933471933, "grad_norm": 0.4547826945781708, "learning_rate": 3.920218295218295e-05, "loss": 0.2679, "num_input_tokens_seen": 544040, "step": 6035 }, { "epoch": 1.5696465696465696, "grad_norm": 0.5605474710464478, "learning_rate": 3.923466735966736e-05, "loss": 0.232, "num_input_tokens_seen": 544488, "step": 6040 }, { "epoch": 1.570945945945946, "grad_norm": 0.9667084813117981, "learning_rate": 3.9267151767151765e-05, "loss": 0.3124, "num_input_tokens_seen": 544904, "step": 6045 }, { "epoch": 1.572245322245322, "grad_norm": 0.509564220905304, "learning_rate": 3.929963617463618e-05, "loss": 0.205, "num_input_tokens_seen": 545384, "step": 6050 }, { "epoch": 1.5735446985446986, "grad_norm": 0.4169420599937439, "learning_rate": 3.9332120582120585e-05, "loss": 0.2848, "num_input_tokens_seen": 545880, "step": 6055 }, { "epoch": 1.5748440748440748, "grad_norm": 0.42040830850601196, "learning_rate": 3.9364604989604994e-05, "loss": 0.1883, "num_input_tokens_seen": 546328, "step": 6060 }, { "epoch": 1.5761434511434511, "grad_norm": 0.5103831887245178, "learning_rate": 3.93970893970894e-05, "loss": 0.2584, "num_input_tokens_seen": 546792, "step": 6065 }, { "epoch": 1.5774428274428276, "grad_norm": 0.5066548585891724, "learning_rate": 3.942957380457381e-05, "loss": 0.2689, "num_input_tokens_seen": 547240, "step": 6070 }, { "epoch": 1.5787422037422036, "grad_norm": 0.37254786491394043, "learning_rate": 3.946205821205821e-05, "loss": 0.1108, "num_input_tokens_seen": 547688, "step": 6075 }, { "epoch": 1.5800415800415801, "grad_norm": 0.33056122064590454, "learning_rate": 3.9494542619542626e-05, "loss": 0.1893, "num_input_tokens_seen": 548120, "step": 6080 }, { "epoch": 1.5813409563409564, "grad_norm": 1.3552724123001099, "learning_rate": 3.952702702702703e-05, "loss": 0.318, "num_input_tokens_seen": 548584, "step": 6085 }, { "epoch": 1.5826403326403327, "grad_norm": 0.30109426379203796, "learning_rate": 3.955951143451144e-05, "loss": 0.2628, "num_input_tokens_seen": 549016, "step": 6090 }, { "epoch": 1.583939708939709, "grad_norm": 0.4224948585033417, "learning_rate": 3.959199584199584e-05, "loss": 0.3853, "num_input_tokens_seen": 549448, "step": 6095 }, { "epoch": 1.5852390852390852, "grad_norm": 0.8012471199035645, "learning_rate": 3.962448024948025e-05, "loss": 0.2711, "num_input_tokens_seen": 549928, "step": 6100 }, { "epoch": 1.5865384615384617, "grad_norm": 0.2651899755001068, "learning_rate": 3.9656964656964654e-05, "loss": 0.2834, "num_input_tokens_seen": 550408, "step": 6105 }, { "epoch": 1.5878378378378377, "grad_norm": 0.7092350125312805, "learning_rate": 3.968944906444907e-05, "loss": 0.2721, "num_input_tokens_seen": 550856, "step": 6110 }, { "epoch": 1.5891372141372142, "grad_norm": 0.6861375570297241, "learning_rate": 3.9721933471933473e-05, "loss": 0.2798, "num_input_tokens_seen": 551304, "step": 6115 }, { "epoch": 1.5904365904365905, "grad_norm": 0.6644079685211182, "learning_rate": 3.975441787941788e-05, "loss": 0.2697, "num_input_tokens_seen": 551768, "step": 6120 }, { "epoch": 1.5917359667359667, "grad_norm": 0.3866196274757385, "learning_rate": 3.9786902286902286e-05, "loss": 0.2599, "num_input_tokens_seen": 552216, "step": 6125 }, { "epoch": 1.593035343035343, "grad_norm": 0.5825464129447937, "learning_rate": 3.9819386694386696e-05, "loss": 0.2794, "num_input_tokens_seen": 552680, "step": 6130 }, { "epoch": 1.5943347193347193, "grad_norm": 0.4941476583480835, "learning_rate": 3.9851871101871105e-05, "loss": 0.2241, "num_input_tokens_seen": 553112, "step": 6135 }, { "epoch": 1.5956340956340958, "grad_norm": 0.4557637870311737, "learning_rate": 3.988435550935551e-05, "loss": 0.1794, "num_input_tokens_seen": 553528, "step": 6140 }, { "epoch": 1.5969334719334718, "grad_norm": 0.40234172344207764, "learning_rate": 3.991683991683992e-05, "loss": 0.2787, "num_input_tokens_seen": 553960, "step": 6145 }, { "epoch": 1.5982328482328483, "grad_norm": 0.642802894115448, "learning_rate": 3.994932432432433e-05, "loss": 0.251, "num_input_tokens_seen": 554408, "step": 6150 }, { "epoch": 1.5995322245322245, "grad_norm": 0.39194580912590027, "learning_rate": 3.998180873180874e-05, "loss": 0.2768, "num_input_tokens_seen": 554856, "step": 6155 }, { "epoch": 1.6008316008316008, "grad_norm": 0.46998751163482666, "learning_rate": 4.001429313929314e-05, "loss": 0.2538, "num_input_tokens_seen": 555304, "step": 6160 }, { "epoch": 1.6021309771309773, "grad_norm": 0.42195260524749756, "learning_rate": 4.004677754677755e-05, "loss": 0.2524, "num_input_tokens_seen": 555800, "step": 6165 }, { "epoch": 1.6034303534303533, "grad_norm": 0.4596182107925415, "learning_rate": 4.007926195426195e-05, "loss": 0.2173, "num_input_tokens_seen": 556264, "step": 6170 }, { "epoch": 1.6047297297297298, "grad_norm": 0.41835325956344604, "learning_rate": 4.011174636174637e-05, "loss": 0.3072, "num_input_tokens_seen": 556680, "step": 6175 }, { "epoch": 1.6060291060291059, "grad_norm": 0.38105863332748413, "learning_rate": 4.014423076923077e-05, "loss": 0.2583, "num_input_tokens_seen": 557176, "step": 6180 }, { "epoch": 1.6073284823284824, "grad_norm": 0.5101937055587769, "learning_rate": 4.017671517671518e-05, "loss": 0.1959, "num_input_tokens_seen": 557672, "step": 6185 }, { "epoch": 1.6086278586278586, "grad_norm": 0.3839322626590729, "learning_rate": 4.0209199584199584e-05, "loss": 0.2515, "num_input_tokens_seen": 558136, "step": 6190 }, { "epoch": 1.6099272349272349, "grad_norm": 0.3864680826663971, "learning_rate": 4.0241683991683994e-05, "loss": 0.2582, "num_input_tokens_seen": 558568, "step": 6195 }, { "epoch": 1.6112266112266114, "grad_norm": 0.37225934863090515, "learning_rate": 4.02741683991684e-05, "loss": 0.3152, "num_input_tokens_seen": 559000, "step": 6200 }, { "epoch": 1.6125259875259874, "grad_norm": 0.5072019696235657, "learning_rate": 4.0306652806652807e-05, "loss": 0.2879, "num_input_tokens_seen": 559448, "step": 6205 }, { "epoch": 1.613825363825364, "grad_norm": 0.6197595000267029, "learning_rate": 4.0339137214137216e-05, "loss": 0.2624, "num_input_tokens_seen": 559928, "step": 6210 }, { "epoch": 1.6151247401247402, "grad_norm": 0.6206514239311218, "learning_rate": 4.0371621621621626e-05, "loss": 0.2488, "num_input_tokens_seen": 560360, "step": 6215 }, { "epoch": 1.6164241164241164, "grad_norm": 0.4230825901031494, "learning_rate": 4.040410602910603e-05, "loss": 0.2236, "num_input_tokens_seen": 560792, "step": 6220 }, { "epoch": 1.6177234927234927, "grad_norm": 0.4509347677230835, "learning_rate": 4.043659043659044e-05, "loss": 0.2769, "num_input_tokens_seen": 561240, "step": 6225 }, { "epoch": 1.619022869022869, "grad_norm": 0.872218668460846, "learning_rate": 4.046907484407484e-05, "loss": 0.34, "num_input_tokens_seen": 561688, "step": 6230 }, { "epoch": 1.6203222453222454, "grad_norm": 0.3022949993610382, "learning_rate": 4.050155925155925e-05, "loss": 0.2934, "num_input_tokens_seen": 562120, "step": 6235 }, { "epoch": 1.6216216216216215, "grad_norm": 0.5139979720115662, "learning_rate": 4.053404365904366e-05, "loss": 0.2094, "num_input_tokens_seen": 562632, "step": 6240 }, { "epoch": 1.622920997920998, "grad_norm": 0.45572927594184875, "learning_rate": 4.056652806652807e-05, "loss": 0.2229, "num_input_tokens_seen": 563080, "step": 6245 }, { "epoch": 1.6242203742203742, "grad_norm": 0.37422940135002136, "learning_rate": 4.059901247401248e-05, "loss": 0.2181, "num_input_tokens_seen": 563496, "step": 6250 }, { "epoch": 1.6255197505197505, "grad_norm": 0.32435575127601624, "learning_rate": 4.063149688149688e-05, "loss": 0.1637, "num_input_tokens_seen": 563944, "step": 6255 }, { "epoch": 1.6268191268191268, "grad_norm": 0.2781877815723419, "learning_rate": 4.066398128898129e-05, "loss": 0.2584, "num_input_tokens_seen": 564392, "step": 6260 }, { "epoch": 1.628118503118503, "grad_norm": 1.0782698392868042, "learning_rate": 4.0696465696465695e-05, "loss": 0.3802, "num_input_tokens_seen": 564840, "step": 6265 }, { "epoch": 1.6294178794178795, "grad_norm": 0.38218656182289124, "learning_rate": 4.0728950103950105e-05, "loss": 0.289, "num_input_tokens_seen": 565320, "step": 6270 }, { "epoch": 1.6307172557172556, "grad_norm": 0.4519829750061035, "learning_rate": 4.0761434511434515e-05, "loss": 0.2228, "num_input_tokens_seen": 565784, "step": 6275 }, { "epoch": 1.632016632016632, "grad_norm": 0.3420472741127014, "learning_rate": 4.0793918918918924e-05, "loss": 0.2924, "num_input_tokens_seen": 566216, "step": 6280 }, { "epoch": 1.6333160083160083, "grad_norm": 0.269174188375473, "learning_rate": 4.082640332640333e-05, "loss": 0.2818, "num_input_tokens_seen": 566664, "step": 6285 }, { "epoch": 1.6346153846153846, "grad_norm": 0.5760965347290039, "learning_rate": 4.085888773388774e-05, "loss": 0.2255, "num_input_tokens_seen": 567080, "step": 6290 }, { "epoch": 1.635914760914761, "grad_norm": 0.4537227749824524, "learning_rate": 4.089137214137214e-05, "loss": 0.1989, "num_input_tokens_seen": 567496, "step": 6295 }, { "epoch": 1.637214137214137, "grad_norm": 0.43311893939971924, "learning_rate": 4.092385654885655e-05, "loss": 0.2694, "num_input_tokens_seen": 567912, "step": 6300 }, { "epoch": 1.6385135135135136, "grad_norm": 0.5554285049438477, "learning_rate": 4.095634095634096e-05, "loss": 0.303, "num_input_tokens_seen": 568360, "step": 6305 }, { "epoch": 1.6398128898128899, "grad_norm": 1.0241925716400146, "learning_rate": 4.098882536382537e-05, "loss": 0.3098, "num_input_tokens_seen": 568792, "step": 6310 }, { "epoch": 1.6411122661122661, "grad_norm": 0.5031588077545166, "learning_rate": 4.102130977130977e-05, "loss": 0.2614, "num_input_tokens_seen": 569208, "step": 6315 }, { "epoch": 1.6424116424116424, "grad_norm": 0.5378952026367188, "learning_rate": 4.105379417879418e-05, "loss": 0.2587, "num_input_tokens_seen": 569656, "step": 6320 }, { "epoch": 1.6437110187110187, "grad_norm": 0.3073250651359558, "learning_rate": 4.1086278586278584e-05, "loss": 0.2597, "num_input_tokens_seen": 570088, "step": 6325 }, { "epoch": 1.6450103950103951, "grad_norm": 0.564811646938324, "learning_rate": 4.1118762993762994e-05, "loss": 0.2461, "num_input_tokens_seen": 570536, "step": 6330 }, { "epoch": 1.6463097713097712, "grad_norm": 0.573776364326477, "learning_rate": 4.11512474012474e-05, "loss": 0.1996, "num_input_tokens_seen": 571000, "step": 6335 }, { "epoch": 1.6476091476091477, "grad_norm": 0.4511926472187042, "learning_rate": 4.118373180873181e-05, "loss": 0.2039, "num_input_tokens_seen": 571496, "step": 6340 }, { "epoch": 1.648908523908524, "grad_norm": 0.335523396730423, "learning_rate": 4.1216216216216216e-05, "loss": 0.1572, "num_input_tokens_seen": 571928, "step": 6345 }, { "epoch": 1.6502079002079002, "grad_norm": 0.24608729779720306, "learning_rate": 4.1248700623700626e-05, "loss": 0.1628, "num_input_tokens_seen": 572408, "step": 6350 }, { "epoch": 1.6515072765072765, "grad_norm": 0.23900866508483887, "learning_rate": 4.1281185031185035e-05, "loss": 0.1493, "num_input_tokens_seen": 572840, "step": 6355 }, { "epoch": 1.6528066528066527, "grad_norm": 1.1871956586837769, "learning_rate": 4.131366943866944e-05, "loss": 0.441, "num_input_tokens_seen": 573304, "step": 6360 }, { "epoch": 1.6541060291060292, "grad_norm": 1.029726505279541, "learning_rate": 4.134615384615385e-05, "loss": 0.3477, "num_input_tokens_seen": 573736, "step": 6365 }, { "epoch": 1.6554054054054053, "grad_norm": 5.980270862579346, "learning_rate": 4.137863825363826e-05, "loss": 0.3636, "num_input_tokens_seen": 574152, "step": 6370 }, { "epoch": 1.6567047817047817, "grad_norm": 4.465409278869629, "learning_rate": 4.141112266112267e-05, "loss": 0.3726, "num_input_tokens_seen": 574616, "step": 6375 }, { "epoch": 1.658004158004158, "grad_norm": 1.658638834953308, "learning_rate": 4.144360706860707e-05, "loss": 0.2792, "num_input_tokens_seen": 575064, "step": 6380 }, { "epoch": 1.6593035343035343, "grad_norm": 0.7959581613540649, "learning_rate": 4.147609147609148e-05, "loss": 0.284, "num_input_tokens_seen": 575512, "step": 6385 }, { "epoch": 1.6606029106029108, "grad_norm": 14.894265174865723, "learning_rate": 4.150857588357588e-05, "loss": 0.5268, "num_input_tokens_seen": 575960, "step": 6390 }, { "epoch": 1.6619022869022868, "grad_norm": 5.325194358825684, "learning_rate": 4.154106029106029e-05, "loss": 0.5131, "num_input_tokens_seen": 576392, "step": 6395 }, { "epoch": 1.6632016632016633, "grad_norm": 8.48084831237793, "learning_rate": 4.15735446985447e-05, "loss": 0.297, "num_input_tokens_seen": 576840, "step": 6400 }, { "epoch": 1.6645010395010393, "grad_norm": 0.07369343936443329, "learning_rate": 4.160602910602911e-05, "loss": 0.2625, "num_input_tokens_seen": 577320, "step": 6405 }, { "epoch": 1.6658004158004158, "grad_norm": 0.678521454334259, "learning_rate": 4.1638513513513514e-05, "loss": 0.3858, "num_input_tokens_seen": 577768, "step": 6410 }, { "epoch": 1.667099792099792, "grad_norm": 1.4991718530654907, "learning_rate": 4.1670997920997924e-05, "loss": 0.665, "num_input_tokens_seen": 578232, "step": 6415 }, { "epoch": 1.6683991683991684, "grad_norm": 3.8964452743530273, "learning_rate": 4.170348232848233e-05, "loss": 0.4043, "num_input_tokens_seen": 578664, "step": 6420 }, { "epoch": 1.6696985446985448, "grad_norm": 0.646805465221405, "learning_rate": 4.1735966735966736e-05, "loss": 0.2675, "num_input_tokens_seen": 579160, "step": 6425 }, { "epoch": 1.6709979209979209, "grad_norm": 0.7362461090087891, "learning_rate": 4.176845114345114e-05, "loss": 0.2689, "num_input_tokens_seen": 579576, "step": 6430 }, { "epoch": 1.6722972972972974, "grad_norm": 0.7703577876091003, "learning_rate": 4.1800935550935556e-05, "loss": 0.2325, "num_input_tokens_seen": 580024, "step": 6435 }, { "epoch": 1.6735966735966736, "grad_norm": 0.4802606999874115, "learning_rate": 4.183341995841996e-05, "loss": 0.2141, "num_input_tokens_seen": 580504, "step": 6440 }, { "epoch": 1.67489604989605, "grad_norm": 0.653410792350769, "learning_rate": 4.186590436590437e-05, "loss": 0.2221, "num_input_tokens_seen": 580936, "step": 6445 }, { "epoch": 1.6761954261954262, "grad_norm": 0.5814205408096313, "learning_rate": 4.189838877338878e-05, "loss": 0.3093, "num_input_tokens_seen": 581384, "step": 6450 }, { "epoch": 1.6774948024948024, "grad_norm": 0.5543256998062134, "learning_rate": 4.193087318087318e-05, "loss": 0.2269, "num_input_tokens_seen": 581816, "step": 6455 }, { "epoch": 1.678794178794179, "grad_norm": 0.3987285792827606, "learning_rate": 4.196335758835759e-05, "loss": 0.2137, "num_input_tokens_seen": 582280, "step": 6460 }, { "epoch": 1.680093555093555, "grad_norm": 1.0678032636642456, "learning_rate": 4.1995841995842e-05, "loss": 0.3475, "num_input_tokens_seen": 582696, "step": 6465 }, { "epoch": 1.6813929313929314, "grad_norm": 0.48584139347076416, "learning_rate": 4.202832640332641e-05, "loss": 0.1379, "num_input_tokens_seen": 583144, "step": 6470 }, { "epoch": 1.6826923076923077, "grad_norm": 0.5057379007339478, "learning_rate": 4.206081081081081e-05, "loss": 0.3228, "num_input_tokens_seen": 583608, "step": 6475 }, { "epoch": 1.683991683991684, "grad_norm": 0.4971606135368347, "learning_rate": 4.209329521829522e-05, "loss": 0.2558, "num_input_tokens_seen": 584088, "step": 6480 }, { "epoch": 1.6852910602910602, "grad_norm": 0.4895883798599243, "learning_rate": 4.2125779625779625e-05, "loss": 0.2461, "num_input_tokens_seen": 584568, "step": 6485 }, { "epoch": 1.6865904365904365, "grad_norm": 0.6464132070541382, "learning_rate": 4.2158264033264035e-05, "loss": 0.325, "num_input_tokens_seen": 585032, "step": 6490 }, { "epoch": 1.687889812889813, "grad_norm": 0.8420343399047852, "learning_rate": 4.2190748440748445e-05, "loss": 0.267, "num_input_tokens_seen": 585432, "step": 6495 }, { "epoch": 1.689189189189189, "grad_norm": 0.8098533749580383, "learning_rate": 4.2223232848232854e-05, "loss": 0.2656, "num_input_tokens_seen": 585864, "step": 6500 }, { "epoch": 1.6904885654885655, "grad_norm": 0.8858453631401062, "learning_rate": 4.225571725571726e-05, "loss": 0.2717, "num_input_tokens_seen": 586312, "step": 6505 }, { "epoch": 1.6917879417879418, "grad_norm": 0.5440831184387207, "learning_rate": 4.228820166320167e-05, "loss": 0.2291, "num_input_tokens_seen": 586776, "step": 6510 }, { "epoch": 1.693087318087318, "grad_norm": 1.0629585981369019, "learning_rate": 4.232068607068607e-05, "loss": 0.3184, "num_input_tokens_seen": 587240, "step": 6515 }, { "epoch": 1.6943866943866945, "grad_norm": 0.35506781935691833, "learning_rate": 4.235317047817048e-05, "loss": 0.2161, "num_input_tokens_seen": 587688, "step": 6520 }, { "epoch": 1.6956860706860706, "grad_norm": 0.47696223855018616, "learning_rate": 4.238565488565488e-05, "loss": 0.2783, "num_input_tokens_seen": 588104, "step": 6525 }, { "epoch": 1.696985446985447, "grad_norm": 0.4930365979671478, "learning_rate": 4.24181392931393e-05, "loss": 0.3159, "num_input_tokens_seen": 588584, "step": 6530 }, { "epoch": 1.6982848232848233, "grad_norm": 0.6919560432434082, "learning_rate": 4.24506237006237e-05, "loss": 0.2799, "num_input_tokens_seen": 589032, "step": 6535 }, { "epoch": 1.6995841995841996, "grad_norm": 0.649807870388031, "learning_rate": 4.248310810810811e-05, "loss": 0.2831, "num_input_tokens_seen": 589448, "step": 6540 }, { "epoch": 1.7008835758835759, "grad_norm": 0.49222898483276367, "learning_rate": 4.2515592515592514e-05, "loss": 0.2408, "num_input_tokens_seen": 589912, "step": 6545 }, { "epoch": 1.7021829521829521, "grad_norm": 0.40037956833839417, "learning_rate": 4.2548076923076924e-05, "loss": 0.3145, "num_input_tokens_seen": 590376, "step": 6550 }, { "epoch": 1.7034823284823286, "grad_norm": 0.5080562829971313, "learning_rate": 4.258056133056133e-05, "loss": 0.242, "num_input_tokens_seen": 590824, "step": 6555 }, { "epoch": 1.7047817047817047, "grad_norm": 0.47001248598098755, "learning_rate": 4.261304573804574e-05, "loss": 0.1913, "num_input_tokens_seen": 591272, "step": 6560 }, { "epoch": 1.7060810810810811, "grad_norm": 0.4519476592540741, "learning_rate": 4.264553014553015e-05, "loss": 0.2301, "num_input_tokens_seen": 591752, "step": 6565 }, { "epoch": 1.7073804573804574, "grad_norm": 0.5765719413757324, "learning_rate": 4.2678014553014555e-05, "loss": 0.1563, "num_input_tokens_seen": 592168, "step": 6570 }, { "epoch": 1.7086798336798337, "grad_norm": 0.23938877880573273, "learning_rate": 4.2710498960498965e-05, "loss": 0.1394, "num_input_tokens_seen": 592648, "step": 6575 }, { "epoch": 1.70997920997921, "grad_norm": 0.771944522857666, "learning_rate": 4.274298336798337e-05, "loss": 0.2536, "num_input_tokens_seen": 593096, "step": 6580 }, { "epoch": 1.7112785862785862, "grad_norm": 0.6895413398742676, "learning_rate": 4.277546777546778e-05, "loss": 0.2829, "num_input_tokens_seen": 593496, "step": 6585 }, { "epoch": 1.7125779625779627, "grad_norm": 0.28154832124710083, "learning_rate": 4.280795218295218e-05, "loss": 0.3012, "num_input_tokens_seen": 593928, "step": 6590 }, { "epoch": 1.7138773388773387, "grad_norm": 0.35458317399024963, "learning_rate": 4.28404365904366e-05, "loss": 0.2869, "num_input_tokens_seen": 594360, "step": 6595 }, { "epoch": 1.7151767151767152, "grad_norm": 0.37347105145454407, "learning_rate": 4.2872920997921e-05, "loss": 0.2642, "num_input_tokens_seen": 594776, "step": 6600 }, { "epoch": 1.7164760914760915, "grad_norm": 0.29778769612312317, "learning_rate": 4.290540540540541e-05, "loss": 0.2519, "num_input_tokens_seen": 595192, "step": 6605 }, { "epoch": 1.7177754677754677, "grad_norm": 0.2920040488243103, "learning_rate": 4.293788981288981e-05, "loss": 0.2957, "num_input_tokens_seen": 595624, "step": 6610 }, { "epoch": 1.7190748440748442, "grad_norm": 0.6845489144325256, "learning_rate": 4.297037422037422e-05, "loss": 0.2801, "num_input_tokens_seen": 596056, "step": 6615 }, { "epoch": 1.7203742203742203, "grad_norm": 0.34828802943229675, "learning_rate": 4.3002858627858625e-05, "loss": 0.2564, "num_input_tokens_seen": 596520, "step": 6620 }, { "epoch": 1.7216735966735968, "grad_norm": 0.4960200786590576, "learning_rate": 4.303534303534304e-05, "loss": 0.2176, "num_input_tokens_seen": 596968, "step": 6625 }, { "epoch": 1.722972972972973, "grad_norm": 0.478447824716568, "learning_rate": 4.3067827442827444e-05, "loss": 0.2715, "num_input_tokens_seen": 597432, "step": 6630 }, { "epoch": 1.7242723492723493, "grad_norm": 0.4739665389060974, "learning_rate": 4.3100311850311854e-05, "loss": 0.266, "num_input_tokens_seen": 597896, "step": 6635 }, { "epoch": 1.7255717255717256, "grad_norm": 0.37021881341934204, "learning_rate": 4.313279625779626e-05, "loss": 0.254, "num_input_tokens_seen": 598344, "step": 6640 }, { "epoch": 1.7268711018711018, "grad_norm": 0.4743438959121704, "learning_rate": 4.3165280665280666e-05, "loss": 0.3156, "num_input_tokens_seen": 598792, "step": 6645 }, { "epoch": 1.7281704781704783, "grad_norm": 0.5096452236175537, "learning_rate": 4.3197765072765076e-05, "loss": 0.243, "num_input_tokens_seen": 599208, "step": 6650 }, { "epoch": 1.7294698544698544, "grad_norm": 0.968651294708252, "learning_rate": 4.3230249480249486e-05, "loss": 0.2725, "num_input_tokens_seen": 599672, "step": 6655 }, { "epoch": 1.7307692307692308, "grad_norm": 0.3512127995491028, "learning_rate": 4.326273388773389e-05, "loss": 0.2588, "num_input_tokens_seen": 600120, "step": 6660 }, { "epoch": 1.732068607068607, "grad_norm": 0.6217081546783447, "learning_rate": 4.32952182952183e-05, "loss": 0.2464, "num_input_tokens_seen": 600568, "step": 6665 }, { "epoch": 1.7333679833679834, "grad_norm": 0.9104228615760803, "learning_rate": 4.332770270270271e-05, "loss": 0.2872, "num_input_tokens_seen": 601016, "step": 6670 }, { "epoch": 1.7346673596673596, "grad_norm": 0.40955835580825806, "learning_rate": 4.336018711018711e-05, "loss": 0.2177, "num_input_tokens_seen": 601496, "step": 6675 }, { "epoch": 1.735966735966736, "grad_norm": 0.355857253074646, "learning_rate": 4.339267151767152e-05, "loss": 0.16, "num_input_tokens_seen": 601928, "step": 6680 }, { "epoch": 1.7372661122661124, "grad_norm": 0.32837769389152527, "learning_rate": 4.342515592515592e-05, "loss": 0.2566, "num_input_tokens_seen": 602376, "step": 6685 }, { "epoch": 1.7385654885654884, "grad_norm": 0.3079654276371002, "learning_rate": 4.345764033264034e-05, "loss": 0.2159, "num_input_tokens_seen": 602824, "step": 6690 }, { "epoch": 1.739864864864865, "grad_norm": 0.3233065903186798, "learning_rate": 4.349012474012474e-05, "loss": 0.2547, "num_input_tokens_seen": 603256, "step": 6695 }, { "epoch": 1.7411642411642412, "grad_norm": 0.6049924492835999, "learning_rate": 4.352260914760915e-05, "loss": 0.313, "num_input_tokens_seen": 603720, "step": 6700 }, { "epoch": 1.7424636174636174, "grad_norm": 0.41496530175209045, "learning_rate": 4.3555093555093555e-05, "loss": 0.2159, "num_input_tokens_seen": 604168, "step": 6705 }, { "epoch": 1.743762993762994, "grad_norm": 0.43084391951560974, "learning_rate": 4.3587577962577965e-05, "loss": 0.1784, "num_input_tokens_seen": 604600, "step": 6710 }, { "epoch": 1.74506237006237, "grad_norm": 0.5139052271842957, "learning_rate": 4.362006237006237e-05, "loss": 0.1672, "num_input_tokens_seen": 605048, "step": 6715 }, { "epoch": 1.7463617463617465, "grad_norm": 0.4953571856021881, "learning_rate": 4.3652546777546784e-05, "loss": 0.3436, "num_input_tokens_seen": 605512, "step": 6720 }, { "epoch": 1.7476611226611225, "grad_norm": 0.3662929832935333, "learning_rate": 4.368503118503119e-05, "loss": 0.1596, "num_input_tokens_seen": 605992, "step": 6725 }, { "epoch": 1.748960498960499, "grad_norm": 0.29652151465415955, "learning_rate": 4.37175155925156e-05, "loss": 0.2122, "num_input_tokens_seen": 606408, "step": 6730 }, { "epoch": 1.7502598752598753, "grad_norm": 0.3937038779258728, "learning_rate": 4.375e-05, "loss": 0.2976, "num_input_tokens_seen": 606888, "step": 6735 }, { "epoch": 1.7515592515592515, "grad_norm": 0.3454665541648865, "learning_rate": 4.378248440748441e-05, "loss": 0.2298, "num_input_tokens_seen": 607320, "step": 6740 }, { "epoch": 1.752858627858628, "grad_norm": 0.39002490043640137, "learning_rate": 4.381496881496881e-05, "loss": 0.1715, "num_input_tokens_seen": 607752, "step": 6745 }, { "epoch": 1.754158004158004, "grad_norm": 0.3085915148258209, "learning_rate": 4.384745322245322e-05, "loss": 0.2174, "num_input_tokens_seen": 608168, "step": 6750 }, { "epoch": 1.7554573804573805, "grad_norm": 0.9984143376350403, "learning_rate": 4.387993762993763e-05, "loss": 0.2897, "num_input_tokens_seen": 608600, "step": 6755 }, { "epoch": 1.7567567567567568, "grad_norm": 0.4714805781841278, "learning_rate": 4.391242203742204e-05, "loss": 0.2663, "num_input_tokens_seen": 609064, "step": 6760 }, { "epoch": 1.758056133056133, "grad_norm": 0.3506949841976166, "learning_rate": 4.394490644490645e-05, "loss": 0.3516, "num_input_tokens_seen": 609496, "step": 6765 }, { "epoch": 1.7593555093555093, "grad_norm": 0.2832396924495697, "learning_rate": 4.3977390852390854e-05, "loss": 0.2558, "num_input_tokens_seen": 609944, "step": 6770 }, { "epoch": 1.7606548856548856, "grad_norm": 0.6047461628913879, "learning_rate": 4.400987525987526e-05, "loss": 0.2498, "num_input_tokens_seen": 610424, "step": 6775 }, { "epoch": 1.761954261954262, "grad_norm": 0.8163511157035828, "learning_rate": 4.4042359667359666e-05, "loss": 0.3431, "num_input_tokens_seen": 610888, "step": 6780 }, { "epoch": 1.7632536382536381, "grad_norm": 0.5397114753723145, "learning_rate": 4.407484407484408e-05, "loss": 0.2829, "num_input_tokens_seen": 611304, "step": 6785 }, { "epoch": 1.7645530145530146, "grad_norm": 0.5187711715698242, "learning_rate": 4.4107328482328485e-05, "loss": 0.2072, "num_input_tokens_seen": 611736, "step": 6790 }, { "epoch": 1.7658523908523909, "grad_norm": 0.38181471824645996, "learning_rate": 4.4139812889812895e-05, "loss": 0.2461, "num_input_tokens_seen": 612200, "step": 6795 }, { "epoch": 1.7671517671517671, "grad_norm": 0.37668517231941223, "learning_rate": 4.41722972972973e-05, "loss": 0.261, "num_input_tokens_seen": 612648, "step": 6800 }, { "epoch": 1.7684511434511434, "grad_norm": 0.3961270749568939, "learning_rate": 4.420478170478171e-05, "loss": 0.3425, "num_input_tokens_seen": 613080, "step": 6805 }, { "epoch": 1.7697505197505197, "grad_norm": 0.3754716217517853, "learning_rate": 4.423726611226611e-05, "loss": 0.1757, "num_input_tokens_seen": 613480, "step": 6810 }, { "epoch": 1.7710498960498962, "grad_norm": 0.3983878195285797, "learning_rate": 4.426975051975052e-05, "loss": 0.2254, "num_input_tokens_seen": 613912, "step": 6815 }, { "epoch": 1.7723492723492722, "grad_norm": 0.3006362020969391, "learning_rate": 4.430223492723493e-05, "loss": 0.1624, "num_input_tokens_seen": 614344, "step": 6820 }, { "epoch": 1.7736486486486487, "grad_norm": 0.5488753318786621, "learning_rate": 4.433471933471934e-05, "loss": 0.1704, "num_input_tokens_seen": 614808, "step": 6825 }, { "epoch": 1.774948024948025, "grad_norm": 0.21263372898101807, "learning_rate": 4.436720374220374e-05, "loss": 0.2207, "num_input_tokens_seen": 615208, "step": 6830 }, { "epoch": 1.7762474012474012, "grad_norm": 0.5416462421417236, "learning_rate": 4.439968814968815e-05, "loss": 0.3016, "num_input_tokens_seen": 615656, "step": 6835 }, { "epoch": 1.7775467775467777, "grad_norm": 0.2683987021446228, "learning_rate": 4.4432172557172555e-05, "loss": 0.2815, "num_input_tokens_seen": 616088, "step": 6840 }, { "epoch": 1.7788461538461537, "grad_norm": 0.3663121163845062, "learning_rate": 4.4464656964656965e-05, "loss": 0.3066, "num_input_tokens_seen": 616520, "step": 6845 }, { "epoch": 1.7801455301455302, "grad_norm": 0.3354509174823761, "learning_rate": 4.4497141372141374e-05, "loss": 0.1796, "num_input_tokens_seen": 616968, "step": 6850 }, { "epoch": 1.7814449064449065, "grad_norm": 0.46686917543411255, "learning_rate": 4.4529625779625784e-05, "loss": 0.307, "num_input_tokens_seen": 617432, "step": 6855 }, { "epoch": 1.7827442827442828, "grad_norm": 0.7939720153808594, "learning_rate": 4.456211018711019e-05, "loss": 0.325, "num_input_tokens_seen": 617880, "step": 6860 }, { "epoch": 1.784043659043659, "grad_norm": 0.15063777565956116, "learning_rate": 4.4594594594594596e-05, "loss": 0.2848, "num_input_tokens_seen": 618392, "step": 6865 }, { "epoch": 1.7853430353430353, "grad_norm": 0.13406141102313995, "learning_rate": 4.4627079002079006e-05, "loss": 0.2816, "num_input_tokens_seen": 618856, "step": 6870 }, { "epoch": 1.7866424116424118, "grad_norm": 1.0966615676879883, "learning_rate": 4.465956340956341e-05, "loss": 0.3671, "num_input_tokens_seen": 619288, "step": 6875 }, { "epoch": 1.7879417879417878, "grad_norm": 0.12511567771434784, "learning_rate": 4.469204781704782e-05, "loss": 0.272, "num_input_tokens_seen": 619752, "step": 6880 }, { "epoch": 1.7892411642411643, "grad_norm": 0.6281819343566895, "learning_rate": 4.472453222453223e-05, "loss": 0.2821, "num_input_tokens_seen": 620168, "step": 6885 }, { "epoch": 1.7905405405405406, "grad_norm": 0.24151155352592468, "learning_rate": 4.475701663201664e-05, "loss": 0.254, "num_input_tokens_seen": 620616, "step": 6890 }, { "epoch": 1.7918399168399168, "grad_norm": 0.32666248083114624, "learning_rate": 4.478950103950104e-05, "loss": 0.2268, "num_input_tokens_seen": 621064, "step": 6895 }, { "epoch": 1.793139293139293, "grad_norm": 0.4542793929576874, "learning_rate": 4.482198544698545e-05, "loss": 0.2516, "num_input_tokens_seen": 621528, "step": 6900 }, { "epoch": 1.7944386694386694, "grad_norm": 0.36276549100875854, "learning_rate": 4.485446985446985e-05, "loss": 0.4098, "num_input_tokens_seen": 621992, "step": 6905 }, { "epoch": 1.7957380457380459, "grad_norm": 0.4974256753921509, "learning_rate": 4.488695426195426e-05, "loss": 0.1987, "num_input_tokens_seen": 622424, "step": 6910 }, { "epoch": 1.797037422037422, "grad_norm": 0.4945085048675537, "learning_rate": 4.491943866943867e-05, "loss": 0.2905, "num_input_tokens_seen": 622872, "step": 6915 }, { "epoch": 1.7983367983367984, "grad_norm": 0.40273284912109375, "learning_rate": 4.495192307692308e-05, "loss": 0.2643, "num_input_tokens_seen": 623336, "step": 6920 }, { "epoch": 1.7996361746361746, "grad_norm": 0.5506315231323242, "learning_rate": 4.4984407484407485e-05, "loss": 0.2684, "num_input_tokens_seen": 623816, "step": 6925 }, { "epoch": 1.800935550935551, "grad_norm": 0.6273744106292725, "learning_rate": 4.5016891891891895e-05, "loss": 0.2235, "num_input_tokens_seen": 624312, "step": 6930 }, { "epoch": 1.8022349272349274, "grad_norm": 0.8170303106307983, "learning_rate": 4.50493762993763e-05, "loss": 0.1828, "num_input_tokens_seen": 624760, "step": 6935 }, { "epoch": 1.8035343035343034, "grad_norm": 0.3057401180267334, "learning_rate": 4.508186070686071e-05, "loss": 0.0461, "num_input_tokens_seen": 625224, "step": 6940 }, { "epoch": 1.80483367983368, "grad_norm": 0.11492546647787094, "learning_rate": 4.511434511434512e-05, "loss": 0.498, "num_input_tokens_seen": 625656, "step": 6945 }, { "epoch": 1.806133056133056, "grad_norm": 0.2342827171087265, "learning_rate": 4.514682952182953e-05, "loss": 0.3447, "num_input_tokens_seen": 626104, "step": 6950 }, { "epoch": 1.8074324324324325, "grad_norm": 3.8027501106262207, "learning_rate": 4.517931392931393e-05, "loss": 0.4269, "num_input_tokens_seen": 626552, "step": 6955 }, { "epoch": 1.8087318087318087, "grad_norm": 30.25678062438965, "learning_rate": 4.521179833679834e-05, "loss": 0.3787, "num_input_tokens_seen": 626984, "step": 6960 }, { "epoch": 1.810031185031185, "grad_norm": 9.779452323913574, "learning_rate": 4.524428274428275e-05, "loss": 0.5512, "num_input_tokens_seen": 627416, "step": 6965 }, { "epoch": 1.8113305613305615, "grad_norm": 7.52706241607666, "learning_rate": 4.527676715176715e-05, "loss": 0.3317, "num_input_tokens_seen": 627864, "step": 6970 }, { "epoch": 1.8126299376299375, "grad_norm": 3.625094175338745, "learning_rate": 4.530925155925156e-05, "loss": 0.5403, "num_input_tokens_seen": 628328, "step": 6975 }, { "epoch": 1.813929313929314, "grad_norm": 0.9191709756851196, "learning_rate": 4.534173596673597e-05, "loss": 0.3437, "num_input_tokens_seen": 628776, "step": 6980 }, { "epoch": 1.8152286902286903, "grad_norm": 21.492464065551758, "learning_rate": 4.537422037422038e-05, "loss": 1.1302, "num_input_tokens_seen": 629208, "step": 6985 }, { "epoch": 1.8165280665280665, "grad_norm": 1.3536173105239868, "learning_rate": 4.5406704781704784e-05, "loss": 0.277, "num_input_tokens_seen": 629672, "step": 6990 }, { "epoch": 1.8178274428274428, "grad_norm": 0.64480060338974, "learning_rate": 4.543918918918919e-05, "loss": 0.2799, "num_input_tokens_seen": 630120, "step": 6995 }, { "epoch": 1.819126819126819, "grad_norm": 0.3278752863407135, "learning_rate": 4.5471673596673596e-05, "loss": 0.2735, "num_input_tokens_seen": 630584, "step": 7000 }, { "epoch": 1.8204261954261955, "grad_norm": 0.6839632391929626, "learning_rate": 4.5504158004158006e-05, "loss": 0.3188, "num_input_tokens_seen": 631032, "step": 7005 }, { "epoch": 1.8217255717255716, "grad_norm": 0.8022751212120056, "learning_rate": 4.5536642411642415e-05, "loss": 0.1845, "num_input_tokens_seen": 631464, "step": 7010 }, { "epoch": 1.823024948024948, "grad_norm": 0.9894481897354126, "learning_rate": 4.5569126819126825e-05, "loss": 0.1565, "num_input_tokens_seen": 631928, "step": 7015 }, { "epoch": 1.8243243243243243, "grad_norm": 0.7219380736351013, "learning_rate": 4.560161122661123e-05, "loss": 0.4041, "num_input_tokens_seen": 632408, "step": 7020 }, { "epoch": 1.8256237006237006, "grad_norm": 2.121323823928833, "learning_rate": 4.563409563409564e-05, "loss": 0.2856, "num_input_tokens_seen": 632872, "step": 7025 }, { "epoch": 1.8269230769230769, "grad_norm": 17.73909568786621, "learning_rate": 4.566658004158004e-05, "loss": 0.5452, "num_input_tokens_seen": 633336, "step": 7030 }, { "epoch": 1.8282224532224531, "grad_norm": 0.9586365818977356, "learning_rate": 4.569906444906445e-05, "loss": 0.2883, "num_input_tokens_seen": 633832, "step": 7035 }, { "epoch": 1.8295218295218296, "grad_norm": 1.0976169109344482, "learning_rate": 4.573154885654886e-05, "loss": 0.2844, "num_input_tokens_seen": 634312, "step": 7040 }, { "epoch": 1.8308212058212057, "grad_norm": 0.5030800104141235, "learning_rate": 4.576403326403327e-05, "loss": 0.2918, "num_input_tokens_seen": 634776, "step": 7045 }, { "epoch": 1.8321205821205822, "grad_norm": 0.3837577700614929, "learning_rate": 4.579651767151767e-05, "loss": 0.3212, "num_input_tokens_seen": 635256, "step": 7050 }, { "epoch": 1.8334199584199584, "grad_norm": 0.5287837386131287, "learning_rate": 4.582900207900208e-05, "loss": 0.22, "num_input_tokens_seen": 635736, "step": 7055 }, { "epoch": 1.8347193347193347, "grad_norm": 0.6110936403274536, "learning_rate": 4.5861486486486485e-05, "loss": 0.2571, "num_input_tokens_seen": 636184, "step": 7060 }, { "epoch": 1.8360187110187112, "grad_norm": 0.6106201410293579, "learning_rate": 4.5893970893970894e-05, "loss": 0.2779, "num_input_tokens_seen": 636632, "step": 7065 }, { "epoch": 1.8373180873180872, "grad_norm": 0.6161503195762634, "learning_rate": 4.5926455301455304e-05, "loss": 0.2578, "num_input_tokens_seen": 637064, "step": 7070 }, { "epoch": 1.8386174636174637, "grad_norm": 0.34748926758766174, "learning_rate": 4.5958939708939714e-05, "loss": 0.3023, "num_input_tokens_seen": 637512, "step": 7075 }, { "epoch": 1.83991683991684, "grad_norm": 0.49710023403167725, "learning_rate": 4.599142411642412e-05, "loss": 0.1988, "num_input_tokens_seen": 637960, "step": 7080 }, { "epoch": 1.8412162162162162, "grad_norm": 0.34826064109802246, "learning_rate": 4.6023908523908526e-05, "loss": 0.4236, "num_input_tokens_seen": 638376, "step": 7085 }, { "epoch": 1.8425155925155925, "grad_norm": 0.11324219405651093, "learning_rate": 4.6056392931392936e-05, "loss": 0.2842, "num_input_tokens_seen": 638792, "step": 7090 }, { "epoch": 1.8438149688149688, "grad_norm": 0.23980295658111572, "learning_rate": 4.608887733887734e-05, "loss": 0.2842, "num_input_tokens_seen": 639224, "step": 7095 }, { "epoch": 1.8451143451143452, "grad_norm": 0.9576981663703918, "learning_rate": 4.612136174636175e-05, "loss": 0.2449, "num_input_tokens_seen": 639640, "step": 7100 }, { "epoch": 1.8464137214137213, "grad_norm": 0.2860308289527893, "learning_rate": 4.615384615384616e-05, "loss": 0.2079, "num_input_tokens_seen": 640088, "step": 7105 }, { "epoch": 1.8477130977130978, "grad_norm": 0.5457742810249329, "learning_rate": 4.618633056133057e-05, "loss": 0.3321, "num_input_tokens_seen": 640536, "step": 7110 }, { "epoch": 1.849012474012474, "grad_norm": 0.5115193724632263, "learning_rate": 4.621881496881497e-05, "loss": 0.2169, "num_input_tokens_seen": 640984, "step": 7115 }, { "epoch": 1.8503118503118503, "grad_norm": 0.3960241675376892, "learning_rate": 4.625129937629938e-05, "loss": 0.3788, "num_input_tokens_seen": 641464, "step": 7120 }, { "epoch": 1.8516112266112266, "grad_norm": 2.906743049621582, "learning_rate": 4.628378378378378e-05, "loss": 0.3022, "num_input_tokens_seen": 641928, "step": 7125 }, { "epoch": 1.8529106029106028, "grad_norm": 0.1498330682516098, "learning_rate": 4.631626819126819e-05, "loss": 0.3147, "num_input_tokens_seen": 642392, "step": 7130 }, { "epoch": 1.8542099792099793, "grad_norm": 0.6585544943809509, "learning_rate": 4.6348752598752596e-05, "loss": 0.2533, "num_input_tokens_seen": 642840, "step": 7135 }, { "epoch": 1.8555093555093554, "grad_norm": 0.537283182144165, "learning_rate": 4.638123700623701e-05, "loss": 0.2587, "num_input_tokens_seen": 643288, "step": 7140 }, { "epoch": 1.8568087318087318, "grad_norm": 0.7196905612945557, "learning_rate": 4.6413721413721415e-05, "loss": 0.3129, "num_input_tokens_seen": 643736, "step": 7145 }, { "epoch": 1.8581081081081081, "grad_norm": 0.5969760417938232, "learning_rate": 4.6446205821205825e-05, "loss": 0.2092, "num_input_tokens_seen": 644184, "step": 7150 }, { "epoch": 1.8594074844074844, "grad_norm": 0.419609934091568, "learning_rate": 4.647869022869023e-05, "loss": 0.3758, "num_input_tokens_seen": 644696, "step": 7155 }, { "epoch": 1.8607068607068609, "grad_norm": 0.8253301978111267, "learning_rate": 4.651117463617464e-05, "loss": 0.2419, "num_input_tokens_seen": 645144, "step": 7160 }, { "epoch": 1.862006237006237, "grad_norm": 0.8209514021873474, "learning_rate": 4.654365904365905e-05, "loss": 0.2827, "num_input_tokens_seen": 645608, "step": 7165 }, { "epoch": 1.8633056133056134, "grad_norm": 0.2823975384235382, "learning_rate": 4.6576143451143457e-05, "loss": 0.3201, "num_input_tokens_seen": 646120, "step": 7170 }, { "epoch": 1.8646049896049897, "grad_norm": 0.7263445854187012, "learning_rate": 4.660862785862786e-05, "loss": 0.2567, "num_input_tokens_seen": 646600, "step": 7175 }, { "epoch": 1.865904365904366, "grad_norm": 0.5799904465675354, "learning_rate": 4.664111226611227e-05, "loss": 0.2621, "num_input_tokens_seen": 647032, "step": 7180 }, { "epoch": 1.8672037422037422, "grad_norm": 0.3360745310783386, "learning_rate": 4.667359667359668e-05, "loss": 0.307, "num_input_tokens_seen": 647496, "step": 7185 }, { "epoch": 1.8685031185031185, "grad_norm": 0.22553999722003937, "learning_rate": 4.670608108108108e-05, "loss": 0.283, "num_input_tokens_seen": 647928, "step": 7190 }, { "epoch": 1.869802494802495, "grad_norm": 0.6165371537208557, "learning_rate": 4.673856548856549e-05, "loss": 0.2606, "num_input_tokens_seen": 648376, "step": 7195 }, { "epoch": 1.871101871101871, "grad_norm": 0.25950753688812256, "learning_rate": 4.67710498960499e-05, "loss": 0.2549, "num_input_tokens_seen": 648824, "step": 7200 }, { "epoch": 1.8724012474012475, "grad_norm": 0.43323561549186707, "learning_rate": 4.680353430353431e-05, "loss": 0.2779, "num_input_tokens_seen": 649240, "step": 7205 }, { "epoch": 1.8737006237006237, "grad_norm": 0.34456107020378113, "learning_rate": 4.6836018711018713e-05, "loss": 0.2287, "num_input_tokens_seen": 649656, "step": 7210 }, { "epoch": 1.875, "grad_norm": 0.35287269949913025, "learning_rate": 4.686850311850312e-05, "loss": 0.2018, "num_input_tokens_seen": 650088, "step": 7215 }, { "epoch": 1.8762993762993763, "grad_norm": 0.8765450716018677, "learning_rate": 4.6900987525987526e-05, "loss": 0.3872, "num_input_tokens_seen": 650536, "step": 7220 }, { "epoch": 1.8775987525987525, "grad_norm": 0.3173234760761261, "learning_rate": 4.6933471933471936e-05, "loss": 0.2158, "num_input_tokens_seen": 650952, "step": 7225 }, { "epoch": 1.878898128898129, "grad_norm": 0.32270190119743347, "learning_rate": 4.696595634095634e-05, "loss": 0.1841, "num_input_tokens_seen": 651416, "step": 7230 }, { "epoch": 1.880197505197505, "grad_norm": 0.35807564854621887, "learning_rate": 4.6998440748440755e-05, "loss": 0.3088, "num_input_tokens_seen": 651896, "step": 7235 }, { "epoch": 1.8814968814968815, "grad_norm": 0.44387856125831604, "learning_rate": 4.703092515592516e-05, "loss": 0.3033, "num_input_tokens_seen": 652296, "step": 7240 }, { "epoch": 1.8827962577962578, "grad_norm": 0.5837923884391785, "learning_rate": 4.706340956340957e-05, "loss": 0.2799, "num_input_tokens_seen": 652760, "step": 7245 }, { "epoch": 1.884095634095634, "grad_norm": 0.22082389891147614, "learning_rate": 4.709589397089397e-05, "loss": 0.2626, "num_input_tokens_seen": 653208, "step": 7250 }, { "epoch": 1.8853950103950103, "grad_norm": 0.4660550355911255, "learning_rate": 4.712837837837838e-05, "loss": 0.2451, "num_input_tokens_seen": 653704, "step": 7255 }, { "epoch": 1.8866943866943866, "grad_norm": 0.36044445633888245, "learning_rate": 4.716086278586278e-05, "loss": 0.243, "num_input_tokens_seen": 654168, "step": 7260 }, { "epoch": 1.887993762993763, "grad_norm": 0.3919093906879425, "learning_rate": 4.71933471933472e-05, "loss": 0.2637, "num_input_tokens_seen": 654664, "step": 7265 }, { "epoch": 1.8892931392931391, "grad_norm": 0.4226205348968506, "learning_rate": 4.72258316008316e-05, "loss": 0.22, "num_input_tokens_seen": 655096, "step": 7270 }, { "epoch": 1.8905925155925156, "grad_norm": 0.3055659830570221, "learning_rate": 4.725831600831601e-05, "loss": 0.3186, "num_input_tokens_seen": 655528, "step": 7275 }, { "epoch": 1.8918918918918919, "grad_norm": 0.31559959053993225, "learning_rate": 4.7290800415800415e-05, "loss": 0.1682, "num_input_tokens_seen": 655960, "step": 7280 }, { "epoch": 1.8931912681912682, "grad_norm": 0.353615939617157, "learning_rate": 4.7323284823284824e-05, "loss": 0.2697, "num_input_tokens_seen": 656424, "step": 7285 }, { "epoch": 1.8944906444906446, "grad_norm": 0.37315505743026733, "learning_rate": 4.7355769230769234e-05, "loss": 0.2182, "num_input_tokens_seen": 656872, "step": 7290 }, { "epoch": 1.8957900207900207, "grad_norm": 0.3458234369754791, "learning_rate": 4.738825363825364e-05, "loss": 0.3472, "num_input_tokens_seen": 657336, "step": 7295 }, { "epoch": 1.8970893970893972, "grad_norm": 0.3968425989151001, "learning_rate": 4.742073804573805e-05, "loss": 0.2438, "num_input_tokens_seen": 657768, "step": 7300 }, { "epoch": 1.8983887733887734, "grad_norm": 0.21247614920139313, "learning_rate": 4.7453222453222456e-05, "loss": 0.254, "num_input_tokens_seen": 658184, "step": 7305 }, { "epoch": 1.8996881496881497, "grad_norm": 0.21837735176086426, "learning_rate": 4.7485706860706866e-05, "loss": 0.3168, "num_input_tokens_seen": 658632, "step": 7310 }, { "epoch": 1.900987525987526, "grad_norm": 0.20811952650547028, "learning_rate": 4.751819126819127e-05, "loss": 0.2549, "num_input_tokens_seen": 659048, "step": 7315 }, { "epoch": 1.9022869022869022, "grad_norm": 0.2937939763069153, "learning_rate": 4.755067567567568e-05, "loss": 0.2119, "num_input_tokens_seen": 659480, "step": 7320 }, { "epoch": 1.9035862785862787, "grad_norm": 0.3308296799659729, "learning_rate": 4.758316008316008e-05, "loss": 0.179, "num_input_tokens_seen": 660008, "step": 7325 }, { "epoch": 1.9048856548856548, "grad_norm": 0.2229268103837967, "learning_rate": 4.76156444906445e-05, "loss": 0.0933, "num_input_tokens_seen": 660424, "step": 7330 }, { "epoch": 1.9061850311850312, "grad_norm": 0.20065420866012573, "learning_rate": 4.76481288981289e-05, "loss": 0.3896, "num_input_tokens_seen": 660888, "step": 7335 }, { "epoch": 1.9074844074844075, "grad_norm": 0.3504914939403534, "learning_rate": 4.768061330561331e-05, "loss": 0.3894, "num_input_tokens_seen": 661352, "step": 7340 }, { "epoch": 1.9087837837837838, "grad_norm": 0.400312602519989, "learning_rate": 4.771309771309771e-05, "loss": 0.2741, "num_input_tokens_seen": 661800, "step": 7345 }, { "epoch": 1.91008316008316, "grad_norm": 0.6069068908691406, "learning_rate": 4.774558212058212e-05, "loss": 0.3319, "num_input_tokens_seen": 662264, "step": 7350 }, { "epoch": 1.9113825363825363, "grad_norm": 2.3145527839660645, "learning_rate": 4.7778066528066526e-05, "loss": 0.3384, "num_input_tokens_seen": 662680, "step": 7355 }, { "epoch": 1.9126819126819128, "grad_norm": 0.28083574771881104, "learning_rate": 4.7810550935550935e-05, "loss": 0.3544, "num_input_tokens_seen": 663144, "step": 7360 }, { "epoch": 1.9139812889812888, "grad_norm": 0.3370571434497833, "learning_rate": 4.7843035343035345e-05, "loss": 0.2877, "num_input_tokens_seen": 663608, "step": 7365 }, { "epoch": 1.9152806652806653, "grad_norm": 0.5627428293228149, "learning_rate": 4.7875519750519755e-05, "loss": 0.1934, "num_input_tokens_seen": 664024, "step": 7370 }, { "epoch": 1.9165800415800416, "grad_norm": 0.37685707211494446, "learning_rate": 4.790800415800416e-05, "loss": 0.0559, "num_input_tokens_seen": 664488, "step": 7375 }, { "epoch": 1.9178794178794178, "grad_norm": 11.948236465454102, "learning_rate": 4.794048856548857e-05, "loss": 0.5102, "num_input_tokens_seen": 664920, "step": 7380 }, { "epoch": 1.9191787941787943, "grad_norm": 7.853895664215088, "learning_rate": 4.797297297297298e-05, "loss": 0.6609, "num_input_tokens_seen": 665368, "step": 7385 }, { "epoch": 1.9204781704781704, "grad_norm": 14.859299659729004, "learning_rate": 4.800545738045738e-05, "loss": 0.8141, "num_input_tokens_seen": 665784, "step": 7390 }, { "epoch": 1.9217775467775469, "grad_norm": 0.6180768609046936, "learning_rate": 4.803794178794179e-05, "loss": 0.1806, "num_input_tokens_seen": 666280, "step": 7395 }, { "epoch": 1.9230769230769231, "grad_norm": 0.1599651724100113, "learning_rate": 4.80704261954262e-05, "loss": 0.3647, "num_input_tokens_seen": 666728, "step": 7400 }, { "epoch": 1.9243762993762994, "grad_norm": 2.930027961730957, "learning_rate": 4.810291060291061e-05, "loss": 0.6089, "num_input_tokens_seen": 667160, "step": 7405 }, { "epoch": 1.9256756756756757, "grad_norm": 1.3043471574783325, "learning_rate": 4.813539501039501e-05, "loss": 0.1884, "num_input_tokens_seen": 667576, "step": 7410 }, { "epoch": 1.926975051975052, "grad_norm": 1.8975884914398193, "learning_rate": 4.816787941787942e-05, "loss": 0.3522, "num_input_tokens_seen": 668008, "step": 7415 }, { "epoch": 1.9282744282744284, "grad_norm": 0.806225597858429, "learning_rate": 4.8200363825363824e-05, "loss": 0.215, "num_input_tokens_seen": 668488, "step": 7420 }, { "epoch": 1.9295738045738045, "grad_norm": 0.6403327584266663, "learning_rate": 4.823284823284824e-05, "loss": 0.1805, "num_input_tokens_seen": 668984, "step": 7425 }, { "epoch": 1.930873180873181, "grad_norm": 1.3390027284622192, "learning_rate": 4.8265332640332643e-05, "loss": 0.4176, "num_input_tokens_seen": 669448, "step": 7430 }, { "epoch": 1.9321725571725572, "grad_norm": 0.12910428643226624, "learning_rate": 4.829781704781705e-05, "loss": 0.2619, "num_input_tokens_seen": 669928, "step": 7435 }, { "epoch": 1.9334719334719335, "grad_norm": 1.7074170112609863, "learning_rate": 4.8330301455301456e-05, "loss": 0.348, "num_input_tokens_seen": 670344, "step": 7440 }, { "epoch": 1.9347713097713097, "grad_norm": 0.9299015402793884, "learning_rate": 4.8362785862785866e-05, "loss": 0.2874, "num_input_tokens_seen": 670792, "step": 7445 }, { "epoch": 1.936070686070686, "grad_norm": 0.6300871968269348, "learning_rate": 4.839527027027027e-05, "loss": 0.3101, "num_input_tokens_seen": 671288, "step": 7450 }, { "epoch": 1.9373700623700625, "grad_norm": 0.5166355967521667, "learning_rate": 4.842775467775468e-05, "loss": 0.271, "num_input_tokens_seen": 671736, "step": 7455 }, { "epoch": 1.9386694386694385, "grad_norm": 0.5320115685462952, "learning_rate": 4.846023908523909e-05, "loss": 0.2681, "num_input_tokens_seen": 672248, "step": 7460 }, { "epoch": 1.939968814968815, "grad_norm": 0.6376543045043945, "learning_rate": 4.84927234927235e-05, "loss": 0.2294, "num_input_tokens_seen": 672712, "step": 7465 }, { "epoch": 1.9412681912681913, "grad_norm": 0.5414319634437561, "learning_rate": 4.85252079002079e-05, "loss": 0.2783, "num_input_tokens_seen": 673176, "step": 7470 }, { "epoch": 1.9425675675675675, "grad_norm": 0.5148892998695374, "learning_rate": 4.855769230769231e-05, "loss": 0.2226, "num_input_tokens_seen": 673608, "step": 7475 }, { "epoch": 1.943866943866944, "grad_norm": 0.4922657310962677, "learning_rate": 4.859017671517671e-05, "loss": 0.2622, "num_input_tokens_seen": 674072, "step": 7480 }, { "epoch": 1.94516632016632, "grad_norm": 0.34387531876564026, "learning_rate": 4.862266112266112e-05, "loss": 0.1659, "num_input_tokens_seen": 674568, "step": 7485 }, { "epoch": 1.9464656964656966, "grad_norm": 0.2907506227493286, "learning_rate": 4.865514553014553e-05, "loss": 0.2123, "num_input_tokens_seen": 675016, "step": 7490 }, { "epoch": 1.9477650727650726, "grad_norm": 0.3171491324901581, "learning_rate": 4.868762993762994e-05, "loss": 0.2615, "num_input_tokens_seen": 675464, "step": 7495 }, { "epoch": 1.949064449064449, "grad_norm": 0.42465174198150635, "learning_rate": 4.872011434511435e-05, "loss": 0.3592, "num_input_tokens_seen": 675928, "step": 7500 }, { "epoch": 1.9503638253638254, "grad_norm": 0.615798830986023, "learning_rate": 4.8752598752598754e-05, "loss": 0.2445, "num_input_tokens_seen": 676376, "step": 7505 }, { "epoch": 1.9516632016632016, "grad_norm": 0.5554379820823669, "learning_rate": 4.8785083160083164e-05, "loss": 0.2365, "num_input_tokens_seen": 676840, "step": 7510 }, { "epoch": 1.952962577962578, "grad_norm": 0.4675489664077759, "learning_rate": 4.881756756756757e-05, "loss": 0.2272, "num_input_tokens_seen": 677304, "step": 7515 }, { "epoch": 1.9542619542619541, "grad_norm": 0.2944481074810028, "learning_rate": 4.8850051975051977e-05, "loss": 0.1522, "num_input_tokens_seen": 677768, "step": 7520 }, { "epoch": 1.9555613305613306, "grad_norm": 0.18328161537647247, "learning_rate": 4.8882536382536386e-05, "loss": 0.1503, "num_input_tokens_seen": 678200, "step": 7525 }, { "epoch": 1.956860706860707, "grad_norm": 0.6291869878768921, "learning_rate": 4.8915020790020796e-05, "loss": 0.3992, "num_input_tokens_seen": 678664, "step": 7530 }, { "epoch": 1.9581600831600832, "grad_norm": 0.440236896276474, "learning_rate": 4.89475051975052e-05, "loss": 0.2919, "num_input_tokens_seen": 679128, "step": 7535 }, { "epoch": 1.9594594594594594, "grad_norm": 0.3713102638721466, "learning_rate": 4.897998960498961e-05, "loss": 0.2211, "num_input_tokens_seen": 679560, "step": 7540 }, { "epoch": 1.9607588357588357, "grad_norm": 0.4122040271759033, "learning_rate": 4.901247401247401e-05, "loss": 0.2162, "num_input_tokens_seen": 680024, "step": 7545 }, { "epoch": 1.9620582120582122, "grad_norm": 0.34968358278274536, "learning_rate": 4.904495841995842e-05, "loss": 0.2211, "num_input_tokens_seen": 680456, "step": 7550 }, { "epoch": 1.9633575883575882, "grad_norm": 0.3104795813560486, "learning_rate": 4.907744282744283e-05, "loss": 0.2075, "num_input_tokens_seen": 680888, "step": 7555 }, { "epoch": 1.9646569646569647, "grad_norm": 0.8213828802108765, "learning_rate": 4.910992723492724e-05, "loss": 0.3892, "num_input_tokens_seen": 681352, "step": 7560 }, { "epoch": 1.965956340956341, "grad_norm": 0.6183084845542908, "learning_rate": 4.914241164241164e-05, "loss": 0.2773, "num_input_tokens_seen": 681800, "step": 7565 }, { "epoch": 1.9672557172557172, "grad_norm": 0.2376890629529953, "learning_rate": 4.917489604989605e-05, "loss": 0.2491, "num_input_tokens_seen": 682248, "step": 7570 }, { "epoch": 1.9685550935550935, "grad_norm": 0.6091116666793823, "learning_rate": 4.9207380457380456e-05, "loss": 0.287, "num_input_tokens_seen": 682696, "step": 7575 }, { "epoch": 1.9698544698544698, "grad_norm": 0.18490305542945862, "learning_rate": 4.9239864864864865e-05, "loss": 0.328, "num_input_tokens_seen": 683176, "step": 7580 }, { "epoch": 1.9711538461538463, "grad_norm": 0.6966219544410706, "learning_rate": 4.9272349272349275e-05, "loss": 0.2759, "num_input_tokens_seen": 683624, "step": 7585 }, { "epoch": 1.9724532224532223, "grad_norm": 0.25035861134529114, "learning_rate": 4.9304833679833685e-05, "loss": 0.2889, "num_input_tokens_seen": 684104, "step": 7590 }, { "epoch": 1.9737525987525988, "grad_norm": 0.46419933438301086, "learning_rate": 4.933731808731809e-05, "loss": 0.2353, "num_input_tokens_seen": 684552, "step": 7595 }, { "epoch": 1.975051975051975, "grad_norm": 0.36378034949302673, "learning_rate": 4.93698024948025e-05, "loss": 0.219, "num_input_tokens_seen": 685000, "step": 7600 }, { "epoch": 1.9763513513513513, "grad_norm": 0.27192869782447815, "learning_rate": 4.940228690228691e-05, "loss": 0.1672, "num_input_tokens_seen": 685448, "step": 7605 }, { "epoch": 1.9776507276507278, "grad_norm": 0.5525782704353333, "learning_rate": 4.943477130977131e-05, "loss": 0.2905, "num_input_tokens_seen": 685912, "step": 7610 }, { "epoch": 1.9789501039501038, "grad_norm": 0.5098966360092163, "learning_rate": 4.946725571725572e-05, "loss": 0.2151, "num_input_tokens_seen": 686376, "step": 7615 }, { "epoch": 1.9802494802494803, "grad_norm": 0.4004964232444763, "learning_rate": 4.949974012474013e-05, "loss": 0.3295, "num_input_tokens_seen": 686824, "step": 7620 }, { "epoch": 1.9815488565488566, "grad_norm": 0.3158430755138397, "learning_rate": 4.953222453222454e-05, "loss": 0.3245, "num_input_tokens_seen": 687304, "step": 7625 }, { "epoch": 1.9828482328482329, "grad_norm": 0.2468181997537613, "learning_rate": 4.956470893970894e-05, "loss": 0.2638, "num_input_tokens_seen": 687768, "step": 7630 }, { "epoch": 1.9841476091476091, "grad_norm": 0.4628356397151947, "learning_rate": 4.959719334719335e-05, "loss": 0.2397, "num_input_tokens_seen": 688200, "step": 7635 }, { "epoch": 1.9854469854469854, "grad_norm": 0.45073843002319336, "learning_rate": 4.9629677754677754e-05, "loss": 0.2337, "num_input_tokens_seen": 688712, "step": 7640 }, { "epoch": 1.9867463617463619, "grad_norm": 0.3969971537590027, "learning_rate": 4.9662162162162164e-05, "loss": 0.2193, "num_input_tokens_seen": 689176, "step": 7645 }, { "epoch": 1.988045738045738, "grad_norm": 0.48481523990631104, "learning_rate": 4.969464656964657e-05, "loss": 0.2671, "num_input_tokens_seen": 689640, "step": 7650 }, { "epoch": 1.9893451143451144, "grad_norm": 0.3535934388637543, "learning_rate": 4.972713097713098e-05, "loss": 0.3715, "num_input_tokens_seen": 690056, "step": 7655 }, { "epoch": 1.9906444906444907, "grad_norm": 0.3999886214733124, "learning_rate": 4.9759615384615386e-05, "loss": 0.1885, "num_input_tokens_seen": 690520, "step": 7660 }, { "epoch": 1.991943866943867, "grad_norm": 0.3412964642047882, "learning_rate": 4.9792099792099796e-05, "loss": 0.1483, "num_input_tokens_seen": 691000, "step": 7665 }, { "epoch": 1.9932432432432432, "grad_norm": 0.3947204351425171, "learning_rate": 4.98245841995842e-05, "loss": 0.3273, "num_input_tokens_seen": 691432, "step": 7670 }, { "epoch": 1.9945426195426195, "grad_norm": 0.4214434027671814, "learning_rate": 4.985706860706861e-05, "loss": 0.2197, "num_input_tokens_seen": 691880, "step": 7675 }, { "epoch": 1.995841995841996, "grad_norm": 0.32610204815864563, "learning_rate": 4.988955301455302e-05, "loss": 0.3314, "num_input_tokens_seen": 692312, "step": 7680 }, { "epoch": 1.997141372141372, "grad_norm": 0.3155263364315033, "learning_rate": 4.992203742203743e-05, "loss": 0.1706, "num_input_tokens_seen": 692776, "step": 7685 }, { "epoch": 1.9984407484407485, "grad_norm": 0.3384334146976471, "learning_rate": 4.995452182952183e-05, "loss": 0.2496, "num_input_tokens_seen": 693256, "step": 7690 }, { "epoch": 1.9997401247401247, "grad_norm": 0.309944212436676, "learning_rate": 4.998700623700624e-05, "loss": 0.3132, "num_input_tokens_seen": 693704, "step": 7695 }, { "epoch": 2.0, "eval_loss": 0.24168933928012848, "eval_runtime": 13.1557, "eval_samples_per_second": 65.067, "eval_steps_per_second": 32.533, "num_input_tokens_seen": 693752, "step": 7696 }, { "epoch": 2.001039501039501, "grad_norm": 0.4359293282032013, "learning_rate": 4.999999976856069e-05, "loss": 0.3253, "num_input_tokens_seen": 694120, "step": 7700 }, { "epoch": 2.0023388773388775, "grad_norm": 0.5459543466567993, "learning_rate": 4.9999998354209355e-05, "loss": 0.2705, "num_input_tokens_seen": 694552, "step": 7705 }, { "epoch": 2.0036382536382535, "grad_norm": 0.45919376611709595, "learning_rate": 4.999999565408414e-05, "loss": 0.2336, "num_input_tokens_seen": 695000, "step": 7710 }, { "epoch": 2.00493762993763, "grad_norm": 0.377464234828949, "learning_rate": 4.999999166818521e-05, "loss": 0.2006, "num_input_tokens_seen": 695400, "step": 7715 }, { "epoch": 2.006237006237006, "grad_norm": 0.29517319798469543, "learning_rate": 4.999998639651274e-05, "loss": 0.3032, "num_input_tokens_seen": 695848, "step": 7720 }, { "epoch": 2.0075363825363826, "grad_norm": 0.3629200756549835, "learning_rate": 4.999997983906703e-05, "loss": 0.2704, "num_input_tokens_seen": 696296, "step": 7725 }, { "epoch": 2.008835758835759, "grad_norm": 0.31934189796447754, "learning_rate": 4.999997199584839e-05, "loss": 0.2609, "num_input_tokens_seen": 696744, "step": 7730 }, { "epoch": 2.010135135135135, "grad_norm": 0.33884504437446594, "learning_rate": 4.999996286685725e-05, "loss": 0.2254, "num_input_tokens_seen": 697192, "step": 7735 }, { "epoch": 2.0114345114345116, "grad_norm": 0.29810255765914917, "learning_rate": 4.9999952452094055e-05, "loss": 0.2215, "num_input_tokens_seen": 697640, "step": 7740 }, { "epoch": 2.0127338877338876, "grad_norm": 0.27302104234695435, "learning_rate": 4.999994075155936e-05, "loss": 0.2143, "num_input_tokens_seen": 698072, "step": 7745 }, { "epoch": 2.014033264033264, "grad_norm": 0.34194210171699524, "learning_rate": 4.999992776525375e-05, "loss": 0.3008, "num_input_tokens_seen": 698520, "step": 7750 }, { "epoch": 2.01533264033264, "grad_norm": 0.3119703531265259, "learning_rate": 4.999991349317791e-05, "loss": 0.2173, "num_input_tokens_seen": 698968, "step": 7755 }, { "epoch": 2.0166320166320166, "grad_norm": 0.3405916392803192, "learning_rate": 4.9999897935332555e-05, "loss": 0.2661, "num_input_tokens_seen": 699416, "step": 7760 }, { "epoch": 2.017931392931393, "grad_norm": 0.306471586227417, "learning_rate": 4.9999881091718506e-05, "loss": 0.2218, "num_input_tokens_seen": 699880, "step": 7765 }, { "epoch": 2.019230769230769, "grad_norm": 0.7209691405296326, "learning_rate": 4.9999862962336606e-05, "loss": 0.2493, "num_input_tokens_seen": 700312, "step": 7770 }, { "epoch": 2.0205301455301456, "grad_norm": 0.31612536311149597, "learning_rate": 4.999984354718781e-05, "loss": 0.2605, "num_input_tokens_seen": 700744, "step": 7775 }, { "epoch": 2.0218295218295217, "grad_norm": 0.310549795627594, "learning_rate": 4.999982284627311e-05, "loss": 0.3015, "num_input_tokens_seen": 701208, "step": 7780 }, { "epoch": 2.023128898128898, "grad_norm": 0.26301661133766174, "learning_rate": 4.999980085959356e-05, "loss": 0.2619, "num_input_tokens_seen": 701608, "step": 7785 }, { "epoch": 2.024428274428274, "grad_norm": 0.4732113182544708, "learning_rate": 4.9999777587150295e-05, "loss": 0.2544, "num_input_tokens_seen": 702120, "step": 7790 }, { "epoch": 2.0257276507276507, "grad_norm": 0.2647179663181305, "learning_rate": 4.999975302894452e-05, "loss": 0.264, "num_input_tokens_seen": 702568, "step": 7795 }, { "epoch": 2.027027027027027, "grad_norm": 0.24614174664020538, "learning_rate": 4.99997271849775e-05, "loss": 0.2507, "num_input_tokens_seen": 703032, "step": 7800 }, { "epoch": 2.0283264033264032, "grad_norm": 0.4007647633552551, "learning_rate": 4.9999700055250545e-05, "loss": 0.233, "num_input_tokens_seen": 703464, "step": 7805 }, { "epoch": 2.0296257796257797, "grad_norm": 0.298317015171051, "learning_rate": 4.9999671639765065e-05, "loss": 0.3213, "num_input_tokens_seen": 703912, "step": 7810 }, { "epoch": 2.0309251559251558, "grad_norm": 0.3023882508277893, "learning_rate": 4.999964193852252e-05, "loss": 0.2214, "num_input_tokens_seen": 704360, "step": 7815 }, { "epoch": 2.0322245322245323, "grad_norm": 0.30313998460769653, "learning_rate": 4.9999610951524445e-05, "loss": 0.2578, "num_input_tokens_seen": 704792, "step": 7820 }, { "epoch": 2.0335239085239087, "grad_norm": 0.34838175773620605, "learning_rate": 4.999957867877242e-05, "loss": 0.2617, "num_input_tokens_seen": 705272, "step": 7825 }, { "epoch": 2.034823284823285, "grad_norm": 0.4130825102329254, "learning_rate": 4.9999545120268105e-05, "loss": 0.2854, "num_input_tokens_seen": 705688, "step": 7830 }, { "epoch": 2.0361226611226613, "grad_norm": 0.20400582253932953, "learning_rate": 4.999951027601324e-05, "loss": 0.3052, "num_input_tokens_seen": 706152, "step": 7835 }, { "epoch": 2.0374220374220373, "grad_norm": 0.7025163769721985, "learning_rate": 4.99994741460096e-05, "loss": 0.3165, "num_input_tokens_seen": 706648, "step": 7840 }, { "epoch": 2.038721413721414, "grad_norm": 0.5769108533859253, "learning_rate": 4.9999436730259053e-05, "loss": 0.2882, "num_input_tokens_seen": 707112, "step": 7845 }, { "epoch": 2.04002079002079, "grad_norm": 0.1174430102109909, "learning_rate": 4.999939802876352e-05, "loss": 0.2893, "num_input_tokens_seen": 707592, "step": 7850 }, { "epoch": 2.0413201663201663, "grad_norm": 0.649105966091156, "learning_rate": 4.9999358041525004e-05, "loss": 0.2816, "num_input_tokens_seen": 708040, "step": 7855 }, { "epoch": 2.042619542619543, "grad_norm": 0.22973401844501495, "learning_rate": 4.999931676854554e-05, "loss": 0.2793, "num_input_tokens_seen": 708504, "step": 7860 }, { "epoch": 2.043918918918919, "grad_norm": 0.48104530572891235, "learning_rate": 4.999927420982726e-05, "loss": 0.2149, "num_input_tokens_seen": 708984, "step": 7865 }, { "epoch": 2.0452182952182953, "grad_norm": 0.45836037397384644, "learning_rate": 4.999923036537236e-05, "loss": 0.2901, "num_input_tokens_seen": 709448, "step": 7870 }, { "epoch": 2.0465176715176714, "grad_norm": 0.39448249340057373, "learning_rate": 4.999918523518309e-05, "loss": 0.22, "num_input_tokens_seen": 709944, "step": 7875 }, { "epoch": 2.047817047817048, "grad_norm": 0.3423321843147278, "learning_rate": 4.9999138819261764e-05, "loss": 0.3173, "num_input_tokens_seen": 710376, "step": 7880 }, { "epoch": 2.049116424116424, "grad_norm": 0.35756874084472656, "learning_rate": 4.999909111761078e-05, "loss": 0.2267, "num_input_tokens_seen": 710856, "step": 7885 }, { "epoch": 2.0504158004158004, "grad_norm": 0.3685116171836853, "learning_rate": 4.9999042130232585e-05, "loss": 0.251, "num_input_tokens_seen": 711304, "step": 7890 }, { "epoch": 2.051715176715177, "grad_norm": 0.3684084117412567, "learning_rate": 4.9998991857129704e-05, "loss": 0.2325, "num_input_tokens_seen": 711784, "step": 7895 }, { "epoch": 2.053014553014553, "grad_norm": 0.3116242587566376, "learning_rate": 4.999894029830472e-05, "loss": 0.1155, "num_input_tokens_seen": 712264, "step": 7900 }, { "epoch": 2.0543139293139294, "grad_norm": 0.2631776034832001, "learning_rate": 4.999888745376028e-05, "loss": 0.2868, "num_input_tokens_seen": 712728, "step": 7905 }, { "epoch": 2.0556133056133055, "grad_norm": 0.2557147741317749, "learning_rate": 4.9998833323499104e-05, "loss": 0.2141, "num_input_tokens_seen": 713192, "step": 7910 }, { "epoch": 2.056912681912682, "grad_norm": 0.47330695390701294, "learning_rate": 4.999877790752398e-05, "loss": 0.2864, "num_input_tokens_seen": 713656, "step": 7915 }, { "epoch": 2.0582120582120584, "grad_norm": 0.2519369125366211, "learning_rate": 4.999872120583775e-05, "loss": 0.148, "num_input_tokens_seen": 714088, "step": 7920 }, { "epoch": 2.0595114345114345, "grad_norm": 0.46376243233680725, "learning_rate": 4.9998663218443344e-05, "loss": 0.2181, "num_input_tokens_seen": 714504, "step": 7925 }, { "epoch": 2.060810810810811, "grad_norm": 0.24616461992263794, "learning_rate": 4.999860394534373e-05, "loss": 0.0836, "num_input_tokens_seen": 715000, "step": 7930 }, { "epoch": 2.062110187110187, "grad_norm": 0.6193587183952332, "learning_rate": 4.9998543386541964e-05, "loss": 0.1456, "num_input_tokens_seen": 715480, "step": 7935 }, { "epoch": 2.0634095634095635, "grad_norm": 0.5801534652709961, "learning_rate": 4.9998481542041164e-05, "loss": 0.4051, "num_input_tokens_seen": 715912, "step": 7940 }, { "epoch": 2.0647089397089395, "grad_norm": 0.4043143093585968, "learning_rate": 4.99984184118445e-05, "loss": 0.2875, "num_input_tokens_seen": 716360, "step": 7945 }, { "epoch": 2.066008316008316, "grad_norm": 0.2849465608596802, "learning_rate": 4.999835399595523e-05, "loss": 0.3367, "num_input_tokens_seen": 716840, "step": 7950 }, { "epoch": 2.0673076923076925, "grad_norm": 0.23150748014450073, "learning_rate": 4.999828829437666e-05, "loss": 0.256, "num_input_tokens_seen": 717272, "step": 7955 }, { "epoch": 2.0686070686070686, "grad_norm": 0.20800799131393433, "learning_rate": 4.999822130711217e-05, "loss": 0.2473, "num_input_tokens_seen": 717704, "step": 7960 }, { "epoch": 2.069906444906445, "grad_norm": 0.7423439025878906, "learning_rate": 4.99981530341652e-05, "loss": 0.2958, "num_input_tokens_seen": 718168, "step": 7965 }, { "epoch": 2.071205821205821, "grad_norm": 0.2693825662136078, "learning_rate": 4.999808347553927e-05, "loss": 0.2423, "num_input_tokens_seen": 718600, "step": 7970 }, { "epoch": 2.0725051975051976, "grad_norm": 0.24962028861045837, "learning_rate": 4.999801263123796e-05, "loss": 0.3273, "num_input_tokens_seen": 719080, "step": 7975 }, { "epoch": 2.0738045738045736, "grad_norm": 0.5431593656539917, "learning_rate": 4.99979405012649e-05, "loss": 0.2272, "num_input_tokens_seen": 719496, "step": 7980 }, { "epoch": 2.07510395010395, "grad_norm": 0.5091651082038879, "learning_rate": 4.9997867085623824e-05, "loss": 0.2515, "num_input_tokens_seen": 719960, "step": 7985 }, { "epoch": 2.0764033264033266, "grad_norm": 0.48899567127227783, "learning_rate": 4.9997792384318475e-05, "loss": 0.3196, "num_input_tokens_seen": 720408, "step": 7990 }, { "epoch": 2.0777027027027026, "grad_norm": 0.4729642868041992, "learning_rate": 4.9997716397352725e-05, "loss": 0.2823, "num_input_tokens_seen": 720856, "step": 7995 }, { "epoch": 2.079002079002079, "grad_norm": 0.1630844622850418, "learning_rate": 4.9997639124730464e-05, "loss": 0.322, "num_input_tokens_seen": 721288, "step": 8000 }, { "epoch": 2.080301455301455, "grad_norm": 0.6208122968673706, "learning_rate": 4.999756056645567e-05, "loss": 0.2745, "num_input_tokens_seen": 721736, "step": 8005 }, { "epoch": 2.0816008316008316, "grad_norm": 0.1481602042913437, "learning_rate": 4.999748072253239e-05, "loss": 0.2846, "num_input_tokens_seen": 722200, "step": 8010 }, { "epoch": 2.0829002079002077, "grad_norm": 0.6356961727142334, "learning_rate": 4.999739959296471e-05, "loss": 0.2743, "num_input_tokens_seen": 722632, "step": 8015 }, { "epoch": 2.084199584199584, "grad_norm": 0.3653762936592102, "learning_rate": 4.999731717775683e-05, "loss": 0.2262, "num_input_tokens_seen": 723048, "step": 8020 }, { "epoch": 2.0854989604989607, "grad_norm": 0.7119756937026978, "learning_rate": 4.9997233476912977e-05, "loss": 0.286, "num_input_tokens_seen": 723512, "step": 8025 }, { "epoch": 2.0867983367983367, "grad_norm": 0.31567758321762085, "learning_rate": 4.999714849043745e-05, "loss": 0.2648, "num_input_tokens_seen": 723960, "step": 8030 }, { "epoch": 2.088097713097713, "grad_norm": 0.3535041809082031, "learning_rate": 4.9997062218334627e-05, "loss": 0.29, "num_input_tokens_seen": 724392, "step": 8035 }, { "epoch": 2.0893970893970892, "grad_norm": 0.3802124261856079, "learning_rate": 4.999697466060894e-05, "loss": 0.2637, "num_input_tokens_seen": 724856, "step": 8040 }, { "epoch": 2.0906964656964657, "grad_norm": 0.1867804378271103, "learning_rate": 4.9996885817264904e-05, "loss": 0.3438, "num_input_tokens_seen": 725320, "step": 8045 }, { "epoch": 2.091995841995842, "grad_norm": 0.5921372771263123, "learning_rate": 4.999679568830707e-05, "loss": 0.2774, "num_input_tokens_seen": 725768, "step": 8050 }, { "epoch": 2.0932952182952183, "grad_norm": 0.21428833901882172, "learning_rate": 4.999670427374009e-05, "loss": 0.2587, "num_input_tokens_seen": 726216, "step": 8055 }, { "epoch": 2.0945945945945947, "grad_norm": 0.2773233652114868, "learning_rate": 4.999661157356865e-05, "loss": 0.2907, "num_input_tokens_seen": 726664, "step": 8060 }, { "epoch": 2.095893970893971, "grad_norm": 0.3534236252307892, "learning_rate": 4.999651758779754e-05, "loss": 0.1764, "num_input_tokens_seen": 727096, "step": 8065 }, { "epoch": 2.0971933471933473, "grad_norm": 0.43404528498649597, "learning_rate": 4.999642231643157e-05, "loss": 0.2716, "num_input_tokens_seen": 727544, "step": 8070 }, { "epoch": 2.0984927234927233, "grad_norm": 0.30102238059043884, "learning_rate": 4.999632575947565e-05, "loss": 0.2598, "num_input_tokens_seen": 728024, "step": 8075 }, { "epoch": 2.0997920997921, "grad_norm": 0.3512255549430847, "learning_rate": 4.999622791693475e-05, "loss": 0.2915, "num_input_tokens_seen": 728456, "step": 8080 }, { "epoch": 2.1010914760914763, "grad_norm": 0.3203223943710327, "learning_rate": 4.999612878881389e-05, "loss": 0.3096, "num_input_tokens_seen": 728920, "step": 8085 }, { "epoch": 2.1023908523908523, "grad_norm": 0.6976293921470642, "learning_rate": 4.999602837511818e-05, "loss": 0.2515, "num_input_tokens_seen": 729368, "step": 8090 }, { "epoch": 2.103690228690229, "grad_norm": 0.6993454694747925, "learning_rate": 4.9995926675852784e-05, "loss": 0.261, "num_input_tokens_seen": 729832, "step": 8095 }, { "epoch": 2.104989604989605, "grad_norm": 0.6515407562255859, "learning_rate": 4.9995823691022925e-05, "loss": 0.3459, "num_input_tokens_seen": 730296, "step": 8100 }, { "epoch": 2.1062889812889813, "grad_norm": 0.43884363770484924, "learning_rate": 4.999571942063391e-05, "loss": 0.2285, "num_input_tokens_seen": 730760, "step": 8105 }, { "epoch": 2.1075883575883574, "grad_norm": 0.3934211730957031, "learning_rate": 4.999561386469109e-05, "loss": 0.2034, "num_input_tokens_seen": 731224, "step": 8110 }, { "epoch": 2.108887733887734, "grad_norm": 0.27405545115470886, "learning_rate": 4.99955070231999e-05, "loss": 0.2444, "num_input_tokens_seen": 731688, "step": 8115 }, { "epoch": 2.1101871101871104, "grad_norm": 0.28639817237854004, "learning_rate": 4.999539889616583e-05, "loss": 0.2494, "num_input_tokens_seen": 732104, "step": 8120 }, { "epoch": 2.1114864864864864, "grad_norm": 0.31605154275894165, "learning_rate": 4.9995289483594456e-05, "loss": 0.2189, "num_input_tokens_seen": 732568, "step": 8125 }, { "epoch": 2.112785862785863, "grad_norm": 0.30544158816337585, "learning_rate": 4.9995178785491394e-05, "loss": 0.3214, "num_input_tokens_seen": 733080, "step": 8130 }, { "epoch": 2.114085239085239, "grad_norm": 0.31121626496315, "learning_rate": 4.9995066801862326e-05, "loss": 0.3081, "num_input_tokens_seen": 733512, "step": 8135 }, { "epoch": 2.1153846153846154, "grad_norm": 0.383901447057724, "learning_rate": 4.999495353271303e-05, "loss": 0.2255, "num_input_tokens_seen": 733960, "step": 8140 }, { "epoch": 2.116683991683992, "grad_norm": 0.31533220410346985, "learning_rate": 4.999483897804933e-05, "loss": 0.1927, "num_input_tokens_seen": 734424, "step": 8145 }, { "epoch": 2.117983367983368, "grad_norm": 0.29018285870552063, "learning_rate": 4.99947231378771e-05, "loss": 0.257, "num_input_tokens_seen": 734904, "step": 8150 }, { "epoch": 2.1192827442827444, "grad_norm": 0.2640551030635834, "learning_rate": 4.999460601220232e-05, "loss": 0.2116, "num_input_tokens_seen": 735368, "step": 8155 }, { "epoch": 2.1205821205821205, "grad_norm": 0.3767295777797699, "learning_rate": 4.9994487601031006e-05, "loss": 0.2743, "num_input_tokens_seen": 735832, "step": 8160 }, { "epoch": 2.121881496881497, "grad_norm": 0.27165496349334717, "learning_rate": 4.999436790436924e-05, "loss": 0.2126, "num_input_tokens_seen": 736264, "step": 8165 }, { "epoch": 2.123180873180873, "grad_norm": 0.34366485476493835, "learning_rate": 4.999424692222319e-05, "loss": 0.3183, "num_input_tokens_seen": 736728, "step": 8170 }, { "epoch": 2.1244802494802495, "grad_norm": 0.31710541248321533, "learning_rate": 4.9994124654599064e-05, "loss": 0.2199, "num_input_tokens_seen": 737160, "step": 8175 }, { "epoch": 2.125779625779626, "grad_norm": 0.2588222622871399, "learning_rate": 4.999400110150316e-05, "loss": 0.3564, "num_input_tokens_seen": 737640, "step": 8180 }, { "epoch": 2.127079002079002, "grad_norm": 0.48958632349967957, "learning_rate": 4.999387626294183e-05, "loss": 0.2239, "num_input_tokens_seen": 738104, "step": 8185 }, { "epoch": 2.1283783783783785, "grad_norm": 0.28189167380332947, "learning_rate": 4.99937501389215e-05, "loss": 0.2325, "num_input_tokens_seen": 738536, "step": 8190 }, { "epoch": 2.1296777546777546, "grad_norm": 0.43255046010017395, "learning_rate": 4.9993622729448656e-05, "loss": 0.2446, "num_input_tokens_seen": 738936, "step": 8195 }, { "epoch": 2.130977130977131, "grad_norm": 0.5839219689369202, "learning_rate": 4.9993494034529846e-05, "loss": 0.2435, "num_input_tokens_seen": 739464, "step": 8200 }, { "epoch": 2.132276507276507, "grad_norm": 0.7461224794387817, "learning_rate": 4.999336405417169e-05, "loss": 0.2161, "num_input_tokens_seen": 739896, "step": 8205 }, { "epoch": 2.1335758835758836, "grad_norm": 0.6728612184524536, "learning_rate": 4.999323278838087e-05, "loss": 0.2146, "num_input_tokens_seen": 740376, "step": 8210 }, { "epoch": 2.13487525987526, "grad_norm": 0.4194898009300232, "learning_rate": 4.9993100237164144e-05, "loss": 0.2843, "num_input_tokens_seen": 740856, "step": 8215 }, { "epoch": 2.136174636174636, "grad_norm": 0.321262001991272, "learning_rate": 4.999296640052832e-05, "loss": 0.3624, "num_input_tokens_seen": 741320, "step": 8220 }, { "epoch": 2.1374740124740126, "grad_norm": 0.48465463519096375, "learning_rate": 4.999283127848029e-05, "loss": 0.2201, "num_input_tokens_seen": 741768, "step": 8225 }, { "epoch": 2.1387733887733886, "grad_norm": 0.46683362126350403, "learning_rate": 4.9992694871027e-05, "loss": 0.2664, "num_input_tokens_seen": 742216, "step": 8230 }, { "epoch": 2.140072765072765, "grad_norm": 0.5294101238250732, "learning_rate": 4.999255717817547e-05, "loss": 0.266, "num_input_tokens_seen": 742632, "step": 8235 }, { "epoch": 2.141372141372141, "grad_norm": 0.32145121693611145, "learning_rate": 4.999241819993277e-05, "loss": 0.2329, "num_input_tokens_seen": 743080, "step": 8240 }, { "epoch": 2.1426715176715176, "grad_norm": 0.3643297851085663, "learning_rate": 4.999227793630606e-05, "loss": 0.3015, "num_input_tokens_seen": 743560, "step": 8245 }, { "epoch": 2.143970893970894, "grad_norm": 0.27915072441101074, "learning_rate": 4.999213638730255e-05, "loss": 0.2581, "num_input_tokens_seen": 744008, "step": 8250 }, { "epoch": 2.14527027027027, "grad_norm": 0.3011475205421448, "learning_rate": 4.9991993552929514e-05, "loss": 0.2576, "num_input_tokens_seen": 744456, "step": 8255 }, { "epoch": 2.1465696465696467, "grad_norm": 0.3802259564399719, "learning_rate": 4.999184943319432e-05, "loss": 0.2457, "num_input_tokens_seen": 744888, "step": 8260 }, { "epoch": 2.1478690228690227, "grad_norm": 0.39934614300727844, "learning_rate": 4.9991704028104345e-05, "loss": 0.218, "num_input_tokens_seen": 745384, "step": 8265 }, { "epoch": 2.149168399168399, "grad_norm": 0.30473124980926514, "learning_rate": 4.9991557337667096e-05, "loss": 0.3358, "num_input_tokens_seen": 745816, "step": 8270 }, { "epoch": 2.1504677754677752, "grad_norm": 0.2351352423429489, "learning_rate": 4.99914093618901e-05, "loss": 0.2913, "num_input_tokens_seen": 746264, "step": 8275 }, { "epoch": 2.1517671517671517, "grad_norm": 0.6052897572517395, "learning_rate": 4.999126010078098e-05, "loss": 0.2917, "num_input_tokens_seen": 746664, "step": 8280 }, { "epoch": 2.153066528066528, "grad_norm": 0.20232555270195007, "learning_rate": 4.99911095543474e-05, "loss": 0.279, "num_input_tokens_seen": 747128, "step": 8285 }, { "epoch": 2.1543659043659042, "grad_norm": 0.23100921511650085, "learning_rate": 4.999095772259712e-05, "loss": 0.2901, "num_input_tokens_seen": 747608, "step": 8290 }, { "epoch": 2.1556652806652807, "grad_norm": 0.6576434969902039, "learning_rate": 4.999080460553793e-05, "loss": 0.268, "num_input_tokens_seen": 748040, "step": 8295 }, { "epoch": 2.156964656964657, "grad_norm": 0.24265414476394653, "learning_rate": 4.9990650203177724e-05, "loss": 0.2531, "num_input_tokens_seen": 748456, "step": 8300 }, { "epoch": 2.1582640332640333, "grad_norm": 0.4158915579319, "learning_rate": 4.999049451552443e-05, "loss": 0.2568, "num_input_tokens_seen": 748904, "step": 8305 }, { "epoch": 2.1595634095634098, "grad_norm": 0.3686743378639221, "learning_rate": 4.999033754258605e-05, "loss": 0.1674, "num_input_tokens_seen": 749384, "step": 8310 }, { "epoch": 2.160862785862786, "grad_norm": 0.33672159910202026, "learning_rate": 4.9990179284370675e-05, "loss": 0.2575, "num_input_tokens_seen": 749848, "step": 8315 }, { "epoch": 2.1621621621621623, "grad_norm": 0.5359681248664856, "learning_rate": 4.999001974088644e-05, "loss": 0.2577, "num_input_tokens_seen": 750296, "step": 8320 }, { "epoch": 2.1634615384615383, "grad_norm": 0.3367993235588074, "learning_rate": 4.998985891214153e-05, "loss": 0.2867, "num_input_tokens_seen": 750744, "step": 8325 }, { "epoch": 2.164760914760915, "grad_norm": 0.3643845021724701, "learning_rate": 4.998969679814424e-05, "loss": 0.3342, "num_input_tokens_seen": 751224, "step": 8330 }, { "epoch": 2.166060291060291, "grad_norm": 0.3219309449195862, "learning_rate": 4.99895333989029e-05, "loss": 0.251, "num_input_tokens_seen": 751688, "step": 8335 }, { "epoch": 2.1673596673596673, "grad_norm": 0.7890896797180176, "learning_rate": 4.998936871442591e-05, "loss": 0.2993, "num_input_tokens_seen": 752152, "step": 8340 }, { "epoch": 2.168659043659044, "grad_norm": 0.23281624913215637, "learning_rate": 4.998920274472175e-05, "loss": 0.2766, "num_input_tokens_seen": 752632, "step": 8345 }, { "epoch": 2.16995841995842, "grad_norm": 0.25837060809135437, "learning_rate": 4.9989035489798944e-05, "loss": 0.2558, "num_input_tokens_seen": 753096, "step": 8350 }, { "epoch": 2.1712577962577964, "grad_norm": 0.45318886637687683, "learning_rate": 4.9988866949666105e-05, "loss": 0.2318, "num_input_tokens_seen": 753544, "step": 8355 }, { "epoch": 2.1725571725571724, "grad_norm": 0.3963797390460968, "learning_rate": 4.99886971243319e-05, "loss": 0.2247, "num_input_tokens_seen": 753992, "step": 8360 }, { "epoch": 2.173856548856549, "grad_norm": 0.8832217454910278, "learning_rate": 4.998852601380504e-05, "loss": 0.3085, "num_input_tokens_seen": 754456, "step": 8365 }, { "epoch": 2.1751559251559254, "grad_norm": 0.33425018191337585, "learning_rate": 4.998835361809436e-05, "loss": 0.302, "num_input_tokens_seen": 754920, "step": 8370 }, { "epoch": 2.1764553014553014, "grad_norm": 0.23599304258823395, "learning_rate": 4.9988179937208704e-05, "loss": 0.3243, "num_input_tokens_seen": 755352, "step": 8375 }, { "epoch": 2.177754677754678, "grad_norm": 0.16960333287715912, "learning_rate": 4.998800497115701e-05, "loss": 0.303, "num_input_tokens_seen": 755784, "step": 8380 }, { "epoch": 2.179054054054054, "grad_norm": 0.1277938038110733, "learning_rate": 4.9987828719948284e-05, "loss": 0.2858, "num_input_tokens_seen": 756232, "step": 8385 }, { "epoch": 2.1803534303534304, "grad_norm": 0.0697021409869194, "learning_rate": 4.9987651183591574e-05, "loss": 0.2843, "num_input_tokens_seen": 756712, "step": 8390 }, { "epoch": 2.1816528066528065, "grad_norm": 0.554061233997345, "learning_rate": 4.998747236209603e-05, "loss": 0.2742, "num_input_tokens_seen": 757192, "step": 8395 }, { "epoch": 2.182952182952183, "grad_norm": 0.4529387056827545, "learning_rate": 4.998729225547085e-05, "loss": 0.2595, "num_input_tokens_seen": 757672, "step": 8400 }, { "epoch": 2.1842515592515594, "grad_norm": 0.3922203779220581, "learning_rate": 4.998711086372527e-05, "loss": 0.2948, "num_input_tokens_seen": 758152, "step": 8405 }, { "epoch": 2.1855509355509355, "grad_norm": 0.3231333792209625, "learning_rate": 4.9986928186868645e-05, "loss": 0.3009, "num_input_tokens_seen": 758600, "step": 8410 }, { "epoch": 2.186850311850312, "grad_norm": 0.4619082510471344, "learning_rate": 4.998674422491036e-05, "loss": 0.254, "num_input_tokens_seen": 759080, "step": 8415 }, { "epoch": 2.188149688149688, "grad_norm": 0.42330849170684814, "learning_rate": 4.998655897785989e-05, "loss": 0.2365, "num_input_tokens_seen": 759528, "step": 8420 }, { "epoch": 2.1894490644490645, "grad_norm": 0.41254258155822754, "learning_rate": 4.998637244572674e-05, "loss": 0.2789, "num_input_tokens_seen": 759976, "step": 8425 }, { "epoch": 2.1907484407484406, "grad_norm": 0.34965765476226807, "learning_rate": 4.998618462852051e-05, "loss": 0.2497, "num_input_tokens_seen": 760408, "step": 8430 }, { "epoch": 2.192047817047817, "grad_norm": 0.3947889804840088, "learning_rate": 4.9985995526250876e-05, "loss": 0.2135, "num_input_tokens_seen": 760856, "step": 8435 }, { "epoch": 2.1933471933471935, "grad_norm": 0.3511009216308594, "learning_rate": 4.998580513892754e-05, "loss": 0.2942, "num_input_tokens_seen": 761352, "step": 8440 }, { "epoch": 2.1946465696465696, "grad_norm": 0.37089788913726807, "learning_rate": 4.998561346656031e-05, "loss": 0.227, "num_input_tokens_seen": 761816, "step": 8445 }, { "epoch": 2.195945945945946, "grad_norm": 0.4168611764907837, "learning_rate": 4.998542050915904e-05, "loss": 0.2269, "num_input_tokens_seen": 762280, "step": 8450 }, { "epoch": 2.197245322245322, "grad_norm": 0.2855779528617859, "learning_rate": 4.9985226266733645e-05, "loss": 0.1587, "num_input_tokens_seen": 762696, "step": 8455 }, { "epoch": 2.1985446985446986, "grad_norm": 1.0175724029541016, "learning_rate": 4.9985030739294136e-05, "loss": 0.33, "num_input_tokens_seen": 763160, "step": 8460 }, { "epoch": 2.1998440748440746, "grad_norm": 0.2989063560962677, "learning_rate": 4.998483392685055e-05, "loss": 0.2294, "num_input_tokens_seen": 763640, "step": 8465 }, { "epoch": 2.201143451143451, "grad_norm": 0.3726067841053009, "learning_rate": 4.998463582941302e-05, "loss": 0.2586, "num_input_tokens_seen": 764088, "step": 8470 }, { "epoch": 2.2024428274428276, "grad_norm": 0.32639071345329285, "learning_rate": 4.998443644699172e-05, "loss": 0.1808, "num_input_tokens_seen": 764536, "step": 8475 }, { "epoch": 2.2037422037422036, "grad_norm": 0.33762869238853455, "learning_rate": 4.9984235779596925e-05, "loss": 0.288, "num_input_tokens_seen": 764984, "step": 8480 }, { "epoch": 2.20504158004158, "grad_norm": 0.3092462420463562, "learning_rate": 4.9984033827238944e-05, "loss": 0.2898, "num_input_tokens_seen": 765416, "step": 8485 }, { "epoch": 2.206340956340956, "grad_norm": 0.4140019714832306, "learning_rate": 4.998383058992817e-05, "loss": 0.2961, "num_input_tokens_seen": 765848, "step": 8490 }, { "epoch": 2.2076403326403327, "grad_norm": 0.3628382384777069, "learning_rate": 4.998362606767504e-05, "loss": 0.2076, "num_input_tokens_seen": 766312, "step": 8495 }, { "epoch": 2.208939708939709, "grad_norm": 0.3652889132499695, "learning_rate": 4.998342026049009e-05, "loss": 0.2628, "num_input_tokens_seen": 766760, "step": 8500 }, { "epoch": 2.210239085239085, "grad_norm": 0.329863578081131, "learning_rate": 4.998321316838389e-05, "loss": 0.217, "num_input_tokens_seen": 767176, "step": 8505 }, { "epoch": 2.2115384615384617, "grad_norm": 0.7231184840202332, "learning_rate": 4.9983004791367104e-05, "loss": 0.2986, "num_input_tokens_seen": 767672, "step": 8510 }, { "epoch": 2.2128378378378377, "grad_norm": 0.2887502610683441, "learning_rate": 4.9982795129450444e-05, "loss": 0.2582, "num_input_tokens_seen": 768120, "step": 8515 }, { "epoch": 2.214137214137214, "grad_norm": 0.3195134997367859, "learning_rate": 4.998258418264469e-05, "loss": 0.2176, "num_input_tokens_seen": 768632, "step": 8520 }, { "epoch": 2.2154365904365902, "grad_norm": 0.32714372873306274, "learning_rate": 4.99823719509607e-05, "loss": 0.2231, "num_input_tokens_seen": 769128, "step": 8525 }, { "epoch": 2.2167359667359667, "grad_norm": 0.3127601146697998, "learning_rate": 4.9982158434409374e-05, "loss": 0.1989, "num_input_tokens_seen": 769608, "step": 8530 }, { "epoch": 2.218035343035343, "grad_norm": 0.2699388861656189, "learning_rate": 4.9981943633001715e-05, "loss": 0.3446, "num_input_tokens_seen": 770072, "step": 8535 }, { "epoch": 2.2193347193347193, "grad_norm": 0.3574341833591461, "learning_rate": 4.998172754674876e-05, "loss": 0.2678, "num_input_tokens_seen": 770488, "step": 8540 }, { "epoch": 2.2206340956340958, "grad_norm": 0.24366945028305054, "learning_rate": 4.9981510175661606e-05, "loss": 0.223, "num_input_tokens_seen": 770920, "step": 8545 }, { "epoch": 2.221933471933472, "grad_norm": 0.29150524735450745, "learning_rate": 4.998129151975146e-05, "loss": 0.2275, "num_input_tokens_seen": 771384, "step": 8550 }, { "epoch": 2.2232328482328483, "grad_norm": 0.304086297750473, "learning_rate": 4.998107157902955e-05, "loss": 0.174, "num_input_tokens_seen": 771832, "step": 8555 }, { "epoch": 2.2245322245322248, "grad_norm": 0.2794502079486847, "learning_rate": 4.998085035350719e-05, "loss": 0.2658, "num_input_tokens_seen": 772264, "step": 8560 }, { "epoch": 2.225831600831601, "grad_norm": 0.2500487267971039, "learning_rate": 4.9980627843195774e-05, "loss": 0.2568, "num_input_tokens_seen": 772712, "step": 8565 }, { "epoch": 2.2271309771309773, "grad_norm": 0.3024572432041168, "learning_rate": 4.998040404810672e-05, "loss": 0.2622, "num_input_tokens_seen": 773160, "step": 8570 }, { "epoch": 2.2284303534303533, "grad_norm": 0.3061693608760834, "learning_rate": 4.9980178968251554e-05, "loss": 0.3098, "num_input_tokens_seen": 773560, "step": 8575 }, { "epoch": 2.22972972972973, "grad_norm": 0.25965720415115356, "learning_rate": 4.9979952603641846e-05, "loss": 0.2944, "num_input_tokens_seen": 773976, "step": 8580 }, { "epoch": 2.231029106029106, "grad_norm": 0.25605958700180054, "learning_rate": 4.9979724954289244e-05, "loss": 0.2893, "num_input_tokens_seen": 774440, "step": 8585 }, { "epoch": 2.2323284823284824, "grad_norm": 0.2000574916601181, "learning_rate": 4.997949602020545e-05, "loss": 0.2748, "num_input_tokens_seen": 774888, "step": 8590 }, { "epoch": 2.233627858627859, "grad_norm": 0.4700242280960083, "learning_rate": 4.997926580140225e-05, "loss": 0.2453, "num_input_tokens_seen": 775304, "step": 8595 }, { "epoch": 2.234927234927235, "grad_norm": 0.7198705077171326, "learning_rate": 4.997903429789147e-05, "loss": 0.3003, "num_input_tokens_seen": 775752, "step": 8600 }, { "epoch": 2.2362266112266114, "grad_norm": 0.4292985200881958, "learning_rate": 4.997880150968502e-05, "loss": 0.2272, "num_input_tokens_seen": 776168, "step": 8605 }, { "epoch": 2.2375259875259874, "grad_norm": 0.44621431827545166, "learning_rate": 4.997856743679487e-05, "loss": 0.2956, "num_input_tokens_seen": 776584, "step": 8610 }, { "epoch": 2.238825363825364, "grad_norm": 0.5693262815475464, "learning_rate": 4.997833207923308e-05, "loss": 0.2858, "num_input_tokens_seen": 777048, "step": 8615 }, { "epoch": 2.24012474012474, "grad_norm": 0.28730976581573486, "learning_rate": 4.997809543701173e-05, "loss": 0.3115, "num_input_tokens_seen": 777544, "step": 8620 }, { "epoch": 2.2414241164241164, "grad_norm": 0.5266662836074829, "learning_rate": 4.9977857510143e-05, "loss": 0.2159, "num_input_tokens_seen": 777960, "step": 8625 }, { "epoch": 2.242723492723493, "grad_norm": 0.5767003893852234, "learning_rate": 4.9977618298639114e-05, "loss": 0.2799, "num_input_tokens_seen": 778424, "step": 8630 }, { "epoch": 2.244022869022869, "grad_norm": 0.5227214694023132, "learning_rate": 4.9977377802512405e-05, "loss": 0.173, "num_input_tokens_seen": 778856, "step": 8635 }, { "epoch": 2.2453222453222454, "grad_norm": 0.9621784687042236, "learning_rate": 4.997713602177521e-05, "loss": 0.3257, "num_input_tokens_seen": 779288, "step": 8640 }, { "epoch": 2.2466216216216215, "grad_norm": 0.4114360213279724, "learning_rate": 4.997689295643998e-05, "loss": 0.205, "num_input_tokens_seen": 779752, "step": 8645 }, { "epoch": 2.247920997920998, "grad_norm": 0.3748674690723419, "learning_rate": 4.997664860651922e-05, "loss": 0.1579, "num_input_tokens_seen": 780184, "step": 8650 }, { "epoch": 2.249220374220374, "grad_norm": 0.2757696807384491, "learning_rate": 4.997640297202548e-05, "loss": 0.0809, "num_input_tokens_seen": 780584, "step": 8655 }, { "epoch": 2.2505197505197505, "grad_norm": 0.15776391327381134, "learning_rate": 4.997615605297141e-05, "loss": 0.1595, "num_input_tokens_seen": 781032, "step": 8660 }, { "epoch": 2.251819126819127, "grad_norm": 0.14059554040431976, "learning_rate": 4.997590784936971e-05, "loss": 0.1564, "num_input_tokens_seen": 781496, "step": 8665 }, { "epoch": 2.253118503118503, "grad_norm": 1.336375117301941, "learning_rate": 4.997565836123313e-05, "loss": 0.6772, "num_input_tokens_seen": 781944, "step": 8670 }, { "epoch": 2.2544178794178795, "grad_norm": 0.7913801670074463, "learning_rate": 4.9975407588574506e-05, "loss": 0.3177, "num_input_tokens_seen": 782408, "step": 8675 }, { "epoch": 2.2557172557172556, "grad_norm": 2.779033660888672, "learning_rate": 4.9975155531406745e-05, "loss": 0.3779, "num_input_tokens_seen": 782824, "step": 8680 }, { "epoch": 2.257016632016632, "grad_norm": 1.4455029964447021, "learning_rate": 4.9974902189742814e-05, "loss": 0.2794, "num_input_tokens_seen": 783240, "step": 8685 }, { "epoch": 2.258316008316008, "grad_norm": 0.7568178772926331, "learning_rate": 4.997464756359572e-05, "loss": 0.2593, "num_input_tokens_seen": 783672, "step": 8690 }, { "epoch": 2.2596153846153846, "grad_norm": 0.49525904655456543, "learning_rate": 4.997439165297858e-05, "loss": 0.321, "num_input_tokens_seen": 784152, "step": 8695 }, { "epoch": 2.260914760914761, "grad_norm": 0.24258722364902496, "learning_rate": 4.997413445790454e-05, "loss": 0.2794, "num_input_tokens_seen": 784632, "step": 8700 }, { "epoch": 2.262214137214137, "grad_norm": 0.5543224215507507, "learning_rate": 4.9973875978386843e-05, "loss": 0.1955, "num_input_tokens_seen": 785064, "step": 8705 }, { "epoch": 2.2635135135135136, "grad_norm": 2.016465902328491, "learning_rate": 4.997361621443877e-05, "loss": 0.4612, "num_input_tokens_seen": 785512, "step": 8710 }, { "epoch": 2.2648128898128896, "grad_norm": 0.4601844847202301, "learning_rate": 4.997335516607369e-05, "loss": 0.2838, "num_input_tokens_seen": 785992, "step": 8715 }, { "epoch": 2.266112266112266, "grad_norm": 0.5914899706840515, "learning_rate": 4.997309283330503e-05, "loss": 0.2255, "num_input_tokens_seen": 786424, "step": 8720 }, { "epoch": 2.267411642411642, "grad_norm": 0.48962581157684326, "learning_rate": 4.997282921614627e-05, "loss": 0.1938, "num_input_tokens_seen": 786840, "step": 8725 }, { "epoch": 2.2687110187110187, "grad_norm": 0.9311036467552185, "learning_rate": 4.997256431461098e-05, "loss": 0.4199, "num_input_tokens_seen": 787320, "step": 8730 }, { "epoch": 2.270010395010395, "grad_norm": 0.5420421361923218, "learning_rate": 4.997229812871278e-05, "loss": 0.2813, "num_input_tokens_seen": 787752, "step": 8735 }, { "epoch": 2.271309771309771, "grad_norm": 0.6419205069541931, "learning_rate": 4.997203065846536e-05, "loss": 0.2709, "num_input_tokens_seen": 788200, "step": 8740 }, { "epoch": 2.2726091476091477, "grad_norm": 0.26878267526626587, "learning_rate": 4.997176190388247e-05, "loss": 0.2654, "num_input_tokens_seen": 788664, "step": 8745 }, { "epoch": 2.2739085239085237, "grad_norm": 0.33865633606910706, "learning_rate": 4.9971491864977946e-05, "loss": 0.2576, "num_input_tokens_seen": 789144, "step": 8750 }, { "epoch": 2.2752079002079, "grad_norm": 0.4006807804107666, "learning_rate": 4.997122054176566e-05, "loss": 0.2823, "num_input_tokens_seen": 789592, "step": 8755 }, { "epoch": 2.2765072765072767, "grad_norm": 0.3704787790775299, "learning_rate": 4.997094793425958e-05, "loss": 0.2216, "num_input_tokens_seen": 790040, "step": 8760 }, { "epoch": 2.2778066528066527, "grad_norm": 0.33467647433280945, "learning_rate": 4.997067404247373e-05, "loss": 0.2146, "num_input_tokens_seen": 790504, "step": 8765 }, { "epoch": 2.279106029106029, "grad_norm": 0.2939055860042572, "learning_rate": 4.997039886642218e-05, "loss": 0.1567, "num_input_tokens_seen": 790952, "step": 8770 }, { "epoch": 2.2804054054054053, "grad_norm": 0.2789788544178009, "learning_rate": 4.997012240611909e-05, "loss": 0.151, "num_input_tokens_seen": 791384, "step": 8775 }, { "epoch": 2.2817047817047817, "grad_norm": 0.2489999681711197, "learning_rate": 4.996984466157868e-05, "loss": 0.2253, "num_input_tokens_seen": 791864, "step": 8780 }, { "epoch": 2.2830041580041582, "grad_norm": 1.2450125217437744, "learning_rate": 4.996956563281524e-05, "loss": 0.368, "num_input_tokens_seen": 792328, "step": 8785 }, { "epoch": 2.2843035343035343, "grad_norm": 0.287307471036911, "learning_rate": 4.9969285319843105e-05, "loss": 0.1462, "num_input_tokens_seen": 792792, "step": 8790 }, { "epoch": 2.2856029106029108, "grad_norm": 0.26841291785240173, "learning_rate": 4.996900372267671e-05, "loss": 0.1499, "num_input_tokens_seen": 793256, "step": 8795 }, { "epoch": 2.286902286902287, "grad_norm": 0.42303213477134705, "learning_rate": 4.996872084133053e-05, "loss": 0.2654, "num_input_tokens_seen": 793720, "step": 8800 }, { "epoch": 2.2882016632016633, "grad_norm": 0.8869578242301941, "learning_rate": 4.996843667581911e-05, "loss": 0.2022, "num_input_tokens_seen": 794152, "step": 8805 }, { "epoch": 2.2895010395010393, "grad_norm": 0.34032168984413147, "learning_rate": 4.9968151226157065e-05, "loss": 0.4079, "num_input_tokens_seen": 794584, "step": 8810 }, { "epoch": 2.290800415800416, "grad_norm": 0.4442999064922333, "learning_rate": 4.996786449235908e-05, "loss": 0.2238, "num_input_tokens_seen": 795032, "step": 8815 }, { "epoch": 2.2920997920997923, "grad_norm": 0.21147102117538452, "learning_rate": 4.9967576474439905e-05, "loss": 0.2615, "num_input_tokens_seen": 795480, "step": 8820 }, { "epoch": 2.2933991683991684, "grad_norm": 0.6996681094169617, "learning_rate": 4.9967287172414345e-05, "loss": 0.2935, "num_input_tokens_seen": 795928, "step": 8825 }, { "epoch": 2.294698544698545, "grad_norm": 0.419453889131546, "learning_rate": 4.996699658629729e-05, "loss": 0.2265, "num_input_tokens_seen": 796344, "step": 8830 }, { "epoch": 2.295997920997921, "grad_norm": 0.23704056441783905, "learning_rate": 4.996670471610367e-05, "loss": 0.2736, "num_input_tokens_seen": 796808, "step": 8835 }, { "epoch": 2.2972972972972974, "grad_norm": 0.262392520904541, "learning_rate": 4.99664115618485e-05, "loss": 0.2757, "num_input_tokens_seen": 797240, "step": 8840 }, { "epoch": 2.2985966735966734, "grad_norm": 0.2055354118347168, "learning_rate": 4.996611712354687e-05, "loss": 0.282, "num_input_tokens_seen": 797704, "step": 8845 }, { "epoch": 2.29989604989605, "grad_norm": 0.5846949815750122, "learning_rate": 4.996582140121392e-05, "loss": 0.2874, "num_input_tokens_seen": 798136, "step": 8850 }, { "epoch": 2.3011954261954264, "grad_norm": 0.572752833366394, "learning_rate": 4.9965524394864846e-05, "loss": 0.261, "num_input_tokens_seen": 798584, "step": 8855 }, { "epoch": 2.3024948024948024, "grad_norm": 0.41354358196258545, "learning_rate": 4.996522610451494e-05, "loss": 0.2474, "num_input_tokens_seen": 799016, "step": 8860 }, { "epoch": 2.303794178794179, "grad_norm": 0.3047899007797241, "learning_rate": 4.996492653017952e-05, "loss": 0.2613, "num_input_tokens_seen": 799464, "step": 8865 }, { "epoch": 2.305093555093555, "grad_norm": 0.30616846680641174, "learning_rate": 4.996462567187402e-05, "loss": 0.3821, "num_input_tokens_seen": 799912, "step": 8870 }, { "epoch": 2.3063929313929314, "grad_norm": 0.48278817534446716, "learning_rate": 4.9964323529613905e-05, "loss": 0.2859, "num_input_tokens_seen": 800328, "step": 8875 }, { "epoch": 2.3076923076923075, "grad_norm": 0.4241349697113037, "learning_rate": 4.9964020103414706e-05, "loss": 0.2183, "num_input_tokens_seen": 800856, "step": 8880 }, { "epoch": 2.308991683991684, "grad_norm": 0.350868821144104, "learning_rate": 4.996371539329203e-05, "loss": 0.2247, "num_input_tokens_seen": 801320, "step": 8885 }, { "epoch": 2.3102910602910605, "grad_norm": 0.7494425177574158, "learning_rate": 4.996340939926156e-05, "loss": 0.289, "num_input_tokens_seen": 801784, "step": 8890 }, { "epoch": 2.3115904365904365, "grad_norm": 0.3228685259819031, "learning_rate": 4.996310212133902e-05, "loss": 0.2212, "num_input_tokens_seen": 802248, "step": 8895 }, { "epoch": 2.312889812889813, "grad_norm": 0.2889350950717926, "learning_rate": 4.9962793559540224e-05, "loss": 0.2175, "num_input_tokens_seen": 802760, "step": 8900 }, { "epoch": 2.314189189189189, "grad_norm": 0.3611694276332855, "learning_rate": 4.996248371388103e-05, "loss": 0.3185, "num_input_tokens_seen": 803192, "step": 8905 }, { "epoch": 2.3154885654885655, "grad_norm": 0.3052821457386017, "learning_rate": 4.9962172584377386e-05, "loss": 0.2182, "num_input_tokens_seen": 803656, "step": 8910 }, { "epoch": 2.3167879417879416, "grad_norm": 0.2913222014904022, "learning_rate": 4.9961860171045284e-05, "loss": 0.1744, "num_input_tokens_seen": 804104, "step": 8915 }, { "epoch": 2.318087318087318, "grad_norm": 0.31626632809638977, "learning_rate": 4.99615464739008e-05, "loss": 0.3177, "num_input_tokens_seen": 804552, "step": 8920 }, { "epoch": 2.3193866943866945, "grad_norm": 0.290680468082428, "learning_rate": 4.996123149296007e-05, "loss": 0.2197, "num_input_tokens_seen": 805000, "step": 8925 }, { "epoch": 2.3206860706860706, "grad_norm": 0.280299574136734, "learning_rate": 4.9960915228239274e-05, "loss": 0.172, "num_input_tokens_seen": 805480, "step": 8930 }, { "epoch": 2.321985446985447, "grad_norm": 0.27083620429039, "learning_rate": 4.99605976797547e-05, "loss": 0.2564, "num_input_tokens_seen": 805928, "step": 8935 }, { "epoch": 2.323284823284823, "grad_norm": 0.3455985486507416, "learning_rate": 4.996027884752267e-05, "loss": 0.3515, "num_input_tokens_seen": 806408, "step": 8940 }, { "epoch": 2.3245841995841996, "grad_norm": 0.33561861515045166, "learning_rate": 4.995995873155958e-05, "loss": 0.2735, "num_input_tokens_seen": 806840, "step": 8945 }, { "epoch": 2.3258835758835756, "grad_norm": 0.3568795621395111, "learning_rate": 4.99596373318819e-05, "loss": 0.228, "num_input_tokens_seen": 807288, "step": 8950 }, { "epoch": 2.327182952182952, "grad_norm": 0.35828858613967896, "learning_rate": 4.995931464850616e-05, "loss": 0.223, "num_input_tokens_seen": 807752, "step": 8955 }, { "epoch": 2.3284823284823286, "grad_norm": 0.3023236095905304, "learning_rate": 4.995899068144895e-05, "loss": 0.2941, "num_input_tokens_seen": 808152, "step": 8960 }, { "epoch": 2.3297817047817047, "grad_norm": 0.38140246272087097, "learning_rate": 4.9958665430726924e-05, "loss": 0.3187, "num_input_tokens_seen": 808584, "step": 8965 }, { "epoch": 2.331081081081081, "grad_norm": 0.5992932915687561, "learning_rate": 4.995833889635683e-05, "loss": 0.3338, "num_input_tokens_seen": 809032, "step": 8970 }, { "epoch": 2.3323804573804576, "grad_norm": 0.5740867257118225, "learning_rate": 4.995801107835546e-05, "loss": 0.2674, "num_input_tokens_seen": 809496, "step": 8975 }, { "epoch": 2.3336798336798337, "grad_norm": 0.6127086281776428, "learning_rate": 4.995768197673966e-05, "loss": 0.2727, "num_input_tokens_seen": 809928, "step": 8980 }, { "epoch": 2.33497920997921, "grad_norm": 0.6095922589302063, "learning_rate": 4.9957351591526356e-05, "loss": 0.2473, "num_input_tokens_seen": 810360, "step": 8985 }, { "epoch": 2.336278586278586, "grad_norm": 0.3442152738571167, "learning_rate": 4.995701992273255e-05, "loss": 0.2759, "num_input_tokens_seen": 810808, "step": 8990 }, { "epoch": 2.3375779625779627, "grad_norm": 0.3204038143157959, "learning_rate": 4.995668697037531e-05, "loss": 0.302, "num_input_tokens_seen": 811272, "step": 8995 }, { "epoch": 2.3388773388773387, "grad_norm": 0.35739484429359436, "learning_rate": 4.995635273447173e-05, "loss": 0.259, "num_input_tokens_seen": 811720, "step": 9000 }, { "epoch": 2.340176715176715, "grad_norm": 0.32478684186935425, "learning_rate": 4.995601721503902e-05, "loss": 0.2995, "num_input_tokens_seen": 812152, "step": 9005 }, { "epoch": 2.3414760914760917, "grad_norm": 0.7677115201950073, "learning_rate": 4.995568041209444e-05, "loss": 0.303, "num_input_tokens_seen": 812568, "step": 9010 }, { "epoch": 2.3427754677754677, "grad_norm": 0.2705598473548889, "learning_rate": 4.9955342325655295e-05, "loss": 0.2476, "num_input_tokens_seen": 813016, "step": 9015 }, { "epoch": 2.3440748440748442, "grad_norm": 0.43680477142333984, "learning_rate": 4.995500295573899e-05, "loss": 0.2403, "num_input_tokens_seen": 813432, "step": 9020 }, { "epoch": 2.3453742203742203, "grad_norm": 0.28091591596603394, "learning_rate": 4.9954662302362973e-05, "loss": 0.2581, "num_input_tokens_seen": 813864, "step": 9025 }, { "epoch": 2.3466735966735968, "grad_norm": 0.7431975603103638, "learning_rate": 4.9954320365544765e-05, "loss": 0.2509, "num_input_tokens_seen": 814280, "step": 9030 }, { "epoch": 2.347972972972973, "grad_norm": 0.33825767040252686, "learning_rate": 4.995397714530194e-05, "loss": 0.248, "num_input_tokens_seen": 814696, "step": 9035 }, { "epoch": 2.3492723492723493, "grad_norm": 0.30996549129486084, "learning_rate": 4.9953632641652174e-05, "loss": 0.2398, "num_input_tokens_seen": 815176, "step": 9040 }, { "epoch": 2.350571725571726, "grad_norm": 0.31982848048210144, "learning_rate": 4.995328685461317e-05, "loss": 0.2691, "num_input_tokens_seen": 815640, "step": 9045 }, { "epoch": 2.351871101871102, "grad_norm": 0.3738236427307129, "learning_rate": 4.995293978420271e-05, "loss": 0.1807, "num_input_tokens_seen": 816088, "step": 9050 }, { "epoch": 2.3531704781704783, "grad_norm": 0.3300192952156067, "learning_rate": 4.9952591430438646e-05, "loss": 0.2666, "num_input_tokens_seen": 816552, "step": 9055 }, { "epoch": 2.3544698544698544, "grad_norm": 0.31495630741119385, "learning_rate": 4.9952241793338897e-05, "loss": 0.2124, "num_input_tokens_seen": 816984, "step": 9060 }, { "epoch": 2.355769230769231, "grad_norm": 0.3271489441394806, "learning_rate": 4.995189087292145e-05, "loss": 0.2554, "num_input_tokens_seen": 817464, "step": 9065 }, { "epoch": 2.357068607068607, "grad_norm": 0.30865001678466797, "learning_rate": 4.995153866920434e-05, "loss": 0.2672, "num_input_tokens_seen": 817896, "step": 9070 }, { "epoch": 2.3583679833679834, "grad_norm": 0.27197954058647156, "learning_rate": 4.9951185182205694e-05, "loss": 0.3459, "num_input_tokens_seen": 818328, "step": 9075 }, { "epoch": 2.35966735966736, "grad_norm": 0.6224559545516968, "learning_rate": 4.9950830411943684e-05, "loss": 0.2194, "num_input_tokens_seen": 818776, "step": 9080 }, { "epoch": 2.360966735966736, "grad_norm": 0.20583797991275787, "learning_rate": 4.995047435843656e-05, "loss": 0.3325, "num_input_tokens_seen": 819208, "step": 9085 }, { "epoch": 2.3622661122661124, "grad_norm": 0.5532911419868469, "learning_rate": 4.995011702170264e-05, "loss": 0.2692, "num_input_tokens_seen": 819704, "step": 9090 }, { "epoch": 2.3635654885654884, "grad_norm": 0.45648103952407837, "learning_rate": 4.994975840176028e-05, "loss": 0.2452, "num_input_tokens_seen": 820184, "step": 9095 }, { "epoch": 2.364864864864865, "grad_norm": 0.3381078243255615, "learning_rate": 4.9949398498627955e-05, "loss": 0.2296, "num_input_tokens_seen": 820616, "step": 9100 }, { "epoch": 2.366164241164241, "grad_norm": 0.2832771837711334, "learning_rate": 4.9949037312324155e-05, "loss": 0.175, "num_input_tokens_seen": 821096, "step": 9105 }, { "epoch": 2.3674636174636174, "grad_norm": 0.7559969425201416, "learning_rate": 4.994867484286746e-05, "loss": 0.3601, "num_input_tokens_seen": 821560, "step": 9110 }, { "epoch": 2.368762993762994, "grad_norm": 0.367387056350708, "learning_rate": 4.9948311090276515e-05, "loss": 0.2517, "num_input_tokens_seen": 822040, "step": 9115 }, { "epoch": 2.37006237006237, "grad_norm": 0.3491326868534088, "learning_rate": 4.994794605457002e-05, "loss": 0.2543, "num_input_tokens_seen": 822504, "step": 9120 }, { "epoch": 2.3713617463617465, "grad_norm": 0.789927065372467, "learning_rate": 4.994757973576676e-05, "loss": 0.2521, "num_input_tokens_seen": 822968, "step": 9125 }, { "epoch": 2.3726611226611225, "grad_norm": 0.35690414905548096, "learning_rate": 4.9947212133885566e-05, "loss": 0.2208, "num_input_tokens_seen": 823416, "step": 9130 }, { "epoch": 2.373960498960499, "grad_norm": 0.40361905097961426, "learning_rate": 4.9946843248945353e-05, "loss": 0.2258, "num_input_tokens_seen": 823864, "step": 9135 }, { "epoch": 2.375259875259875, "grad_norm": 0.39380595088005066, "learning_rate": 4.994647308096509e-05, "loss": 0.3385, "num_input_tokens_seen": 824296, "step": 9140 }, { "epoch": 2.3765592515592515, "grad_norm": 0.6850389242172241, "learning_rate": 4.9946101629963816e-05, "loss": 0.2936, "num_input_tokens_seen": 824776, "step": 9145 }, { "epoch": 2.377858627858628, "grad_norm": 0.45836636424064636, "learning_rate": 4.994572889596063e-05, "loss": 0.2136, "num_input_tokens_seen": 825240, "step": 9150 }, { "epoch": 2.379158004158004, "grad_norm": 0.30709612369537354, "learning_rate": 4.99453548789747e-05, "loss": 0.1334, "num_input_tokens_seen": 825704, "step": 9155 }, { "epoch": 2.3804573804573805, "grad_norm": 0.4194350242614746, "learning_rate": 4.994497957902528e-05, "loss": 0.3222, "num_input_tokens_seen": 826120, "step": 9160 }, { "epoch": 2.3817567567567566, "grad_norm": 0.3976006805896759, "learning_rate": 4.9944602996131646e-05, "loss": 0.3754, "num_input_tokens_seen": 826552, "step": 9165 }, { "epoch": 2.383056133056133, "grad_norm": 0.40132173895835876, "learning_rate": 4.9944225130313183e-05, "loss": 0.1713, "num_input_tokens_seen": 826968, "step": 9170 }, { "epoch": 2.384355509355509, "grad_norm": 0.32364943623542786, "learning_rate": 4.994384598158932e-05, "loss": 0.2248, "num_input_tokens_seen": 827416, "step": 9175 }, { "epoch": 2.3856548856548856, "grad_norm": 0.34857314825057983, "learning_rate": 4.994346554997956e-05, "loss": 0.2605, "num_input_tokens_seen": 827848, "step": 9180 }, { "epoch": 2.386954261954262, "grad_norm": 0.3474026322364807, "learning_rate": 4.9943083835503467e-05, "loss": 0.176, "num_input_tokens_seen": 828296, "step": 9185 }, { "epoch": 2.388253638253638, "grad_norm": 0.8739005923271179, "learning_rate": 4.994270083818068e-05, "loss": 0.3197, "num_input_tokens_seen": 828744, "step": 9190 }, { "epoch": 2.3895530145530146, "grad_norm": 0.399352103471756, "learning_rate": 4.994231655803088e-05, "loss": 0.3576, "num_input_tokens_seen": 829176, "step": 9195 }, { "epoch": 2.390852390852391, "grad_norm": 0.8063971996307373, "learning_rate": 4.994193099507384e-05, "loss": 0.2533, "num_input_tokens_seen": 829624, "step": 9200 }, { "epoch": 2.392151767151767, "grad_norm": 0.33920586109161377, "learning_rate": 4.99415441493294e-05, "loss": 0.2285, "num_input_tokens_seen": 830056, "step": 9205 }, { "epoch": 2.3934511434511436, "grad_norm": 0.45737719535827637, "learning_rate": 4.9941156020817436e-05, "loss": 0.2433, "num_input_tokens_seen": 830504, "step": 9210 }, { "epoch": 2.3947505197505197, "grad_norm": 0.45405131578445435, "learning_rate": 4.994076660955793e-05, "loss": 0.173, "num_input_tokens_seen": 830952, "step": 9215 }, { "epoch": 2.396049896049896, "grad_norm": 0.7320113182067871, "learning_rate": 4.9940375915570895e-05, "loss": 0.3348, "num_input_tokens_seen": 831384, "step": 9220 }, { "epoch": 2.397349272349272, "grad_norm": 0.38864609599113464, "learning_rate": 4.993998393887643e-05, "loss": 0.2683, "num_input_tokens_seen": 831832, "step": 9225 }, { "epoch": 2.3986486486486487, "grad_norm": 0.2950562536716461, "learning_rate": 4.9939590679494694e-05, "loss": 0.2612, "num_input_tokens_seen": 832264, "step": 9230 }, { "epoch": 2.399948024948025, "grad_norm": 0.4365553557872772, "learning_rate": 4.993919613744592e-05, "loss": 0.2613, "num_input_tokens_seen": 832712, "step": 9235 }, { "epoch": 2.401247401247401, "grad_norm": 0.21751992404460907, "learning_rate": 4.993880031275039e-05, "loss": 0.3107, "num_input_tokens_seen": 833176, "step": 9240 }, { "epoch": 2.4025467775467777, "grad_norm": 0.6057740449905396, "learning_rate": 4.9938403205428466e-05, "loss": 0.2641, "num_input_tokens_seen": 833608, "step": 9245 }, { "epoch": 2.4038461538461537, "grad_norm": 0.46011602878570557, "learning_rate": 4.993800481550056e-05, "loss": 0.2319, "num_input_tokens_seen": 834120, "step": 9250 }, { "epoch": 2.4051455301455302, "grad_norm": 0.2800394594669342, "learning_rate": 4.993760514298718e-05, "loss": 0.2974, "num_input_tokens_seen": 834568, "step": 9255 }, { "epoch": 2.4064449064449063, "grad_norm": 0.3255080282688141, "learning_rate": 4.993720418790887e-05, "loss": 0.2236, "num_input_tokens_seen": 835032, "step": 9260 }, { "epoch": 2.4077442827442828, "grad_norm": 0.2892865240573883, "learning_rate": 4.993680195028626e-05, "loss": 0.367, "num_input_tokens_seen": 835528, "step": 9265 }, { "epoch": 2.4090436590436592, "grad_norm": 0.45110490918159485, "learning_rate": 4.993639843014003e-05, "loss": 0.3231, "num_input_tokens_seen": 835992, "step": 9270 }, { "epoch": 2.4103430353430353, "grad_norm": 0.5167623162269592, "learning_rate": 4.993599362749094e-05, "loss": 0.2633, "num_input_tokens_seen": 836440, "step": 9275 }, { "epoch": 2.4116424116424118, "grad_norm": 0.4811563789844513, "learning_rate": 4.99355875423598e-05, "loss": 0.2326, "num_input_tokens_seen": 836904, "step": 9280 }, { "epoch": 2.412941787941788, "grad_norm": 0.859072744846344, "learning_rate": 4.9935180174767496e-05, "loss": 0.3023, "num_input_tokens_seen": 837368, "step": 9285 }, { "epoch": 2.4142411642411643, "grad_norm": 0.4101199507713318, "learning_rate": 4.993477152473499e-05, "loss": 0.2885, "num_input_tokens_seen": 837816, "step": 9290 }, { "epoch": 2.4155405405405403, "grad_norm": 0.3727741539478302, "learning_rate": 4.9934361592283297e-05, "loss": 0.1986, "num_input_tokens_seen": 838280, "step": 9295 }, { "epoch": 2.416839916839917, "grad_norm": 0.35503390431404114, "learning_rate": 4.9933950377433494e-05, "loss": 0.2593, "num_input_tokens_seen": 838728, "step": 9300 }, { "epoch": 2.4181392931392933, "grad_norm": 0.3233613073825836, "learning_rate": 4.993353788020673e-05, "loss": 0.304, "num_input_tokens_seen": 839176, "step": 9305 }, { "epoch": 2.4194386694386694, "grad_norm": 0.29902252554893494, "learning_rate": 4.993312410062422e-05, "loss": 0.2855, "num_input_tokens_seen": 839640, "step": 9310 }, { "epoch": 2.420738045738046, "grad_norm": 0.2158231884241104, "learning_rate": 4.9932709038707246e-05, "loss": 0.2801, "num_input_tokens_seen": 840072, "step": 9315 }, { "epoch": 2.422037422037422, "grad_norm": 0.5360488891601562, "learning_rate": 4.9932292694477165e-05, "loss": 0.2697, "num_input_tokens_seen": 840520, "step": 9320 }, { "epoch": 2.4233367983367984, "grad_norm": 0.2251569628715515, "learning_rate": 4.993187506795538e-05, "loss": 0.2651, "num_input_tokens_seen": 840968, "step": 9325 }, { "epoch": 2.4246361746361744, "grad_norm": 0.21546393632888794, "learning_rate": 4.993145615916337e-05, "loss": 0.3056, "num_input_tokens_seen": 841464, "step": 9330 }, { "epoch": 2.425935550935551, "grad_norm": 0.18437346816062927, "learning_rate": 4.993103596812268e-05, "loss": 0.2921, "num_input_tokens_seen": 841880, "step": 9335 }, { "epoch": 2.4272349272349274, "grad_norm": 0.08676786720752716, "learning_rate": 4.993061449485492e-05, "loss": 0.2868, "num_input_tokens_seen": 842312, "step": 9340 }, { "epoch": 2.4285343035343034, "grad_norm": 0.5706855654716492, "learning_rate": 4.9930191739381775e-05, "loss": 0.2993, "num_input_tokens_seen": 842776, "step": 9345 }, { "epoch": 2.42983367983368, "grad_norm": 0.204625204205513, "learning_rate": 4.992976770172498e-05, "loss": 0.2962, "num_input_tokens_seen": 843240, "step": 9350 }, { "epoch": 2.431133056133056, "grad_norm": 0.45431169867515564, "learning_rate": 4.9929342381906344e-05, "loss": 0.2739, "num_input_tokens_seen": 843688, "step": 9355 }, { "epoch": 2.4324324324324325, "grad_norm": 0.2978680729866028, "learning_rate": 4.992891577994775e-05, "loss": 0.2141, "num_input_tokens_seen": 844136, "step": 9360 }, { "epoch": 2.4337318087318085, "grad_norm": 0.4884237051010132, "learning_rate": 4.9928487895871125e-05, "loss": 0.2752, "num_input_tokens_seen": 844568, "step": 9365 }, { "epoch": 2.435031185031185, "grad_norm": 0.345036119222641, "learning_rate": 4.9928058729698487e-05, "loss": 0.3055, "num_input_tokens_seen": 844984, "step": 9370 }, { "epoch": 2.4363305613305615, "grad_norm": 0.34301382303237915, "learning_rate": 4.99276282814519e-05, "loss": 0.2621, "num_input_tokens_seen": 845496, "step": 9375 }, { "epoch": 2.4376299376299375, "grad_norm": 0.3008035123348236, "learning_rate": 4.99271965511535e-05, "loss": 0.1388, "num_input_tokens_seen": 845928, "step": 9380 }, { "epoch": 2.438929313929314, "grad_norm": 0.23400771617889404, "learning_rate": 4.9926763538825505e-05, "loss": 0.2544, "num_input_tokens_seen": 846392, "step": 9385 }, { "epoch": 2.44022869022869, "grad_norm": 0.21707403659820557, "learning_rate": 4.9926329244490174e-05, "loss": 0.1553, "num_input_tokens_seen": 846824, "step": 9390 }, { "epoch": 2.4415280665280665, "grad_norm": 0.22569096088409424, "learning_rate": 4.9925893668169855e-05, "loss": 0.2848, "num_input_tokens_seen": 847272, "step": 9395 }, { "epoch": 2.442827442827443, "grad_norm": 0.20122793316841125, "learning_rate": 4.9925456809886935e-05, "loss": 0.086, "num_input_tokens_seen": 847752, "step": 9400 }, { "epoch": 2.444126819126819, "grad_norm": 0.2080511450767517, "learning_rate": 4.9925018669663885e-05, "loss": 0.305, "num_input_tokens_seen": 848184, "step": 9405 }, { "epoch": 2.4454261954261955, "grad_norm": 0.2323983907699585, "learning_rate": 4.992457924752325e-05, "loss": 0.3236, "num_input_tokens_seen": 848600, "step": 9410 }, { "epoch": 2.4467255717255716, "grad_norm": 0.28369349241256714, "learning_rate": 4.992413854348762e-05, "loss": 0.2998, "num_input_tokens_seen": 849032, "step": 9415 }, { "epoch": 2.448024948024948, "grad_norm": 0.33746373653411865, "learning_rate": 4.9923696557579665e-05, "loss": 0.2335, "num_input_tokens_seen": 849512, "step": 9420 }, { "epoch": 2.4493243243243246, "grad_norm": 0.636358916759491, "learning_rate": 4.9923253289822116e-05, "loss": 0.2541, "num_input_tokens_seen": 849944, "step": 9425 }, { "epoch": 2.4506237006237006, "grad_norm": 0.32343384623527527, "learning_rate": 4.9922808740237764e-05, "loss": 0.224, "num_input_tokens_seen": 850424, "step": 9430 }, { "epoch": 2.451923076923077, "grad_norm": 0.4302825629711151, "learning_rate": 4.9922362908849484e-05, "loss": 0.2186, "num_input_tokens_seen": 850856, "step": 9435 }, { "epoch": 2.453222453222453, "grad_norm": 0.31045037508010864, "learning_rate": 4.9921915795680194e-05, "loss": 0.266, "num_input_tokens_seen": 851256, "step": 9440 }, { "epoch": 2.4545218295218296, "grad_norm": 0.25649622082710266, "learning_rate": 4.99214674007529e-05, "loss": 0.3711, "num_input_tokens_seen": 851752, "step": 9445 }, { "epoch": 2.4558212058212057, "grad_norm": 0.37405210733413696, "learning_rate": 4.992101772409066e-05, "loss": 0.1962, "num_input_tokens_seen": 852232, "step": 9450 }, { "epoch": 2.457120582120582, "grad_norm": 0.3269602656364441, "learning_rate": 4.99205667657166e-05, "loss": 0.1578, "num_input_tokens_seen": 852712, "step": 9455 }, { "epoch": 2.4584199584199586, "grad_norm": 0.35225462913513184, "learning_rate": 4.992011452565392e-05, "loss": 0.2669, "num_input_tokens_seen": 853160, "step": 9460 }, { "epoch": 2.4597193347193347, "grad_norm": 0.27484750747680664, "learning_rate": 4.991966100392586e-05, "loss": 0.3487, "num_input_tokens_seen": 853624, "step": 9465 }, { "epoch": 2.461018711018711, "grad_norm": 0.3391101062297821, "learning_rate": 4.991920620055576e-05, "loss": 0.3318, "num_input_tokens_seen": 854056, "step": 9470 }, { "epoch": 2.462318087318087, "grad_norm": 0.465613454580307, "learning_rate": 4.991875011556702e-05, "loss": 0.3075, "num_input_tokens_seen": 854520, "step": 9475 }, { "epoch": 2.4636174636174637, "grad_norm": 0.1911012977361679, "learning_rate": 4.9918292748983074e-05, "loss": 0.2667, "num_input_tokens_seen": 854968, "step": 9480 }, { "epoch": 2.4649168399168397, "grad_norm": 0.46812957525253296, "learning_rate": 4.991783410082745e-05, "loss": 0.2563, "num_input_tokens_seen": 855400, "step": 9485 }, { "epoch": 2.4662162162162162, "grad_norm": 0.40198761224746704, "learning_rate": 4.991737417112376e-05, "loss": 0.2133, "num_input_tokens_seen": 855832, "step": 9490 }, { "epoch": 2.4675155925155927, "grad_norm": 0.3109671175479889, "learning_rate": 4.991691295989563e-05, "loss": 0.3813, "num_input_tokens_seen": 856264, "step": 9495 }, { "epoch": 2.4688149688149688, "grad_norm": 0.2662392556667328, "learning_rate": 4.9916450467166804e-05, "loss": 0.3198, "num_input_tokens_seen": 856712, "step": 9500 }, { "epoch": 2.4701143451143452, "grad_norm": 0.6889498233795166, "learning_rate": 4.9915986692961045e-05, "loss": 0.2508, "num_input_tokens_seen": 857160, "step": 9505 }, { "epoch": 2.4714137214137213, "grad_norm": 0.5180881023406982, "learning_rate": 4.991552163730222e-05, "loss": 0.2651, "num_input_tokens_seen": 857592, "step": 9510 }, { "epoch": 2.4727130977130978, "grad_norm": 0.38939303159713745, "learning_rate": 4.9915055300214256e-05, "loss": 0.2291, "num_input_tokens_seen": 858008, "step": 9515 }, { "epoch": 2.474012474012474, "grad_norm": 0.4119836390018463, "learning_rate": 4.991458768172111e-05, "loss": 0.2885, "num_input_tokens_seen": 858504, "step": 9520 }, { "epoch": 2.4753118503118503, "grad_norm": 0.8308788537979126, "learning_rate": 4.991411878184685e-05, "loss": 0.287, "num_input_tokens_seen": 858984, "step": 9525 }, { "epoch": 2.476611226611227, "grad_norm": 0.3452894687652588, "learning_rate": 4.991364860061559e-05, "loss": 0.1744, "num_input_tokens_seen": 859448, "step": 9530 }, { "epoch": 2.477910602910603, "grad_norm": 0.5101956129074097, "learning_rate": 4.9913177138051516e-05, "loss": 0.2223, "num_input_tokens_seen": 859912, "step": 9535 }, { "epoch": 2.4792099792099793, "grad_norm": 0.8362678289413452, "learning_rate": 4.991270439417887e-05, "loss": 0.4293, "num_input_tokens_seen": 860360, "step": 9540 }, { "epoch": 2.4805093555093554, "grad_norm": 0.19048357009887695, "learning_rate": 4.991223036902197e-05, "loss": 0.3062, "num_input_tokens_seen": 860824, "step": 9545 }, { "epoch": 2.481808731808732, "grad_norm": 0.7527651786804199, "learning_rate": 4.991175506260518e-05, "loss": 0.2934, "num_input_tokens_seen": 861256, "step": 9550 }, { "epoch": 2.483108108108108, "grad_norm": 0.8878771066665649, "learning_rate": 4.9911278474952974e-05, "loss": 0.3002, "num_input_tokens_seen": 861704, "step": 9555 }, { "epoch": 2.4844074844074844, "grad_norm": 0.26446178555488586, "learning_rate": 4.9910800606089834e-05, "loss": 0.2759, "num_input_tokens_seen": 862184, "step": 9560 }, { "epoch": 2.485706860706861, "grad_norm": 0.24474522471427917, "learning_rate": 4.9910321456040356e-05, "loss": 0.2886, "num_input_tokens_seen": 862616, "step": 9565 }, { "epoch": 2.487006237006237, "grad_norm": 0.7369071245193481, "learning_rate": 4.990984102482918e-05, "loss": 0.2061, "num_input_tokens_seen": 863032, "step": 9570 }, { "epoch": 2.4883056133056134, "grad_norm": 1.0056788921356201, "learning_rate": 4.9909359312481006e-05, "loss": 0.3516, "num_input_tokens_seen": 863496, "step": 9575 }, { "epoch": 2.4896049896049894, "grad_norm": 0.49336525797843933, "learning_rate": 4.990887631902062e-05, "loss": 0.2583, "num_input_tokens_seen": 863912, "step": 9580 }, { "epoch": 2.490904365904366, "grad_norm": 0.5239165425300598, "learning_rate": 4.9908392044472865e-05, "loss": 0.2179, "num_input_tokens_seen": 864328, "step": 9585 }, { "epoch": 2.492203742203742, "grad_norm": 0.5201740860939026, "learning_rate": 4.9907906488862626e-05, "loss": 0.1813, "num_input_tokens_seen": 864760, "step": 9590 }, { "epoch": 2.4935031185031185, "grad_norm": 0.6375786066055298, "learning_rate": 4.99074196522149e-05, "loss": 0.2547, "num_input_tokens_seen": 865192, "step": 9595 }, { "epoch": 2.494802494802495, "grad_norm": 0.49805667996406555, "learning_rate": 4.9906931534554714e-05, "loss": 0.1397, "num_input_tokens_seen": 865624, "step": 9600 }, { "epoch": 2.496101871101871, "grad_norm": 2.3475120067596436, "learning_rate": 4.990644213590717e-05, "loss": 0.288, "num_input_tokens_seen": 866088, "step": 9605 }, { "epoch": 2.4974012474012475, "grad_norm": 1.0273137092590332, "learning_rate": 4.9905951456297454e-05, "loss": 0.4961, "num_input_tokens_seen": 866536, "step": 9610 }, { "epoch": 2.4987006237006235, "grad_norm": 0.5553402900695801, "learning_rate": 4.9905459495750786e-05, "loss": 0.2608, "num_input_tokens_seen": 866968, "step": 9615 }, { "epoch": 2.5, "grad_norm": 0.4325787425041199, "learning_rate": 4.9904966254292475e-05, "loss": 0.2469, "num_input_tokens_seen": 867416, "step": 9620 }, { "epoch": 2.501299376299376, "grad_norm": 0.34043431282043457, "learning_rate": 4.990447173194788e-05, "loss": 0.2555, "num_input_tokens_seen": 867880, "step": 9625 }, { "epoch": 2.5025987525987525, "grad_norm": 0.5056717991828918, "learning_rate": 4.990397592874244e-05, "loss": 0.3563, "num_input_tokens_seen": 868328, "step": 9630 }, { "epoch": 2.503898128898129, "grad_norm": 0.16205552220344543, "learning_rate": 4.990347884470167e-05, "loss": 0.2897, "num_input_tokens_seen": 868792, "step": 9635 }, { "epoch": 2.505197505197505, "grad_norm": 0.8064384460449219, "learning_rate": 4.99029804798511e-05, "loss": 0.2959, "num_input_tokens_seen": 869208, "step": 9640 }, { "epoch": 2.5064968814968815, "grad_norm": 0.1501506119966507, "learning_rate": 4.99024808342164e-05, "loss": 0.2865, "num_input_tokens_seen": 869688, "step": 9645 }, { "epoch": 2.507796257796258, "grad_norm": 0.4141940772533417, "learning_rate": 4.990197990782325e-05, "loss": 0.262, "num_input_tokens_seen": 870136, "step": 9650 }, { "epoch": 2.509095634095634, "grad_norm": 0.40379929542541504, "learning_rate": 4.990147770069741e-05, "loss": 0.2945, "num_input_tokens_seen": 870568, "step": 9655 }, { "epoch": 2.51039501039501, "grad_norm": 0.2272632122039795, "learning_rate": 4.990097421286471e-05, "loss": 0.2642, "num_input_tokens_seen": 871048, "step": 9660 }, { "epoch": 2.5116943866943866, "grad_norm": 0.34207651019096375, "learning_rate": 4.990046944435105e-05, "loss": 0.2342, "num_input_tokens_seen": 871512, "step": 9665 }, { "epoch": 2.512993762993763, "grad_norm": 0.3593064248561859, "learning_rate": 4.989996339518239e-05, "loss": 0.1835, "num_input_tokens_seen": 871960, "step": 9670 }, { "epoch": 2.514293139293139, "grad_norm": 0.26877453923225403, "learning_rate": 4.989945606538475e-05, "loss": 0.1569, "num_input_tokens_seen": 872440, "step": 9675 }, { "epoch": 2.5155925155925156, "grad_norm": 0.24172230064868927, "learning_rate": 4.989894745498422e-05, "loss": 0.2839, "num_input_tokens_seen": 872872, "step": 9680 }, { "epoch": 2.516891891891892, "grad_norm": 0.4383428394794464, "learning_rate": 4.989843756400698e-05, "loss": 0.2758, "num_input_tokens_seen": 873304, "step": 9685 }, { "epoch": 2.518191268191268, "grad_norm": 0.3594106435775757, "learning_rate": 4.9897926392479235e-05, "loss": 0.319, "num_input_tokens_seen": 873816, "step": 9690 }, { "epoch": 2.5194906444906446, "grad_norm": 0.32991790771484375, "learning_rate": 4.989741394042727e-05, "loss": 0.27, "num_input_tokens_seen": 874264, "step": 9695 }, { "epoch": 2.5207900207900207, "grad_norm": 0.4397948682308197, "learning_rate": 4.9896900207877464e-05, "loss": 0.2336, "num_input_tokens_seen": 874744, "step": 9700 }, { "epoch": 2.522089397089397, "grad_norm": 0.6263600587844849, "learning_rate": 4.989638519485622e-05, "loss": 0.288, "num_input_tokens_seen": 875160, "step": 9705 }, { "epoch": 2.523388773388773, "grad_norm": 0.28608831763267517, "learning_rate": 4.989586890139003e-05, "loss": 0.2235, "num_input_tokens_seen": 875576, "step": 9710 }, { "epoch": 2.5246881496881497, "grad_norm": 0.3011750876903534, "learning_rate": 4.9895351327505454e-05, "loss": 0.2813, "num_input_tokens_seen": 876008, "step": 9715 }, { "epoch": 2.525987525987526, "grad_norm": 0.3684898614883423, "learning_rate": 4.98948324732291e-05, "loss": 0.313, "num_input_tokens_seen": 876424, "step": 9720 }, { "epoch": 2.5272869022869022, "grad_norm": 0.3812906742095947, "learning_rate": 4.989431233858766e-05, "loss": 0.2392, "num_input_tokens_seen": 876888, "step": 9725 }, { "epoch": 2.5285862785862787, "grad_norm": 0.20393472909927368, "learning_rate": 4.989379092360788e-05, "loss": 0.2303, "num_input_tokens_seen": 877304, "step": 9730 }, { "epoch": 2.5298856548856548, "grad_norm": 0.29834893345832825, "learning_rate": 4.9893268228316584e-05, "loss": 0.153, "num_input_tokens_seen": 877752, "step": 9735 }, { "epoch": 2.5311850311850312, "grad_norm": 0.294559121131897, "learning_rate": 4.989274425274065e-05, "loss": 0.1578, "num_input_tokens_seen": 878200, "step": 9740 }, { "epoch": 2.5324844074844073, "grad_norm": 0.4898948073387146, "learning_rate": 4.989221899690704e-05, "loss": 0.3624, "num_input_tokens_seen": 878632, "step": 9745 }, { "epoch": 2.5337837837837838, "grad_norm": 0.4640296697616577, "learning_rate": 4.9891692460842736e-05, "loss": 0.2814, "num_input_tokens_seen": 879096, "step": 9750 }, { "epoch": 2.5350831600831603, "grad_norm": 0.7283058166503906, "learning_rate": 4.989116464457485e-05, "loss": 0.3956, "num_input_tokens_seen": 879592, "step": 9755 }, { "epoch": 2.5363825363825363, "grad_norm": 0.35663747787475586, "learning_rate": 4.989063554813051e-05, "loss": 0.1954, "num_input_tokens_seen": 880024, "step": 9760 }, { "epoch": 2.537681912681913, "grad_norm": 0.3721310496330261, "learning_rate": 4.989010517153695e-05, "loss": 0.2277, "num_input_tokens_seen": 880424, "step": 9765 }, { "epoch": 2.538981288981289, "grad_norm": 0.38481923937797546, "learning_rate": 4.988957351482142e-05, "loss": 0.2275, "num_input_tokens_seen": 880872, "step": 9770 }, { "epoch": 2.5402806652806653, "grad_norm": 0.32061445713043213, "learning_rate": 4.9889040578011284e-05, "loss": 0.307, "num_input_tokens_seen": 881304, "step": 9775 }, { "epoch": 2.5415800415800414, "grad_norm": 0.33173078298568726, "learning_rate": 4.988850636113394e-05, "loss": 0.259, "num_input_tokens_seen": 881736, "step": 9780 }, { "epoch": 2.542879417879418, "grad_norm": 0.3574594557285309, "learning_rate": 4.988797086421686e-05, "loss": 0.3358, "num_input_tokens_seen": 882168, "step": 9785 }, { "epoch": 2.5441787941787943, "grad_norm": 0.24279062449932098, "learning_rate": 4.988743408728759e-05, "loss": 0.2251, "num_input_tokens_seen": 882616, "step": 9790 }, { "epoch": 2.5454781704781704, "grad_norm": 0.4555855393409729, "learning_rate": 4.9886896030373755e-05, "loss": 0.2983, "num_input_tokens_seen": 883064, "step": 9795 }, { "epoch": 2.546777546777547, "grad_norm": 0.21294564008712769, "learning_rate": 4.9886356693503e-05, "loss": 0.2554, "num_input_tokens_seen": 883496, "step": 9800 }, { "epoch": 2.5480769230769234, "grad_norm": 0.3655322790145874, "learning_rate": 4.9885816076703075e-05, "loss": 0.2097, "num_input_tokens_seen": 883944, "step": 9805 }, { "epoch": 2.5493762993762994, "grad_norm": 0.3360641300678253, "learning_rate": 4.988527418000179e-05, "loss": 0.2235, "num_input_tokens_seen": 884408, "step": 9810 }, { "epoch": 2.5506756756756754, "grad_norm": 0.43311694264411926, "learning_rate": 4.988473100342701e-05, "loss": 0.2673, "num_input_tokens_seen": 884856, "step": 9815 }, { "epoch": 2.551975051975052, "grad_norm": 0.27752581238746643, "learning_rate": 4.988418654700666e-05, "loss": 0.0867, "num_input_tokens_seen": 885320, "step": 9820 }, { "epoch": 2.5532744282744284, "grad_norm": 0.9999790787696838, "learning_rate": 4.9883640810768764e-05, "loss": 0.4661, "num_input_tokens_seen": 885784, "step": 9825 }, { "epoch": 2.5545738045738045, "grad_norm": 0.2655417025089264, "learning_rate": 4.988309379474137e-05, "loss": 0.1595, "num_input_tokens_seen": 886216, "step": 9830 }, { "epoch": 2.555873180873181, "grad_norm": 0.3581129312515259, "learning_rate": 4.988254549895264e-05, "loss": 0.167, "num_input_tokens_seen": 886632, "step": 9835 }, { "epoch": 2.5571725571725574, "grad_norm": 0.3407928943634033, "learning_rate": 4.988199592343073e-05, "loss": 0.2054, "num_input_tokens_seen": 887112, "step": 9840 }, { "epoch": 2.5584719334719335, "grad_norm": 0.34701699018478394, "learning_rate": 4.988144506820395e-05, "loss": 0.2775, "num_input_tokens_seen": 887576, "step": 9845 }, { "epoch": 2.5597713097713095, "grad_norm": 0.3320903182029724, "learning_rate": 4.9880892933300593e-05, "loss": 0.263, "num_input_tokens_seen": 888024, "step": 9850 }, { "epoch": 2.561070686070686, "grad_norm": 0.2964380085468292, "learning_rate": 4.988033951874908e-05, "loss": 0.2616, "num_input_tokens_seen": 888472, "step": 9855 }, { "epoch": 2.5623700623700625, "grad_norm": 0.29340308904647827, "learning_rate": 4.9879784824577866e-05, "loss": 0.2511, "num_input_tokens_seen": 888920, "step": 9860 }, { "epoch": 2.5636694386694385, "grad_norm": 0.21494922041893005, "learning_rate": 4.9879228850815476e-05, "loss": 0.3517, "num_input_tokens_seen": 889352, "step": 9865 }, { "epoch": 2.564968814968815, "grad_norm": 0.1668809950351715, "learning_rate": 4.987867159749051e-05, "loss": 0.2798, "num_input_tokens_seen": 889768, "step": 9870 }, { "epoch": 2.5662681912681915, "grad_norm": 0.5722551345825195, "learning_rate": 4.987811306463163e-05, "loss": 0.2583, "num_input_tokens_seen": 890216, "step": 9875 }, { "epoch": 2.5675675675675675, "grad_norm": 0.5892682671546936, "learning_rate": 4.9877553252267564e-05, "loss": 0.2611, "num_input_tokens_seen": 890680, "step": 9880 }, { "epoch": 2.568866943866944, "grad_norm": 0.317322313785553, "learning_rate": 4.987699216042708e-05, "loss": 0.2232, "num_input_tokens_seen": 891160, "step": 9885 }, { "epoch": 2.57016632016632, "grad_norm": 0.2957312762737274, "learning_rate": 4.987642978913907e-05, "loss": 0.2226, "num_input_tokens_seen": 891624, "step": 9890 }, { "epoch": 2.5714656964656966, "grad_norm": 0.7333657741546631, "learning_rate": 4.9875866138432426e-05, "loss": 0.3802, "num_input_tokens_seen": 892072, "step": 9895 }, { "epoch": 2.5727650727650726, "grad_norm": 0.28878548741340637, "learning_rate": 4.987530120833616e-05, "loss": 0.296, "num_input_tokens_seen": 892536, "step": 9900 }, { "epoch": 2.574064449064449, "grad_norm": 0.26379796862602234, "learning_rate": 4.9874734998879316e-05, "loss": 0.2547, "num_input_tokens_seen": 893000, "step": 9905 }, { "epoch": 2.5753638253638256, "grad_norm": 0.257651686668396, "learning_rate": 4.987416751009102e-05, "loss": 0.2801, "num_input_tokens_seen": 893432, "step": 9910 }, { "epoch": 2.5766632016632016, "grad_norm": 0.23901870846748352, "learning_rate": 4.987359874200045e-05, "loss": 0.2649, "num_input_tokens_seen": 893896, "step": 9915 }, { "epoch": 2.577962577962578, "grad_norm": 0.5866069197654724, "learning_rate": 4.987302869463687e-05, "loss": 0.2696, "num_input_tokens_seen": 894360, "step": 9920 }, { "epoch": 2.579261954261954, "grad_norm": 0.774832546710968, "learning_rate": 4.987245736802959e-05, "loss": 0.2259, "num_input_tokens_seen": 894808, "step": 9925 }, { "epoch": 2.5805613305613306, "grad_norm": 0.3480294346809387, "learning_rate": 4.9871884762208e-05, "loss": 0.279, "num_input_tokens_seen": 895240, "step": 9930 }, { "epoch": 2.5818607068607067, "grad_norm": 0.34850916266441345, "learning_rate": 4.9871310877201535e-05, "loss": 0.2216, "num_input_tokens_seen": 895688, "step": 9935 }, { "epoch": 2.583160083160083, "grad_norm": 0.28845033049583435, "learning_rate": 4.987073571303973e-05, "loss": 0.1664, "num_input_tokens_seen": 896136, "step": 9940 }, { "epoch": 2.5844594594594597, "grad_norm": 0.266168475151062, "learning_rate": 4.987015926975216e-05, "loss": 0.2125, "num_input_tokens_seen": 896600, "step": 9945 }, { "epoch": 2.5857588357588357, "grad_norm": 0.23478424549102783, "learning_rate": 4.986958154736846e-05, "loss": 0.1934, "num_input_tokens_seen": 897032, "step": 9950 }, { "epoch": 2.587058212058212, "grad_norm": 0.24126571416854858, "learning_rate": 4.986900254591835e-05, "loss": 0.3183, "num_input_tokens_seen": 897464, "step": 9955 }, { "epoch": 2.5883575883575882, "grad_norm": 0.2743096351623535, "learning_rate": 4.986842226543162e-05, "loss": 0.3031, "num_input_tokens_seen": 897912, "step": 9960 }, { "epoch": 2.5896569646569647, "grad_norm": 0.4878128468990326, "learning_rate": 4.986784070593811e-05, "loss": 0.2717, "num_input_tokens_seen": 898344, "step": 9965 }, { "epoch": 2.5909563409563408, "grad_norm": 0.6199511885643005, "learning_rate": 4.986725786746771e-05, "loss": 0.2762, "num_input_tokens_seen": 898776, "step": 9970 }, { "epoch": 2.5922557172557172, "grad_norm": 0.3146408200263977, "learning_rate": 4.986667375005042e-05, "loss": 0.2519, "num_input_tokens_seen": 899240, "step": 9975 }, { "epoch": 2.5935550935550937, "grad_norm": 0.4331851899623871, "learning_rate": 4.986608835371627e-05, "loss": 0.3126, "num_input_tokens_seen": 899752, "step": 9980 }, { "epoch": 2.5948544698544698, "grad_norm": 0.4902029037475586, "learning_rate": 4.9865501678495375e-05, "loss": 0.263, "num_input_tokens_seen": 900168, "step": 9985 }, { "epoch": 2.5961538461538463, "grad_norm": 0.41581401228904724, "learning_rate": 4.98649137244179e-05, "loss": 0.2192, "num_input_tokens_seen": 900648, "step": 9990 }, { "epoch": 2.5974532224532223, "grad_norm": 0.25317370891571045, "learning_rate": 4.986432449151409e-05, "loss": 0.3107, "num_input_tokens_seen": 901096, "step": 9995 }, { "epoch": 2.598752598752599, "grad_norm": 0.6360588073730469, "learning_rate": 4.9863733979814244e-05, "loss": 0.2528, "num_input_tokens_seen": 901496, "step": 10000 }, { "epoch": 2.600051975051975, "grad_norm": 0.2886197566986084, "learning_rate": 4.9863142189348735e-05, "loss": 0.1313, "num_input_tokens_seen": 901944, "step": 10005 }, { "epoch": 2.6013513513513513, "grad_norm": 0.8124774694442749, "learning_rate": 4.9862549120148005e-05, "loss": 0.3138, "num_input_tokens_seen": 902408, "step": 10010 }, { "epoch": 2.602650727650728, "grad_norm": 0.2647361755371094, "learning_rate": 4.9861954772242546e-05, "loss": 0.2191, "num_input_tokens_seen": 902856, "step": 10015 }, { "epoch": 2.603950103950104, "grad_norm": 0.7070188522338867, "learning_rate": 4.986135914566294e-05, "loss": 0.307, "num_input_tokens_seen": 903288, "step": 10020 }, { "epoch": 2.6052494802494803, "grad_norm": 0.3181648552417755, "learning_rate": 4.986076224043981e-05, "loss": 0.2174, "num_input_tokens_seen": 903720, "step": 10025 }, { "epoch": 2.606548856548857, "grad_norm": 0.26815861463546753, "learning_rate": 4.986016405660385e-05, "loss": 0.2434, "num_input_tokens_seen": 904152, "step": 10030 }, { "epoch": 2.607848232848233, "grad_norm": 0.42965999245643616, "learning_rate": 4.985956459418584e-05, "loss": 0.2953, "num_input_tokens_seen": 904616, "step": 10035 }, { "epoch": 2.609147609147609, "grad_norm": 0.26611605286598206, "learning_rate": 4.98589638532166e-05, "loss": 0.2348, "num_input_tokens_seen": 905080, "step": 10040 }, { "epoch": 2.6104469854469854, "grad_norm": 0.6107804179191589, "learning_rate": 4.985836183372703e-05, "loss": 0.2485, "num_input_tokens_seen": 905528, "step": 10045 }, { "epoch": 2.611746361746362, "grad_norm": 0.3133287727832794, "learning_rate": 4.98577585357481e-05, "loss": 0.2967, "num_input_tokens_seen": 906008, "step": 10050 }, { "epoch": 2.613045738045738, "grad_norm": 0.5888065099716187, "learning_rate": 4.985715395931083e-05, "loss": 0.2812, "num_input_tokens_seen": 906456, "step": 10055 }, { "epoch": 2.6143451143451144, "grad_norm": 0.3231661319732666, "learning_rate": 4.98565481044463e-05, "loss": 0.1627, "num_input_tokens_seen": 906872, "step": 10060 }, { "epoch": 2.615644490644491, "grad_norm": 0.2722456157207489, "learning_rate": 4.98559409711857e-05, "loss": 0.252, "num_input_tokens_seen": 907352, "step": 10065 }, { "epoch": 2.616943866943867, "grad_norm": 0.6193125247955322, "learning_rate": 4.9855332559560235e-05, "loss": 0.2869, "num_input_tokens_seen": 907800, "step": 10070 }, { "epoch": 2.618243243243243, "grad_norm": 0.2786122262477875, "learning_rate": 4.98547228696012e-05, "loss": 0.2655, "num_input_tokens_seen": 908248, "step": 10075 }, { "epoch": 2.6195426195426195, "grad_norm": 0.2925240397453308, "learning_rate": 4.9854111901339954e-05, "loss": 0.2569, "num_input_tokens_seen": 908728, "step": 10080 }, { "epoch": 2.620841995841996, "grad_norm": 0.20154011249542236, "learning_rate": 4.9853499654807924e-05, "loss": 0.3517, "num_input_tokens_seen": 909160, "step": 10085 }, { "epoch": 2.622141372141372, "grad_norm": 0.1385360062122345, "learning_rate": 4.9852886130036586e-05, "loss": 0.2931, "num_input_tokens_seen": 909576, "step": 10090 }, { "epoch": 2.6234407484407485, "grad_norm": 0.1304589956998825, "learning_rate": 4.9852271327057504e-05, "loss": 0.2835, "num_input_tokens_seen": 910040, "step": 10095 }, { "epoch": 2.624740124740125, "grad_norm": 0.7059158682823181, "learning_rate": 4.98516552459023e-05, "loss": 0.3092, "num_input_tokens_seen": 910488, "step": 10100 }, { "epoch": 2.626039501039501, "grad_norm": 0.16199134290218353, "learning_rate": 4.985103788660265e-05, "loss": 0.2823, "num_input_tokens_seen": 910968, "step": 10105 }, { "epoch": 2.6273388773388775, "grad_norm": 0.1760709583759308, "learning_rate": 4.985041924919031e-05, "loss": 0.2907, "num_input_tokens_seen": 911384, "step": 10110 }, { "epoch": 2.6286382536382535, "grad_norm": 0.18437853455543518, "learning_rate": 4.9849799333697095e-05, "loss": 0.2668, "num_input_tokens_seen": 911848, "step": 10115 }, { "epoch": 2.62993762993763, "grad_norm": 0.2428193986415863, "learning_rate": 4.984917814015489e-05, "loss": 0.2641, "num_input_tokens_seen": 912296, "step": 10120 }, { "epoch": 2.631237006237006, "grad_norm": 0.2514469623565674, "learning_rate": 4.984855566859565e-05, "loss": 0.2252, "num_input_tokens_seen": 912776, "step": 10125 }, { "epoch": 2.6325363825363826, "grad_norm": 0.3730570077896118, "learning_rate": 4.984793191905138e-05, "loss": 0.2502, "num_input_tokens_seen": 913224, "step": 10130 }, { "epoch": 2.633835758835759, "grad_norm": 0.4889146387577057, "learning_rate": 4.984730689155417e-05, "loss": 0.2887, "num_input_tokens_seen": 913704, "step": 10135 }, { "epoch": 2.635135135135135, "grad_norm": 0.6774154305458069, "learning_rate": 4.984668058613615e-05, "loss": 0.282, "num_input_tokens_seen": 914152, "step": 10140 }, { "epoch": 2.6364345114345116, "grad_norm": 0.5672652721405029, "learning_rate": 4.984605300282954e-05, "loss": 0.2816, "num_input_tokens_seen": 914584, "step": 10145 }, { "epoch": 2.6377338877338876, "grad_norm": 0.5454232096672058, "learning_rate": 4.984542414166663e-05, "loss": 0.2451, "num_input_tokens_seen": 915016, "step": 10150 }, { "epoch": 2.639033264033264, "grad_norm": 0.45081236958503723, "learning_rate": 4.984479400267974e-05, "loss": 0.2319, "num_input_tokens_seen": 915464, "step": 10155 }, { "epoch": 2.64033264033264, "grad_norm": 0.359798401594162, "learning_rate": 4.984416258590129e-05, "loss": 0.2207, "num_input_tokens_seen": 915896, "step": 10160 }, { "epoch": 2.6416320166320166, "grad_norm": 0.582020103931427, "learning_rate": 4.984352989136375e-05, "loss": 0.265, "num_input_tokens_seen": 916392, "step": 10165 }, { "epoch": 2.642931392931393, "grad_norm": 0.3080858886241913, "learning_rate": 4.984289591909967e-05, "loss": 0.2059, "num_input_tokens_seen": 916888, "step": 10170 }, { "epoch": 2.644230769230769, "grad_norm": 0.3055424392223358, "learning_rate": 4.984226066914165e-05, "loss": 0.1514, "num_input_tokens_seen": 917320, "step": 10175 }, { "epoch": 2.6455301455301456, "grad_norm": 1.060807466506958, "learning_rate": 4.984162414152236e-05, "loss": 0.3943, "num_input_tokens_seen": 917768, "step": 10180 }, { "epoch": 2.6468295218295217, "grad_norm": 0.30132681131362915, "learning_rate": 4.9840986336274534e-05, "loss": 0.1096, "num_input_tokens_seen": 918232, "step": 10185 }, { "epoch": 2.648128898128898, "grad_norm": 0.5493287444114685, "learning_rate": 4.9840347253430984e-05, "loss": 0.1697, "num_input_tokens_seen": 918680, "step": 10190 }, { "epoch": 2.649428274428274, "grad_norm": 0.23663003742694855, "learning_rate": 4.983970689302457e-05, "loss": 0.2165, "num_input_tokens_seen": 919160, "step": 10195 }, { "epoch": 2.6507276507276507, "grad_norm": 0.9735227823257446, "learning_rate": 4.9839065255088234e-05, "loss": 0.3845, "num_input_tokens_seen": 919576, "step": 10200 }, { "epoch": 2.652027027027027, "grad_norm": 0.33946993947029114, "learning_rate": 4.9838422339654974e-05, "loss": 0.21, "num_input_tokens_seen": 920008, "step": 10205 }, { "epoch": 2.6533264033264032, "grad_norm": 0.30129075050354004, "learning_rate": 4.983777814675785e-05, "loss": 0.255, "num_input_tokens_seen": 920440, "step": 10210 }, { "epoch": 2.6546257796257797, "grad_norm": 0.3700699210166931, "learning_rate": 4.983713267643e-05, "loss": 0.2643, "num_input_tokens_seen": 920872, "step": 10215 }, { "epoch": 2.6559251559251558, "grad_norm": 0.3989569842815399, "learning_rate": 4.983648592870462e-05, "loss": 0.2316, "num_input_tokens_seen": 921288, "step": 10220 }, { "epoch": 2.6572245322245323, "grad_norm": 0.29867053031921387, "learning_rate": 4.983583790361497e-05, "loss": 0.1878, "num_input_tokens_seen": 921704, "step": 10225 }, { "epoch": 2.6585239085239083, "grad_norm": 0.29100027680397034, "learning_rate": 4.9835188601194374e-05, "loss": 0.2632, "num_input_tokens_seen": 922152, "step": 10230 }, { "epoch": 2.659823284823285, "grad_norm": 0.2503606379032135, "learning_rate": 4.983453802147624e-05, "loss": 0.2141, "num_input_tokens_seen": 922600, "step": 10235 }, { "epoch": 2.6611226611226613, "grad_norm": 0.3896554410457611, "learning_rate": 4.9833886164494014e-05, "loss": 0.2219, "num_input_tokens_seen": 923080, "step": 10240 }, { "epoch": 2.6624220374220373, "grad_norm": 0.3702031672000885, "learning_rate": 4.983323303028124e-05, "loss": 0.3379, "num_input_tokens_seen": 923512, "step": 10245 }, { "epoch": 2.663721413721414, "grad_norm": 0.32665643095970154, "learning_rate": 4.9832578618871485e-05, "loss": 0.2169, "num_input_tokens_seen": 923960, "step": 10250 }, { "epoch": 2.6650207900207903, "grad_norm": 0.29663270711898804, "learning_rate": 4.983192293029843e-05, "loss": 0.2594, "num_input_tokens_seen": 924408, "step": 10255 }, { "epoch": 2.6663201663201663, "grad_norm": 0.28299763798713684, "learning_rate": 4.983126596459577e-05, "loss": 0.2115, "num_input_tokens_seen": 924872, "step": 10260 }, { "epoch": 2.6676195426195424, "grad_norm": 0.2654508948326111, "learning_rate": 4.983060772179732e-05, "loss": 0.2262, "num_input_tokens_seen": 925272, "step": 10265 }, { "epoch": 2.668918918918919, "grad_norm": 0.31788092851638794, "learning_rate": 4.982994820193692e-05, "loss": 0.2605, "num_input_tokens_seen": 925704, "step": 10270 }, { "epoch": 2.6702182952182953, "grad_norm": 0.29485777020454407, "learning_rate": 4.982928740504849e-05, "loss": 0.2187, "num_input_tokens_seen": 926168, "step": 10275 }, { "epoch": 2.6715176715176714, "grad_norm": 0.2796163260936737, "learning_rate": 4.9828625331166024e-05, "loss": 0.2125, "num_input_tokens_seen": 926600, "step": 10280 }, { "epoch": 2.672817047817048, "grad_norm": 0.24065236747264862, "learning_rate": 4.982796198032357e-05, "loss": 0.2197, "num_input_tokens_seen": 927032, "step": 10285 }, { "epoch": 2.6741164241164244, "grad_norm": 0.8143368363380432, "learning_rate": 4.982729735255523e-05, "loss": 0.2029, "num_input_tokens_seen": 927496, "step": 10290 }, { "epoch": 2.6754158004158004, "grad_norm": 0.8583540916442871, "learning_rate": 4.982663144789521e-05, "loss": 0.3218, "num_input_tokens_seen": 927944, "step": 10295 }, { "epoch": 2.6767151767151764, "grad_norm": 0.24994948506355286, "learning_rate": 4.982596426637774e-05, "loss": 0.3051, "num_input_tokens_seen": 928392, "step": 10300 }, { "epoch": 2.678014553014553, "grad_norm": 0.33587387204170227, "learning_rate": 4.982529580803714e-05, "loss": 0.2296, "num_input_tokens_seen": 928904, "step": 10305 }, { "epoch": 2.6793139293139294, "grad_norm": 0.3184526562690735, "learning_rate": 4.98246260729078e-05, "loss": 0.1824, "num_input_tokens_seen": 929336, "step": 10310 }, { "epoch": 2.6806133056133055, "grad_norm": 0.28516820073127747, "learning_rate": 4.982395506102415e-05, "loss": 0.3347, "num_input_tokens_seen": 929800, "step": 10315 }, { "epoch": 2.681912681912682, "grad_norm": 0.32567304372787476, "learning_rate": 4.982328277242071e-05, "loss": 0.1855, "num_input_tokens_seen": 930248, "step": 10320 }, { "epoch": 2.6832120582120584, "grad_norm": 0.27480944991111755, "learning_rate": 4.9822609207132045e-05, "loss": 0.13, "num_input_tokens_seen": 930680, "step": 10325 }, { "epoch": 2.6845114345114345, "grad_norm": 0.5717644095420837, "learning_rate": 4.9821934365192805e-05, "loss": 0.1644, "num_input_tokens_seen": 931112, "step": 10330 }, { "epoch": 2.685810810810811, "grad_norm": 0.23137539625167847, "learning_rate": 4.98212582466377e-05, "loss": 0.2147, "num_input_tokens_seen": 931544, "step": 10335 }, { "epoch": 2.687110187110187, "grad_norm": 0.681191086769104, "learning_rate": 4.98205808515015e-05, "loss": 0.1476, "num_input_tokens_seen": 931976, "step": 10340 }, { "epoch": 2.6884095634095635, "grad_norm": 0.18520337343215942, "learning_rate": 4.981990217981905e-05, "loss": 0.137, "num_input_tokens_seen": 932424, "step": 10345 }, { "epoch": 2.6897089397089395, "grad_norm": 0.17928989231586456, "learning_rate": 4.9819222231625245e-05, "loss": 0.2113, "num_input_tokens_seen": 932872, "step": 10350 }, { "epoch": 2.691008316008316, "grad_norm": 0.17602545022964478, "learning_rate": 4.981854100695506e-05, "loss": 0.3572, "num_input_tokens_seen": 933320, "step": 10355 }, { "epoch": 2.6923076923076925, "grad_norm": 0.41651734709739685, "learning_rate": 4.981785850584353e-05, "loss": 0.3437, "num_input_tokens_seen": 933752, "step": 10360 }, { "epoch": 2.6936070686070686, "grad_norm": 0.3243671953678131, "learning_rate": 4.981717472832576e-05, "loss": 0.2157, "num_input_tokens_seen": 934200, "step": 10365 }, { "epoch": 2.694906444906445, "grad_norm": 0.35116901993751526, "learning_rate": 4.9816489674436916e-05, "loss": 0.2792, "num_input_tokens_seen": 934632, "step": 10370 }, { "epoch": 2.696205821205821, "grad_norm": 0.43021294474601746, "learning_rate": 4.981580334421223e-05, "loss": 0.1876, "num_input_tokens_seen": 935064, "step": 10375 }, { "epoch": 2.6975051975051976, "grad_norm": 0.3172835111618042, "learning_rate": 4.9815115737686995e-05, "loss": 0.1731, "num_input_tokens_seen": 935496, "step": 10380 }, { "epoch": 2.6988045738045736, "grad_norm": 0.250892698764801, "learning_rate": 4.981442685489659e-05, "loss": 0.2087, "num_input_tokens_seen": 935928, "step": 10385 }, { "epoch": 2.70010395010395, "grad_norm": 1.4161083698272705, "learning_rate": 4.981373669587642e-05, "loss": 0.2765, "num_input_tokens_seen": 936376, "step": 10390 }, { "epoch": 2.7014033264033266, "grad_norm": 0.7297537922859192, "learning_rate": 4.981304526066202e-05, "loss": 0.2259, "num_input_tokens_seen": 936824, "step": 10395 }, { "epoch": 2.7027027027027026, "grad_norm": 0.4670954942703247, "learning_rate": 4.981235254928891e-05, "loss": 0.2077, "num_input_tokens_seen": 937288, "step": 10400 }, { "epoch": 2.704002079002079, "grad_norm": 0.3227354884147644, "learning_rate": 4.981165856179274e-05, "loss": 0.3465, "num_input_tokens_seen": 937720, "step": 10405 }, { "epoch": 2.705301455301455, "grad_norm": 0.9091821312904358, "learning_rate": 4.981096329820919e-05, "loss": 0.2817, "num_input_tokens_seen": 938152, "step": 10410 }, { "epoch": 2.7066008316008316, "grad_norm": 0.1730560064315796, "learning_rate": 4.981026675857403e-05, "loss": 0.2746, "num_input_tokens_seen": 938600, "step": 10415 }, { "epoch": 2.7079002079002077, "grad_norm": 0.26936864852905273, "learning_rate": 4.980956894292308e-05, "loss": 0.244, "num_input_tokens_seen": 939080, "step": 10420 }, { "epoch": 2.709199584199584, "grad_norm": 0.31525909900665283, "learning_rate": 4.980886985129223e-05, "loss": 0.1774, "num_input_tokens_seen": 939528, "step": 10425 }, { "epoch": 2.7104989604989607, "grad_norm": 0.5440437197685242, "learning_rate": 4.9808169483717427e-05, "loss": 0.2756, "num_input_tokens_seen": 939960, "step": 10430 }, { "epoch": 2.7117983367983367, "grad_norm": 0.7280185222625732, "learning_rate": 4.98074678402347e-05, "loss": 0.4027, "num_input_tokens_seen": 940440, "step": 10435 }, { "epoch": 2.713097713097713, "grad_norm": 0.4366860091686249, "learning_rate": 4.980676492088013e-05, "loss": 0.2425, "num_input_tokens_seen": 940904, "step": 10440 }, { "epoch": 2.7143970893970892, "grad_norm": 0.5341721177101135, "learning_rate": 4.980606072568988e-05, "loss": 0.2575, "num_input_tokens_seen": 941336, "step": 10445 }, { "epoch": 2.7156964656964657, "grad_norm": 0.5668947696685791, "learning_rate": 4.980535525470015e-05, "loss": 0.2054, "num_input_tokens_seen": 941800, "step": 10450 }, { "epoch": 2.7169958419958418, "grad_norm": 0.3296639621257782, "learning_rate": 4.980464850794724e-05, "loss": 0.2334, "num_input_tokens_seen": 942248, "step": 10455 }, { "epoch": 2.7182952182952183, "grad_norm": 0.09142360836267471, "learning_rate": 4.980394048546748e-05, "loss": 0.0117, "num_input_tokens_seen": 942680, "step": 10460 }, { "epoch": 2.7195945945945947, "grad_norm": 0.08465686440467834, "learning_rate": 4.9803231187297304e-05, "loss": 0.2272, "num_input_tokens_seen": 943176, "step": 10465 }, { "epoch": 2.720893970893971, "grad_norm": 8.637269973754883, "learning_rate": 4.980252061347318e-05, "loss": 0.3016, "num_input_tokens_seen": 943640, "step": 10470 }, { "epoch": 2.7221933471933473, "grad_norm": 21.601226806640625, "learning_rate": 4.980180876403166e-05, "loss": 0.664, "num_input_tokens_seen": 944072, "step": 10475 }, { "epoch": 2.7234927234927238, "grad_norm": 1.5054463148117065, "learning_rate": 4.9801095639009346e-05, "loss": 0.5838, "num_input_tokens_seen": 944504, "step": 10480 }, { "epoch": 2.7247920997921, "grad_norm": 1.3999111652374268, "learning_rate": 4.980038123844292e-05, "loss": 0.3847, "num_input_tokens_seen": 944952, "step": 10485 }, { "epoch": 2.726091476091476, "grad_norm": 1.1259046792984009, "learning_rate": 4.979966556236912e-05, "loss": 0.2558, "num_input_tokens_seen": 945432, "step": 10490 }, { "epoch": 2.7273908523908523, "grad_norm": 0.9428913593292236, "learning_rate": 4.9798948610824765e-05, "loss": 0.2249, "num_input_tokens_seen": 945896, "step": 10495 }, { "epoch": 2.728690228690229, "grad_norm": 0.5357194542884827, "learning_rate": 4.9798230383846725e-05, "loss": 0.3411, "num_input_tokens_seen": 946360, "step": 10500 }, { "epoch": 2.729989604989605, "grad_norm": 0.7687686085700989, "learning_rate": 4.979751088147192e-05, "loss": 0.2611, "num_input_tokens_seen": 946776, "step": 10505 }, { "epoch": 2.7312889812889813, "grad_norm": 0.7827091217041016, "learning_rate": 4.979679010373739e-05, "loss": 0.2902, "num_input_tokens_seen": 947256, "step": 10510 }, { "epoch": 2.732588357588358, "grad_norm": 0.5281159281730652, "learning_rate": 4.979606805068017e-05, "loss": 0.2575, "num_input_tokens_seen": 947736, "step": 10515 }, { "epoch": 2.733887733887734, "grad_norm": 0.8564766049385071, "learning_rate": 4.979534472233742e-05, "loss": 0.3285, "num_input_tokens_seen": 948200, "step": 10520 }, { "epoch": 2.73518711018711, "grad_norm": 0.2738010585308075, "learning_rate": 4.9794620118746336e-05, "loss": 0.2449, "num_input_tokens_seen": 948632, "step": 10525 }, { "epoch": 2.7364864864864864, "grad_norm": 0.3158833086490631, "learning_rate": 4.979389423994418e-05, "loss": 0.2382, "num_input_tokens_seen": 949080, "step": 10530 }, { "epoch": 2.737785862785863, "grad_norm": 0.32955339550971985, "learning_rate": 4.979316708596829e-05, "loss": 0.2179, "num_input_tokens_seen": 949576, "step": 10535 }, { "epoch": 2.739085239085239, "grad_norm": 0.3915672302246094, "learning_rate": 4.9792438656856044e-05, "loss": 0.2149, "num_input_tokens_seen": 950024, "step": 10540 }, { "epoch": 2.7403846153846154, "grad_norm": 0.2601853609085083, "learning_rate": 4.979170895264494e-05, "loss": 0.2224, "num_input_tokens_seen": 950472, "step": 10545 }, { "epoch": 2.741683991683992, "grad_norm": 0.32031697034835815, "learning_rate": 4.979097797337249e-05, "loss": 0.421, "num_input_tokens_seen": 950920, "step": 10550 }, { "epoch": 2.742983367983368, "grad_norm": 0.41052916646003723, "learning_rate": 4.979024571907628e-05, "loss": 0.2164, "num_input_tokens_seen": 951384, "step": 10555 }, { "epoch": 2.7442827442827444, "grad_norm": 0.8133533000946045, "learning_rate": 4.978951218979398e-05, "loss": 0.298, "num_input_tokens_seen": 951832, "step": 10560 }, { "epoch": 2.7455821205821205, "grad_norm": 0.62374347448349, "learning_rate": 4.978877738556332e-05, "loss": 0.2284, "num_input_tokens_seen": 952280, "step": 10565 }, { "epoch": 2.746881496881497, "grad_norm": 0.6683977842330933, "learning_rate": 4.9788041306422094e-05, "loss": 0.2401, "num_input_tokens_seen": 952728, "step": 10570 }, { "epoch": 2.748180873180873, "grad_norm": 0.3340015709400177, "learning_rate": 4.978730395240815e-05, "loss": 0.2692, "num_input_tokens_seen": 953176, "step": 10575 }, { "epoch": 2.7494802494802495, "grad_norm": 0.2977784276008606, "learning_rate": 4.978656532355941e-05, "loss": 0.2027, "num_input_tokens_seen": 953592, "step": 10580 }, { "epoch": 2.750779625779626, "grad_norm": 0.4667227268218994, "learning_rate": 4.978582541991386e-05, "loss": 0.2099, "num_input_tokens_seen": 954072, "step": 10585 }, { "epoch": 2.752079002079002, "grad_norm": 0.23818965256214142, "learning_rate": 4.978508424150957e-05, "loss": 0.1534, "num_input_tokens_seen": 954552, "step": 10590 }, { "epoch": 2.7533783783783785, "grad_norm": 0.9393228888511658, "learning_rate": 4.9784341788384646e-05, "loss": 0.3351, "num_input_tokens_seen": 954952, "step": 10595 }, { "epoch": 2.7546777546777546, "grad_norm": 0.37075164914131165, "learning_rate": 4.978359806057727e-05, "loss": 0.2846, "num_input_tokens_seen": 955416, "step": 10600 }, { "epoch": 2.755977130977131, "grad_norm": 0.30046066641807556, "learning_rate": 4.978285305812571e-05, "loss": 0.2572, "num_input_tokens_seen": 955880, "step": 10605 }, { "epoch": 2.757276507276507, "grad_norm": 0.3630262315273285, "learning_rate": 4.978210678106827e-05, "loss": 0.1848, "num_input_tokens_seen": 956328, "step": 10610 }, { "epoch": 2.7585758835758836, "grad_norm": 0.3093824088573456, "learning_rate": 4.978135922944333e-05, "loss": 0.2212, "num_input_tokens_seen": 956792, "step": 10615 }, { "epoch": 2.75987525987526, "grad_norm": 0.660738468170166, "learning_rate": 4.9780610403289344e-05, "loss": 0.4174, "num_input_tokens_seen": 957192, "step": 10620 }, { "epoch": 2.761174636174636, "grad_norm": 0.36524057388305664, "learning_rate": 4.977986030264482e-05, "loss": 0.1998, "num_input_tokens_seen": 957640, "step": 10625 }, { "epoch": 2.7624740124740126, "grad_norm": 0.2260740101337433, "learning_rate": 4.9779108927548336e-05, "loss": 0.3082, "num_input_tokens_seen": 958072, "step": 10630 }, { "epoch": 2.7637733887733886, "grad_norm": 0.656559944152832, "learning_rate": 4.9778356278038535e-05, "loss": 0.3381, "num_input_tokens_seen": 958536, "step": 10635 }, { "epoch": 2.765072765072765, "grad_norm": 0.06429427862167358, "learning_rate": 4.9777602354154126e-05, "loss": 0.2985, "num_input_tokens_seen": 959000, "step": 10640 }, { "epoch": 2.766372141372141, "grad_norm": 0.8031464219093323, "learning_rate": 4.9776847155933896e-05, "loss": 0.3165, "num_input_tokens_seen": 959448, "step": 10645 }, { "epoch": 2.7676715176715176, "grad_norm": 0.16479288041591644, "learning_rate": 4.9776090683416674e-05, "loss": 0.2761, "num_input_tokens_seen": 959864, "step": 10650 }, { "epoch": 2.768970893970894, "grad_norm": 0.19826315343379974, "learning_rate": 4.9775332936641374e-05, "loss": 0.3023, "num_input_tokens_seen": 960328, "step": 10655 }, { "epoch": 2.77027027027027, "grad_norm": 0.24564866721630096, "learning_rate": 4.9774573915646955e-05, "loss": 0.2395, "num_input_tokens_seen": 960760, "step": 10660 }, { "epoch": 2.7715696465696467, "grad_norm": 0.3466947376728058, "learning_rate": 4.977381362047247e-05, "loss": 0.1832, "num_input_tokens_seen": 961192, "step": 10665 }, { "epoch": 2.7728690228690227, "grad_norm": 0.399402916431427, "learning_rate": 4.9773052051157e-05, "loss": 0.2933, "num_input_tokens_seen": 961608, "step": 10670 }, { "epoch": 2.774168399168399, "grad_norm": 0.3816690444946289, "learning_rate": 4.977228920773974e-05, "loss": 0.247, "num_input_tokens_seen": 962056, "step": 10675 }, { "epoch": 2.7754677754677752, "grad_norm": 0.5159576535224915, "learning_rate": 4.9771525090259896e-05, "loss": 0.2048, "num_input_tokens_seen": 962488, "step": 10680 }, { "epoch": 2.7767671517671517, "grad_norm": 0.44663161039352417, "learning_rate": 4.9770759698756797e-05, "loss": 0.2012, "num_input_tokens_seen": 962984, "step": 10685 }, { "epoch": 2.778066528066528, "grad_norm": 1.033126711845398, "learning_rate": 4.976999303326978e-05, "loss": 0.2908, "num_input_tokens_seen": 963480, "step": 10690 }, { "epoch": 2.7793659043659042, "grad_norm": 0.6819586157798767, "learning_rate": 4.9769225093838294e-05, "loss": 0.296, "num_input_tokens_seen": 963928, "step": 10695 }, { "epoch": 2.7806652806652807, "grad_norm": 0.736377477645874, "learning_rate": 4.976845588050183e-05, "loss": 0.2533, "num_input_tokens_seen": 964360, "step": 10700 }, { "epoch": 2.7819646569646572, "grad_norm": 0.6651831865310669, "learning_rate": 4.976768539329994e-05, "loss": 0.2949, "num_input_tokens_seen": 964792, "step": 10705 }, { "epoch": 2.7832640332640333, "grad_norm": 0.5126408338546753, "learning_rate": 4.976691363227227e-05, "loss": 0.2749, "num_input_tokens_seen": 965256, "step": 10710 }, { "epoch": 2.7845634095634093, "grad_norm": 2.0510528087615967, "learning_rate": 4.97661405974585e-05, "loss": 0.3522, "num_input_tokens_seen": 965720, "step": 10715 }, { "epoch": 2.785862785862786, "grad_norm": 1.0032012462615967, "learning_rate": 4.9765366288898384e-05, "loss": 0.2569, "num_input_tokens_seen": 966136, "step": 10720 }, { "epoch": 2.7871621621621623, "grad_norm": 0.3408985137939453, "learning_rate": 4.9764590706631755e-05, "loss": 0.2886, "num_input_tokens_seen": 966600, "step": 10725 }, { "epoch": 2.7884615384615383, "grad_norm": 0.322319895029068, "learning_rate": 4.9763813850698494e-05, "loss": 0.1599, "num_input_tokens_seen": 967048, "step": 10730 }, { "epoch": 2.789760914760915, "grad_norm": 0.4588909149169922, "learning_rate": 4.9763035721138556e-05, "loss": 0.3823, "num_input_tokens_seen": 967480, "step": 10735 }, { "epoch": 2.7910602910602913, "grad_norm": 0.3455658555030823, "learning_rate": 4.976225631799197e-05, "loss": 0.2271, "num_input_tokens_seen": 967928, "step": 10740 }, { "epoch": 2.7923596673596673, "grad_norm": 0.318840354681015, "learning_rate": 4.9761475641298815e-05, "loss": 0.2574, "num_input_tokens_seen": 968360, "step": 10745 }, { "epoch": 2.7936590436590434, "grad_norm": 0.3677924573421478, "learning_rate": 4.9760693691099244e-05, "loss": 0.1906, "num_input_tokens_seen": 968808, "step": 10750 }, { "epoch": 2.79495841995842, "grad_norm": 0.33141160011291504, "learning_rate": 4.975991046743347e-05, "loss": 0.3968, "num_input_tokens_seen": 969272, "step": 10755 }, { "epoch": 2.7962577962577964, "grad_norm": 0.23307529091835022, "learning_rate": 4.975912597034177e-05, "loss": 0.2521, "num_input_tokens_seen": 969736, "step": 10760 }, { "epoch": 2.7975571725571724, "grad_norm": 0.44467082619667053, "learning_rate": 4.975834019986451e-05, "loss": 0.2168, "num_input_tokens_seen": 970152, "step": 10765 }, { "epoch": 2.798856548856549, "grad_norm": 0.28365758061408997, "learning_rate": 4.975755315604208e-05, "loss": 0.2577, "num_input_tokens_seen": 970584, "step": 10770 }, { "epoch": 2.8001559251559254, "grad_norm": 0.3502534329891205, "learning_rate": 4.9756764838914976e-05, "loss": 0.2903, "num_input_tokens_seen": 971000, "step": 10775 }, { "epoch": 2.8014553014553014, "grad_norm": 0.3981557786464691, "learning_rate": 4.975597524852374e-05, "loss": 0.2881, "num_input_tokens_seen": 971416, "step": 10780 }, { "epoch": 2.802754677754678, "grad_norm": 0.3217216432094574, "learning_rate": 4.975518438490897e-05, "loss": 0.1648, "num_input_tokens_seen": 971864, "step": 10785 }, { "epoch": 2.804054054054054, "grad_norm": 0.28963345289230347, "learning_rate": 4.975439224811135e-05, "loss": 0.3211, "num_input_tokens_seen": 972296, "step": 10790 }, { "epoch": 2.8053534303534304, "grad_norm": 0.31141015887260437, "learning_rate": 4.975359883817161e-05, "loss": 0.3028, "num_input_tokens_seen": 972760, "step": 10795 }, { "epoch": 2.8066528066528065, "grad_norm": 0.33568093180656433, "learning_rate": 4.9752804155130574e-05, "loss": 0.2269, "num_input_tokens_seen": 973208, "step": 10800 }, { "epoch": 2.807952182952183, "grad_norm": 0.2547631859779358, "learning_rate": 4.97520081990291e-05, "loss": 0.2605, "num_input_tokens_seen": 973656, "step": 10805 }, { "epoch": 2.8092515592515594, "grad_norm": 0.2601790130138397, "learning_rate": 4.9751210969908134e-05, "loss": 0.2642, "num_input_tokens_seen": 974104, "step": 10810 }, { "epoch": 2.8105509355509355, "grad_norm": 0.25482261180877686, "learning_rate": 4.975041246780866e-05, "loss": 0.2633, "num_input_tokens_seen": 974584, "step": 10815 }, { "epoch": 2.811850311850312, "grad_norm": 0.33948299288749695, "learning_rate": 4.9749612692771776e-05, "loss": 0.2263, "num_input_tokens_seen": 975032, "step": 10820 }, { "epoch": 2.813149688149688, "grad_norm": 0.2805023193359375, "learning_rate": 4.9748811644838584e-05, "loss": 0.2179, "num_input_tokens_seen": 975464, "step": 10825 }, { "epoch": 2.8144490644490645, "grad_norm": 0.27694839239120483, "learning_rate": 4.974800932405029e-05, "loss": 0.2629, "num_input_tokens_seen": 975864, "step": 10830 }, { "epoch": 2.8157484407484406, "grad_norm": 0.274758905172348, "learning_rate": 4.974720573044818e-05, "loss": 0.256, "num_input_tokens_seen": 976360, "step": 10835 }, { "epoch": 2.817047817047817, "grad_norm": 0.31753942370414734, "learning_rate": 4.974640086407356e-05, "loss": 0.2, "num_input_tokens_seen": 976792, "step": 10840 }, { "epoch": 2.8183471933471935, "grad_norm": 0.2528200149536133, "learning_rate": 4.974559472496784e-05, "loss": 0.3297, "num_input_tokens_seen": 977240, "step": 10845 }, { "epoch": 2.8196465696465696, "grad_norm": 0.46729764342308044, "learning_rate": 4.974478731317246e-05, "loss": 0.3262, "num_input_tokens_seen": 977704, "step": 10850 }, { "epoch": 2.820945945945946, "grad_norm": 0.2270454317331314, "learning_rate": 4.9743978628728965e-05, "loss": 0.2158, "num_input_tokens_seen": 978168, "step": 10855 }, { "epoch": 2.822245322245322, "grad_norm": 0.24774660170078278, "learning_rate": 4.974316867167894e-05, "loss": 0.1924, "num_input_tokens_seen": 978648, "step": 10860 }, { "epoch": 2.8235446985446986, "grad_norm": 0.25196754932403564, "learning_rate": 4.9742357442064045e-05, "loss": 0.2223, "num_input_tokens_seen": 979080, "step": 10865 }, { "epoch": 2.8248440748440746, "grad_norm": 0.32678699493408203, "learning_rate": 4.9741544939926e-05, "loss": 0.2631, "num_input_tokens_seen": 979512, "step": 10870 }, { "epoch": 2.826143451143451, "grad_norm": 0.34371551871299744, "learning_rate": 4.9740731165306585e-05, "loss": 0.3197, "num_input_tokens_seen": 979992, "step": 10875 }, { "epoch": 2.8274428274428276, "grad_norm": 0.3184674084186554, "learning_rate": 4.973991611824767e-05, "loss": 0.2165, "num_input_tokens_seen": 980424, "step": 10880 }, { "epoch": 2.8287422037422036, "grad_norm": 0.28327998518943787, "learning_rate": 4.973909979879116e-05, "loss": 0.3029, "num_input_tokens_seen": 980872, "step": 10885 }, { "epoch": 2.83004158004158, "grad_norm": 0.301400363445282, "learning_rate": 4.973828220697905e-05, "loss": 0.2111, "num_input_tokens_seen": 981304, "step": 10890 }, { "epoch": 2.8313409563409566, "grad_norm": 0.2546318471431732, "learning_rate": 4.973746334285337e-05, "loss": 0.2876, "num_input_tokens_seen": 981736, "step": 10895 }, { "epoch": 2.8326403326403327, "grad_norm": 0.3630441129207611, "learning_rate": 4.9736643206456256e-05, "loss": 0.2648, "num_input_tokens_seen": 982200, "step": 10900 }, { "epoch": 2.8339397089397087, "grad_norm": 0.2792193293571472, "learning_rate": 4.9735821797829884e-05, "loss": 0.2996, "num_input_tokens_seen": 982664, "step": 10905 }, { "epoch": 2.835239085239085, "grad_norm": 0.7851760387420654, "learning_rate": 4.973499911701649e-05, "loss": 0.3081, "num_input_tokens_seen": 983064, "step": 10910 }, { "epoch": 2.8365384615384617, "grad_norm": 0.7998102903366089, "learning_rate": 4.97341751640584e-05, "loss": 0.2811, "num_input_tokens_seen": 983512, "step": 10915 }, { "epoch": 2.8378378378378377, "grad_norm": 0.6341899633407593, "learning_rate": 4.973334993899798e-05, "loss": 0.2809, "num_input_tokens_seen": 983944, "step": 10920 }, { "epoch": 2.839137214137214, "grad_norm": 0.38022559881210327, "learning_rate": 4.973252344187766e-05, "loss": 0.2263, "num_input_tokens_seen": 984424, "step": 10925 }, { "epoch": 2.8404365904365907, "grad_norm": 0.3154451549053192, "learning_rate": 4.973169567273998e-05, "loss": 0.2271, "num_input_tokens_seen": 984872, "step": 10930 }, { "epoch": 2.8417359667359667, "grad_norm": 0.3346388041973114, "learning_rate": 4.9730866631627484e-05, "loss": 0.2484, "num_input_tokens_seen": 985304, "step": 10935 }, { "epoch": 2.8430353430353428, "grad_norm": 0.42084184288978577, "learning_rate": 4.973003631858282e-05, "loss": 0.3743, "num_input_tokens_seen": 985720, "step": 10940 }, { "epoch": 2.8443347193347193, "grad_norm": 0.4319007396697998, "learning_rate": 4.972920473364869e-05, "loss": 0.2719, "num_input_tokens_seen": 986168, "step": 10945 }, { "epoch": 2.8456340956340958, "grad_norm": 0.314181387424469, "learning_rate": 4.972837187686788e-05, "loss": 0.2689, "num_input_tokens_seen": 986616, "step": 10950 }, { "epoch": 2.846933471933472, "grad_norm": 0.3236224055290222, "learning_rate": 4.9727537748283206e-05, "loss": 0.1664, "num_input_tokens_seen": 987048, "step": 10955 }, { "epoch": 2.8482328482328483, "grad_norm": 0.2432047575712204, "learning_rate": 4.972670234793757e-05, "loss": 0.215, "num_input_tokens_seen": 987480, "step": 10960 }, { "epoch": 2.8495322245322248, "grad_norm": 0.4819989502429962, "learning_rate": 4.972586567587394e-05, "loss": 0.155, "num_input_tokens_seen": 987944, "step": 10965 }, { "epoch": 2.850831600831601, "grad_norm": 0.16521133482456207, "learning_rate": 4.972502773213534e-05, "loss": 0.1378, "num_input_tokens_seen": 988424, "step": 10970 }, { "epoch": 2.8521309771309773, "grad_norm": 0.5249848365783691, "learning_rate": 4.972418851676489e-05, "loss": 0.306, "num_input_tokens_seen": 988888, "step": 10975 }, { "epoch": 2.8534303534303533, "grad_norm": 0.2260316014289856, "learning_rate": 4.972334802980573e-05, "loss": 0.346, "num_input_tokens_seen": 989368, "step": 10980 }, { "epoch": 2.85472972972973, "grad_norm": 0.3050822615623474, "learning_rate": 4.972250627130108e-05, "loss": 0.3959, "num_input_tokens_seen": 989816, "step": 10985 }, { "epoch": 2.856029106029106, "grad_norm": 0.8569634556770325, "learning_rate": 4.9721663241294256e-05, "loss": 0.2694, "num_input_tokens_seen": 990296, "step": 10990 }, { "epoch": 2.8573284823284824, "grad_norm": 0.579189121723175, "learning_rate": 4.97208189398286e-05, "loss": 0.2753, "num_input_tokens_seen": 990760, "step": 10995 }, { "epoch": 2.858627858627859, "grad_norm": 0.43748506903648376, "learning_rate": 4.9719973366947545e-05, "loss": 0.2632, "num_input_tokens_seen": 991208, "step": 11000 }, { "epoch": 2.859927234927235, "grad_norm": 0.4049612581729889, "learning_rate": 4.971912652269457e-05, "loss": 0.1757, "num_input_tokens_seen": 991688, "step": 11005 }, { "epoch": 2.8612266112266114, "grad_norm": 0.7493085861206055, "learning_rate": 4.971827840711323e-05, "loss": 0.28, "num_input_tokens_seen": 992120, "step": 11010 }, { "epoch": 2.8625259875259874, "grad_norm": 0.31518688797950745, "learning_rate": 4.9717429020247156e-05, "loss": 0.0825, "num_input_tokens_seen": 992568, "step": 11015 }, { "epoch": 2.863825363825364, "grad_norm": 0.3544139266014099, "learning_rate": 4.9716578362140016e-05, "loss": 0.3019, "num_input_tokens_seen": 993032, "step": 11020 }, { "epoch": 2.86512474012474, "grad_norm": 0.5673320889472961, "learning_rate": 4.971572643283557e-05, "loss": 0.3698, "num_input_tokens_seen": 993480, "step": 11025 }, { "epoch": 2.8664241164241164, "grad_norm": 0.4111165702342987, "learning_rate": 4.9714873232377635e-05, "loss": 0.2235, "num_input_tokens_seen": 993896, "step": 11030 }, { "epoch": 2.867723492723493, "grad_norm": 0.3369550406932831, "learning_rate": 4.971401876081009e-05, "loss": 0.3222, "num_input_tokens_seen": 994392, "step": 11035 }, { "epoch": 2.869022869022869, "grad_norm": 0.8517348170280457, "learning_rate": 4.9713163018176876e-05, "loss": 0.2599, "num_input_tokens_seen": 994872, "step": 11040 }, { "epoch": 2.8703222453222454, "grad_norm": 0.8764755725860596, "learning_rate": 4.971230600452201e-05, "loss": 0.245, "num_input_tokens_seen": 995288, "step": 11045 }, { "epoch": 2.8716216216216215, "grad_norm": 0.443183034658432, "learning_rate": 4.971144771988957e-05, "loss": 0.288, "num_input_tokens_seen": 995736, "step": 11050 }, { "epoch": 2.872920997920998, "grad_norm": 0.3184202015399933, "learning_rate": 4.971058816432369e-05, "loss": 0.2882, "num_input_tokens_seen": 996168, "step": 11055 }, { "epoch": 2.874220374220374, "grad_norm": 0.34894290566444397, "learning_rate": 4.970972733786859e-05, "loss": 0.2422, "num_input_tokens_seen": 996584, "step": 11060 }, { "epoch": 2.8755197505197505, "grad_norm": 0.3236933648586273, "learning_rate": 4.970886524056854e-05, "loss": 0.2291, "num_input_tokens_seen": 997032, "step": 11065 }, { "epoch": 2.876819126819127, "grad_norm": 0.7530671954154968, "learning_rate": 4.970800187246787e-05, "loss": 0.3267, "num_input_tokens_seen": 997480, "step": 11070 }, { "epoch": 2.878118503118503, "grad_norm": 0.3695976138114929, "learning_rate": 4.9707137233610994e-05, "loss": 0.2176, "num_input_tokens_seen": 997944, "step": 11075 }, { "epoch": 2.8794178794178795, "grad_norm": 0.28079259395599365, "learning_rate": 4.970627132404238e-05, "loss": 0.2515, "num_input_tokens_seen": 998408, "step": 11080 }, { "epoch": 2.8807172557172556, "grad_norm": 0.2416723519563675, "learning_rate": 4.9705404143806555e-05, "loss": 0.2708, "num_input_tokens_seen": 998856, "step": 11085 }, { "epoch": 2.882016632016632, "grad_norm": 0.43522879481315613, "learning_rate": 4.970453569294812e-05, "loss": 0.2921, "num_input_tokens_seen": 999304, "step": 11090 }, { "epoch": 2.883316008316008, "grad_norm": 0.3980727195739746, "learning_rate": 4.970366597151175e-05, "loss": 0.2352, "num_input_tokens_seen": 999752, "step": 11095 }, { "epoch": 2.8846153846153846, "grad_norm": 0.3631994426250458, "learning_rate": 4.970279497954216e-05, "loss": 0.2745, "num_input_tokens_seen": 1000184, "step": 11100 }, { "epoch": 2.885914760914761, "grad_norm": 0.33065906167030334, "learning_rate": 4.970192271708416e-05, "loss": 0.242, "num_input_tokens_seen": 1000632, "step": 11105 }, { "epoch": 2.887214137214137, "grad_norm": 0.355428546667099, "learning_rate": 4.9701049184182616e-05, "loss": 0.3146, "num_input_tokens_seen": 1001080, "step": 11110 }, { "epoch": 2.8885135135135136, "grad_norm": 0.24547283351421356, "learning_rate": 4.970017438088243e-05, "loss": 0.2013, "num_input_tokens_seen": 1001528, "step": 11115 }, { "epoch": 2.88981288981289, "grad_norm": 0.3205449879169464, "learning_rate": 4.9699298307228616e-05, "loss": 0.2186, "num_input_tokens_seen": 1001976, "step": 11120 }, { "epoch": 2.891112266112266, "grad_norm": 0.2577130198478699, "learning_rate": 4.969842096326622e-05, "loss": 0.1574, "num_input_tokens_seen": 1002424, "step": 11125 }, { "epoch": 2.892411642411642, "grad_norm": 0.9212538003921509, "learning_rate": 4.9697542349040375e-05, "loss": 0.3257, "num_input_tokens_seen": 1002888, "step": 11130 }, { "epoch": 2.8937110187110187, "grad_norm": 0.2516918182373047, "learning_rate": 4.969666246459626e-05, "loss": 0.2828, "num_input_tokens_seen": 1003320, "step": 11135 }, { "epoch": 2.895010395010395, "grad_norm": 0.3255087733268738, "learning_rate": 4.9695781309979135e-05, "loss": 0.2648, "num_input_tokens_seen": 1003784, "step": 11140 }, { "epoch": 2.896309771309771, "grad_norm": 0.218874990940094, "learning_rate": 4.969489888523431e-05, "loss": 0.3482, "num_input_tokens_seen": 1004280, "step": 11145 }, { "epoch": 2.8976091476091477, "grad_norm": 0.39566388726234436, "learning_rate": 4.9694015190407176e-05, "loss": 0.2188, "num_input_tokens_seen": 1004728, "step": 11150 }, { "epoch": 2.898908523908524, "grad_norm": 0.38281360268592834, "learning_rate": 4.969313022554318e-05, "loss": 0.2792, "num_input_tokens_seen": 1005192, "step": 11155 }, { "epoch": 2.9002079002079, "grad_norm": 0.5277111530303955, "learning_rate": 4.9692243990687834e-05, "loss": 0.2808, "num_input_tokens_seen": 1005608, "step": 11160 }, { "epoch": 2.9015072765072762, "grad_norm": 0.35351625084877014, "learning_rate": 4.969135648588672e-05, "loss": 0.2762, "num_input_tokens_seen": 1006072, "step": 11165 }, { "epoch": 2.9028066528066527, "grad_norm": 0.30985188484191895, "learning_rate": 4.969046771118549e-05, "loss": 0.1792, "num_input_tokens_seen": 1006488, "step": 11170 }, { "epoch": 2.904106029106029, "grad_norm": 0.2651936411857605, "learning_rate": 4.968957766662985e-05, "loss": 0.2191, "num_input_tokens_seen": 1006936, "step": 11175 }, { "epoch": 2.9054054054054053, "grad_norm": 0.2590669095516205, "learning_rate": 4.9688686352265566e-05, "loss": 0.2622, "num_input_tokens_seen": 1007336, "step": 11180 }, { "epoch": 2.9067047817047817, "grad_norm": 0.24639692902565002, "learning_rate": 4.968779376813849e-05, "loss": 0.2109, "num_input_tokens_seen": 1007784, "step": 11185 }, { "epoch": 2.9080041580041582, "grad_norm": 0.2588089406490326, "learning_rate": 4.968689991429453e-05, "loss": 0.3417, "num_input_tokens_seen": 1008264, "step": 11190 }, { "epoch": 2.9093035343035343, "grad_norm": 0.2491486668586731, "learning_rate": 4.968600479077964e-05, "loss": 0.3361, "num_input_tokens_seen": 1008728, "step": 11195 }, { "epoch": 2.9106029106029108, "grad_norm": 0.5554271936416626, "learning_rate": 4.968510839763989e-05, "loss": 0.2497, "num_input_tokens_seen": 1009192, "step": 11200 }, { "epoch": 2.911902286902287, "grad_norm": 0.17679668962955475, "learning_rate": 4.968421073492135e-05, "loss": 0.3076, "num_input_tokens_seen": 1009656, "step": 11205 }, { "epoch": 2.9132016632016633, "grad_norm": 0.4022502303123474, "learning_rate": 4.968331180267021e-05, "loss": 0.2438, "num_input_tokens_seen": 1010120, "step": 11210 }, { "epoch": 2.9145010395010393, "grad_norm": 0.17163795232772827, "learning_rate": 4.968241160093268e-05, "loss": 0.2743, "num_input_tokens_seen": 1010552, "step": 11215 }, { "epoch": 2.915800415800416, "grad_norm": 0.40098875761032104, "learning_rate": 4.968151012975508e-05, "loss": 0.2776, "num_input_tokens_seen": 1011016, "step": 11220 }, { "epoch": 2.9170997920997923, "grad_norm": 0.15130293369293213, "learning_rate": 4.968060738918376e-05, "loss": 0.2845, "num_input_tokens_seen": 1011432, "step": 11225 }, { "epoch": 2.9183991683991684, "grad_norm": 0.45420369505882263, "learning_rate": 4.967970337926516e-05, "loss": 0.2777, "num_input_tokens_seen": 1011848, "step": 11230 }, { "epoch": 2.919698544698545, "grad_norm": 0.45238861441612244, "learning_rate": 4.9678798100045776e-05, "loss": 0.2702, "num_input_tokens_seen": 1012264, "step": 11235 }, { "epoch": 2.920997920997921, "grad_norm": 0.39307230710983276, "learning_rate": 4.967789155157215e-05, "loss": 0.2741, "num_input_tokens_seen": 1012696, "step": 11240 }, { "epoch": 2.9222972972972974, "grad_norm": 0.20159855484962463, "learning_rate": 4.9676983733890913e-05, "loss": 0.2535, "num_input_tokens_seen": 1013144, "step": 11245 }, { "epoch": 2.9235966735966734, "grad_norm": 0.17192442715168, "learning_rate": 4.9676074647048766e-05, "loss": 0.3106, "num_input_tokens_seen": 1013592, "step": 11250 }, { "epoch": 2.92489604989605, "grad_norm": 0.4165033996105194, "learning_rate": 4.967516429109245e-05, "loss": 0.2338, "num_input_tokens_seen": 1014040, "step": 11255 }, { "epoch": 2.9261954261954264, "grad_norm": 0.5607776641845703, "learning_rate": 4.96742526660688e-05, "loss": 0.2984, "num_input_tokens_seen": 1014520, "step": 11260 }, { "epoch": 2.9274948024948024, "grad_norm": 0.37412819266319275, "learning_rate": 4.967333977202469e-05, "loss": 0.2374, "num_input_tokens_seen": 1014968, "step": 11265 }, { "epoch": 2.928794178794179, "grad_norm": 0.22792354226112366, "learning_rate": 4.9672425609007064e-05, "loss": 0.2611, "num_input_tokens_seen": 1015416, "step": 11270 }, { "epoch": 2.930093555093555, "grad_norm": 0.5823106169700623, "learning_rate": 4.967151017706296e-05, "loss": 0.2706, "num_input_tokens_seen": 1015848, "step": 11275 }, { "epoch": 2.9313929313929314, "grad_norm": 0.33820897340774536, "learning_rate": 4.967059347623945e-05, "loss": 0.2536, "num_input_tokens_seen": 1016296, "step": 11280 }, { "epoch": 2.9326923076923075, "grad_norm": 0.23080754280090332, "learning_rate": 4.9669675506583675e-05, "loss": 0.2597, "num_input_tokens_seen": 1016744, "step": 11285 }, { "epoch": 2.933991683991684, "grad_norm": 0.3122304379940033, "learning_rate": 4.966875626814285e-05, "loss": 0.281, "num_input_tokens_seen": 1017176, "step": 11290 }, { "epoch": 2.9352910602910605, "grad_norm": 0.3221740126609802, "learning_rate": 4.966783576096426e-05, "loss": 0.214, "num_input_tokens_seen": 1017624, "step": 11295 }, { "epoch": 2.9365904365904365, "grad_norm": 0.24970795214176178, "learning_rate": 4.966691398509523e-05, "loss": 0.2971, "num_input_tokens_seen": 1018104, "step": 11300 }, { "epoch": 2.937889812889813, "grad_norm": 0.23753182590007782, "learning_rate": 4.966599094058319e-05, "loss": 0.2176, "num_input_tokens_seen": 1018536, "step": 11305 }, { "epoch": 2.939189189189189, "grad_norm": 0.3082329332828522, "learning_rate": 4.96650666274756e-05, "loss": 0.2763, "num_input_tokens_seen": 1018968, "step": 11310 }, { "epoch": 2.9404885654885655, "grad_norm": 0.24056832492351532, "learning_rate": 4.966414104581999e-05, "loss": 0.2218, "num_input_tokens_seen": 1019400, "step": 11315 }, { "epoch": 2.9417879417879416, "grad_norm": 0.28409647941589355, "learning_rate": 4.966321419566399e-05, "loss": 0.1346, "num_input_tokens_seen": 1019848, "step": 11320 }, { "epoch": 2.943087318087318, "grad_norm": 0.3547050356864929, "learning_rate": 4.966228607705524e-05, "loss": 0.2575, "num_input_tokens_seen": 1020296, "step": 11325 }, { "epoch": 2.9443866943866945, "grad_norm": 0.836749792098999, "learning_rate": 4.9661356690041494e-05, "loss": 0.3166, "num_input_tokens_seen": 1020760, "step": 11330 }, { "epoch": 2.9456860706860706, "grad_norm": 0.2514040470123291, "learning_rate": 4.966042603467055e-05, "loss": 0.2559, "num_input_tokens_seen": 1021224, "step": 11335 }, { "epoch": 2.946985446985447, "grad_norm": 0.28792527318000793, "learning_rate": 4.9659494110990256e-05, "loss": 0.2534, "num_input_tokens_seen": 1021656, "step": 11340 }, { "epoch": 2.9482848232848236, "grad_norm": 0.397506982088089, "learning_rate": 4.965856091904855e-05, "loss": 0.236, "num_input_tokens_seen": 1022088, "step": 11345 }, { "epoch": 2.9495841995841996, "grad_norm": 0.2733784317970276, "learning_rate": 4.9657626458893436e-05, "loss": 0.1811, "num_input_tokens_seen": 1022552, "step": 11350 }, { "epoch": 2.9508835758835756, "grad_norm": 0.2274097502231598, "learning_rate": 4.9656690730572965e-05, "loss": 0.2209, "num_input_tokens_seen": 1023032, "step": 11355 }, { "epoch": 2.952182952182952, "grad_norm": 0.31662651896476746, "learning_rate": 4.965575373413527e-05, "loss": 0.2045, "num_input_tokens_seen": 1023496, "step": 11360 }, { "epoch": 2.9534823284823286, "grad_norm": 0.3369264304637909, "learning_rate": 4.965481546962853e-05, "loss": 0.2124, "num_input_tokens_seen": 1024024, "step": 11365 }, { "epoch": 2.9547817047817047, "grad_norm": 0.3000832498073578, "learning_rate": 4.965387593710101e-05, "loss": 0.3944, "num_input_tokens_seen": 1024472, "step": 11370 }, { "epoch": 2.956081081081081, "grad_norm": 0.30784639716148376, "learning_rate": 4.965293513660103e-05, "loss": 0.2717, "num_input_tokens_seen": 1024920, "step": 11375 }, { "epoch": 2.9573804573804576, "grad_norm": 0.2955529987812042, "learning_rate": 4.965199306817697e-05, "loss": 0.2056, "num_input_tokens_seen": 1025336, "step": 11380 }, { "epoch": 2.9586798336798337, "grad_norm": 0.31427833437919617, "learning_rate": 4.9651049731877283e-05, "loss": 0.2576, "num_input_tokens_seen": 1025768, "step": 11385 }, { "epoch": 2.9599792099792097, "grad_norm": 0.348789781332016, "learning_rate": 4.96501051277505e-05, "loss": 0.2801, "num_input_tokens_seen": 1026200, "step": 11390 }, { "epoch": 2.961278586278586, "grad_norm": 0.36963218450546265, "learning_rate": 4.9649159255845184e-05, "loss": 0.2576, "num_input_tokens_seen": 1026664, "step": 11395 }, { "epoch": 2.9625779625779627, "grad_norm": 0.18669891357421875, "learning_rate": 4.964821211620999e-05, "loss": 0.2916, "num_input_tokens_seen": 1027160, "step": 11400 }, { "epoch": 2.9638773388773387, "grad_norm": 0.46086761355400085, "learning_rate": 4.964726370889363e-05, "loss": 0.2779, "num_input_tokens_seen": 1027576, "step": 11405 }, { "epoch": 2.965176715176715, "grad_norm": 0.4775330424308777, "learning_rate": 4.9646314033944884e-05, "loss": 0.2515, "num_input_tokens_seen": 1028024, "step": 11410 }, { "epoch": 2.9664760914760917, "grad_norm": 0.34315016865730286, "learning_rate": 4.964536309141259e-05, "loss": 0.1818, "num_input_tokens_seen": 1028488, "step": 11415 }, { "epoch": 2.9677754677754677, "grad_norm": 0.23450902104377747, "learning_rate": 4.9644410881345665e-05, "loss": 0.2718, "num_input_tokens_seen": 1029000, "step": 11420 }, { "epoch": 2.9690748440748442, "grad_norm": 0.3464885950088501, "learning_rate": 4.964345740379307e-05, "loss": 0.2233, "num_input_tokens_seen": 1029464, "step": 11425 }, { "epoch": 2.9703742203742203, "grad_norm": 0.296036034822464, "learning_rate": 4.9642502658803846e-05, "loss": 0.3758, "num_input_tokens_seen": 1029912, "step": 11430 }, { "epoch": 2.9716735966735968, "grad_norm": 0.7906910181045532, "learning_rate": 4.964154664642711e-05, "loss": 0.3498, "num_input_tokens_seen": 1030360, "step": 11435 }, { "epoch": 2.972972972972973, "grad_norm": 0.28506380319595337, "learning_rate": 4.9640589366712e-05, "loss": 0.2866, "num_input_tokens_seen": 1030808, "step": 11440 }, { "epoch": 2.9742723492723493, "grad_norm": 3.5445449352264404, "learning_rate": 4.963963081970778e-05, "loss": 0.3949, "num_input_tokens_seen": 1031272, "step": 11445 }, { "epoch": 2.975571725571726, "grad_norm": 0.2030554562807083, "learning_rate": 4.9638671005463746e-05, "loss": 0.358, "num_input_tokens_seen": 1031752, "step": 11450 }, { "epoch": 2.976871101871102, "grad_norm": 0.19384035468101501, "learning_rate": 4.9637709924029244e-05, "loss": 0.278, "num_input_tokens_seen": 1032184, "step": 11455 }, { "epoch": 2.9781704781704783, "grad_norm": 0.16909493505954742, "learning_rate": 4.963674757545372e-05, "loss": 0.34, "num_input_tokens_seen": 1032616, "step": 11460 }, { "epoch": 2.9794698544698544, "grad_norm": 0.5126104950904846, "learning_rate": 4.9635783959786656e-05, "loss": 0.293, "num_input_tokens_seen": 1033080, "step": 11465 }, { "epoch": 2.980769230769231, "grad_norm": 0.6789242029190063, "learning_rate": 4.963481907707762e-05, "loss": 0.3192, "num_input_tokens_seen": 1033560, "step": 11470 }, { "epoch": 2.982068607068607, "grad_norm": 0.49289265275001526, "learning_rate": 4.963385292737624e-05, "loss": 0.2918, "num_input_tokens_seen": 1034040, "step": 11475 }, { "epoch": 2.9833679833679834, "grad_norm": 0.18919208645820618, "learning_rate": 4.96328855107322e-05, "loss": 0.252, "num_input_tokens_seen": 1034488, "step": 11480 }, { "epoch": 2.98466735966736, "grad_norm": 0.25733721256256104, "learning_rate": 4.963191682719525e-05, "loss": 0.2322, "num_input_tokens_seen": 1034936, "step": 11485 }, { "epoch": 2.985966735966736, "grad_norm": 0.3164377212524414, "learning_rate": 4.963094687681522e-05, "loss": 0.2238, "num_input_tokens_seen": 1035384, "step": 11490 }, { "epoch": 2.9872661122661124, "grad_norm": 0.2963978946208954, "learning_rate": 4.9629975659641994e-05, "loss": 0.4644, "num_input_tokens_seen": 1035816, "step": 11495 }, { "epoch": 2.9885654885654884, "grad_norm": 0.23347778618335724, "learning_rate": 4.962900317572552e-05, "loss": 0.2535, "num_input_tokens_seen": 1036264, "step": 11500 }, { "epoch": 2.989864864864865, "grad_norm": 0.18802091479301453, "learning_rate": 4.962802942511581e-05, "loss": 0.264, "num_input_tokens_seen": 1036680, "step": 11505 }, { "epoch": 2.991164241164241, "grad_norm": 0.19742780923843384, "learning_rate": 4.962705440786295e-05, "loss": 0.2842, "num_input_tokens_seen": 1037096, "step": 11510 }, { "epoch": 2.9924636174636174, "grad_norm": 0.475980669260025, "learning_rate": 4.96260781240171e-05, "loss": 0.2395, "num_input_tokens_seen": 1037560, "step": 11515 }, { "epoch": 2.993762993762994, "grad_norm": 0.6940547823905945, "learning_rate": 4.962510057362844e-05, "loss": 0.2742, "num_input_tokens_seen": 1038040, "step": 11520 }, { "epoch": 2.99506237006237, "grad_norm": 0.4058719575405121, "learning_rate": 4.962412175674726e-05, "loss": 0.2721, "num_input_tokens_seen": 1038472, "step": 11525 }, { "epoch": 2.9963617463617465, "grad_norm": 0.3466864228248596, "learning_rate": 4.962314167342391e-05, "loss": 0.2499, "num_input_tokens_seen": 1038968, "step": 11530 }, { "epoch": 2.9976611226611225, "grad_norm": 0.33524996042251587, "learning_rate": 4.96221603237088e-05, "loss": 0.1882, "num_input_tokens_seen": 1039384, "step": 11535 }, { "epoch": 2.998960498960499, "grad_norm": 0.3259657621383667, "learning_rate": 4.962117770765238e-05, "loss": 0.2179, "num_input_tokens_seen": 1039832, "step": 11540 }, { "epoch": 3.0, "eval_loss": 0.24048779904842377, "eval_runtime": 13.1973, "eval_samples_per_second": 64.862, "eval_steps_per_second": 32.431, "num_input_tokens_seen": 1040128, "step": 11544 }, { "epoch": 3.0002598752598755, "grad_norm": 0.30883243680000305, "learning_rate": 4.962019382530521e-05, "loss": 0.2596, "num_input_tokens_seen": 1040224, "step": 11545 }, { "epoch": 3.0015592515592515, "grad_norm": 0.8744833469390869, "learning_rate": 4.9619208676717874e-05, "loss": 0.245, "num_input_tokens_seen": 1040720, "step": 11550 }, { "epoch": 3.002858627858628, "grad_norm": 0.7489253282546997, "learning_rate": 4.961822226194104e-05, "loss": 0.3867, "num_input_tokens_seen": 1041168, "step": 11555 }, { "epoch": 3.004158004158004, "grad_norm": 0.24393221735954285, "learning_rate": 4.9617234581025465e-05, "loss": 0.2219, "num_input_tokens_seen": 1041616, "step": 11560 }, { "epoch": 3.0054573804573805, "grad_norm": 0.21973319351673126, "learning_rate": 4.961624563402192e-05, "loss": 0.2821, "num_input_tokens_seen": 1042096, "step": 11565 }, { "epoch": 3.0067567567567566, "grad_norm": 0.19102157652378082, "learning_rate": 4.9615255420981266e-05, "loss": 0.2927, "num_input_tokens_seen": 1042640, "step": 11570 }, { "epoch": 3.008056133056133, "grad_norm": 0.5065321922302246, "learning_rate": 4.961426394195445e-05, "loss": 0.2896, "num_input_tokens_seen": 1043072, "step": 11575 }, { "epoch": 3.0093555093555096, "grad_norm": 0.48047250509262085, "learning_rate": 4.961327119699246e-05, "loss": 0.2831, "num_input_tokens_seen": 1043488, "step": 11580 }, { "epoch": 3.0106548856548856, "grad_norm": 0.1193900927901268, "learning_rate": 4.9612277186146335e-05, "loss": 0.2749, "num_input_tokens_seen": 1043936, "step": 11585 }, { "epoch": 3.011954261954262, "grad_norm": 0.4734954535961151, "learning_rate": 4.961128190946723e-05, "loss": 0.2915, "num_input_tokens_seen": 1044368, "step": 11590 }, { "epoch": 3.013253638253638, "grad_norm": 0.14264163374900818, "learning_rate": 4.9610285367006305e-05, "loss": 0.2813, "num_input_tokens_seen": 1044816, "step": 11595 }, { "epoch": 3.0145530145530146, "grad_norm": 0.416459858417511, "learning_rate": 4.960928755881482e-05, "loss": 0.2509, "num_input_tokens_seen": 1045232, "step": 11600 }, { "epoch": 3.0158523908523907, "grad_norm": 0.19352391362190247, "learning_rate": 4.960828848494411e-05, "loss": 0.2475, "num_input_tokens_seen": 1045680, "step": 11605 }, { "epoch": 3.017151767151767, "grad_norm": 0.3167785406112671, "learning_rate": 4.960728814544553e-05, "loss": 0.1622, "num_input_tokens_seen": 1046112, "step": 11610 }, { "epoch": 3.0184511434511436, "grad_norm": 0.41186681389808655, "learning_rate": 4.960628654037055e-05, "loss": 0.3539, "num_input_tokens_seen": 1046528, "step": 11615 }, { "epoch": 3.0197505197505197, "grad_norm": 0.32273146510124207, "learning_rate": 4.9605283669770674e-05, "loss": 0.216, "num_input_tokens_seen": 1046976, "step": 11620 }, { "epoch": 3.021049896049896, "grad_norm": 0.34784895181655884, "learning_rate": 4.960427953369749e-05, "loss": 0.2978, "num_input_tokens_seen": 1047440, "step": 11625 }, { "epoch": 3.022349272349272, "grad_norm": 0.2578054368495941, "learning_rate": 4.960327413220262e-05, "loss": 0.2633, "num_input_tokens_seen": 1047888, "step": 11630 }, { "epoch": 3.0236486486486487, "grad_norm": 0.33772996068000793, "learning_rate": 4.960226746533779e-05, "loss": 0.287, "num_input_tokens_seen": 1048336, "step": 11635 }, { "epoch": 3.024948024948025, "grad_norm": 0.3197817802429199, "learning_rate": 4.960125953315479e-05, "loss": 0.2448, "num_input_tokens_seen": 1048800, "step": 11640 }, { "epoch": 3.026247401247401, "grad_norm": 0.1626310497522354, "learning_rate": 4.9600250335705425e-05, "loss": 0.3215, "num_input_tokens_seen": 1049280, "step": 11645 }, { "epoch": 3.0275467775467777, "grad_norm": 0.4132397174835205, "learning_rate": 4.959923987304161e-05, "loss": 0.2261, "num_input_tokens_seen": 1049728, "step": 11650 }, { "epoch": 3.0288461538461537, "grad_norm": 0.3667006492614746, "learning_rate": 4.9598228145215334e-05, "loss": 0.2701, "num_input_tokens_seen": 1050224, "step": 11655 }, { "epoch": 3.0301455301455302, "grad_norm": 0.19264186918735504, "learning_rate": 4.959721515227861e-05, "loss": 0.3203, "num_input_tokens_seen": 1050720, "step": 11660 }, { "epoch": 3.0314449064449063, "grad_norm": 0.48880940675735474, "learning_rate": 4.959620089428354e-05, "loss": 0.2889, "num_input_tokens_seen": 1051216, "step": 11665 }, { "epoch": 3.0327442827442828, "grad_norm": 0.13961803913116455, "learning_rate": 4.959518537128229e-05, "loss": 0.2499, "num_input_tokens_seen": 1051632, "step": 11670 }, { "epoch": 3.0340436590436592, "grad_norm": 0.20917491614818573, "learning_rate": 4.9594168583327094e-05, "loss": 0.2906, "num_input_tokens_seen": 1052064, "step": 11675 }, { "epoch": 3.0353430353430353, "grad_norm": 0.44180262088775635, "learning_rate": 4.959315053047024e-05, "loss": 0.2635, "num_input_tokens_seen": 1052512, "step": 11680 }, { "epoch": 3.0366424116424118, "grad_norm": 0.22854332625865936, "learning_rate": 4.959213121276409e-05, "loss": 0.231, "num_input_tokens_seen": 1052976, "step": 11685 }, { "epoch": 3.037941787941788, "grad_norm": 0.32484641671180725, "learning_rate": 4.9591110630261076e-05, "loss": 0.2749, "num_input_tokens_seen": 1053440, "step": 11690 }, { "epoch": 3.0392411642411643, "grad_norm": 0.303094744682312, "learning_rate": 4.959008878301367e-05, "loss": 0.2611, "num_input_tokens_seen": 1053872, "step": 11695 }, { "epoch": 3.0405405405405403, "grad_norm": 0.3124060034751892, "learning_rate": 4.958906567107444e-05, "loss": 0.3249, "num_input_tokens_seen": 1054320, "step": 11700 }, { "epoch": 3.041839916839917, "grad_norm": 0.3261723220348358, "learning_rate": 4.9588041294496e-05, "loss": 0.2394, "num_input_tokens_seen": 1054752, "step": 11705 }, { "epoch": 3.0431392931392933, "grad_norm": 0.5970593690872192, "learning_rate": 4.958701565333104e-05, "loss": 0.2798, "num_input_tokens_seen": 1055184, "step": 11710 }, { "epoch": 3.0444386694386694, "grad_norm": 0.3840376138687134, "learning_rate": 4.9585988747632306e-05, "loss": 0.2821, "num_input_tokens_seen": 1055600, "step": 11715 }, { "epoch": 3.045738045738046, "grad_norm": 0.3758932054042816, "learning_rate": 4.958496057745262e-05, "loss": 0.2264, "num_input_tokens_seen": 1056064, "step": 11720 }, { "epoch": 3.047037422037422, "grad_norm": 0.2394053339958191, "learning_rate": 4.9583931142844845e-05, "loss": 0.2947, "num_input_tokens_seen": 1056528, "step": 11725 }, { "epoch": 3.0483367983367984, "grad_norm": 0.2245931178331375, "learning_rate": 4.958290044386194e-05, "loss": 0.2744, "num_input_tokens_seen": 1056992, "step": 11730 }, { "epoch": 3.0496361746361744, "grad_norm": 0.5429590940475464, "learning_rate": 4.958186848055691e-05, "loss": 0.2833, "num_input_tokens_seen": 1057440, "step": 11735 }, { "epoch": 3.050935550935551, "grad_norm": 0.4098165035247803, "learning_rate": 4.9580835252982836e-05, "loss": 0.2622, "num_input_tokens_seen": 1057856, "step": 11740 }, { "epoch": 3.0522349272349274, "grad_norm": 0.23052345216274261, "learning_rate": 4.957980076119285e-05, "loss": 0.3018, "num_input_tokens_seen": 1058304, "step": 11745 }, { "epoch": 3.0535343035343034, "grad_norm": 0.18189099431037903, "learning_rate": 4.9578765005240164e-05, "loss": 0.2883, "num_input_tokens_seen": 1058752, "step": 11750 }, { "epoch": 3.05483367983368, "grad_norm": 0.32556620240211487, "learning_rate": 4.9577727985178036e-05, "loss": 0.2998, "num_input_tokens_seen": 1059216, "step": 11755 }, { "epoch": 3.056133056133056, "grad_norm": 0.13685032725334167, "learning_rate": 4.9576689701059815e-05, "loss": 0.2821, "num_input_tokens_seen": 1059664, "step": 11760 }, { "epoch": 3.0574324324324325, "grad_norm": 0.8414954543113708, "learning_rate": 4.957565015293889e-05, "loss": 0.2998, "num_input_tokens_seen": 1060128, "step": 11765 }, { "epoch": 3.058731808731809, "grad_norm": 0.17393162846565247, "learning_rate": 4.957460934086873e-05, "loss": 0.2817, "num_input_tokens_seen": 1060608, "step": 11770 }, { "epoch": 3.060031185031185, "grad_norm": 0.2607022821903229, "learning_rate": 4.9573567264902865e-05, "loss": 0.27, "num_input_tokens_seen": 1061056, "step": 11775 }, { "epoch": 3.0613305613305615, "grad_norm": 0.3847953677177429, "learning_rate": 4.95725239250949e-05, "loss": 0.2112, "num_input_tokens_seen": 1061520, "step": 11780 }, { "epoch": 3.0626299376299375, "grad_norm": 0.3479543626308441, "learning_rate": 4.957147932149847e-05, "loss": 0.3359, "num_input_tokens_seen": 1061952, "step": 11785 }, { "epoch": 3.063929313929314, "grad_norm": 0.42011138796806335, "learning_rate": 4.957043345416732e-05, "loss": 0.2239, "num_input_tokens_seen": 1062384, "step": 11790 }, { "epoch": 3.06522869022869, "grad_norm": 0.2409452199935913, "learning_rate": 4.956938632315524e-05, "loss": 0.279, "num_input_tokens_seen": 1062832, "step": 11795 }, { "epoch": 3.0665280665280665, "grad_norm": 0.3880605399608612, "learning_rate": 4.956833792851608e-05, "loss": 0.1976, "num_input_tokens_seen": 1063248, "step": 11800 }, { "epoch": 3.067827442827443, "grad_norm": 0.29493746161460876, "learning_rate": 4.956728827030376e-05, "loss": 0.2222, "num_input_tokens_seen": 1063696, "step": 11805 }, { "epoch": 3.069126819126819, "grad_norm": 0.25795429944992065, "learning_rate": 4.956623734857226e-05, "loss": 0.2091, "num_input_tokens_seen": 1064112, "step": 11810 }, { "epoch": 3.0704261954261955, "grad_norm": 0.44138720631599426, "learning_rate": 4.956518516337564e-05, "loss": 0.1665, "num_input_tokens_seen": 1064608, "step": 11815 }, { "epoch": 3.0717255717255716, "grad_norm": 0.21639248728752136, "learning_rate": 4.9564131714768e-05, "loss": 0.2096, "num_input_tokens_seen": 1065088, "step": 11820 }, { "epoch": 3.073024948024948, "grad_norm": 0.445697158575058, "learning_rate": 4.956307700280354e-05, "loss": 0.3691, "num_input_tokens_seen": 1065520, "step": 11825 }, { "epoch": 3.074324324324324, "grad_norm": 0.25654712319374084, "learning_rate": 4.9562021027536494e-05, "loss": 0.3175, "num_input_tokens_seen": 1065936, "step": 11830 }, { "epoch": 3.0756237006237006, "grad_norm": 0.3039679229259491, "learning_rate": 4.956096378902117e-05, "loss": 0.337, "num_input_tokens_seen": 1066416, "step": 11835 }, { "epoch": 3.076923076923077, "grad_norm": 0.5333613753318787, "learning_rate": 4.955990528731195e-05, "loss": 0.2031, "num_input_tokens_seen": 1066864, "step": 11840 }, { "epoch": 3.078222453222453, "grad_norm": 0.3533966541290283, "learning_rate": 4.955884552246326e-05, "loss": 0.224, "num_input_tokens_seen": 1067360, "step": 11845 }, { "epoch": 3.0795218295218296, "grad_norm": 0.49965184926986694, "learning_rate": 4.955778449452962e-05, "loss": 0.2693, "num_input_tokens_seen": 1067808, "step": 11850 }, { "epoch": 3.0808212058212057, "grad_norm": 0.5792025923728943, "learning_rate": 4.9556722203565585e-05, "loss": 0.3121, "num_input_tokens_seen": 1068288, "step": 11855 }, { "epoch": 3.082120582120582, "grad_norm": 0.34958893060684204, "learning_rate": 4.955565864962581e-05, "loss": 0.2525, "num_input_tokens_seen": 1068736, "step": 11860 }, { "epoch": 3.0834199584199586, "grad_norm": 0.25842398405075073, "learning_rate": 4.955459383276497e-05, "loss": 0.2628, "num_input_tokens_seen": 1069152, "step": 11865 }, { "epoch": 3.0847193347193347, "grad_norm": 0.4253480136394501, "learning_rate": 4.955352775303786e-05, "loss": 0.2639, "num_input_tokens_seen": 1069600, "step": 11870 }, { "epoch": 3.086018711018711, "grad_norm": 0.3472401797771454, "learning_rate": 4.955246041049927e-05, "loss": 0.1707, "num_input_tokens_seen": 1070080, "step": 11875 }, { "epoch": 3.087318087318087, "grad_norm": 0.37365153431892395, "learning_rate": 4.9551391805204126e-05, "loss": 0.2763, "num_input_tokens_seen": 1070528, "step": 11880 }, { "epoch": 3.0886174636174637, "grad_norm": 0.28263989090919495, "learning_rate": 4.955032193720739e-05, "loss": 0.3068, "num_input_tokens_seen": 1071024, "step": 11885 }, { "epoch": 3.0899168399168397, "grad_norm": 0.25612175464630127, "learning_rate": 4.954925080656405e-05, "loss": 0.2181, "num_input_tokens_seen": 1071472, "step": 11890 }, { "epoch": 3.0912162162162162, "grad_norm": 0.3304985761642456, "learning_rate": 4.9548178413329236e-05, "loss": 0.273, "num_input_tokens_seen": 1071952, "step": 11895 }, { "epoch": 3.0925155925155927, "grad_norm": 0.2984769344329834, "learning_rate": 4.954710475755808e-05, "loss": 0.2976, "num_input_tokens_seen": 1072384, "step": 11900 }, { "epoch": 3.0938149688149688, "grad_norm": 0.32816097140312195, "learning_rate": 4.954602983930581e-05, "loss": 0.2194, "num_input_tokens_seen": 1072864, "step": 11905 }, { "epoch": 3.0951143451143452, "grad_norm": 0.26434022188186646, "learning_rate": 4.95449536586277e-05, "loss": 0.2571, "num_input_tokens_seen": 1073296, "step": 11910 }, { "epoch": 3.0964137214137213, "grad_norm": 0.2563956677913666, "learning_rate": 4.954387621557911e-05, "loss": 0.258, "num_input_tokens_seen": 1073712, "step": 11915 }, { "epoch": 3.0977130977130978, "grad_norm": 0.3942703306674957, "learning_rate": 4.954279751021545e-05, "loss": 0.3063, "num_input_tokens_seen": 1074160, "step": 11920 }, { "epoch": 3.099012474012474, "grad_norm": 0.2149813175201416, "learning_rate": 4.954171754259219e-05, "loss": 0.2918, "num_input_tokens_seen": 1074592, "step": 11925 }, { "epoch": 3.1003118503118503, "grad_norm": 0.6117647290229797, "learning_rate": 4.9540636312764886e-05, "loss": 0.2601, "num_input_tokens_seen": 1075024, "step": 11930 }, { "epoch": 3.101611226611227, "grad_norm": 0.6125718355178833, "learning_rate": 4.953955382078915e-05, "loss": 0.2839, "num_input_tokens_seen": 1075456, "step": 11935 }, { "epoch": 3.102910602910603, "grad_norm": 0.3165912926197052, "learning_rate": 4.953847006672064e-05, "loss": 0.1875, "num_input_tokens_seen": 1075936, "step": 11940 }, { "epoch": 3.1042099792099793, "grad_norm": 0.24179819226264954, "learning_rate": 4.95373850506151e-05, "loss": 0.2263, "num_input_tokens_seen": 1076384, "step": 11945 }, { "epoch": 3.1055093555093554, "grad_norm": 0.2720138132572174, "learning_rate": 4.953629877252835e-05, "loss": 0.3752, "num_input_tokens_seen": 1076864, "step": 11950 }, { "epoch": 3.106808731808732, "grad_norm": 0.2880620062351227, "learning_rate": 4.953521123251624e-05, "loss": 0.2563, "num_input_tokens_seen": 1077312, "step": 11955 }, { "epoch": 3.108108108108108, "grad_norm": 0.32120251655578613, "learning_rate": 4.95341224306347e-05, "loss": 0.1846, "num_input_tokens_seen": 1077744, "step": 11960 }, { "epoch": 3.1094074844074844, "grad_norm": 0.42888012528419495, "learning_rate": 4.9533032366939744e-05, "loss": 0.1793, "num_input_tokens_seen": 1078208, "step": 11965 }, { "epoch": 3.110706860706861, "grad_norm": 0.2225526124238968, "learning_rate": 4.9531941041487414e-05, "loss": 0.3446, "num_input_tokens_seen": 1078672, "step": 11970 }, { "epoch": 3.112006237006237, "grad_norm": 0.3284223973751068, "learning_rate": 4.9530848454333865e-05, "loss": 0.177, "num_input_tokens_seen": 1079120, "step": 11975 }, { "epoch": 3.1133056133056134, "grad_norm": 0.21481303870677948, "learning_rate": 4.952975460553527e-05, "loss": 0.2742, "num_input_tokens_seen": 1079568, "step": 11980 }, { "epoch": 3.1146049896049894, "grad_norm": 0.594039261341095, "learning_rate": 4.95286594951479e-05, "loss": 0.2461, "num_input_tokens_seen": 1080032, "step": 11985 }, { "epoch": 3.115904365904366, "grad_norm": 0.35301893949508667, "learning_rate": 4.952756312322806e-05, "loss": 0.3015, "num_input_tokens_seen": 1080480, "step": 11990 }, { "epoch": 3.1172037422037424, "grad_norm": 0.6406096816062927, "learning_rate": 4.952646548983215e-05, "loss": 0.2163, "num_input_tokens_seen": 1080976, "step": 11995 }, { "epoch": 3.1185031185031185, "grad_norm": 0.28860440850257874, "learning_rate": 4.952536659501662e-05, "loss": 0.3264, "num_input_tokens_seen": 1081424, "step": 12000 }, { "epoch": 3.119802494802495, "grad_norm": 0.3529403507709503, "learning_rate": 4.952426643883799e-05, "loss": 0.2274, "num_input_tokens_seen": 1081840, "step": 12005 }, { "epoch": 3.121101871101871, "grad_norm": 0.35242679715156555, "learning_rate": 4.952316502135284e-05, "loss": 0.2252, "num_input_tokens_seen": 1082288, "step": 12010 }, { "epoch": 3.1224012474012475, "grad_norm": 0.28655508160591125, "learning_rate": 4.952206234261781e-05, "loss": 0.2311, "num_input_tokens_seen": 1082720, "step": 12015 }, { "epoch": 3.1237006237006235, "grad_norm": 0.2981358766555786, "learning_rate": 4.952095840268962e-05, "loss": 0.2653, "num_input_tokens_seen": 1083152, "step": 12020 }, { "epoch": 3.125, "grad_norm": 0.2327883094549179, "learning_rate": 4.9519853201625044e-05, "loss": 0.1061, "num_input_tokens_seen": 1083568, "step": 12025 }, { "epoch": 3.1262993762993765, "grad_norm": 0.32322269678115845, "learning_rate": 4.951874673948093e-05, "loss": 0.3622, "num_input_tokens_seen": 1084032, "step": 12030 }, { "epoch": 3.1275987525987525, "grad_norm": 0.2636926472187042, "learning_rate": 4.951763901631417e-05, "loss": 0.2539, "num_input_tokens_seen": 1084480, "step": 12035 }, { "epoch": 3.128898128898129, "grad_norm": 0.2614841163158417, "learning_rate": 4.9516530032181744e-05, "loss": 0.2532, "num_input_tokens_seen": 1084944, "step": 12040 }, { "epoch": 3.130197505197505, "grad_norm": 0.2847461998462677, "learning_rate": 4.951541978714069e-05, "loss": 0.3106, "num_input_tokens_seen": 1085408, "step": 12045 }, { "epoch": 3.1314968814968815, "grad_norm": 0.33556392788887024, "learning_rate": 4.951430828124811e-05, "loss": 0.198, "num_input_tokens_seen": 1085840, "step": 12050 }, { "epoch": 3.1327962577962576, "grad_norm": 0.30284759402275085, "learning_rate": 4.9513195514561164e-05, "loss": 0.2507, "num_input_tokens_seen": 1086320, "step": 12055 }, { "epoch": 3.134095634095634, "grad_norm": 0.2653781473636627, "learning_rate": 4.951208148713708e-05, "loss": 0.2208, "num_input_tokens_seen": 1086736, "step": 12060 }, { "epoch": 3.1353950103950106, "grad_norm": 0.25473928451538086, "learning_rate": 4.9510966199033174e-05, "loss": 0.2691, "num_input_tokens_seen": 1087184, "step": 12065 }, { "epoch": 3.1366943866943866, "grad_norm": 0.2967732846736908, "learning_rate": 4.950984965030678e-05, "loss": 0.2182, "num_input_tokens_seen": 1087632, "step": 12070 }, { "epoch": 3.137993762993763, "grad_norm": 0.23945613205432892, "learning_rate": 4.9508731841015334e-05, "loss": 0.3108, "num_input_tokens_seen": 1088080, "step": 12075 }, { "epoch": 3.139293139293139, "grad_norm": 0.24119897186756134, "learning_rate": 4.950761277121633e-05, "loss": 0.2195, "num_input_tokens_seen": 1088544, "step": 12080 }, { "epoch": 3.1405925155925156, "grad_norm": 0.4328137934207916, "learning_rate": 4.950649244096731e-05, "loss": 0.3117, "num_input_tokens_seen": 1088992, "step": 12085 }, { "epoch": 3.141891891891892, "grad_norm": 0.3044075071811676, "learning_rate": 4.950537085032591e-05, "loss": 0.2135, "num_input_tokens_seen": 1089440, "step": 12090 }, { "epoch": 3.143191268191268, "grad_norm": 0.2233598679304123, "learning_rate": 4.9504247999349815e-05, "loss": 0.3415, "num_input_tokens_seen": 1089936, "step": 12095 }, { "epoch": 3.1444906444906446, "grad_norm": 0.1988162249326706, "learning_rate": 4.950312388809676e-05, "loss": 0.2541, "num_input_tokens_seen": 1090352, "step": 12100 }, { "epoch": 3.1457900207900207, "grad_norm": 0.3626471757888794, "learning_rate": 4.950199851662456e-05, "loss": 0.2398, "num_input_tokens_seen": 1090768, "step": 12105 }, { "epoch": 3.147089397089397, "grad_norm": 0.2026451826095581, "learning_rate": 4.950087188499111e-05, "loss": 0.2649, "num_input_tokens_seen": 1091232, "step": 12110 }, { "epoch": 3.148388773388773, "grad_norm": 0.5451570749282837, "learning_rate": 4.9499743993254335e-05, "loss": 0.34, "num_input_tokens_seen": 1091712, "step": 12115 }, { "epoch": 3.1496881496881497, "grad_norm": 0.44604113698005676, "learning_rate": 4.949861484147225e-05, "loss": 0.2665, "num_input_tokens_seen": 1092144, "step": 12120 }, { "epoch": 3.150987525987526, "grad_norm": 0.37121498584747314, "learning_rate": 4.949748442970293e-05, "loss": 0.2237, "num_input_tokens_seen": 1092576, "step": 12125 }, { "epoch": 3.1522869022869022, "grad_norm": 0.30699262022972107, "learning_rate": 4.9496352758004524e-05, "loss": 0.2199, "num_input_tokens_seen": 1093024, "step": 12130 }, { "epoch": 3.1535862785862787, "grad_norm": 0.26963263750076294, "learning_rate": 4.949521982643522e-05, "loss": 0.3749, "num_input_tokens_seen": 1093472, "step": 12135 }, { "epoch": 3.1548856548856548, "grad_norm": 0.3033244013786316, "learning_rate": 4.9494085635053286e-05, "loss": 0.225, "num_input_tokens_seen": 1093920, "step": 12140 }, { "epoch": 3.1561850311850312, "grad_norm": 0.33022812008857727, "learning_rate": 4.949295018391706e-05, "loss": 0.2171, "num_input_tokens_seen": 1094384, "step": 12145 }, { "epoch": 3.1574844074844073, "grad_norm": 0.5731655955314636, "learning_rate": 4.949181347308494e-05, "loss": 0.3481, "num_input_tokens_seen": 1094800, "step": 12150 }, { "epoch": 3.1587837837837838, "grad_norm": 0.32583868503570557, "learning_rate": 4.949067550261539e-05, "loss": 0.1983, "num_input_tokens_seen": 1095264, "step": 12155 }, { "epoch": 3.1600831600831603, "grad_norm": 0.2908349335193634, "learning_rate": 4.948953627256693e-05, "loss": 0.1829, "num_input_tokens_seen": 1095696, "step": 12160 }, { "epoch": 3.1613825363825363, "grad_norm": 0.26555386185646057, "learning_rate": 4.948839578299815e-05, "loss": 0.2632, "num_input_tokens_seen": 1096128, "step": 12165 }, { "epoch": 3.162681912681913, "grad_norm": 0.2257785201072693, "learning_rate": 4.948725403396771e-05, "loss": 0.2015, "num_input_tokens_seen": 1096608, "step": 12170 }, { "epoch": 3.163981288981289, "grad_norm": 0.2738654613494873, "learning_rate": 4.948611102553434e-05, "loss": 0.3182, "num_input_tokens_seen": 1097088, "step": 12175 }, { "epoch": 3.1652806652806653, "grad_norm": 0.2125975787639618, "learning_rate": 4.948496675775681e-05, "loss": 0.1136, "num_input_tokens_seen": 1097536, "step": 12180 }, { "epoch": 3.1665800415800414, "grad_norm": 0.21877571940422058, "learning_rate": 4.948382123069399e-05, "loss": 0.2725, "num_input_tokens_seen": 1098000, "step": 12185 }, { "epoch": 3.167879417879418, "grad_norm": 0.35711464285850525, "learning_rate": 4.9482674444404776e-05, "loss": 0.3244, "num_input_tokens_seen": 1098480, "step": 12190 }, { "epoch": 3.1691787941787943, "grad_norm": 0.29206159710884094, "learning_rate": 4.948152639894816e-05, "loss": 0.1589, "num_input_tokens_seen": 1098928, "step": 12195 }, { "epoch": 3.1704781704781704, "grad_norm": 0.2158486247062683, "learning_rate": 4.948037709438319e-05, "loss": 0.221, "num_input_tokens_seen": 1099360, "step": 12200 }, { "epoch": 3.171777546777547, "grad_norm": 0.31744855642318726, "learning_rate": 4.947922653076896e-05, "loss": 0.2684, "num_input_tokens_seen": 1099808, "step": 12205 }, { "epoch": 3.173076923076923, "grad_norm": 0.32582032680511475, "learning_rate": 4.947807470816466e-05, "loss": 0.1676, "num_input_tokens_seen": 1100256, "step": 12210 }, { "epoch": 3.1743762993762994, "grad_norm": 0.22046084702014923, "learning_rate": 4.9476921626629524e-05, "loss": 0.3083, "num_input_tokens_seen": 1100704, "step": 12215 }, { "epoch": 3.175675675675676, "grad_norm": 0.28125977516174316, "learning_rate": 4.9475767286222856e-05, "loss": 0.2369, "num_input_tokens_seen": 1101184, "step": 12220 }, { "epoch": 3.176975051975052, "grad_norm": 0.31560951471328735, "learning_rate": 4.947461168700402e-05, "loss": 0.203, "num_input_tokens_seen": 1101680, "step": 12225 }, { "epoch": 3.1782744282744284, "grad_norm": 0.24636436998844147, "learning_rate": 4.947345482903246e-05, "loss": 0.1814, "num_input_tokens_seen": 1102112, "step": 12230 }, { "epoch": 3.1795738045738045, "grad_norm": 0.2372364103794098, "learning_rate": 4.947229671236767e-05, "loss": 0.3002, "num_input_tokens_seen": 1102544, "step": 12235 }, { "epoch": 3.180873180873181, "grad_norm": 0.2507651746273041, "learning_rate": 4.9471137337069215e-05, "loss": 0.3675, "num_input_tokens_seen": 1102960, "step": 12240 }, { "epoch": 3.182172557172557, "grad_norm": 0.2841126322746277, "learning_rate": 4.946997670319671e-05, "loss": 0.2034, "num_input_tokens_seen": 1103408, "step": 12245 }, { "epoch": 3.1834719334719335, "grad_norm": 0.23987066745758057, "learning_rate": 4.946881481080987e-05, "loss": 0.2874, "num_input_tokens_seen": 1103856, "step": 12250 }, { "epoch": 3.18477130977131, "grad_norm": 0.3699302077293396, "learning_rate": 4.946765165996843e-05, "loss": 0.2433, "num_input_tokens_seen": 1104304, "step": 12255 }, { "epoch": 3.186070686070686, "grad_norm": 0.34677058458328247, "learning_rate": 4.946648725073222e-05, "loss": 0.2963, "num_input_tokens_seen": 1104752, "step": 12260 }, { "epoch": 3.1873700623700625, "grad_norm": 0.24387012422084808, "learning_rate": 4.946532158316113e-05, "loss": 0.2649, "num_input_tokens_seen": 1105168, "step": 12265 }, { "epoch": 3.1886694386694385, "grad_norm": 0.6512274146080017, "learning_rate": 4.946415465731511e-05, "loss": 0.3, "num_input_tokens_seen": 1105616, "step": 12270 }, { "epoch": 3.189968814968815, "grad_norm": 0.29880061745643616, "learning_rate": 4.9462986473254166e-05, "loss": 0.3209, "num_input_tokens_seen": 1106064, "step": 12275 }, { "epoch": 3.1912681912681915, "grad_norm": 0.5391989350318909, "learning_rate": 4.9461817031038405e-05, "loss": 0.2817, "num_input_tokens_seen": 1106480, "step": 12280 }, { "epoch": 3.1925675675675675, "grad_norm": 0.21703580021858215, "learning_rate": 4.946064633072795e-05, "loss": 0.2849, "num_input_tokens_seen": 1106928, "step": 12285 }, { "epoch": 3.193866943866944, "grad_norm": 0.1618267446756363, "learning_rate": 4.945947437238301e-05, "loss": 0.2619, "num_input_tokens_seen": 1107376, "step": 12290 }, { "epoch": 3.19516632016632, "grad_norm": 0.26582756638526917, "learning_rate": 4.945830115606388e-05, "loss": 0.2467, "num_input_tokens_seen": 1107840, "step": 12295 }, { "epoch": 3.1964656964656966, "grad_norm": 0.2879641652107239, "learning_rate": 4.9457126681830876e-05, "loss": 0.2259, "num_input_tokens_seen": 1108288, "step": 12300 }, { "epoch": 3.1977650727650726, "grad_norm": 0.28881680965423584, "learning_rate": 4.945595094974442e-05, "loss": 0.1654, "num_input_tokens_seen": 1108704, "step": 12305 }, { "epoch": 3.199064449064449, "grad_norm": 0.26206105947494507, "learning_rate": 4.945477395986497e-05, "loss": 0.2525, "num_input_tokens_seen": 1109168, "step": 12310 }, { "epoch": 3.2003638253638256, "grad_norm": 0.4166327118873596, "learning_rate": 4.945359571225307e-05, "loss": 0.4178, "num_input_tokens_seen": 1109648, "step": 12315 }, { "epoch": 3.2016632016632016, "grad_norm": 0.31246447563171387, "learning_rate": 4.94524162069693e-05, "loss": 0.2151, "num_input_tokens_seen": 1110080, "step": 12320 }, { "epoch": 3.202962577962578, "grad_norm": 0.7037014961242676, "learning_rate": 4.945123544407434e-05, "loss": 0.3601, "num_input_tokens_seen": 1110576, "step": 12325 }, { "epoch": 3.204261954261954, "grad_norm": 0.6157185435295105, "learning_rate": 4.945005342362892e-05, "loss": 0.2431, "num_input_tokens_seen": 1111024, "step": 12330 }, { "epoch": 3.2055613305613306, "grad_norm": 0.5673896670341492, "learning_rate": 4.944887014569381e-05, "loss": 0.2618, "num_input_tokens_seen": 1111456, "step": 12335 }, { "epoch": 3.2068607068607067, "grad_norm": 0.5225887894630432, "learning_rate": 4.9447685610329905e-05, "loss": 0.2778, "num_input_tokens_seen": 1111904, "step": 12340 }, { "epoch": 3.208160083160083, "grad_norm": 0.6892077922821045, "learning_rate": 4.944649981759809e-05, "loss": 0.2903, "num_input_tokens_seen": 1112352, "step": 12345 }, { "epoch": 3.2094594594594597, "grad_norm": 0.3192542791366577, "learning_rate": 4.944531276755937e-05, "loss": 0.2778, "num_input_tokens_seen": 1112784, "step": 12350 }, { "epoch": 3.2107588357588357, "grad_norm": 0.2649066746234894, "learning_rate": 4.9444124460274785e-05, "loss": 0.2479, "num_input_tokens_seen": 1113232, "step": 12355 }, { "epoch": 3.212058212058212, "grad_norm": 0.47377294301986694, "learning_rate": 4.944293489580547e-05, "loss": 0.1933, "num_input_tokens_seen": 1113696, "step": 12360 }, { "epoch": 3.2133575883575882, "grad_norm": 0.6130356192588806, "learning_rate": 4.944174407421258e-05, "loss": 0.1646, "num_input_tokens_seen": 1114192, "step": 12365 }, { "epoch": 3.2146569646569647, "grad_norm": 3.283036470413208, "learning_rate": 4.9440551995557384e-05, "loss": 0.411, "num_input_tokens_seen": 1114624, "step": 12370 }, { "epoch": 3.2159563409563408, "grad_norm": 0.3573106825351715, "learning_rate": 4.943935865990118e-05, "loss": 0.1486, "num_input_tokens_seen": 1115104, "step": 12375 }, { "epoch": 3.2172557172557172, "grad_norm": 0.3204185664653778, "learning_rate": 4.943816406730534e-05, "loss": 0.141, "num_input_tokens_seen": 1115536, "step": 12380 }, { "epoch": 3.2185550935550937, "grad_norm": 1.0730198621749878, "learning_rate": 4.94369682178313e-05, "loss": 0.4694, "num_input_tokens_seen": 1116000, "step": 12385 }, { "epoch": 3.2198544698544698, "grad_norm": 0.27932634949684143, "learning_rate": 4.943577111154058e-05, "loss": 0.2719, "num_input_tokens_seen": 1116432, "step": 12390 }, { "epoch": 3.2211538461538463, "grad_norm": 0.5133640170097351, "learning_rate": 4.943457274849473e-05, "loss": 0.249, "num_input_tokens_seen": 1116928, "step": 12395 }, { "epoch": 3.2224532224532223, "grad_norm": 0.378111332654953, "learning_rate": 4.94333731287554e-05, "loss": 0.2198, "num_input_tokens_seen": 1117360, "step": 12400 }, { "epoch": 3.223752598752599, "grad_norm": 0.2872215509414673, "learning_rate": 4.9432172252384276e-05, "loss": 0.2095, "num_input_tokens_seen": 1117792, "step": 12405 }, { "epoch": 3.225051975051975, "grad_norm": 1.165810227394104, "learning_rate": 4.9430970119443124e-05, "loss": 0.2117, "num_input_tokens_seen": 1118224, "step": 12410 }, { "epoch": 3.2263513513513513, "grad_norm": 0.4742242097854614, "learning_rate": 4.9429766729993776e-05, "loss": 0.4231, "num_input_tokens_seen": 1118656, "step": 12415 }, { "epoch": 3.227650727650728, "grad_norm": 0.28364720940589905, "learning_rate": 4.9428562084098106e-05, "loss": 0.331, "num_input_tokens_seen": 1119088, "step": 12420 }, { "epoch": 3.228950103950104, "grad_norm": 0.5467734336853027, "learning_rate": 4.942735618181808e-05, "loss": 0.2649, "num_input_tokens_seen": 1119504, "step": 12425 }, { "epoch": 3.2302494802494803, "grad_norm": 0.7289660573005676, "learning_rate": 4.942614902321574e-05, "loss": 0.2598, "num_input_tokens_seen": 1119952, "step": 12430 }, { "epoch": 3.2315488565488564, "grad_norm": 0.24002741277217865, "learning_rate": 4.9424940608353135e-05, "loss": 0.2929, "num_input_tokens_seen": 1120432, "step": 12435 }, { "epoch": 3.232848232848233, "grad_norm": 0.42894095182418823, "learning_rate": 4.9423730937292434e-05, "loss": 0.2093, "num_input_tokens_seen": 1120912, "step": 12440 }, { "epoch": 3.2341476091476093, "grad_norm": 0.41980916261672974, "learning_rate": 4.942252001009585e-05, "loss": 0.2933, "num_input_tokens_seen": 1121360, "step": 12445 }, { "epoch": 3.2354469854469854, "grad_norm": 0.34894445538520813, "learning_rate": 4.942130782682566e-05, "loss": 0.1792, "num_input_tokens_seen": 1121792, "step": 12450 }, { "epoch": 3.236746361746362, "grad_norm": 1.1049412488937378, "learning_rate": 4.942009438754421e-05, "loss": 0.2629, "num_input_tokens_seen": 1122240, "step": 12455 }, { "epoch": 3.238045738045738, "grad_norm": 0.5162599086761475, "learning_rate": 4.9418879692313914e-05, "loss": 0.2835, "num_input_tokens_seen": 1122672, "step": 12460 }, { "epoch": 3.2393451143451144, "grad_norm": 0.36860546469688416, "learning_rate": 4.9417663741197236e-05, "loss": 0.2552, "num_input_tokens_seen": 1123104, "step": 12465 }, { "epoch": 3.2406444906444904, "grad_norm": 0.3383711874485016, "learning_rate": 4.941644653425671e-05, "loss": 0.2582, "num_input_tokens_seen": 1123536, "step": 12470 }, { "epoch": 3.241943866943867, "grad_norm": 0.23647364974021912, "learning_rate": 4.941522807155495e-05, "loss": 0.2672, "num_input_tokens_seen": 1123968, "step": 12475 }, { "epoch": 3.2432432432432434, "grad_norm": 0.2936577796936035, "learning_rate": 4.941400835315461e-05, "loss": 0.2194, "num_input_tokens_seen": 1124400, "step": 12480 }, { "epoch": 3.2445426195426195, "grad_norm": 0.33168548345565796, "learning_rate": 4.941278737911843e-05, "loss": 0.2547, "num_input_tokens_seen": 1124864, "step": 12485 }, { "epoch": 3.245841995841996, "grad_norm": 0.347560852766037, "learning_rate": 4.941156514950921e-05, "loss": 0.2123, "num_input_tokens_seen": 1125280, "step": 12490 }, { "epoch": 3.247141372141372, "grad_norm": 0.3316405415534973, "learning_rate": 4.9410341664389803e-05, "loss": 0.1734, "num_input_tokens_seen": 1125680, "step": 12495 }, { "epoch": 3.2484407484407485, "grad_norm": 0.6332309246063232, "learning_rate": 4.940911692382313e-05, "loss": 0.2191, "num_input_tokens_seen": 1126144, "step": 12500 }, { "epoch": 3.249740124740125, "grad_norm": 0.2974153161048889, "learning_rate": 4.9407890927872184e-05, "loss": 0.3229, "num_input_tokens_seen": 1126608, "step": 12505 }, { "epoch": 3.251039501039501, "grad_norm": 0.3714148700237274, "learning_rate": 4.9406663676600026e-05, "loss": 0.3141, "num_input_tokens_seen": 1127056, "step": 12510 }, { "epoch": 3.2523388773388775, "grad_norm": 0.2758222818374634, "learning_rate": 4.940543517006977e-05, "loss": 0.2475, "num_input_tokens_seen": 1127520, "step": 12515 }, { "epoch": 3.2536382536382535, "grad_norm": 0.3458258807659149, "learning_rate": 4.94042054083446e-05, "loss": 0.2232, "num_input_tokens_seen": 1127936, "step": 12520 }, { "epoch": 3.25493762993763, "grad_norm": 0.673637866973877, "learning_rate": 4.940297439148776e-05, "loss": 0.2137, "num_input_tokens_seen": 1128384, "step": 12525 }, { "epoch": 3.256237006237006, "grad_norm": 0.2756933569908142, "learning_rate": 4.940174211956256e-05, "loss": 0.279, "num_input_tokens_seen": 1128832, "step": 12530 }, { "epoch": 3.2575363825363826, "grad_norm": 0.3365021049976349, "learning_rate": 4.94005085926324e-05, "loss": 0.1855, "num_input_tokens_seen": 1129264, "step": 12535 }, { "epoch": 3.258835758835759, "grad_norm": 0.30397066473960876, "learning_rate": 4.9399273810760685e-05, "loss": 0.1097, "num_input_tokens_seen": 1129680, "step": 12540 }, { "epoch": 3.260135135135135, "grad_norm": 0.31228286027908325, "learning_rate": 4.939803777401095e-05, "loss": 0.2216, "num_input_tokens_seen": 1130128, "step": 12545 }, { "epoch": 3.2614345114345116, "grad_norm": 0.22635166347026825, "learning_rate": 4.9396800482446746e-05, "loss": 0.3092, "num_input_tokens_seen": 1130528, "step": 12550 }, { "epoch": 3.2627338877338876, "grad_norm": 0.5112591981887817, "learning_rate": 4.939556193613173e-05, "loss": 0.2081, "num_input_tokens_seen": 1130992, "step": 12555 }, { "epoch": 3.264033264033264, "grad_norm": 0.6530801653862, "learning_rate": 4.939432213512958e-05, "loss": 0.2816, "num_input_tokens_seen": 1131456, "step": 12560 }, { "epoch": 3.26533264033264, "grad_norm": 0.31705746054649353, "learning_rate": 4.939308107950407e-05, "loss": 0.2272, "num_input_tokens_seen": 1131920, "step": 12565 }, { "epoch": 3.2666320166320166, "grad_norm": 0.32784104347229004, "learning_rate": 4.939183876931903e-05, "loss": 0.2185, "num_input_tokens_seen": 1132352, "step": 12570 }, { "epoch": 3.267931392931393, "grad_norm": 0.2733173370361328, "learning_rate": 4.939059520463835e-05, "loss": 0.2239, "num_input_tokens_seen": 1132816, "step": 12575 }, { "epoch": 3.269230769230769, "grad_norm": 0.23423874378204346, "learning_rate": 4.938935038552599e-05, "loss": 0.2423, "num_input_tokens_seen": 1133264, "step": 12580 }, { "epoch": 3.2705301455301456, "grad_norm": 0.2930796444416046, "learning_rate": 4.938810431204597e-05, "loss": 0.1666, "num_input_tokens_seen": 1133728, "step": 12585 }, { "epoch": 3.2718295218295217, "grad_norm": 0.39093610644340515, "learning_rate": 4.9386856984262374e-05, "loss": 0.2217, "num_input_tokens_seen": 1134144, "step": 12590 }, { "epoch": 3.273128898128898, "grad_norm": 0.22264806926250458, "learning_rate": 4.9385608402239364e-05, "loss": 0.213, "num_input_tokens_seen": 1134560, "step": 12595 }, { "epoch": 3.274428274428274, "grad_norm": 0.18909776210784912, "learning_rate": 4.9384358566041144e-05, "loss": 0.2548, "num_input_tokens_seen": 1135024, "step": 12600 }, { "epoch": 3.2757276507276507, "grad_norm": 0.25173479318618774, "learning_rate": 4.938310747573201e-05, "loss": 0.3305, "num_input_tokens_seen": 1135456, "step": 12605 }, { "epoch": 3.277027027027027, "grad_norm": 0.2713122069835663, "learning_rate": 4.938185513137627e-05, "loss": 0.3608, "num_input_tokens_seen": 1135904, "step": 12610 }, { "epoch": 3.2783264033264032, "grad_norm": 0.3764803111553192, "learning_rate": 4.9380601533038385e-05, "loss": 0.1612, "num_input_tokens_seen": 1136352, "step": 12615 }, { "epoch": 3.2796257796257797, "grad_norm": 0.3189832270145416, "learning_rate": 4.937934668078279e-05, "loss": 0.2531, "num_input_tokens_seen": 1136784, "step": 12620 }, { "epoch": 3.2809251559251558, "grad_norm": 0.281367689371109, "learning_rate": 4.937809057467404e-05, "loss": 0.2637, "num_input_tokens_seen": 1137248, "step": 12625 }, { "epoch": 3.2822245322245323, "grad_norm": 0.3107827305793762, "learning_rate": 4.937683321477673e-05, "loss": 0.2122, "num_input_tokens_seen": 1137728, "step": 12630 }, { "epoch": 3.2835239085239083, "grad_norm": 0.25434964895248413, "learning_rate": 4.9375574601155536e-05, "loss": 0.2474, "num_input_tokens_seen": 1138224, "step": 12635 }, { "epoch": 3.284823284823285, "grad_norm": 0.5933122634887695, "learning_rate": 4.9374314733875184e-05, "loss": 0.3617, "num_input_tokens_seen": 1138656, "step": 12640 }, { "epoch": 3.2861226611226613, "grad_norm": 0.5574665665626526, "learning_rate": 4.937305361300046e-05, "loss": 0.236, "num_input_tokens_seen": 1139056, "step": 12645 }, { "epoch": 3.2874220374220373, "grad_norm": 0.5620163083076477, "learning_rate": 4.937179123859625e-05, "loss": 0.2654, "num_input_tokens_seen": 1139520, "step": 12650 }, { "epoch": 3.288721413721414, "grad_norm": 0.45720794796943665, "learning_rate": 4.937052761072746e-05, "loss": 0.2074, "num_input_tokens_seen": 1139984, "step": 12655 }, { "epoch": 3.29002079002079, "grad_norm": 0.3598051071166992, "learning_rate": 4.936926272945908e-05, "loss": 0.1612, "num_input_tokens_seen": 1140432, "step": 12660 }, { "epoch": 3.2913201663201663, "grad_norm": 1.4167704582214355, "learning_rate": 4.936799659485617e-05, "loss": 0.3134, "num_input_tokens_seen": 1140896, "step": 12665 }, { "epoch": 3.2926195426195424, "grad_norm": 0.6056066751480103, "learning_rate": 4.936672920698385e-05, "loss": 0.2347, "num_input_tokens_seen": 1141360, "step": 12670 }, { "epoch": 3.293918918918919, "grad_norm": 0.4355859160423279, "learning_rate": 4.93654605659073e-05, "loss": 0.3462, "num_input_tokens_seen": 1141824, "step": 12675 }, { "epoch": 3.2952182952182953, "grad_norm": 0.4487825334072113, "learning_rate": 4.936419067169177e-05, "loss": 0.2922, "num_input_tokens_seen": 1142272, "step": 12680 }, { "epoch": 3.2965176715176714, "grad_norm": 0.4272691309452057, "learning_rate": 4.936291952440256e-05, "loss": 0.1798, "num_input_tokens_seen": 1142752, "step": 12685 }, { "epoch": 3.297817047817048, "grad_norm": 0.3063333034515381, "learning_rate": 4.936164712410506e-05, "loss": 0.2254, "num_input_tokens_seen": 1143200, "step": 12690 }, { "epoch": 3.2991164241164244, "grad_norm": 0.21564900875091553, "learning_rate": 4.936037347086471e-05, "loss": 0.323, "num_input_tokens_seen": 1143616, "step": 12695 }, { "epoch": 3.3004158004158004, "grad_norm": 0.2972104251384735, "learning_rate": 4.9359098564747e-05, "loss": 0.3001, "num_input_tokens_seen": 1144064, "step": 12700 }, { "epoch": 3.301715176715177, "grad_norm": 0.3846447467803955, "learning_rate": 4.935782240581752e-05, "loss": 0.2635, "num_input_tokens_seen": 1144544, "step": 12705 }, { "epoch": 3.303014553014553, "grad_norm": 0.42691606283187866, "learning_rate": 4.93565449941419e-05, "loss": 0.2558, "num_input_tokens_seen": 1144976, "step": 12710 }, { "epoch": 3.3043139293139294, "grad_norm": 0.4554917812347412, "learning_rate": 4.935526632978582e-05, "loss": 0.2564, "num_input_tokens_seen": 1145424, "step": 12715 }, { "epoch": 3.3056133056133055, "grad_norm": 0.3384469747543335, "learning_rate": 4.935398641281507e-05, "loss": 0.2352, "num_input_tokens_seen": 1145840, "step": 12720 }, { "epoch": 3.306912681912682, "grad_norm": 0.2550743520259857, "learning_rate": 4.935270524329546e-05, "loss": 0.2236, "num_input_tokens_seen": 1146256, "step": 12725 }, { "epoch": 3.3082120582120584, "grad_norm": 0.2951190769672394, "learning_rate": 4.935142282129288e-05, "loss": 0.3255, "num_input_tokens_seen": 1146704, "step": 12730 }, { "epoch": 3.3095114345114345, "grad_norm": 0.2669551968574524, "learning_rate": 4.93501391468733e-05, "loss": 0.3522, "num_input_tokens_seen": 1147168, "step": 12735 }, { "epoch": 3.310810810810811, "grad_norm": 0.31293079257011414, "learning_rate": 4.934885422010272e-05, "loss": 0.2614, "num_input_tokens_seen": 1147616, "step": 12740 }, { "epoch": 3.312110187110187, "grad_norm": 0.2797352373600006, "learning_rate": 4.934756804104725e-05, "loss": 0.2327, "num_input_tokens_seen": 1148080, "step": 12745 }, { "epoch": 3.3134095634095635, "grad_norm": 0.3445165455341339, "learning_rate": 4.934628060977302e-05, "loss": 0.2288, "num_input_tokens_seen": 1148512, "step": 12750 }, { "epoch": 3.3147089397089395, "grad_norm": 0.28463035821914673, "learning_rate": 4.934499192634626e-05, "loss": 0.2454, "num_input_tokens_seen": 1148976, "step": 12755 }, { "epoch": 3.316008316008316, "grad_norm": 0.22233863174915314, "learning_rate": 4.9343701990833225e-05, "loss": 0.2604, "num_input_tokens_seen": 1149392, "step": 12760 }, { "epoch": 3.3173076923076925, "grad_norm": 0.5677981376647949, "learning_rate": 4.934241080330028e-05, "loss": 0.2836, "num_input_tokens_seen": 1149856, "step": 12765 }, { "epoch": 3.3186070686070686, "grad_norm": 0.24084290862083435, "learning_rate": 4.934111836381383e-05, "loss": 0.213, "num_input_tokens_seen": 1150272, "step": 12770 }, { "epoch": 3.319906444906445, "grad_norm": 0.25750890374183655, "learning_rate": 4.9339824672440325e-05, "loss": 0.1728, "num_input_tokens_seen": 1150720, "step": 12775 }, { "epoch": 3.321205821205821, "grad_norm": 0.25849512219429016, "learning_rate": 4.933852972924633e-05, "loss": 0.2089, "num_input_tokens_seen": 1151168, "step": 12780 }, { "epoch": 3.3225051975051976, "grad_norm": 0.28912511467933655, "learning_rate": 4.9337233534298425e-05, "loss": 0.3719, "num_input_tokens_seen": 1151616, "step": 12785 }, { "epoch": 3.3238045738045736, "grad_norm": 0.2495337575674057, "learning_rate": 4.933593608766328e-05, "loss": 0.2012, "num_input_tokens_seen": 1152048, "step": 12790 }, { "epoch": 3.32510395010395, "grad_norm": 0.21446402370929718, "learning_rate": 4.9334637389407624e-05, "loss": 0.1234, "num_input_tokens_seen": 1152528, "step": 12795 }, { "epoch": 3.3264033264033266, "grad_norm": 0.3029913306236267, "learning_rate": 4.9333337439598247e-05, "loss": 0.2634, "num_input_tokens_seen": 1152944, "step": 12800 }, { "epoch": 3.3277027027027026, "grad_norm": 0.6363791823387146, "learning_rate": 4.933203623830201e-05, "loss": 0.3545, "num_input_tokens_seen": 1153408, "step": 12805 }, { "epoch": 3.329002079002079, "grad_norm": 0.2536679804325104, "learning_rate": 4.9330733785585845e-05, "loss": 0.2128, "num_input_tokens_seen": 1153840, "step": 12810 }, { "epoch": 3.330301455301455, "grad_norm": 0.2168799340724945, "learning_rate": 4.932943008151673e-05, "loss": 0.2948, "num_input_tokens_seen": 1154304, "step": 12815 }, { "epoch": 3.3316008316008316, "grad_norm": 0.1884555071592331, "learning_rate": 4.93281251261617e-05, "loss": 0.2805, "num_input_tokens_seen": 1154704, "step": 12820 }, { "epoch": 3.3329002079002077, "grad_norm": 0.1975007802248001, "learning_rate": 4.932681891958789e-05, "loss": 0.2463, "num_input_tokens_seen": 1155152, "step": 12825 }, { "epoch": 3.334199584199584, "grad_norm": 0.46603038907051086, "learning_rate": 4.9325511461862486e-05, "loss": 0.2982, "num_input_tokens_seen": 1155584, "step": 12830 }, { "epoch": 3.3354989604989607, "grad_norm": 0.4971141815185547, "learning_rate": 4.932420275305271e-05, "loss": 0.2918, "num_input_tokens_seen": 1156064, "step": 12835 }, { "epoch": 3.3367983367983367, "grad_norm": 0.44891631603240967, "learning_rate": 4.932289279322588e-05, "loss": 0.2639, "num_input_tokens_seen": 1156560, "step": 12840 }, { "epoch": 3.338097713097713, "grad_norm": 0.35620298981666565, "learning_rate": 4.9321581582449365e-05, "loss": 0.237, "num_input_tokens_seen": 1157024, "step": 12845 }, { "epoch": 3.3393970893970892, "grad_norm": 0.35645419359207153, "learning_rate": 4.9320269120790616e-05, "loss": 0.2862, "num_input_tokens_seen": 1157488, "step": 12850 }, { "epoch": 3.3406964656964657, "grad_norm": 0.4007503390312195, "learning_rate": 4.9318955408317115e-05, "loss": 0.2704, "num_input_tokens_seen": 1157952, "step": 12855 }, { "epoch": 3.3419958419958418, "grad_norm": 0.32953882217407227, "learning_rate": 4.931764044509643e-05, "loss": 0.2816, "num_input_tokens_seen": 1158400, "step": 12860 }, { "epoch": 3.3432952182952183, "grad_norm": 0.5981128811836243, "learning_rate": 4.931632423119621e-05, "loss": 0.2153, "num_input_tokens_seen": 1158848, "step": 12865 }, { "epoch": 3.3445945945945947, "grad_norm": 0.33926793932914734, "learning_rate": 4.9315006766684135e-05, "loss": 0.2172, "num_input_tokens_seen": 1159280, "step": 12870 }, { "epoch": 3.345893970893971, "grad_norm": 0.38655003905296326, "learning_rate": 4.931368805162796e-05, "loss": 0.2194, "num_input_tokens_seen": 1159728, "step": 12875 }, { "epoch": 3.3471933471933473, "grad_norm": 0.24727854132652283, "learning_rate": 4.931236808609552e-05, "loss": 0.1555, "num_input_tokens_seen": 1160144, "step": 12880 }, { "epoch": 3.3484927234927233, "grad_norm": 0.4978613257408142, "learning_rate": 4.931104687015468e-05, "loss": 0.3162, "num_input_tokens_seen": 1160624, "step": 12885 }, { "epoch": 3.3497920997921, "grad_norm": 0.24687768518924713, "learning_rate": 4.930972440387341e-05, "loss": 0.2111, "num_input_tokens_seen": 1161056, "step": 12890 }, { "epoch": 3.3510914760914763, "grad_norm": 0.3340601623058319, "learning_rate": 4.930840068731973e-05, "loss": 0.3191, "num_input_tokens_seen": 1161536, "step": 12895 }, { "epoch": 3.3523908523908523, "grad_norm": 0.2940766215324402, "learning_rate": 4.9307075720561705e-05, "loss": 0.2724, "num_input_tokens_seen": 1162000, "step": 12900 }, { "epoch": 3.353690228690229, "grad_norm": 0.32757875323295593, "learning_rate": 4.930574950366749e-05, "loss": 0.227, "num_input_tokens_seen": 1162432, "step": 12905 }, { "epoch": 3.354989604989605, "grad_norm": 0.5785524845123291, "learning_rate": 4.930442203670529e-05, "loss": 0.2509, "num_input_tokens_seen": 1162912, "step": 12910 }, { "epoch": 3.3562889812889813, "grad_norm": 0.258979469537735, "learning_rate": 4.9303093319743364e-05, "loss": 0.2569, "num_input_tokens_seen": 1163344, "step": 12915 }, { "epoch": 3.357588357588358, "grad_norm": 0.3161182403564453, "learning_rate": 4.9301763352850075e-05, "loss": 0.3177, "num_input_tokens_seen": 1163792, "step": 12920 }, { "epoch": 3.358887733887734, "grad_norm": 0.3866940438747406, "learning_rate": 4.930043213609381e-05, "loss": 0.2537, "num_input_tokens_seen": 1164224, "step": 12925 }, { "epoch": 3.3601871101871104, "grad_norm": 0.5301572680473328, "learning_rate": 4.9299099669543035e-05, "loss": 0.2313, "num_input_tokens_seen": 1164640, "step": 12930 }, { "epoch": 3.3614864864864864, "grad_norm": 0.28422507643699646, "learning_rate": 4.9297765953266287e-05, "loss": 0.2896, "num_input_tokens_seen": 1165104, "step": 12935 }, { "epoch": 3.362785862785863, "grad_norm": 0.274762898683548, "learning_rate": 4.929643098733215e-05, "loss": 0.1805, "num_input_tokens_seen": 1165568, "step": 12940 }, { "epoch": 3.364085239085239, "grad_norm": 0.28719043731689453, "learning_rate": 4.9295094771809285e-05, "loss": 0.246, "num_input_tokens_seen": 1166000, "step": 12945 }, { "epoch": 3.3653846153846154, "grad_norm": 0.40123942494392395, "learning_rate": 4.929375730676642e-05, "loss": 0.333, "num_input_tokens_seen": 1166432, "step": 12950 }, { "epoch": 3.366683991683992, "grad_norm": 0.3125784993171692, "learning_rate": 4.9292418592272344e-05, "loss": 0.2042, "num_input_tokens_seen": 1166864, "step": 12955 }, { "epoch": 3.367983367983368, "grad_norm": 0.45142653584480286, "learning_rate": 4.92910786283959e-05, "loss": 0.2737, "num_input_tokens_seen": 1167328, "step": 12960 }, { "epoch": 3.3692827442827444, "grad_norm": 0.32936739921569824, "learning_rate": 4.928973741520601e-05, "loss": 0.3122, "num_input_tokens_seen": 1167760, "step": 12965 }, { "epoch": 3.3705821205821205, "grad_norm": 0.5165513157844543, "learning_rate": 4.9288394952771645e-05, "loss": 0.2614, "num_input_tokens_seen": 1168192, "step": 12970 }, { "epoch": 3.371881496881497, "grad_norm": 0.3393429219722748, "learning_rate": 4.9287051241161865e-05, "loss": 0.2319, "num_input_tokens_seen": 1168672, "step": 12975 }, { "epoch": 3.373180873180873, "grad_norm": 0.35843324661254883, "learning_rate": 4.9285706280445756e-05, "loss": 0.2582, "num_input_tokens_seen": 1169088, "step": 12980 }, { "epoch": 3.3744802494802495, "grad_norm": 0.7083838582038879, "learning_rate": 4.928436007069251e-05, "loss": 0.2299, "num_input_tokens_seen": 1169536, "step": 12985 }, { "epoch": 3.375779625779626, "grad_norm": 218.6122589111328, "learning_rate": 4.9283012611971365e-05, "loss": 4.2041, "num_input_tokens_seen": 1170000, "step": 12990 }, { "epoch": 3.377079002079002, "grad_norm": 3.3737313747406006, "learning_rate": 4.9281663904351604e-05, "loss": 0.3513, "num_input_tokens_seen": 1170416, "step": 12995 }, { "epoch": 3.3783783783783785, "grad_norm": 8.197477340698242, "learning_rate": 4.928031394790261e-05, "loss": 0.3522, "num_input_tokens_seen": 1170832, "step": 13000 }, { "epoch": 3.3796777546777546, "grad_norm": 0.4083069860935211, "learning_rate": 4.92789627426938e-05, "loss": 0.2465, "num_input_tokens_seen": 1171280, "step": 13005 }, { "epoch": 3.380977130977131, "grad_norm": 0.508405327796936, "learning_rate": 4.9277610288794675e-05, "loss": 0.1622, "num_input_tokens_seen": 1171712, "step": 13010 }, { "epoch": 3.382276507276507, "grad_norm": 1.149393916130066, "learning_rate": 4.92762565862748e-05, "loss": 0.4014, "num_input_tokens_seen": 1172160, "step": 13015 }, { "epoch": 3.3835758835758836, "grad_norm": 0.4157160818576813, "learning_rate": 4.927490163520377e-05, "loss": 0.3128, "num_input_tokens_seen": 1172640, "step": 13020 }, { "epoch": 3.38487525987526, "grad_norm": 0.3183368444442749, "learning_rate": 4.92735454356513e-05, "loss": 0.5311, "num_input_tokens_seen": 1173072, "step": 13025 }, { "epoch": 3.386174636174636, "grad_norm": 0.46097177267074585, "learning_rate": 4.9272187987687136e-05, "loss": 0.2446, "num_input_tokens_seen": 1173536, "step": 13030 }, { "epoch": 3.3874740124740126, "grad_norm": 0.3571687638759613, "learning_rate": 4.9270829291381084e-05, "loss": 0.226, "num_input_tokens_seen": 1174000, "step": 13035 }, { "epoch": 3.3887733887733886, "grad_norm": 0.39586082100868225, "learning_rate": 4.926946934680302e-05, "loss": 0.208, "num_input_tokens_seen": 1174432, "step": 13040 }, { "epoch": 3.390072765072765, "grad_norm": 0.2578338086605072, "learning_rate": 4.92681081540229e-05, "loss": 0.2775, "num_input_tokens_seen": 1174896, "step": 13045 }, { "epoch": 3.391372141372141, "grad_norm": 0.26416483521461487, "learning_rate": 4.926674571311072e-05, "loss": 0.2728, "num_input_tokens_seen": 1175392, "step": 13050 }, { "epoch": 3.3926715176715176, "grad_norm": 0.33927974104881287, "learning_rate": 4.926538202413656e-05, "loss": 0.2178, "num_input_tokens_seen": 1175840, "step": 13055 }, { "epoch": 3.393970893970894, "grad_norm": 0.24218055605888367, "learning_rate": 4.926401708717055e-05, "loss": 0.1601, "num_input_tokens_seen": 1176272, "step": 13060 }, { "epoch": 3.39527027027027, "grad_norm": 0.3621757924556732, "learning_rate": 4.926265090228289e-05, "loss": 0.2726, "num_input_tokens_seen": 1176704, "step": 13065 }, { "epoch": 3.3965696465696467, "grad_norm": 0.2402178943157196, "learning_rate": 4.926128346954385e-05, "loss": 0.1971, "num_input_tokens_seen": 1177168, "step": 13070 }, { "epoch": 3.3978690228690227, "grad_norm": 0.30805256962776184, "learning_rate": 4.9259914789023764e-05, "loss": 0.25, "num_input_tokens_seen": 1177616, "step": 13075 }, { "epoch": 3.399168399168399, "grad_norm": 0.34182003140449524, "learning_rate": 4.925854486079301e-05, "loss": 0.2627, "num_input_tokens_seen": 1178112, "step": 13080 }, { "epoch": 3.4004677754677752, "grad_norm": 0.3271045982837677, "learning_rate": 4.925717368492204e-05, "loss": 0.266, "num_input_tokens_seen": 1178544, "step": 13085 }, { "epoch": 3.4017671517671517, "grad_norm": 0.35672128200531006, "learning_rate": 4.92558012614814e-05, "loss": 0.2463, "num_input_tokens_seen": 1179008, "step": 13090 }, { "epoch": 3.403066528066528, "grad_norm": 0.3797007203102112, "learning_rate": 4.9254427590541655e-05, "loss": 0.3055, "num_input_tokens_seen": 1179472, "step": 13095 }, { "epoch": 3.4043659043659042, "grad_norm": 0.6919114589691162, "learning_rate": 4.925305267217346e-05, "loss": 0.2524, "num_input_tokens_seen": 1179952, "step": 13100 }, { "epoch": 3.4056652806652807, "grad_norm": 0.3905535042285919, "learning_rate": 4.925167650644752e-05, "loss": 0.2389, "num_input_tokens_seen": 1180480, "step": 13105 }, { "epoch": 3.406964656964657, "grad_norm": 0.4016265571117401, "learning_rate": 4.925029909343463e-05, "loss": 0.1831, "num_input_tokens_seen": 1180928, "step": 13110 }, { "epoch": 3.4082640332640333, "grad_norm": 0.35586750507354736, "learning_rate": 4.924892043320561e-05, "loss": 0.3422, "num_input_tokens_seen": 1181392, "step": 13115 }, { "epoch": 3.4095634095634098, "grad_norm": 0.38378432393074036, "learning_rate": 4.9247540525831394e-05, "loss": 0.2201, "num_input_tokens_seen": 1181840, "step": 13120 }, { "epoch": 3.410862785862786, "grad_norm": 0.3546094596385956, "learning_rate": 4.924615937138293e-05, "loss": 0.26, "num_input_tokens_seen": 1182304, "step": 13125 }, { "epoch": 3.4121621621621623, "grad_norm": 0.34646105766296387, "learning_rate": 4.9244776969931256e-05, "loss": 0.2561, "num_input_tokens_seen": 1182784, "step": 13130 }, { "epoch": 3.4134615384615383, "grad_norm": 0.710594654083252, "learning_rate": 4.9243393321547474e-05, "loss": 0.3454, "num_input_tokens_seen": 1183264, "step": 13135 }, { "epoch": 3.414760914760915, "grad_norm": 0.6321313381195068, "learning_rate": 4.924200842630275e-05, "loss": 0.2362, "num_input_tokens_seen": 1183728, "step": 13140 }, { "epoch": 3.4160602910602913, "grad_norm": 0.5426578521728516, "learning_rate": 4.9240622284268287e-05, "loss": 0.2934, "num_input_tokens_seen": 1184176, "step": 13145 }, { "epoch": 3.4173596673596673, "grad_norm": 0.27057787775993347, "learning_rate": 4.9239234895515406e-05, "loss": 0.2527, "num_input_tokens_seen": 1184624, "step": 13150 }, { "epoch": 3.418659043659044, "grad_norm": 0.41066867113113403, "learning_rate": 4.923784626011545e-05, "loss": 0.2113, "num_input_tokens_seen": 1185056, "step": 13155 }, { "epoch": 3.41995841995842, "grad_norm": 0.32411694526672363, "learning_rate": 4.9236456378139836e-05, "loss": 0.1748, "num_input_tokens_seen": 1185472, "step": 13160 }, { "epoch": 3.4212577962577964, "grad_norm": 0.24573422968387604, "learning_rate": 4.923506524966005e-05, "loss": 0.2067, "num_input_tokens_seen": 1185888, "step": 13165 }, { "epoch": 3.4225571725571724, "grad_norm": 0.6303260922431946, "learning_rate": 4.923367287474764e-05, "loss": 0.4042, "num_input_tokens_seen": 1186352, "step": 13170 }, { "epoch": 3.423856548856549, "grad_norm": 0.9646506905555725, "learning_rate": 4.9232279253474205e-05, "loss": 0.2504, "num_input_tokens_seen": 1186784, "step": 13175 }, { "epoch": 3.4251559251559254, "grad_norm": 0.7277589440345764, "learning_rate": 4.9230884385911436e-05, "loss": 0.3515, "num_input_tokens_seen": 1187248, "step": 13180 }, { "epoch": 3.4264553014553014, "grad_norm": 0.6182223558425903, "learning_rate": 4.9229488272131067e-05, "loss": 0.3831, "num_input_tokens_seen": 1187712, "step": 13185 }, { "epoch": 3.427754677754678, "grad_norm": 0.47601819038391113, "learning_rate": 4.922809091220489e-05, "loss": 0.2877, "num_input_tokens_seen": 1188192, "step": 13190 }, { "epoch": 3.429054054054054, "grad_norm": 1.1924046277999878, "learning_rate": 4.9226692306204795e-05, "loss": 0.2158, "num_input_tokens_seen": 1188640, "step": 13195 }, { "epoch": 3.4303534303534304, "grad_norm": 1.3703495264053345, "learning_rate": 4.92252924542027e-05, "loss": 0.4155, "num_input_tokens_seen": 1189072, "step": 13200 }, { "epoch": 3.4316528066528065, "grad_norm": 0.25805753469467163, "learning_rate": 4.92238913562706e-05, "loss": 0.4508, "num_input_tokens_seen": 1189504, "step": 13205 }, { "epoch": 3.432952182952183, "grad_norm": 0.9829915165901184, "learning_rate": 4.922248901248056e-05, "loss": 0.448, "num_input_tokens_seen": 1189968, "step": 13210 }, { "epoch": 3.4342515592515594, "grad_norm": 1.048309326171875, "learning_rate": 4.92210854229047e-05, "loss": 0.3266, "num_input_tokens_seen": 1190448, "step": 13215 }, { "epoch": 3.4355509355509355, "grad_norm": 0.46795788407325745, "learning_rate": 4.921968058761521e-05, "loss": 0.2409, "num_input_tokens_seen": 1190912, "step": 13220 }, { "epoch": 3.436850311850312, "grad_norm": 0.3087374269962311, "learning_rate": 4.921827450668434e-05, "loss": 0.3289, "num_input_tokens_seen": 1191312, "step": 13225 }, { "epoch": 3.438149688149688, "grad_norm": 0.47055765986442566, "learning_rate": 4.92168671801844e-05, "loss": 0.2078, "num_input_tokens_seen": 1191776, "step": 13230 }, { "epoch": 3.4394490644490645, "grad_norm": 0.36853402853012085, "learning_rate": 4.921545860818779e-05, "loss": 0.2127, "num_input_tokens_seen": 1192224, "step": 13235 }, { "epoch": 3.4407484407484406, "grad_norm": 0.5989190340042114, "learning_rate": 4.921404879076693e-05, "loss": 0.2168, "num_input_tokens_seen": 1192672, "step": 13240 }, { "epoch": 3.442047817047817, "grad_norm": 0.3223884701728821, "learning_rate": 4.921263772799435e-05, "loss": 0.3331, "num_input_tokens_seen": 1193152, "step": 13245 }, { "epoch": 3.4433471933471935, "grad_norm": 0.3452574908733368, "learning_rate": 4.921122541994261e-05, "loss": 0.2115, "num_input_tokens_seen": 1193600, "step": 13250 }, { "epoch": 3.4446465696465696, "grad_norm": 0.27166756987571716, "learning_rate": 4.920981186668435e-05, "loss": 0.3222, "num_input_tokens_seen": 1194048, "step": 13255 }, { "epoch": 3.445945945945946, "grad_norm": 0.2772787809371948, "learning_rate": 4.920839706829226e-05, "loss": 0.2255, "num_input_tokens_seen": 1194512, "step": 13260 }, { "epoch": 3.447245322245322, "grad_norm": 0.41308319568634033, "learning_rate": 4.920698102483912e-05, "loss": 0.2063, "num_input_tokens_seen": 1194944, "step": 13265 }, { "epoch": 3.4485446985446986, "grad_norm": 0.33100587129592896, "learning_rate": 4.920556373639775e-05, "loss": 0.3014, "num_input_tokens_seen": 1195408, "step": 13270 }, { "epoch": 3.4498440748440746, "grad_norm": 0.21069443225860596, "learning_rate": 4.920414520304105e-05, "loss": 0.3674, "num_input_tokens_seen": 1195840, "step": 13275 }, { "epoch": 3.451143451143451, "grad_norm": 0.4069516062736511, "learning_rate": 4.920272542484197e-05, "loss": 0.2356, "num_input_tokens_seen": 1196272, "step": 13280 }, { "epoch": 3.4524428274428276, "grad_norm": 0.6979698538780212, "learning_rate": 4.920130440187352e-05, "loss": 0.2744, "num_input_tokens_seen": 1196704, "step": 13285 }, { "epoch": 3.4537422037422036, "grad_norm": 0.24698825180530548, "learning_rate": 4.919988213420881e-05, "loss": 0.2563, "num_input_tokens_seen": 1197168, "step": 13290 }, { "epoch": 3.45504158004158, "grad_norm": 0.38435837626457214, "learning_rate": 4.919845862192096e-05, "loss": 0.1817, "num_input_tokens_seen": 1197616, "step": 13295 }, { "epoch": 3.456340956340956, "grad_norm": 0.2919781804084778, "learning_rate": 4.9197033865083206e-05, "loss": 0.1697, "num_input_tokens_seen": 1198064, "step": 13300 }, { "epoch": 3.4576403326403327, "grad_norm": 0.22093465924263, "learning_rate": 4.919560786376882e-05, "loss": 0.2221, "num_input_tokens_seen": 1198496, "step": 13305 }, { "epoch": 3.4589397089397087, "grad_norm": 0.566008448600769, "learning_rate": 4.919418061805113e-05, "loss": 0.2221, "num_input_tokens_seen": 1198928, "step": 13310 }, { "epoch": 3.460239085239085, "grad_norm": 5.177806854248047, "learning_rate": 4.9192752128003554e-05, "loss": 0.2283, "num_input_tokens_seen": 1199376, "step": 13315 }, { "epoch": 3.4615384615384617, "grad_norm": 0.5731908082962036, "learning_rate": 4.9191322393699557e-05, "loss": 0.4857, "num_input_tokens_seen": 1199824, "step": 13320 }, { "epoch": 3.4628378378378377, "grad_norm": 0.8121353983879089, "learning_rate": 4.918989141521267e-05, "loss": 0.3225, "num_input_tokens_seen": 1200288, "step": 13325 }, { "epoch": 3.464137214137214, "grad_norm": 0.7337005734443665, "learning_rate": 4.9188459192616484e-05, "loss": 0.2513, "num_input_tokens_seen": 1200736, "step": 13330 }, { "epoch": 3.4654365904365902, "grad_norm": 0.3782058358192444, "learning_rate": 4.918702572598467e-05, "loss": 0.2495, "num_input_tokens_seen": 1201136, "step": 13335 }, { "epoch": 3.4667359667359667, "grad_norm": 0.3590981960296631, "learning_rate": 4.9185591015390955e-05, "loss": 0.2289, "num_input_tokens_seen": 1201584, "step": 13340 }, { "epoch": 3.468035343035343, "grad_norm": 0.3070550560951233, "learning_rate": 4.918415506090911e-05, "loss": 0.1586, "num_input_tokens_seen": 1202016, "step": 13345 }, { "epoch": 3.4693347193347193, "grad_norm": 0.33905577659606934, "learning_rate": 4.9182717862613e-05, "loss": 0.2854, "num_input_tokens_seen": 1202480, "step": 13350 }, { "epoch": 3.4706340956340958, "grad_norm": 0.5494893193244934, "learning_rate": 4.918127942057654e-05, "loss": 0.2787, "num_input_tokens_seen": 1202912, "step": 13355 }, { "epoch": 3.471933471933472, "grad_norm": 0.47839659452438354, "learning_rate": 4.917983973487371e-05, "loss": 0.3391, "num_input_tokens_seen": 1203360, "step": 13360 }, { "epoch": 3.4732328482328483, "grad_norm": 0.37688544392585754, "learning_rate": 4.917839880557855e-05, "loss": 0.3033, "num_input_tokens_seen": 1203808, "step": 13365 }, { "epoch": 3.4745322245322248, "grad_norm": 0.6560546159744263, "learning_rate": 4.917695663276518e-05, "loss": 0.2704, "num_input_tokens_seen": 1204240, "step": 13370 }, { "epoch": 3.475831600831601, "grad_norm": 0.3325541615486145, "learning_rate": 4.917551321650776e-05, "loss": 0.235, "num_input_tokens_seen": 1204640, "step": 13375 }, { "epoch": 3.4771309771309773, "grad_norm": 0.31788763403892517, "learning_rate": 4.917406855688054e-05, "loss": 0.3419, "num_input_tokens_seen": 1205056, "step": 13380 }, { "epoch": 3.4784303534303533, "grad_norm": 0.29890745878219604, "learning_rate": 4.917262265395781e-05, "loss": 0.1732, "num_input_tokens_seen": 1205504, "step": 13385 }, { "epoch": 3.47972972972973, "grad_norm": 0.31812065839767456, "learning_rate": 4.9171175507813924e-05, "loss": 0.3043, "num_input_tokens_seen": 1205936, "step": 13390 }, { "epoch": 3.481029106029106, "grad_norm": 0.22157463431358337, "learning_rate": 4.916972711852333e-05, "loss": 0.3398, "num_input_tokens_seen": 1206368, "step": 13395 }, { "epoch": 3.4823284823284824, "grad_norm": 0.18362173438072205, "learning_rate": 4.916827748616052e-05, "loss": 0.2656, "num_input_tokens_seen": 1206816, "step": 13400 }, { "epoch": 3.483627858627859, "grad_norm": 0.4789181053638458, "learning_rate": 4.9166826610800035e-05, "loss": 0.278, "num_input_tokens_seen": 1207248, "step": 13405 }, { "epoch": 3.484927234927235, "grad_norm": 0.4360195994377136, "learning_rate": 4.916537449251649e-05, "loss": 0.2535, "num_input_tokens_seen": 1207712, "step": 13410 }, { "epoch": 3.4862266112266114, "grad_norm": 0.19093133509159088, "learning_rate": 4.916392113138459e-05, "loss": 0.2956, "num_input_tokens_seen": 1208176, "step": 13415 }, { "epoch": 3.4875259875259874, "grad_norm": 0.5647348761558533, "learning_rate": 4.916246652747908e-05, "loss": 0.3278, "num_input_tokens_seen": 1208672, "step": 13420 }, { "epoch": 3.488825363825364, "grad_norm": 0.5351153016090393, "learning_rate": 4.916101068087476e-05, "loss": 0.2833, "num_input_tokens_seen": 1209104, "step": 13425 }, { "epoch": 3.49012474012474, "grad_norm": 0.7658098936080933, "learning_rate": 4.915955359164651e-05, "loss": 0.2997, "num_input_tokens_seen": 1209520, "step": 13430 }, { "epoch": 3.4914241164241164, "grad_norm": 0.5290349125862122, "learning_rate": 4.9158095259869274e-05, "loss": 0.2878, "num_input_tokens_seen": 1209968, "step": 13435 }, { "epoch": 3.492723492723493, "grad_norm": 0.3342897593975067, "learning_rate": 4.9156635685618045e-05, "loss": 0.2346, "num_input_tokens_seen": 1210400, "step": 13440 }, { "epoch": 3.494022869022869, "grad_norm": 0.24153606593608856, "learning_rate": 4.9155174868967904e-05, "loss": 0.3044, "num_input_tokens_seen": 1210880, "step": 13445 }, { "epoch": 3.4953222453222454, "grad_norm": 0.2917959988117218, "learning_rate": 4.915371280999397e-05, "loss": 0.2593, "num_input_tokens_seen": 1211328, "step": 13450 }, { "epoch": 3.4966216216216215, "grad_norm": 0.2689143121242523, "learning_rate": 4.9152249508771445e-05, "loss": 0.1707, "num_input_tokens_seen": 1211792, "step": 13455 }, { "epoch": 3.497920997920998, "grad_norm": 0.22674255073070526, "learning_rate": 4.9150784965375586e-05, "loss": 0.214, "num_input_tokens_seen": 1212240, "step": 13460 }, { "epoch": 3.499220374220374, "grad_norm": 0.3240133821964264, "learning_rate": 4.9149319179881716e-05, "loss": 0.2216, "num_input_tokens_seen": 1212720, "step": 13465 }, { "epoch": 3.5005197505197505, "grad_norm": 0.27914223074913025, "learning_rate": 4.914785215236522e-05, "loss": 0.2127, "num_input_tokens_seen": 1213168, "step": 13470 }, { "epoch": 3.501819126819127, "grad_norm": 0.2983192205429077, "learning_rate": 4.9146383882901555e-05, "loss": 0.1907, "num_input_tokens_seen": 1213600, "step": 13475 }, { "epoch": 3.503118503118503, "grad_norm": 0.24129506945610046, "learning_rate": 4.914491437156623e-05, "loss": 0.3714, "num_input_tokens_seen": 1214032, "step": 13480 }, { "epoch": 3.5044178794178795, "grad_norm": 0.3160611093044281, "learning_rate": 4.914344361843482e-05, "loss": 0.249, "num_input_tokens_seen": 1214448, "step": 13485 }, { "epoch": 3.5057172557172556, "grad_norm": 0.28285062313079834, "learning_rate": 4.914197162358297e-05, "loss": 0.2561, "num_input_tokens_seen": 1214912, "step": 13490 }, { "epoch": 3.507016632016632, "grad_norm": 0.34834277629852295, "learning_rate": 4.9140498387086396e-05, "loss": 0.1908, "num_input_tokens_seen": 1215392, "step": 13495 }, { "epoch": 3.508316008316008, "grad_norm": 0.40483570098876953, "learning_rate": 4.913902390902085e-05, "loss": 0.2895, "num_input_tokens_seen": 1215840, "step": 13500 }, { "epoch": 3.5096153846153846, "grad_norm": 0.3152134120464325, "learning_rate": 4.913754818946219e-05, "loss": 0.1327, "num_input_tokens_seen": 1216304, "step": 13505 }, { "epoch": 3.510914760914761, "grad_norm": 0.3393908441066742, "learning_rate": 4.913607122848628e-05, "loss": 0.2677, "num_input_tokens_seen": 1216768, "step": 13510 }, { "epoch": 3.512214137214137, "grad_norm": 0.21538971364498138, "learning_rate": 4.913459302616912e-05, "loss": 0.2627, "num_input_tokens_seen": 1217216, "step": 13515 }, { "epoch": 3.5135135135135136, "grad_norm": 0.2905784547328949, "learning_rate": 4.91331135825867e-05, "loss": 0.319, "num_input_tokens_seen": 1217664, "step": 13520 }, { "epoch": 3.51481288981289, "grad_norm": 0.2653823494911194, "learning_rate": 4.913163289781514e-05, "loss": 0.2131, "num_input_tokens_seen": 1218096, "step": 13525 }, { "epoch": 3.516112266112266, "grad_norm": 0.22797317802906036, "learning_rate": 4.913015097193057e-05, "loss": 0.2961, "num_input_tokens_seen": 1218560, "step": 13530 }, { "epoch": 3.517411642411642, "grad_norm": 0.5570852756500244, "learning_rate": 4.912866780500921e-05, "loss": 0.3445, "num_input_tokens_seen": 1219056, "step": 13535 }, { "epoch": 3.5187110187110187, "grad_norm": 0.12789739668369293, "learning_rate": 4.912718339712735e-05, "loss": 0.2884, "num_input_tokens_seen": 1219504, "step": 13540 }, { "epoch": 3.520010395010395, "grad_norm": 0.7642223834991455, "learning_rate": 4.9125697748361335e-05, "loss": 0.2987, "num_input_tokens_seen": 1219952, "step": 13545 }, { "epoch": 3.521309771309771, "grad_norm": 3.1181650161743164, "learning_rate": 4.912421085878757e-05, "loss": 0.2859, "num_input_tokens_seen": 1220432, "step": 13550 }, { "epoch": 3.5226091476091477, "grad_norm": 0.46060940623283386, "learning_rate": 4.912272272848252e-05, "loss": 0.2734, "num_input_tokens_seen": 1220880, "step": 13555 }, { "epoch": 3.523908523908524, "grad_norm": 0.20079553127288818, "learning_rate": 4.9121233357522724e-05, "loss": 0.2789, "num_input_tokens_seen": 1221312, "step": 13560 }, { "epoch": 3.5252079002079, "grad_norm": 0.367895245552063, "learning_rate": 4.911974274598479e-05, "loss": 0.2089, "num_input_tokens_seen": 1221792, "step": 13565 }, { "epoch": 3.5265072765072762, "grad_norm": 0.3754718005657196, "learning_rate": 4.911825089394537e-05, "loss": 0.2451, "num_input_tokens_seen": 1222256, "step": 13570 }, { "epoch": 3.5278066528066527, "grad_norm": 0.4842202365398407, "learning_rate": 4.911675780148121e-05, "loss": 0.2568, "num_input_tokens_seen": 1222704, "step": 13575 }, { "epoch": 3.529106029106029, "grad_norm": 0.33844760060310364, "learning_rate": 4.911526346866907e-05, "loss": 0.3118, "num_input_tokens_seen": 1223152, "step": 13580 }, { "epoch": 3.5304054054054053, "grad_norm": 0.29965758323669434, "learning_rate": 4.911376789558584e-05, "loss": 0.2557, "num_input_tokens_seen": 1223664, "step": 13585 }, { "epoch": 3.5317047817047817, "grad_norm": 0.28954097628593445, "learning_rate": 4.9112271082308415e-05, "loss": 0.1857, "num_input_tokens_seen": 1224128, "step": 13590 }, { "epoch": 3.5330041580041582, "grad_norm": 0.32391873002052307, "learning_rate": 4.9110773028913785e-05, "loss": 0.2217, "num_input_tokens_seen": 1224592, "step": 13595 }, { "epoch": 3.5343035343035343, "grad_norm": 0.694067656993866, "learning_rate": 4.9109273735479e-05, "loss": 0.3474, "num_input_tokens_seen": 1225040, "step": 13600 }, { "epoch": 3.5356029106029108, "grad_norm": 0.3846363425254822, "learning_rate": 4.910777320208117e-05, "loss": 0.3618, "num_input_tokens_seen": 1225488, "step": 13605 }, { "epoch": 3.536902286902287, "grad_norm": 0.4047725796699524, "learning_rate": 4.9106271428797455e-05, "loss": 0.2365, "num_input_tokens_seen": 1225968, "step": 13610 }, { "epoch": 3.5382016632016633, "grad_norm": 0.22199323773384094, "learning_rate": 4.9104768415705106e-05, "loss": 0.2946, "num_input_tokens_seen": 1226416, "step": 13615 }, { "epoch": 3.5395010395010393, "grad_norm": 0.19435536861419678, "learning_rate": 4.910326416288142e-05, "loss": 0.2642, "num_input_tokens_seen": 1226864, "step": 13620 }, { "epoch": 3.540800415800416, "grad_norm": 0.1902359426021576, "learning_rate": 4.910175867040377e-05, "loss": 0.2865, "num_input_tokens_seen": 1227312, "step": 13625 }, { "epoch": 3.5420997920997923, "grad_norm": 0.5475410223007202, "learning_rate": 4.910025193834957e-05, "loss": 0.2652, "num_input_tokens_seen": 1227744, "step": 13630 }, { "epoch": 3.5433991683991684, "grad_norm": 0.4139593243598938, "learning_rate": 4.909874396679633e-05, "loss": 0.2404, "num_input_tokens_seen": 1228192, "step": 13635 }, { "epoch": 3.544698544698545, "grad_norm": 0.36235666275024414, "learning_rate": 4.9097234755821595e-05, "loss": 0.2294, "num_input_tokens_seen": 1228656, "step": 13640 }, { "epoch": 3.545997920997921, "grad_norm": 0.35000836849212646, "learning_rate": 4.909572430550299e-05, "loss": 0.2089, "num_input_tokens_seen": 1229072, "step": 13645 }, { "epoch": 3.5472972972972974, "grad_norm": 0.32278499007225037, "learning_rate": 4.9094212615918186e-05, "loss": 0.2558, "num_input_tokens_seen": 1229536, "step": 13650 }, { "epoch": 3.5485966735966734, "grad_norm": 0.28844693303108215, "learning_rate": 4.909269968714495e-05, "loss": 0.0825, "num_input_tokens_seen": 1229984, "step": 13655 }, { "epoch": 3.54989604989605, "grad_norm": 0.2484765648841858, "learning_rate": 4.909118551926108e-05, "loss": 0.2266, "num_input_tokens_seen": 1230432, "step": 13660 }, { "epoch": 3.5511954261954264, "grad_norm": 0.27108776569366455, "learning_rate": 4.9089670112344456e-05, "loss": 0.3154, "num_input_tokens_seen": 1230896, "step": 13665 }, { "epoch": 3.5524948024948024, "grad_norm": 0.457725465297699, "learning_rate": 4.908815346647302e-05, "loss": 0.208, "num_input_tokens_seen": 1231392, "step": 13670 }, { "epoch": 3.553794178794179, "grad_norm": 0.657808244228363, "learning_rate": 4.9086635581724774e-05, "loss": 0.4175, "num_input_tokens_seen": 1231824, "step": 13675 }, { "epoch": 3.555093555093555, "grad_norm": 0.5780637264251709, "learning_rate": 4.908511645817777e-05, "loss": 0.2726, "num_input_tokens_seen": 1232256, "step": 13680 }, { "epoch": 3.5563929313929314, "grad_norm": 0.22129926085472107, "learning_rate": 4.908359609591016e-05, "loss": 0.2594, "num_input_tokens_seen": 1232688, "step": 13685 }, { "epoch": 3.5576923076923075, "grad_norm": 0.1770630031824112, "learning_rate": 4.9082074495000116e-05, "loss": 0.2494, "num_input_tokens_seen": 1233104, "step": 13690 }, { "epoch": 3.558991683991684, "grad_norm": 0.3837307393550873, "learning_rate": 4.908055165552592e-05, "loss": 0.2099, "num_input_tokens_seen": 1233568, "step": 13695 }, { "epoch": 3.5602910602910605, "grad_norm": 0.3262314796447754, "learning_rate": 4.907902757756587e-05, "loss": 0.2569, "num_input_tokens_seen": 1234032, "step": 13700 }, { "epoch": 3.5615904365904365, "grad_norm": 0.49773865938186646, "learning_rate": 4.907750226119837e-05, "loss": 0.3075, "num_input_tokens_seen": 1234496, "step": 13705 }, { "epoch": 3.562889812889813, "grad_norm": 0.3960755467414856, "learning_rate": 4.907597570650185e-05, "loss": 0.2287, "num_input_tokens_seen": 1234960, "step": 13710 }, { "epoch": 3.564189189189189, "grad_norm": 0.7683447599411011, "learning_rate": 4.907444791355483e-05, "loss": 0.315, "num_input_tokens_seen": 1235392, "step": 13715 }, { "epoch": 3.5654885654885655, "grad_norm": 0.39811640977859497, "learning_rate": 4.9072918882435894e-05, "loss": 0.2337, "num_input_tokens_seen": 1235840, "step": 13720 }, { "epoch": 3.5667879417879416, "grad_norm": 0.370919406414032, "learning_rate": 4.907138861322367e-05, "loss": 0.2363, "num_input_tokens_seen": 1236272, "step": 13725 }, { "epoch": 3.568087318087318, "grad_norm": 0.37424734234809875, "learning_rate": 4.906985710599686e-05, "loss": 0.3853, "num_input_tokens_seen": 1236768, "step": 13730 }, { "epoch": 3.5693866943866945, "grad_norm": 0.411577969789505, "learning_rate": 4.9068324360834245e-05, "loss": 0.2609, "num_input_tokens_seen": 1237200, "step": 13735 }, { "epoch": 3.5706860706860706, "grad_norm": 0.35649439692497253, "learning_rate": 4.9066790377814643e-05, "loss": 0.2115, "num_input_tokens_seen": 1237664, "step": 13740 }, { "epoch": 3.571985446985447, "grad_norm": 0.31402450799942017, "learning_rate": 4.906525515701695e-05, "loss": 0.2237, "num_input_tokens_seen": 1238096, "step": 13745 }, { "epoch": 3.5732848232848236, "grad_norm": 0.25386422872543335, "learning_rate": 4.906371869852013e-05, "loss": 0.2204, "num_input_tokens_seen": 1238544, "step": 13750 }, { "epoch": 3.5745841995841996, "grad_norm": 0.29980000853538513, "learning_rate": 4.906218100240321e-05, "loss": 0.3671, "num_input_tokens_seen": 1238992, "step": 13755 }, { "epoch": 3.5758835758835756, "grad_norm": 0.2661604583263397, "learning_rate": 4.906064206874525e-05, "loss": 0.2222, "num_input_tokens_seen": 1239440, "step": 13760 }, { "epoch": 3.577182952182952, "grad_norm": 0.28886622190475464, "learning_rate": 4.905910189762542e-05, "loss": 0.2567, "num_input_tokens_seen": 1239904, "step": 13765 }, { "epoch": 3.5784823284823286, "grad_norm": 0.3237861692905426, "learning_rate": 4.905756048912293e-05, "loss": 0.2192, "num_input_tokens_seen": 1240368, "step": 13770 }, { "epoch": 3.5797817047817047, "grad_norm": 0.6632832884788513, "learning_rate": 4.905601784331705e-05, "loss": 0.3535, "num_input_tokens_seen": 1240848, "step": 13775 }, { "epoch": 3.581081081081081, "grad_norm": 0.3869102895259857, "learning_rate": 4.9054473960287116e-05, "loss": 0.2353, "num_input_tokens_seen": 1241280, "step": 13780 }, { "epoch": 3.5823804573804576, "grad_norm": 0.25254613161087036, "learning_rate": 4.9052928840112555e-05, "loss": 0.2707, "num_input_tokens_seen": 1241744, "step": 13785 }, { "epoch": 3.5836798336798337, "grad_norm": 0.3495674431324005, "learning_rate": 4.90513824828728e-05, "loss": 0.2697, "num_input_tokens_seen": 1242192, "step": 13790 }, { "epoch": 3.5849792099792097, "grad_norm": 0.19759172201156616, "learning_rate": 4.904983488864741e-05, "loss": 0.2887, "num_input_tokens_seen": 1242656, "step": 13795 }, { "epoch": 3.586278586278586, "grad_norm": 0.17954139411449432, "learning_rate": 4.904828605751597e-05, "loss": 0.2431, "num_input_tokens_seen": 1243152, "step": 13800 }, { "epoch": 3.5875779625779627, "grad_norm": 0.3140222728252411, "learning_rate": 4.904673598955813e-05, "loss": 0.198, "num_input_tokens_seen": 1243584, "step": 13805 }, { "epoch": 3.5888773388773387, "grad_norm": 0.3668217658996582, "learning_rate": 4.904518468485362e-05, "loss": 0.2623, "num_input_tokens_seen": 1244064, "step": 13810 }, { "epoch": 3.590176715176715, "grad_norm": 0.23251396417617798, "learning_rate": 4.904363214348222e-05, "loss": 0.1562, "num_input_tokens_seen": 1244528, "step": 13815 }, { "epoch": 3.5914760914760917, "grad_norm": 0.2282695472240448, "learning_rate": 4.904207836552378e-05, "loss": 0.1446, "num_input_tokens_seen": 1244960, "step": 13820 }, { "epoch": 3.5927754677754677, "grad_norm": 0.5985009670257568, "learning_rate": 4.904052335105822e-05, "loss": 0.2184, "num_input_tokens_seen": 1245424, "step": 13825 }, { "epoch": 3.5940748440748442, "grad_norm": 0.2020372748374939, "learning_rate": 4.903896710016551e-05, "loss": 0.2832, "num_input_tokens_seen": 1245904, "step": 13830 }, { "epoch": 3.5953742203742203, "grad_norm": 0.21841222047805786, "learning_rate": 4.9037409612925675e-05, "loss": 0.2186, "num_input_tokens_seen": 1246336, "step": 13835 }, { "epoch": 3.5966735966735968, "grad_norm": 0.35294172167778015, "learning_rate": 4.903585088941885e-05, "loss": 0.3229, "num_input_tokens_seen": 1246752, "step": 13840 }, { "epoch": 3.597972972972973, "grad_norm": 0.25775331258773804, "learning_rate": 4.9034290929725174e-05, "loss": 0.1163, "num_input_tokens_seen": 1247184, "step": 13845 }, { "epoch": 3.5992723492723493, "grad_norm": 0.6236313581466675, "learning_rate": 4.9032729733924885e-05, "loss": 0.3011, "num_input_tokens_seen": 1247648, "step": 13850 }, { "epoch": 3.600571725571726, "grad_norm": 0.3050759732723236, "learning_rate": 4.9031167302098294e-05, "loss": 0.2684, "num_input_tokens_seen": 1248048, "step": 13855 }, { "epoch": 3.601871101871102, "grad_norm": 0.278255820274353, "learning_rate": 4.9029603634325726e-05, "loss": 0.2104, "num_input_tokens_seen": 1248512, "step": 13860 }, { "epoch": 3.6031704781704783, "grad_norm": 0.25323382019996643, "learning_rate": 4.902803873068763e-05, "loss": 0.2077, "num_input_tokens_seen": 1248976, "step": 13865 }, { "epoch": 3.6044698544698544, "grad_norm": 0.2441430687904358, "learning_rate": 4.902647259126447e-05, "loss": 0.1976, "num_input_tokens_seen": 1249392, "step": 13870 }, { "epoch": 3.605769230769231, "grad_norm": 0.22002895176410675, "learning_rate": 4.902490521613681e-05, "loss": 0.2141, "num_input_tokens_seen": 1249856, "step": 13875 }, { "epoch": 3.607068607068607, "grad_norm": 0.29075589776039124, "learning_rate": 4.902333660538525e-05, "loss": 0.2737, "num_input_tokens_seen": 1250304, "step": 13880 }, { "epoch": 3.6083679833679834, "grad_norm": 0.20842903852462769, "learning_rate": 4.902176675909047e-05, "loss": 0.1539, "num_input_tokens_seen": 1250768, "step": 13885 }, { "epoch": 3.60966735966736, "grad_norm": 0.3290092349052429, "learning_rate": 4.902019567733321e-05, "loss": 0.268, "num_input_tokens_seen": 1251248, "step": 13890 }, { "epoch": 3.610966735966736, "grad_norm": 0.3249741494655609, "learning_rate": 4.9018623360194284e-05, "loss": 0.2134, "num_input_tokens_seen": 1251712, "step": 13895 }, { "epoch": 3.6122661122661124, "grad_norm": 0.6790873408317566, "learning_rate": 4.9017049807754534e-05, "loss": 0.266, "num_input_tokens_seen": 1252176, "step": 13900 }, { "epoch": 3.6135654885654884, "grad_norm": 0.24411599338054657, "learning_rate": 4.90154750200949e-05, "loss": 0.2729, "num_input_tokens_seen": 1252640, "step": 13905 }, { "epoch": 3.614864864864865, "grad_norm": 0.26177164912223816, "learning_rate": 4.901389899729638e-05, "loss": 0.2179, "num_input_tokens_seen": 1253088, "step": 13910 }, { "epoch": 3.616164241164241, "grad_norm": 0.2520216405391693, "learning_rate": 4.901232173944001e-05, "loss": 0.2434, "num_input_tokens_seen": 1253536, "step": 13915 }, { "epoch": 3.6174636174636174, "grad_norm": 0.24095357954502106, "learning_rate": 4.901074324660695e-05, "loss": 0.3, "num_input_tokens_seen": 1254000, "step": 13920 }, { "epoch": 3.618762993762994, "grad_norm": 0.2649507522583008, "learning_rate": 4.900916351887834e-05, "loss": 0.1413, "num_input_tokens_seen": 1254432, "step": 13925 }, { "epoch": 3.62006237006237, "grad_norm": 0.22583192586898804, "learning_rate": 4.9007582556335454e-05, "loss": 0.1808, "num_input_tokens_seen": 1254864, "step": 13930 }, { "epoch": 3.6213617463617465, "grad_norm": 0.19628332555294037, "learning_rate": 4.900600035905959e-05, "loss": 0.2197, "num_input_tokens_seen": 1255312, "step": 13935 }, { "epoch": 3.6226611226611225, "grad_norm": 0.32494381070137024, "learning_rate": 4.900441692713213e-05, "loss": 0.3891, "num_input_tokens_seen": 1255728, "step": 13940 }, { "epoch": 3.623960498960499, "grad_norm": 0.2943713068962097, "learning_rate": 4.90028322606345e-05, "loss": 0.212, "num_input_tokens_seen": 1256128, "step": 13945 }, { "epoch": 3.625259875259875, "grad_norm": 0.25515520572662354, "learning_rate": 4.9001246359648224e-05, "loss": 0.2202, "num_input_tokens_seen": 1256624, "step": 13950 }, { "epoch": 3.6265592515592515, "grad_norm": 0.25358229875564575, "learning_rate": 4.899965922425483e-05, "loss": 0.2542, "num_input_tokens_seen": 1257088, "step": 13955 }, { "epoch": 3.627858627858628, "grad_norm": 0.2681500315666199, "learning_rate": 4.8998070854535984e-05, "loss": 0.25, "num_input_tokens_seen": 1257520, "step": 13960 }, { "epoch": 3.629158004158004, "grad_norm": 0.25791916251182556, "learning_rate": 4.899648125057336e-05, "loss": 0.225, "num_input_tokens_seen": 1257968, "step": 13965 }, { "epoch": 3.6304573804573805, "grad_norm": 0.26554808020591736, "learning_rate": 4.89948904124487e-05, "loss": 0.2957, "num_input_tokens_seen": 1258384, "step": 13970 }, { "epoch": 3.631756756756757, "grad_norm": 0.5647695660591125, "learning_rate": 4.899329834024384e-05, "loss": 0.3458, "num_input_tokens_seen": 1258896, "step": 13975 }, { "epoch": 3.633056133056133, "grad_norm": 0.38494008779525757, "learning_rate": 4.899170503404066e-05, "loss": 0.2385, "num_input_tokens_seen": 1259376, "step": 13980 }, { "epoch": 3.634355509355509, "grad_norm": 0.20109248161315918, "learning_rate": 4.8990110493921105e-05, "loss": 0.2403, "num_input_tokens_seen": 1259776, "step": 13985 }, { "epoch": 3.6356548856548856, "grad_norm": 0.3177633285522461, "learning_rate": 4.8988514719967175e-05, "loss": 0.2606, "num_input_tokens_seen": 1260240, "step": 13990 }, { "epoch": 3.636954261954262, "grad_norm": 0.32741954922676086, "learning_rate": 4.898691771226095e-05, "loss": 0.2878, "num_input_tokens_seen": 1260704, "step": 13995 }, { "epoch": 3.638253638253638, "grad_norm": 0.20380619168281555, "learning_rate": 4.898531947088457e-05, "loss": 0.255, "num_input_tokens_seen": 1261152, "step": 14000 }, { "epoch": 3.6395530145530146, "grad_norm": 0.32267704606056213, "learning_rate": 4.898371999592022e-05, "loss": 0.2674, "num_input_tokens_seen": 1261568, "step": 14005 }, { "epoch": 3.640852390852391, "grad_norm": 0.3025676906108856, "learning_rate": 4.898211928745017e-05, "loss": 0.2637, "num_input_tokens_seen": 1262016, "step": 14010 }, { "epoch": 3.642151767151767, "grad_norm": 0.26010745763778687, "learning_rate": 4.898051734555676e-05, "loss": 0.261, "num_input_tokens_seen": 1262448, "step": 14015 }, { "epoch": 3.643451143451143, "grad_norm": 0.3463980555534363, "learning_rate": 4.897891417032235e-05, "loss": 0.2563, "num_input_tokens_seen": 1262896, "step": 14020 }, { "epoch": 3.6447505197505197, "grad_norm": 0.21253080666065216, "learning_rate": 4.8977309761829416e-05, "loss": 0.3047, "num_input_tokens_seen": 1263344, "step": 14025 }, { "epoch": 3.646049896049896, "grad_norm": 0.5235334634780884, "learning_rate": 4.8975704120160464e-05, "loss": 0.2768, "num_input_tokens_seen": 1263808, "step": 14030 }, { "epoch": 3.647349272349272, "grad_norm": 0.1622868925333023, "learning_rate": 4.897409724539808e-05, "loss": 0.3009, "num_input_tokens_seen": 1264272, "step": 14035 }, { "epoch": 3.6486486486486487, "grad_norm": 0.17738361656665802, "learning_rate": 4.89724891376249e-05, "loss": 0.2785, "num_input_tokens_seen": 1264704, "step": 14040 }, { "epoch": 3.649948024948025, "grad_norm": 0.19759100675582886, "learning_rate": 4.8970879796923635e-05, "loss": 0.2757, "num_input_tokens_seen": 1265152, "step": 14045 }, { "epoch": 3.651247401247401, "grad_norm": 0.1353798359632492, "learning_rate": 4.8969269223377056e-05, "loss": 0.2998, "num_input_tokens_seen": 1265584, "step": 14050 }, { "epoch": 3.6525467775467777, "grad_norm": 0.5730398893356323, "learning_rate": 4.896765741706799e-05, "loss": 0.2918, "num_input_tokens_seen": 1266064, "step": 14055 }, { "epoch": 3.6538461538461537, "grad_norm": 0.3891242444515228, "learning_rate": 4.896604437807935e-05, "loss": 0.2697, "num_input_tokens_seen": 1266528, "step": 14060 }, { "epoch": 3.6551455301455302, "grad_norm": 0.19336260855197906, "learning_rate": 4.8964430106494075e-05, "loss": 0.261, "num_input_tokens_seen": 1266976, "step": 14065 }, { "epoch": 3.6564449064449063, "grad_norm": 0.289054811000824, "learning_rate": 4.89628146023952e-05, "loss": 0.1993, "num_input_tokens_seen": 1267392, "step": 14070 }, { "epoch": 3.6577442827442828, "grad_norm": 0.2383560687303543, "learning_rate": 4.896119786586581e-05, "loss": 0.1772, "num_input_tokens_seen": 1267872, "step": 14075 }, { "epoch": 3.6590436590436592, "grad_norm": 0.205484077334404, "learning_rate": 4.8959579896989054e-05, "loss": 0.2081, "num_input_tokens_seen": 1268272, "step": 14080 }, { "epoch": 3.6603430353430353, "grad_norm": 0.190065398812294, "learning_rate": 4.895796069584815e-05, "loss": 0.257, "num_input_tokens_seen": 1268704, "step": 14085 }, { "epoch": 3.6616424116424118, "grad_norm": 0.2372569888830185, "learning_rate": 4.895634026252637e-05, "loss": 0.3519, "num_input_tokens_seen": 1269152, "step": 14090 }, { "epoch": 3.662941787941788, "grad_norm": 0.2007235586643219, "learning_rate": 4.895471859710705e-05, "loss": 0.0979, "num_input_tokens_seen": 1269568, "step": 14095 }, { "epoch": 3.6642411642411643, "grad_norm": 0.6201173067092896, "learning_rate": 4.895309569967361e-05, "loss": 0.3616, "num_input_tokens_seen": 1270000, "step": 14100 }, { "epoch": 3.6655405405405403, "grad_norm": 0.2559451162815094, "learning_rate": 4.895147157030951e-05, "loss": 0.3003, "num_input_tokens_seen": 1270480, "step": 14105 }, { "epoch": 3.666839916839917, "grad_norm": 0.3061884939670563, "learning_rate": 4.894984620909827e-05, "loss": 0.3299, "num_input_tokens_seen": 1270912, "step": 14110 }, { "epoch": 3.6681392931392933, "grad_norm": 0.19359900057315826, "learning_rate": 4.894821961612349e-05, "loss": 0.311, "num_input_tokens_seen": 1271344, "step": 14115 }, { "epoch": 3.6694386694386694, "grad_norm": 0.5224788188934326, "learning_rate": 4.894659179146883e-05, "loss": 0.2614, "num_input_tokens_seen": 1271792, "step": 14120 }, { "epoch": 3.670738045738046, "grad_norm": 0.49664106965065, "learning_rate": 4.894496273521802e-05, "loss": 0.2805, "num_input_tokens_seen": 1272256, "step": 14125 }, { "epoch": 3.672037422037422, "grad_norm": 0.1827949434518814, "learning_rate": 4.894333244745483e-05, "loss": 0.2462, "num_input_tokens_seen": 1272704, "step": 14130 }, { "epoch": 3.6733367983367984, "grad_norm": 0.34703946113586426, "learning_rate": 4.89417009282631e-05, "loss": 0.1998, "num_input_tokens_seen": 1273168, "step": 14135 }, { "epoch": 3.6746361746361744, "grad_norm": 0.3761211931705475, "learning_rate": 4.894006817772676e-05, "loss": 0.2609, "num_input_tokens_seen": 1273616, "step": 14140 }, { "epoch": 3.675935550935551, "grad_norm": 0.2917877733707428, "learning_rate": 4.893843419592977e-05, "loss": 0.2152, "num_input_tokens_seen": 1274080, "step": 14145 }, { "epoch": 3.6772349272349274, "grad_norm": 0.3112621009349823, "learning_rate": 4.893679898295618e-05, "loss": 0.1492, "num_input_tokens_seen": 1274560, "step": 14150 }, { "epoch": 3.6785343035343034, "grad_norm": 0.5965801477432251, "learning_rate": 4.893516253889008e-05, "loss": 0.2657, "num_input_tokens_seen": 1275008, "step": 14155 }, { "epoch": 3.67983367983368, "grad_norm": 0.40195098519325256, "learning_rate": 4.893352486381564e-05, "loss": 0.4315, "num_input_tokens_seen": 1275456, "step": 14160 }, { "epoch": 3.681133056133056, "grad_norm": 0.34016770124435425, "learning_rate": 4.893188595781708e-05, "loss": 0.1851, "num_input_tokens_seen": 1275920, "step": 14165 }, { "epoch": 3.6824324324324325, "grad_norm": 0.2314443588256836, "learning_rate": 4.893024582097869e-05, "loss": 0.2571, "num_input_tokens_seen": 1276416, "step": 14170 }, { "epoch": 3.6837318087318085, "grad_norm": 0.2794255018234253, "learning_rate": 4.892860445338484e-05, "loss": 0.1822, "num_input_tokens_seen": 1276896, "step": 14175 }, { "epoch": 3.685031185031185, "grad_norm": 0.20167280733585358, "learning_rate": 4.892696185511993e-05, "loss": 0.1652, "num_input_tokens_seen": 1277328, "step": 14180 }, { "epoch": 3.6863305613305615, "grad_norm": 0.18947681784629822, "learning_rate": 4.892531802626844e-05, "loss": 0.1478, "num_input_tokens_seen": 1277760, "step": 14185 }, { "epoch": 3.6876299376299375, "grad_norm": 0.2035616934299469, "learning_rate": 4.892367296691493e-05, "loss": 0.2886, "num_input_tokens_seen": 1278256, "step": 14190 }, { "epoch": 3.688929313929314, "grad_norm": 0.18614110350608826, "learning_rate": 4.8922026677144e-05, "loss": 0.145, "num_input_tokens_seen": 1278704, "step": 14195 }, { "epoch": 3.6902286902286905, "grad_norm": 0.35838210582733154, "learning_rate": 4.8920379157040304e-05, "loss": 0.401, "num_input_tokens_seen": 1279168, "step": 14200 }, { "epoch": 3.6915280665280665, "grad_norm": 0.236890509724617, "learning_rate": 4.89187304066886e-05, "loss": 0.2242, "num_input_tokens_seen": 1279664, "step": 14205 }, { "epoch": 3.6928274428274426, "grad_norm": 0.23618347942829132, "learning_rate": 4.891708042617366e-05, "loss": 0.1213, "num_input_tokens_seen": 1280096, "step": 14210 }, { "epoch": 3.694126819126819, "grad_norm": 0.2028961181640625, "learning_rate": 4.891542921558036e-05, "loss": 0.3006, "num_input_tokens_seen": 1280544, "step": 14215 }, { "epoch": 3.6954261954261955, "grad_norm": 0.5557616949081421, "learning_rate": 4.891377677499363e-05, "loss": 0.3111, "num_input_tokens_seen": 1280976, "step": 14220 }, { "epoch": 3.6967255717255716, "grad_norm": 0.27125072479248047, "learning_rate": 4.891212310449844e-05, "loss": 0.3079, "num_input_tokens_seen": 1281424, "step": 14225 }, { "epoch": 3.698024948024948, "grad_norm": 0.2638159394264221, "learning_rate": 4.891046820417985e-05, "loss": 0.2598, "num_input_tokens_seen": 1281872, "step": 14230 }, { "epoch": 3.6993243243243246, "grad_norm": 0.1964157670736313, "learning_rate": 4.890881207412298e-05, "loss": 0.26, "num_input_tokens_seen": 1282304, "step": 14235 }, { "epoch": 3.7006237006237006, "grad_norm": 0.18654341995716095, "learning_rate": 4.8907154714412984e-05, "loss": 0.256, "num_input_tokens_seen": 1282752, "step": 14240 }, { "epoch": 3.7019230769230766, "grad_norm": 0.23411399126052856, "learning_rate": 4.890549612513512e-05, "loss": 0.2264, "num_input_tokens_seen": 1283248, "step": 14245 }, { "epoch": 3.703222453222453, "grad_norm": 0.20867560803890228, "learning_rate": 4.890383630637468e-05, "loss": 0.2792, "num_input_tokens_seen": 1283712, "step": 14250 }, { "epoch": 3.7045218295218296, "grad_norm": 0.20337843894958496, "learning_rate": 4.890217525821704e-05, "loss": 0.2505, "num_input_tokens_seen": 1284192, "step": 14255 }, { "epoch": 3.7058212058212057, "grad_norm": 0.21968938410282135, "learning_rate": 4.890051298074762e-05, "loss": 0.284, "num_input_tokens_seen": 1284704, "step": 14260 }, { "epoch": 3.707120582120582, "grad_norm": 0.22740896046161652, "learning_rate": 4.8898849474051924e-05, "loss": 0.279, "num_input_tokens_seen": 1285168, "step": 14265 }, { "epoch": 3.7084199584199586, "grad_norm": 0.1937534511089325, "learning_rate": 4.88971847382155e-05, "loss": 0.2901, "num_input_tokens_seen": 1285600, "step": 14270 }, { "epoch": 3.7097193347193347, "grad_norm": 0.40090471506118774, "learning_rate": 4.889551877332396e-05, "loss": 0.2596, "num_input_tokens_seen": 1286064, "step": 14275 }, { "epoch": 3.711018711018711, "grad_norm": 0.3368026912212372, "learning_rate": 4.8893851579462993e-05, "loss": 0.2005, "num_input_tokens_seen": 1286480, "step": 14280 }, { "epoch": 3.712318087318087, "grad_norm": 0.23417574167251587, "learning_rate": 4.8892183156718354e-05, "loss": 0.229, "num_input_tokens_seen": 1286944, "step": 14285 }, { "epoch": 3.7136174636174637, "grad_norm": 0.22993865609169006, "learning_rate": 4.889051350517584e-05, "loss": 0.2166, "num_input_tokens_seen": 1287408, "step": 14290 }, { "epoch": 3.7149168399168397, "grad_norm": 0.5137307643890381, "learning_rate": 4.888884262492132e-05, "loss": 0.2084, "num_input_tokens_seen": 1287824, "step": 14295 }, { "epoch": 3.7162162162162162, "grad_norm": 0.23852580785751343, "learning_rate": 4.888717051604074e-05, "loss": 0.2186, "num_input_tokens_seen": 1288272, "step": 14300 }, { "epoch": 3.7175155925155927, "grad_norm": 0.21367770433425903, "learning_rate": 4.8885497178620095e-05, "loss": 0.3226, "num_input_tokens_seen": 1288720, "step": 14305 }, { "epoch": 3.7188149688149688, "grad_norm": 0.6908769607543945, "learning_rate": 4.888382261274544e-05, "loss": 0.3052, "num_input_tokens_seen": 1289152, "step": 14310 }, { "epoch": 3.7201143451143452, "grad_norm": 0.5117773413658142, "learning_rate": 4.88821468185029e-05, "loss": 0.2547, "num_input_tokens_seen": 1289600, "step": 14315 }, { "epoch": 3.7214137214137213, "grad_norm": 0.43644946813583374, "learning_rate": 4.8880469795978676e-05, "loss": 0.2293, "num_input_tokens_seen": 1290032, "step": 14320 }, { "epoch": 3.7227130977130978, "grad_norm": 0.31960421800613403, "learning_rate": 4.887879154525901e-05, "loss": 0.3141, "num_input_tokens_seen": 1290528, "step": 14325 }, { "epoch": 3.724012474012474, "grad_norm": 0.7524402141571045, "learning_rate": 4.887711206643021e-05, "loss": 0.2549, "num_input_tokens_seen": 1290960, "step": 14330 }, { "epoch": 3.7253118503118503, "grad_norm": 0.26379072666168213, "learning_rate": 4.887543135957865e-05, "loss": 0.2645, "num_input_tokens_seen": 1291376, "step": 14335 }, { "epoch": 3.726611226611227, "grad_norm": 0.2889665961265564, "learning_rate": 4.887374942479079e-05, "loss": 0.258, "num_input_tokens_seen": 1291808, "step": 14340 }, { "epoch": 3.727910602910603, "grad_norm": 0.4051062762737274, "learning_rate": 4.887206626215312e-05, "loss": 0.2256, "num_input_tokens_seen": 1292304, "step": 14345 }, { "epoch": 3.7292099792099793, "grad_norm": 0.27604350447654724, "learning_rate": 4.887038187175221e-05, "loss": 0.2168, "num_input_tokens_seen": 1292768, "step": 14350 }, { "epoch": 3.7305093555093554, "grad_norm": 0.2671705186367035, "learning_rate": 4.886869625367468e-05, "loss": 0.362, "num_input_tokens_seen": 1293248, "step": 14355 }, { "epoch": 3.731808731808732, "grad_norm": 0.2741069495677948, "learning_rate": 4.886700940800725e-05, "loss": 0.2563, "num_input_tokens_seen": 1293728, "step": 14360 }, { "epoch": 3.733108108108108, "grad_norm": 0.20825296640396118, "learning_rate": 4.886532133483664e-05, "loss": 0.3208, "num_input_tokens_seen": 1294176, "step": 14365 }, { "epoch": 3.7344074844074844, "grad_norm": 0.19022217392921448, "learning_rate": 4.8863632034249694e-05, "loss": 0.1995, "num_input_tokens_seen": 1294624, "step": 14370 }, { "epoch": 3.735706860706861, "grad_norm": 0.2757556736469269, "learning_rate": 4.886194150633328e-05, "loss": 0.156, "num_input_tokens_seen": 1295072, "step": 14375 }, { "epoch": 3.737006237006237, "grad_norm": 0.23171758651733398, "learning_rate": 4.886024975117437e-05, "loss": 0.1664, "num_input_tokens_seen": 1295504, "step": 14380 }, { "epoch": 3.7383056133056134, "grad_norm": 0.20308147370815277, "learning_rate": 4.8858556768859944e-05, "loss": 0.2708, "num_input_tokens_seen": 1295984, "step": 14385 }, { "epoch": 3.73960498960499, "grad_norm": 0.3151579201221466, "learning_rate": 4.885686255947708e-05, "loss": 0.3828, "num_input_tokens_seen": 1296448, "step": 14390 }, { "epoch": 3.740904365904366, "grad_norm": 0.22627043724060059, "learning_rate": 4.8855167123112914e-05, "loss": 0.2143, "num_input_tokens_seen": 1296896, "step": 14395 }, { "epoch": 3.742203742203742, "grad_norm": 0.2507995367050171, "learning_rate": 4.885347045985465e-05, "loss": 0.2567, "num_input_tokens_seen": 1297328, "step": 14400 }, { "epoch": 3.7435031185031185, "grad_norm": 0.251862496137619, "learning_rate": 4.8851772569789545e-05, "loss": 0.1734, "num_input_tokens_seen": 1297744, "step": 14405 }, { "epoch": 3.744802494802495, "grad_norm": 0.21337930858135223, "learning_rate": 4.885007345300492e-05, "loss": 0.1175, "num_input_tokens_seen": 1298160, "step": 14410 }, { "epoch": 3.746101871101871, "grad_norm": 0.1923169493675232, "learning_rate": 4.884837310958817e-05, "loss": 0.3417, "num_input_tokens_seen": 1298592, "step": 14415 }, { "epoch": 3.7474012474012475, "grad_norm": 0.1927310824394226, "learning_rate": 4.8846671539626735e-05, "loss": 0.2112, "num_input_tokens_seen": 1299040, "step": 14420 }, { "epoch": 3.748700623700624, "grad_norm": 0.21230415999889374, "learning_rate": 4.8844968743208144e-05, "loss": 0.2165, "num_input_tokens_seen": 1299520, "step": 14425 }, { "epoch": 3.75, "grad_norm": 0.21876317262649536, "learning_rate": 4.884326472041995e-05, "loss": 0.3505, "num_input_tokens_seen": 1300000, "step": 14430 }, { "epoch": 3.751299376299376, "grad_norm": 0.36808839440345764, "learning_rate": 4.884155947134982e-05, "loss": 0.2722, "num_input_tokens_seen": 1300464, "step": 14435 }, { "epoch": 3.7525987525987525, "grad_norm": 0.24157433211803436, "learning_rate": 4.883985299608543e-05, "loss": 0.1739, "num_input_tokens_seen": 1300880, "step": 14440 }, { "epoch": 3.753898128898129, "grad_norm": 0.5350860953330994, "learning_rate": 4.883814529471457e-05, "loss": 0.299, "num_input_tokens_seen": 1301312, "step": 14445 }, { "epoch": 3.755197505197505, "grad_norm": 0.20269379019737244, "learning_rate": 4.8836436367325044e-05, "loss": 0.3433, "num_input_tokens_seen": 1301744, "step": 14450 }, { "epoch": 3.7564968814968815, "grad_norm": 0.18146255612373352, "learning_rate": 4.8834726214004764e-05, "loss": 0.3457, "num_input_tokens_seen": 1302192, "step": 14455 }, { "epoch": 3.757796257796258, "grad_norm": 0.47932320833206177, "learning_rate": 4.883301483484167e-05, "loss": 0.2664, "num_input_tokens_seen": 1302656, "step": 14460 }, { "epoch": 3.759095634095634, "grad_norm": 0.5321845412254333, "learning_rate": 4.88313022299238e-05, "loss": 0.2772, "num_input_tokens_seen": 1303072, "step": 14465 }, { "epoch": 3.76039501039501, "grad_norm": 0.46135371923446655, "learning_rate": 4.882958839933921e-05, "loss": 0.2806, "num_input_tokens_seen": 1303520, "step": 14470 }, { "epoch": 3.7616943866943866, "grad_norm": 0.35067227482795715, "learning_rate": 4.882787334317607e-05, "loss": 0.2233, "num_input_tokens_seen": 1303952, "step": 14475 }, { "epoch": 3.762993762993763, "grad_norm": 0.22197525203227997, "learning_rate": 4.882615706152256e-05, "loss": 0.2674, "num_input_tokens_seen": 1304400, "step": 14480 }, { "epoch": 3.764293139293139, "grad_norm": 0.5445477366447449, "learning_rate": 4.8824439554466974e-05, "loss": 0.347, "num_input_tokens_seen": 1304848, "step": 14485 }, { "epoch": 3.7655925155925156, "grad_norm": 0.3475133180618286, "learning_rate": 4.882272082209762e-05, "loss": 0.2409, "num_input_tokens_seen": 1305328, "step": 14490 }, { "epoch": 3.766891891891892, "grad_norm": 0.200414776802063, "learning_rate": 4.882100086450292e-05, "loss": 0.3304, "num_input_tokens_seen": 1305744, "step": 14495 }, { "epoch": 3.768191268191268, "grad_norm": 0.39593347907066345, "learning_rate": 4.881927968177132e-05, "loss": 0.203, "num_input_tokens_seen": 1306208, "step": 14500 }, { "epoch": 3.7694906444906446, "grad_norm": 0.2615083158016205, "learning_rate": 4.881755727399134e-05, "loss": 0.1948, "num_input_tokens_seen": 1306656, "step": 14505 }, { "epoch": 3.7707900207900207, "grad_norm": 0.22227928042411804, "learning_rate": 4.881583364125157e-05, "loss": 0.1752, "num_input_tokens_seen": 1307104, "step": 14510 }, { "epoch": 3.772089397089397, "grad_norm": 0.27186498045921326, "learning_rate": 4.8814108783640655e-05, "loss": 0.2727, "num_input_tokens_seen": 1307568, "step": 14515 }, { "epoch": 3.773388773388773, "grad_norm": 0.2760579288005829, "learning_rate": 4.881238270124731e-05, "loss": 0.3152, "num_input_tokens_seen": 1308032, "step": 14520 }, { "epoch": 3.7746881496881497, "grad_norm": 0.3261345624923706, "learning_rate": 4.881065539416031e-05, "loss": 0.387, "num_input_tokens_seen": 1308496, "step": 14525 }, { "epoch": 3.775987525987526, "grad_norm": 0.5009936094284058, "learning_rate": 4.880892686246849e-05, "loss": 0.2625, "num_input_tokens_seen": 1308928, "step": 14530 }, { "epoch": 3.7772869022869022, "grad_norm": 0.669230580329895, "learning_rate": 4.880719710626074e-05, "loss": 0.2716, "num_input_tokens_seen": 1309392, "step": 14535 }, { "epoch": 3.7785862785862787, "grad_norm": 0.24473607540130615, "learning_rate": 4.880546612562603e-05, "loss": 0.2752, "num_input_tokens_seen": 1309824, "step": 14540 }, { "epoch": 3.7798856548856548, "grad_norm": 0.2651318907737732, "learning_rate": 4.88037339206534e-05, "loss": 0.2197, "num_input_tokens_seen": 1310288, "step": 14545 }, { "epoch": 3.7811850311850312, "grad_norm": 0.2705098092556, "learning_rate": 4.880200049143191e-05, "loss": 0.2613, "num_input_tokens_seen": 1310752, "step": 14550 }, { "epoch": 3.7824844074844073, "grad_norm": 0.2805916666984558, "learning_rate": 4.880026583805074e-05, "loss": 0.2521, "num_input_tokens_seen": 1311184, "step": 14555 }, { "epoch": 3.7837837837837838, "grad_norm": 0.23437049984931946, "learning_rate": 4.8798529960599096e-05, "loss": 0.2974, "num_input_tokens_seen": 1311616, "step": 14560 }, { "epoch": 3.7850831600831603, "grad_norm": 0.27470171451568604, "learning_rate": 4.879679285916625e-05, "loss": 0.2249, "num_input_tokens_seen": 1312080, "step": 14565 }, { "epoch": 3.7863825363825363, "grad_norm": 0.250429630279541, "learning_rate": 4.879505453384154e-05, "loss": 0.2963, "num_input_tokens_seen": 1312512, "step": 14570 }, { "epoch": 3.787681912681913, "grad_norm": 0.293029248714447, "learning_rate": 4.879331498471439e-05, "loss": 0.2941, "num_input_tokens_seen": 1312944, "step": 14575 }, { "epoch": 3.788981288981289, "grad_norm": 0.15592464804649353, "learning_rate": 4.8791574211874244e-05, "loss": 0.3193, "num_input_tokens_seen": 1313376, "step": 14580 }, { "epoch": 3.7902806652806653, "grad_norm": 0.16280710697174072, "learning_rate": 4.878983221541064e-05, "loss": 0.2697, "num_input_tokens_seen": 1313824, "step": 14585 }, { "epoch": 3.7915800415800414, "grad_norm": 0.37493255734443665, "learning_rate": 4.878808899541317e-05, "loss": 0.2473, "num_input_tokens_seen": 1314256, "step": 14590 }, { "epoch": 3.792879417879418, "grad_norm": 0.169393390417099, "learning_rate": 4.87863445519715e-05, "loss": 0.2822, "num_input_tokens_seen": 1314704, "step": 14595 }, { "epoch": 3.7941787941787943, "grad_norm": 0.20426738262176514, "learning_rate": 4.8784598885175324e-05, "loss": 0.2652, "num_input_tokens_seen": 1315136, "step": 14600 }, { "epoch": 3.7954781704781704, "grad_norm": 0.20797814428806305, "learning_rate": 4.8782851995114455e-05, "loss": 0.2204, "num_input_tokens_seen": 1315600, "step": 14605 }, { "epoch": 3.796777546777547, "grad_norm": 0.24706515669822693, "learning_rate": 4.878110388187871e-05, "loss": 0.1691, "num_input_tokens_seen": 1316064, "step": 14610 }, { "epoch": 3.7980769230769234, "grad_norm": 0.3161207139492035, "learning_rate": 4.8779354545558e-05, "loss": 0.2125, "num_input_tokens_seen": 1316496, "step": 14615 }, { "epoch": 3.7993762993762994, "grad_norm": 0.35701191425323486, "learning_rate": 4.877760398624232e-05, "loss": 0.244, "num_input_tokens_seen": 1316944, "step": 14620 }, { "epoch": 3.8006756756756754, "grad_norm": 0.34744784235954285, "learning_rate": 4.8775852204021665e-05, "loss": 0.2768, "num_input_tokens_seen": 1317360, "step": 14625 }, { "epoch": 3.801975051975052, "grad_norm": 0.27909401059150696, "learning_rate": 4.8774099198986154e-05, "loss": 0.3197, "num_input_tokens_seen": 1317776, "step": 14630 }, { "epoch": 3.8032744282744284, "grad_norm": 0.2830156683921814, "learning_rate": 4.877234497122595e-05, "loss": 0.2667, "num_input_tokens_seen": 1318208, "step": 14635 }, { "epoch": 3.8045738045738045, "grad_norm": 0.32007884979248047, "learning_rate": 4.877058952083126e-05, "loss": 0.2852, "num_input_tokens_seen": 1318672, "step": 14640 }, { "epoch": 3.805873180873181, "grad_norm": 0.19661098718643188, "learning_rate": 4.8768832847892375e-05, "loss": 0.2864, "num_input_tokens_seen": 1319104, "step": 14645 }, { "epoch": 3.8071725571725574, "grad_norm": 0.4151657819747925, "learning_rate": 4.876707495249965e-05, "loss": 0.2465, "num_input_tokens_seen": 1319552, "step": 14650 }, { "epoch": 3.8084719334719335, "grad_norm": 0.20524008572101593, "learning_rate": 4.876531583474349e-05, "loss": 0.2304, "num_input_tokens_seen": 1320016, "step": 14655 }, { "epoch": 3.8097713097713095, "grad_norm": 0.324574738740921, "learning_rate": 4.8763555494714355e-05, "loss": 0.2775, "num_input_tokens_seen": 1320480, "step": 14660 }, { "epoch": 3.811070686070686, "grad_norm": 0.38408738374710083, "learning_rate": 4.876179393250279e-05, "loss": 0.2464, "num_input_tokens_seen": 1320944, "step": 14665 }, { "epoch": 3.8123700623700625, "grad_norm": 0.8183798789978027, "learning_rate": 4.8760031148199404e-05, "loss": 0.3052, "num_input_tokens_seen": 1321424, "step": 14670 }, { "epoch": 3.8136694386694385, "grad_norm": 0.3317517936229706, "learning_rate": 4.8758267141894844e-05, "loss": 0.268, "num_input_tokens_seen": 1321856, "step": 14675 }, { "epoch": 3.814968814968815, "grad_norm": 0.1837327778339386, "learning_rate": 4.875650191367984e-05, "loss": 0.2546, "num_input_tokens_seen": 1322272, "step": 14680 }, { "epoch": 3.8162681912681915, "grad_norm": 0.30109283328056335, "learning_rate": 4.875473546364519e-05, "loss": 0.198, "num_input_tokens_seen": 1322736, "step": 14685 }, { "epoch": 3.8175675675675675, "grad_norm": 0.2632671892642975, "learning_rate": 4.875296779188173e-05, "loss": 0.1887, "num_input_tokens_seen": 1323200, "step": 14690 }, { "epoch": 3.818866943866944, "grad_norm": 0.6638166904449463, "learning_rate": 4.8751198898480376e-05, "loss": 0.3513, "num_input_tokens_seen": 1323648, "step": 14695 }, { "epoch": 3.82016632016632, "grad_norm": 0.2849934995174408, "learning_rate": 4.87494287835321e-05, "loss": 0.2187, "num_input_tokens_seen": 1324112, "step": 14700 }, { "epoch": 3.8214656964656966, "grad_norm": 0.2907973527908325, "learning_rate": 4.874765744712796e-05, "loss": 0.3052, "num_input_tokens_seen": 1324576, "step": 14705 }, { "epoch": 3.8227650727650726, "grad_norm": 0.26433244347572327, "learning_rate": 4.874588488935903e-05, "loss": 0.3032, "num_input_tokens_seen": 1325040, "step": 14710 }, { "epoch": 3.824064449064449, "grad_norm": 0.3576458990573883, "learning_rate": 4.874411111031649e-05, "loss": 0.2635, "num_input_tokens_seen": 1325472, "step": 14715 }, { "epoch": 3.8253638253638256, "grad_norm": 0.4386814832687378, "learning_rate": 4.874233611009157e-05, "loss": 0.2846, "num_input_tokens_seen": 1325920, "step": 14720 }, { "epoch": 3.8266632016632016, "grad_norm": 0.4898562729358673, "learning_rate": 4.874055988877556e-05, "loss": 0.2737, "num_input_tokens_seen": 1326368, "step": 14725 }, { "epoch": 3.827962577962578, "grad_norm": 0.1911373734474182, "learning_rate": 4.87387824464598e-05, "loss": 0.2823, "num_input_tokens_seen": 1326864, "step": 14730 }, { "epoch": 3.829261954261954, "grad_norm": 0.5372874736785889, "learning_rate": 4.873700378323571e-05, "loss": 0.2613, "num_input_tokens_seen": 1327296, "step": 14735 }, { "epoch": 3.8305613305613306, "grad_norm": 0.37519198656082153, "learning_rate": 4.873522389919478e-05, "loss": 0.2391, "num_input_tokens_seen": 1327776, "step": 14740 }, { "epoch": 3.8318607068607067, "grad_norm": 0.30732449889183044, "learning_rate": 4.8733442794428533e-05, "loss": 0.2588, "num_input_tokens_seen": 1328208, "step": 14745 }, { "epoch": 3.833160083160083, "grad_norm": 0.2044474482536316, "learning_rate": 4.873166046902859e-05, "loss": 0.3393, "num_input_tokens_seen": 1328656, "step": 14750 }, { "epoch": 3.8344594594594597, "grad_norm": 0.34282028675079346, "learning_rate": 4.872987692308661e-05, "loss": 0.2298, "num_input_tokens_seen": 1329072, "step": 14755 }, { "epoch": 3.8357588357588357, "grad_norm": 0.1861964613199234, "learning_rate": 4.872809215669432e-05, "loss": 0.2597, "num_input_tokens_seen": 1329520, "step": 14760 }, { "epoch": 3.837058212058212, "grad_norm": 0.2950361967086792, "learning_rate": 4.872630616994352e-05, "loss": 0.2601, "num_input_tokens_seen": 1329968, "step": 14765 }, { "epoch": 3.8383575883575882, "grad_norm": 0.18378622829914093, "learning_rate": 4.8724518962926055e-05, "loss": 0.3156, "num_input_tokens_seen": 1330464, "step": 14770 }, { "epoch": 3.8396569646569647, "grad_norm": 0.49289843440055847, "learning_rate": 4.8722730535733854e-05, "loss": 0.2482, "num_input_tokens_seen": 1330896, "step": 14775 }, { "epoch": 3.8409563409563408, "grad_norm": 0.18379788100719452, "learning_rate": 4.872094088845889e-05, "loss": 0.2642, "num_input_tokens_seen": 1331392, "step": 14780 }, { "epoch": 3.8422557172557172, "grad_norm": 0.18438725173473358, "learning_rate": 4.871915002119321e-05, "loss": 0.2835, "num_input_tokens_seen": 1331808, "step": 14785 }, { "epoch": 3.8435550935550937, "grad_norm": 0.17992371320724487, "learning_rate": 4.871735793402891e-05, "loss": 0.3025, "num_input_tokens_seen": 1332240, "step": 14790 }, { "epoch": 3.8448544698544698, "grad_norm": 0.1418616622686386, "learning_rate": 4.8715564627058165e-05, "loss": 0.3084, "num_input_tokens_seen": 1332688, "step": 14795 }, { "epoch": 3.8461538461538463, "grad_norm": 0.6123404502868652, "learning_rate": 4.8713770100373213e-05, "loss": 0.283, "num_input_tokens_seen": 1333136, "step": 14800 }, { "epoch": 3.8474532224532223, "grad_norm": 0.09006812423467636, "learning_rate": 4.8711974354066344e-05, "loss": 0.2852, "num_input_tokens_seen": 1333552, "step": 14805 }, { "epoch": 3.848752598752599, "grad_norm": 0.16255222260951996, "learning_rate": 4.871017738822992e-05, "loss": 0.2639, "num_input_tokens_seen": 1334016, "step": 14810 }, { "epoch": 3.850051975051975, "grad_norm": 0.308941513299942, "learning_rate": 4.870837920295634e-05, "loss": 0.2199, "num_input_tokens_seen": 1334496, "step": 14815 }, { "epoch": 3.8513513513513513, "grad_norm": 0.29990047216415405, "learning_rate": 4.8706579798338116e-05, "loss": 0.2235, "num_input_tokens_seen": 1334944, "step": 14820 }, { "epoch": 3.852650727650728, "grad_norm": 0.3340696692466736, "learning_rate": 4.870477917446777e-05, "loss": 0.287, "num_input_tokens_seen": 1335392, "step": 14825 }, { "epoch": 3.853950103950104, "grad_norm": 0.694223940372467, "learning_rate": 4.870297733143793e-05, "loss": 0.2493, "num_input_tokens_seen": 1335824, "step": 14830 }, { "epoch": 3.8552494802494803, "grad_norm": 0.29217439889907837, "learning_rate": 4.870117426934124e-05, "loss": 0.3001, "num_input_tokens_seen": 1336272, "step": 14835 }, { "epoch": 3.856548856548857, "grad_norm": 0.4409533739089966, "learning_rate": 4.869936998827045e-05, "loss": 0.246, "num_input_tokens_seen": 1336736, "step": 14840 }, { "epoch": 3.857848232848233, "grad_norm": 0.7842772603034973, "learning_rate": 4.869756448831836e-05, "loss": 0.2694, "num_input_tokens_seen": 1337216, "step": 14845 }, { "epoch": 3.859147609147609, "grad_norm": 0.35608237981796265, "learning_rate": 4.869575776957782e-05, "loss": 0.2893, "num_input_tokens_seen": 1337664, "step": 14850 }, { "epoch": 3.8604469854469854, "grad_norm": 0.2342776209115982, "learning_rate": 4.869394983214175e-05, "loss": 0.2696, "num_input_tokens_seen": 1338080, "step": 14855 }, { "epoch": 3.861746361746362, "grad_norm": 0.23101715743541718, "learning_rate": 4.8692140676103146e-05, "loss": 0.3017, "num_input_tokens_seen": 1338512, "step": 14860 }, { "epoch": 3.863045738045738, "grad_norm": 0.13850778341293335, "learning_rate": 4.8690330301555045e-05, "loss": 0.2808, "num_input_tokens_seen": 1338928, "step": 14865 }, { "epoch": 3.8643451143451144, "grad_norm": 0.13757693767547607, "learning_rate": 4.8688518708590544e-05, "loss": 0.2953, "num_input_tokens_seen": 1339376, "step": 14870 }, { "epoch": 3.865644490644491, "grad_norm": 0.16502320766448975, "learning_rate": 4.8686705897302845e-05, "loss": 0.3172, "num_input_tokens_seen": 1339792, "step": 14875 }, { "epoch": 3.866943866943867, "grad_norm": 0.23514479398727417, "learning_rate": 4.868489186778516e-05, "loss": 0.2586, "num_input_tokens_seen": 1340240, "step": 14880 }, { "epoch": 3.868243243243243, "grad_norm": 0.20791570842266083, "learning_rate": 4.8683076620130794e-05, "loss": 0.2949, "num_input_tokens_seen": 1340704, "step": 14885 }, { "epoch": 3.8695426195426195, "grad_norm": 0.1911831796169281, "learning_rate": 4.86812601544331e-05, "loss": 0.2847, "num_input_tokens_seen": 1341152, "step": 14890 }, { "epoch": 3.870841995841996, "grad_norm": 0.1707717925310135, "learning_rate": 4.867944247078551e-05, "loss": 0.3298, "num_input_tokens_seen": 1341584, "step": 14895 }, { "epoch": 3.872141372141372, "grad_norm": 0.5248904228210449, "learning_rate": 4.8677623569281505e-05, "loss": 0.2955, "num_input_tokens_seen": 1342048, "step": 14900 }, { "epoch": 3.8734407484407485, "grad_norm": 0.651296079158783, "learning_rate": 4.867580345001463e-05, "loss": 0.3004, "num_input_tokens_seen": 1342512, "step": 14905 }, { "epoch": 3.874740124740125, "grad_norm": 0.10636971145868301, "learning_rate": 4.867398211307851e-05, "loss": 0.281, "num_input_tokens_seen": 1342992, "step": 14910 }, { "epoch": 3.876039501039501, "grad_norm": 0.4804610311985016, "learning_rate": 4.8672159558566796e-05, "loss": 0.2658, "num_input_tokens_seen": 1343440, "step": 14915 }, { "epoch": 3.8773388773388775, "grad_norm": 0.19490139186382294, "learning_rate": 4.8670335786573236e-05, "loss": 0.2907, "num_input_tokens_seen": 1343904, "step": 14920 }, { "epoch": 3.8786382536382535, "grad_norm": 0.2558186948299408, "learning_rate": 4.866851079719162e-05, "loss": 0.2858, "num_input_tokens_seen": 1344384, "step": 14925 }, { "epoch": 3.87993762993763, "grad_norm": 0.23706206679344177, "learning_rate": 4.866668459051583e-05, "loss": 0.2279, "num_input_tokens_seen": 1344800, "step": 14930 }, { "epoch": 3.881237006237006, "grad_norm": 0.35485753417015076, "learning_rate": 4.8664857166639764e-05, "loss": 0.279, "num_input_tokens_seen": 1345248, "step": 14935 }, { "epoch": 3.8825363825363826, "grad_norm": 0.36675867438316345, "learning_rate": 4.866302852565743e-05, "loss": 0.1546, "num_input_tokens_seen": 1345696, "step": 14940 }, { "epoch": 3.883835758835759, "grad_norm": 0.6681798696517944, "learning_rate": 4.8661198667662854e-05, "loss": 0.2327, "num_input_tokens_seen": 1346144, "step": 14945 }, { "epoch": 3.885135135135135, "grad_norm": 0.352712482213974, "learning_rate": 4.865936759275017e-05, "loss": 0.2246, "num_input_tokens_seen": 1346608, "step": 14950 }, { "epoch": 3.8864345114345116, "grad_norm": 0.3545260429382324, "learning_rate": 4.8657535301013536e-05, "loss": 0.1547, "num_input_tokens_seen": 1347088, "step": 14955 }, { "epoch": 3.8877338877338876, "grad_norm": 0.29775288701057434, "learning_rate": 4.865570179254719e-05, "loss": 0.2937, "num_input_tokens_seen": 1347568, "step": 14960 }, { "epoch": 3.889033264033264, "grad_norm": 0.25181224942207336, "learning_rate": 4.865386706744544e-05, "loss": 0.1543, "num_input_tokens_seen": 1348032, "step": 14965 }, { "epoch": 3.89033264033264, "grad_norm": 0.30142343044281006, "learning_rate": 4.865203112580265e-05, "loss": 0.3336, "num_input_tokens_seen": 1348496, "step": 14970 }, { "epoch": 3.8916320166320166, "grad_norm": 0.2529584467411041, "learning_rate": 4.865019396771322e-05, "loss": 0.2925, "num_input_tokens_seen": 1348960, "step": 14975 }, { "epoch": 3.892931392931393, "grad_norm": 0.5232053995132446, "learning_rate": 4.8648355593271665e-05, "loss": 0.3069, "num_input_tokens_seen": 1349456, "step": 14980 }, { "epoch": 3.894230769230769, "grad_norm": 0.18808889389038086, "learning_rate": 4.864651600257252e-05, "loss": 0.2873, "num_input_tokens_seen": 1349968, "step": 14985 }, { "epoch": 3.8955301455301456, "grad_norm": 0.15792593359947205, "learning_rate": 4.86446751957104e-05, "loss": 0.2335, "num_input_tokens_seen": 1350400, "step": 14990 }, { "epoch": 3.8968295218295217, "grad_norm": 0.3507956564426422, "learning_rate": 4.864283317277998e-05, "loss": 0.2385, "num_input_tokens_seen": 1350864, "step": 14995 }, { "epoch": 3.898128898128898, "grad_norm": 0.22034434974193573, "learning_rate": 4.8640989933876e-05, "loss": 0.2556, "num_input_tokens_seen": 1351296, "step": 15000 }, { "epoch": 3.899428274428274, "grad_norm": 0.20668275654315948, "learning_rate": 4.863914547909325e-05, "loss": 0.3135, "num_input_tokens_seen": 1351728, "step": 15005 }, { "epoch": 3.9007276507276507, "grad_norm": 0.19074152410030365, "learning_rate": 4.8637299808526606e-05, "loss": 0.2605, "num_input_tokens_seen": 1352144, "step": 15010 }, { "epoch": 3.902027027027027, "grad_norm": 0.17929142713546753, "learning_rate": 4.8635452922270975e-05, "loss": 0.2252, "num_input_tokens_seen": 1352576, "step": 15015 }, { "epoch": 3.9033264033264032, "grad_norm": 0.19594553112983704, "learning_rate": 4.863360482042135e-05, "loss": 0.2953, "num_input_tokens_seen": 1353040, "step": 15020 }, { "epoch": 3.9046257796257797, "grad_norm": 0.1636296808719635, "learning_rate": 4.86317555030728e-05, "loss": 0.2771, "num_input_tokens_seen": 1353488, "step": 15025 }, { "epoch": 3.9059251559251558, "grad_norm": 0.28888949751853943, "learning_rate": 4.862990497032042e-05, "loss": 0.2109, "num_input_tokens_seen": 1353952, "step": 15030 }, { "epoch": 3.9072245322245323, "grad_norm": 0.2887333333492279, "learning_rate": 4.862805322225937e-05, "loss": 0.2649, "num_input_tokens_seen": 1354368, "step": 15035 }, { "epoch": 3.9085239085239083, "grad_norm": 0.22620411217212677, "learning_rate": 4.862620025898492e-05, "loss": 0.1926, "num_input_tokens_seen": 1354800, "step": 15040 }, { "epoch": 3.909823284823285, "grad_norm": 0.23990324139595032, "learning_rate": 4.862434608059234e-05, "loss": 0.1691, "num_input_tokens_seen": 1355232, "step": 15045 }, { "epoch": 3.9111226611226613, "grad_norm": 0.2545238733291626, "learning_rate": 4.862249068717702e-05, "loss": 0.2257, "num_input_tokens_seen": 1355680, "step": 15050 }, { "epoch": 3.9124220374220373, "grad_norm": 0.17667463421821594, "learning_rate": 4.862063407883436e-05, "loss": 0.2668, "num_input_tokens_seen": 1356112, "step": 15055 }, { "epoch": 3.913721413721414, "grad_norm": 0.3445800542831421, "learning_rate": 4.861877625565986e-05, "loss": 0.2514, "num_input_tokens_seen": 1356576, "step": 15060 }, { "epoch": 3.9150207900207903, "grad_norm": 0.19575807452201843, "learning_rate": 4.861691721774906e-05, "loss": 0.2186, "num_input_tokens_seen": 1357024, "step": 15065 }, { "epoch": 3.9163201663201663, "grad_norm": 0.30051302909851074, "learning_rate": 4.861505696519759e-05, "loss": 0.2168, "num_input_tokens_seen": 1357424, "step": 15070 }, { "epoch": 3.9176195426195424, "grad_norm": 0.2418135106563568, "learning_rate": 4.86131954981011e-05, "loss": 0.3942, "num_input_tokens_seen": 1357872, "step": 15075 }, { "epoch": 3.918918918918919, "grad_norm": 0.5189719200134277, "learning_rate": 4.8611332816555354e-05, "loss": 0.2014, "num_input_tokens_seen": 1358320, "step": 15080 }, { "epoch": 3.9202182952182953, "grad_norm": 0.26527222990989685, "learning_rate": 4.860946892065614e-05, "loss": 0.2625, "num_input_tokens_seen": 1358752, "step": 15085 }, { "epoch": 3.9215176715176714, "grad_norm": 0.2627003788948059, "learning_rate": 4.8607603810499305e-05, "loss": 0.2488, "num_input_tokens_seen": 1359200, "step": 15090 }, { "epoch": 3.922817047817048, "grad_norm": 0.2732376754283905, "learning_rate": 4.8605737486180793e-05, "loss": 0.226, "num_input_tokens_seen": 1359616, "step": 15095 }, { "epoch": 3.9241164241164244, "grad_norm": 0.23308764398097992, "learning_rate": 4.860386994779659e-05, "loss": 0.1743, "num_input_tokens_seen": 1360080, "step": 15100 }, { "epoch": 3.9254158004158004, "grad_norm": 0.24271811544895172, "learning_rate": 4.8602001195442725e-05, "loss": 0.2803, "num_input_tokens_seen": 1360576, "step": 15105 }, { "epoch": 3.9267151767151764, "grad_norm": 0.43968817591667175, "learning_rate": 4.8600131229215336e-05, "loss": 0.2237, "num_input_tokens_seen": 1361024, "step": 15110 }, { "epoch": 3.928014553014553, "grad_norm": 0.6616767644882202, "learning_rate": 4.859826004921058e-05, "loss": 0.2866, "num_input_tokens_seen": 1361520, "step": 15115 }, { "epoch": 3.9293139293139294, "grad_norm": 0.28121763467788696, "learning_rate": 4.85963876555247e-05, "loss": 0.2157, "num_input_tokens_seen": 1361936, "step": 15120 }, { "epoch": 3.9306133056133055, "grad_norm": 0.6214901804924011, "learning_rate": 4.859451404825399e-05, "loss": 0.2909, "num_input_tokens_seen": 1362400, "step": 15125 }, { "epoch": 3.931912681912682, "grad_norm": 0.36048662662506104, "learning_rate": 4.859263922749482e-05, "loss": 0.1724, "num_input_tokens_seen": 1362880, "step": 15130 }, { "epoch": 3.9332120582120584, "grad_norm": 0.3003673553466797, "learning_rate": 4.859076319334361e-05, "loss": 0.2659, "num_input_tokens_seen": 1363328, "step": 15135 }, { "epoch": 3.9345114345114345, "grad_norm": 0.27255651354789734, "learning_rate": 4.858888594589685e-05, "loss": 0.2618, "num_input_tokens_seen": 1363760, "step": 15140 }, { "epoch": 3.935810810810811, "grad_norm": 0.4363347887992859, "learning_rate": 4.8587007485251074e-05, "loss": 0.3639, "num_input_tokens_seen": 1364224, "step": 15145 }, { "epoch": 3.937110187110187, "grad_norm": 0.6926752328872681, "learning_rate": 4.858512781150291e-05, "loss": 0.2567, "num_input_tokens_seen": 1364640, "step": 15150 }, { "epoch": 3.9384095634095635, "grad_norm": 0.19719237089157104, "learning_rate": 4.858324692474902e-05, "loss": 0.3109, "num_input_tokens_seen": 1365072, "step": 15155 }, { "epoch": 3.9397089397089395, "grad_norm": 0.7031574249267578, "learning_rate": 4.8581364825086144e-05, "loss": 0.2611, "num_input_tokens_seen": 1365488, "step": 15160 }, { "epoch": 3.941008316008316, "grad_norm": 0.4901725947856903, "learning_rate": 4.857948151261108e-05, "loss": 0.2307, "num_input_tokens_seen": 1365984, "step": 15165 }, { "epoch": 3.9423076923076925, "grad_norm": 0.432049423456192, "learning_rate": 4.857759698742069e-05, "loss": 0.2063, "num_input_tokens_seen": 1366448, "step": 15170 }, { "epoch": 3.9436070686070686, "grad_norm": 0.5399405360221863, "learning_rate": 4.85757112496119e-05, "loss": 0.225, "num_input_tokens_seen": 1366928, "step": 15175 }, { "epoch": 3.944906444906445, "grad_norm": 0.6213440299034119, "learning_rate": 4.857382429928169e-05, "loss": 0.2087, "num_input_tokens_seen": 1367360, "step": 15180 }, { "epoch": 3.946205821205821, "grad_norm": 0.48469772934913635, "learning_rate": 4.857193613652711e-05, "loss": 0.411, "num_input_tokens_seen": 1367808, "step": 15185 }, { "epoch": 3.9475051975051976, "grad_norm": 2.167127847671509, "learning_rate": 4.8570046761445265e-05, "loss": 0.3174, "num_input_tokens_seen": 1368256, "step": 15190 }, { "epoch": 3.9488045738045736, "grad_norm": 0.2907269597053528, "learning_rate": 4.8568156174133325e-05, "loss": 0.2677, "num_input_tokens_seen": 1368688, "step": 15195 }, { "epoch": 3.95010395010395, "grad_norm": 0.2345796823501587, "learning_rate": 4.856626437468854e-05, "loss": 0.339, "num_input_tokens_seen": 1369152, "step": 15200 }, { "epoch": 3.9514033264033266, "grad_norm": 0.39438432455062866, "learning_rate": 4.856437136320821e-05, "loss": 0.2572, "num_input_tokens_seen": 1369584, "step": 15205 }, { "epoch": 3.9527027027027026, "grad_norm": 0.48566344380378723, "learning_rate": 4.856247713978966e-05, "loss": 0.2681, "num_input_tokens_seen": 1370016, "step": 15210 }, { "epoch": 3.954002079002079, "grad_norm": 0.277557909488678, "learning_rate": 4.8560581704530345e-05, "loss": 0.3142, "num_input_tokens_seen": 1370480, "step": 15215 }, { "epoch": 3.955301455301455, "grad_norm": 0.31733402609825134, "learning_rate": 4.855868505752774e-05, "loss": 0.2523, "num_input_tokens_seen": 1370960, "step": 15220 }, { "epoch": 3.9566008316008316, "grad_norm": 0.6156630516052246, "learning_rate": 4.8556787198879386e-05, "loss": 0.3128, "num_input_tokens_seen": 1371392, "step": 15225 }, { "epoch": 3.9579002079002077, "grad_norm": 0.7316226363182068, "learning_rate": 4.85548881286829e-05, "loss": 0.2674, "num_input_tokens_seen": 1371856, "step": 15230 }, { "epoch": 3.959199584199584, "grad_norm": 0.31971272826194763, "learning_rate": 4.8552987847035934e-05, "loss": 0.2398, "num_input_tokens_seen": 1372336, "step": 15235 }, { "epoch": 3.9604989604989607, "grad_norm": 0.471794068813324, "learning_rate": 4.855108635403624e-05, "loss": 0.2866, "num_input_tokens_seen": 1372800, "step": 15240 }, { "epoch": 3.9617983367983367, "grad_norm": 0.32510146498680115, "learning_rate": 4.8549183649781626e-05, "loss": 0.3484, "num_input_tokens_seen": 1373248, "step": 15245 }, { "epoch": 3.963097713097713, "grad_norm": 0.5000709891319275, "learning_rate": 4.854727973436992e-05, "loss": 0.1702, "num_input_tokens_seen": 1373696, "step": 15250 }, { "epoch": 3.9643970893970892, "grad_norm": 0.37281399965286255, "learning_rate": 4.854537460789906e-05, "loss": 0.2908, "num_input_tokens_seen": 1374144, "step": 15255 }, { "epoch": 3.9656964656964657, "grad_norm": 0.3756411075592041, "learning_rate": 4.854346827046702e-05, "loss": 0.2555, "num_input_tokens_seen": 1374592, "step": 15260 }, { "epoch": 3.9669958419958418, "grad_norm": 0.7703394293785095, "learning_rate": 4.8541560722171855e-05, "loss": 0.2867, "num_input_tokens_seen": 1375056, "step": 15265 }, { "epoch": 3.9682952182952183, "grad_norm": 0.41129565238952637, "learning_rate": 4.8539651963111655e-05, "loss": 0.3049, "num_input_tokens_seen": 1375488, "step": 15270 }, { "epoch": 3.9695945945945947, "grad_norm": 0.45728161931037903, "learning_rate": 4.853774199338461e-05, "loss": 0.237, "num_input_tokens_seen": 1375968, "step": 15275 }, { "epoch": 3.970893970893971, "grad_norm": 0.4581896960735321, "learning_rate": 4.8535830813088934e-05, "loss": 0.3063, "num_input_tokens_seen": 1376432, "step": 15280 }, { "epoch": 3.9721933471933473, "grad_norm": 0.24191443622112274, "learning_rate": 4.853391842232293e-05, "loss": 0.2415, "num_input_tokens_seen": 1376864, "step": 15285 }, { "epoch": 3.9734927234927238, "grad_norm": 0.26771071553230286, "learning_rate": 4.853200482118495e-05, "loss": 0.2266, "num_input_tokens_seen": 1377312, "step": 15290 }, { "epoch": 3.9747920997921, "grad_norm": 0.38980475068092346, "learning_rate": 4.853009000977342e-05, "loss": 0.2189, "num_input_tokens_seen": 1377760, "step": 15295 }, { "epoch": 3.976091476091476, "grad_norm": 0.23998253047466278, "learning_rate": 4.852817398818682e-05, "loss": 0.1538, "num_input_tokens_seen": 1378240, "step": 15300 }, { "epoch": 3.9773908523908523, "grad_norm": 0.4080977141857147, "learning_rate": 4.852625675652368e-05, "loss": 0.2128, "num_input_tokens_seen": 1378720, "step": 15305 }, { "epoch": 3.978690228690229, "grad_norm": 0.7677913308143616, "learning_rate": 4.852433831488261e-05, "loss": 0.4285, "num_input_tokens_seen": 1379168, "step": 15310 }, { "epoch": 3.979989604989605, "grad_norm": 0.3210486173629761, "learning_rate": 4.852241866336229e-05, "loss": 0.2885, "num_input_tokens_seen": 1379632, "step": 15315 }, { "epoch": 3.9812889812889813, "grad_norm": 0.29738983511924744, "learning_rate": 4.8520497802061436e-05, "loss": 0.2255, "num_input_tokens_seen": 1380048, "step": 15320 }, { "epoch": 3.982588357588358, "grad_norm": 0.3093140125274658, "learning_rate": 4.8518575731078844e-05, "loss": 0.266, "num_input_tokens_seen": 1380512, "step": 15325 }, { "epoch": 3.983887733887734, "grad_norm": 0.27244600653648376, "learning_rate": 4.851665245051337e-05, "loss": 0.3224, "num_input_tokens_seen": 1380960, "step": 15330 }, { "epoch": 3.98518711018711, "grad_norm": 0.25482937693595886, "learning_rate": 4.8514727960463926e-05, "loss": 0.3266, "num_input_tokens_seen": 1381440, "step": 15335 }, { "epoch": 3.9864864864864864, "grad_norm": 0.2780417203903198, "learning_rate": 4.8512802261029486e-05, "loss": 0.2828, "num_input_tokens_seen": 1381904, "step": 15340 }, { "epoch": 3.987785862785863, "grad_norm": 0.5268547534942627, "learning_rate": 4.8510875352309106e-05, "loss": 0.2969, "num_input_tokens_seen": 1382400, "step": 15345 }, { "epoch": 3.989085239085239, "grad_norm": 0.41195106506347656, "learning_rate": 4.8508947234401875e-05, "loss": 0.2503, "num_input_tokens_seen": 1382832, "step": 15350 }, { "epoch": 3.9903846153846154, "grad_norm": 0.20574435591697693, "learning_rate": 4.850701790740696e-05, "loss": 0.2689, "num_input_tokens_seen": 1383312, "step": 15355 }, { "epoch": 3.991683991683992, "grad_norm": 0.29124632477760315, "learning_rate": 4.85050873714236e-05, "loss": 0.2155, "num_input_tokens_seen": 1383760, "step": 15360 }, { "epoch": 3.992983367983368, "grad_norm": 0.28600600361824036, "learning_rate": 4.850315562655107e-05, "loss": 0.1709, "num_input_tokens_seen": 1384192, "step": 15365 }, { "epoch": 3.9942827442827444, "grad_norm": 0.5721672177314758, "learning_rate": 4.850122267288872e-05, "loss": 0.2844, "num_input_tokens_seen": 1384672, "step": 15370 }, { "epoch": 3.9955821205821205, "grad_norm": 0.4879542589187622, "learning_rate": 4.8499288510535975e-05, "loss": 0.2892, "num_input_tokens_seen": 1385152, "step": 15375 }, { "epoch": 3.996881496881497, "grad_norm": 0.3644934594631195, "learning_rate": 4.849735313959231e-05, "loss": 0.2602, "num_input_tokens_seen": 1385632, "step": 15380 }, { "epoch": 3.998180873180873, "grad_norm": 0.314426064491272, "learning_rate": 4.849541656015726e-05, "loss": 0.2136, "num_input_tokens_seen": 1386096, "step": 15385 }, { "epoch": 3.9994802494802495, "grad_norm": 0.3058297634124756, "learning_rate": 4.8493478772330414e-05, "loss": 0.2649, "num_input_tokens_seen": 1386544, "step": 15390 }, { "epoch": 4.0, "eval_loss": 0.2411372810602188, "eval_runtime": 13.2049, "eval_samples_per_second": 64.825, "eval_steps_per_second": 32.412, "num_input_tokens_seen": 1386696, "step": 15392 }, { "epoch": 4.000779625779626, "grad_norm": 0.2291429191827774, "learning_rate": 4.8491539776211453e-05, "loss": 0.2266, "num_input_tokens_seen": 1386968, "step": 15395 }, { "epoch": 4.002079002079002, "grad_norm": 0.2641370892524719, "learning_rate": 4.848959957190009e-05, "loss": 0.1759, "num_input_tokens_seen": 1387416, "step": 15400 }, { "epoch": 4.003378378378378, "grad_norm": 0.2786197066307068, "learning_rate": 4.848765815949611e-05, "loss": 0.3191, "num_input_tokens_seen": 1387832, "step": 15405 }, { "epoch": 4.004677754677755, "grad_norm": 0.23794615268707275, "learning_rate": 4.8485715539099374e-05, "loss": 0.2108, "num_input_tokens_seen": 1388280, "step": 15410 }, { "epoch": 4.005977130977131, "grad_norm": 0.23975491523742676, "learning_rate": 4.848377171080978e-05, "loss": 0.2174, "num_input_tokens_seen": 1388792, "step": 15415 }, { "epoch": 4.007276507276507, "grad_norm": 0.23622263967990875, "learning_rate": 4.848182667472731e-05, "loss": 0.1705, "num_input_tokens_seen": 1389224, "step": 15420 }, { "epoch": 4.008575883575884, "grad_norm": 0.29668548703193665, "learning_rate": 4.8479880430951995e-05, "loss": 0.2668, "num_input_tokens_seen": 1389688, "step": 15425 }, { "epoch": 4.00987525987526, "grad_norm": 0.3454650640487671, "learning_rate": 4.847793297958393e-05, "loss": 0.2078, "num_input_tokens_seen": 1390120, "step": 15430 }, { "epoch": 4.011174636174636, "grad_norm": 0.22306787967681885, "learning_rate": 4.847598432072327e-05, "loss": 0.2596, "num_input_tokens_seen": 1390600, "step": 15435 }, { "epoch": 4.012474012474012, "grad_norm": 0.21542230248451233, "learning_rate": 4.847403445447025e-05, "loss": 0.3094, "num_input_tokens_seen": 1391048, "step": 15440 }, { "epoch": 4.013773388773389, "grad_norm": 0.24297277629375458, "learning_rate": 4.847208338092515e-05, "loss": 0.209, "num_input_tokens_seen": 1391496, "step": 15445 }, { "epoch": 4.015072765072765, "grad_norm": 0.27632462978363037, "learning_rate": 4.84701311001883e-05, "loss": 0.2632, "num_input_tokens_seen": 1391976, "step": 15450 }, { "epoch": 4.016372141372141, "grad_norm": 0.2406025379896164, "learning_rate": 4.8468177612360126e-05, "loss": 0.2181, "num_input_tokens_seen": 1392408, "step": 15455 }, { "epoch": 4.017671517671518, "grad_norm": 0.2808411419391632, "learning_rate": 4.8466222917541095e-05, "loss": 0.3022, "num_input_tokens_seen": 1392840, "step": 15460 }, { "epoch": 4.018970893970894, "grad_norm": 0.29349783062934875, "learning_rate": 4.846426701583173e-05, "loss": 0.228, "num_input_tokens_seen": 1393272, "step": 15465 }, { "epoch": 4.02027027027027, "grad_norm": 0.30604153871536255, "learning_rate": 4.846230990733263e-05, "loss": 0.193, "num_input_tokens_seen": 1393736, "step": 15470 }, { "epoch": 4.021569646569646, "grad_norm": 0.24322515726089478, "learning_rate": 4.846035159214446e-05, "loss": 0.1615, "num_input_tokens_seen": 1394152, "step": 15475 }, { "epoch": 4.022869022869023, "grad_norm": 0.3616969585418701, "learning_rate": 4.845839207036792e-05, "loss": 0.2707, "num_input_tokens_seen": 1394616, "step": 15480 }, { "epoch": 4.024168399168399, "grad_norm": 0.20621125400066376, "learning_rate": 4.845643134210379e-05, "loss": 0.3604, "num_input_tokens_seen": 1395032, "step": 15485 }, { "epoch": 4.025467775467775, "grad_norm": 0.2559572160243988, "learning_rate": 4.845446940745294e-05, "loss": 0.2635, "num_input_tokens_seen": 1395480, "step": 15490 }, { "epoch": 4.026767151767152, "grad_norm": 0.2594328224658966, "learning_rate": 4.845250626651625e-05, "loss": 0.2547, "num_input_tokens_seen": 1395928, "step": 15495 }, { "epoch": 4.028066528066528, "grad_norm": 0.5419949889183044, "learning_rate": 4.8450541919394686e-05, "loss": 0.3138, "num_input_tokens_seen": 1396376, "step": 15500 }, { "epoch": 4.029365904365904, "grad_norm": 0.3262897729873657, "learning_rate": 4.844857636618928e-05, "loss": 0.2875, "num_input_tokens_seen": 1396808, "step": 15505 }, { "epoch": 4.03066528066528, "grad_norm": 0.21960097551345825, "learning_rate": 4.844660960700113e-05, "loss": 0.2644, "num_input_tokens_seen": 1397272, "step": 15510 }, { "epoch": 4.031964656964657, "grad_norm": 0.336704820394516, "learning_rate": 4.844464164193138e-05, "loss": 0.1868, "num_input_tokens_seen": 1397736, "step": 15515 }, { "epoch": 4.033264033264033, "grad_norm": 0.2649485170841217, "learning_rate": 4.844267247108125e-05, "loss": 0.225, "num_input_tokens_seen": 1398200, "step": 15520 }, { "epoch": 4.034563409563409, "grad_norm": 0.2203647494316101, "learning_rate": 4.8440702094552015e-05, "loss": 0.2154, "num_input_tokens_seen": 1398648, "step": 15525 }, { "epoch": 4.035862785862786, "grad_norm": 0.32343968749046326, "learning_rate": 4.843873051244501e-05, "loss": 0.3091, "num_input_tokens_seen": 1399112, "step": 15530 }, { "epoch": 4.037162162162162, "grad_norm": 0.29014286398887634, "learning_rate": 4.8436757724861624e-05, "loss": 0.2114, "num_input_tokens_seen": 1399560, "step": 15535 }, { "epoch": 4.038461538461538, "grad_norm": 0.29084086418151855, "learning_rate": 4.843478373190334e-05, "loss": 0.2097, "num_input_tokens_seen": 1400008, "step": 15540 }, { "epoch": 4.039760914760914, "grad_norm": 0.2607841491699219, "learning_rate": 4.843280853367168e-05, "loss": 0.2964, "num_input_tokens_seen": 1400504, "step": 15545 }, { "epoch": 4.041060291060291, "grad_norm": 0.3113051950931549, "learning_rate": 4.843083213026823e-05, "loss": 0.2687, "num_input_tokens_seen": 1400904, "step": 15550 }, { "epoch": 4.042359667359667, "grad_norm": 0.6298636794090271, "learning_rate": 4.842885452179462e-05, "loss": 0.3143, "num_input_tokens_seen": 1401352, "step": 15555 }, { "epoch": 4.043659043659043, "grad_norm": 0.4396342635154724, "learning_rate": 4.842687570835258e-05, "loss": 0.2365, "num_input_tokens_seen": 1401784, "step": 15560 }, { "epoch": 4.04495841995842, "grad_norm": 0.6636200547218323, "learning_rate": 4.842489569004388e-05, "loss": 0.335, "num_input_tokens_seen": 1402296, "step": 15565 }, { "epoch": 4.046257796257796, "grad_norm": 0.17371362447738647, "learning_rate": 4.842291446697034e-05, "loss": 0.2776, "num_input_tokens_seen": 1402792, "step": 15570 }, { "epoch": 4.047557172557172, "grad_norm": 0.6649243831634521, "learning_rate": 4.842093203923387e-05, "loss": 0.2872, "num_input_tokens_seen": 1403256, "step": 15575 }, { "epoch": 4.048856548856548, "grad_norm": 0.5913294553756714, "learning_rate": 4.841894840693642e-05, "loss": 0.2751, "num_input_tokens_seen": 1403688, "step": 15580 }, { "epoch": 4.050155925155925, "grad_norm": 0.20688611268997192, "learning_rate": 4.8416963570180025e-05, "loss": 0.2572, "num_input_tokens_seen": 1404152, "step": 15585 }, { "epoch": 4.051455301455301, "grad_norm": 0.34902164340019226, "learning_rate": 4.8414977529066754e-05, "loss": 0.2601, "num_input_tokens_seen": 1404632, "step": 15590 }, { "epoch": 4.0527546777546775, "grad_norm": 0.22698605060577393, "learning_rate": 4.841299028369874e-05, "loss": 0.236, "num_input_tokens_seen": 1405080, "step": 15595 }, { "epoch": 4.054054054054054, "grad_norm": 0.24313348531723022, "learning_rate": 4.841100183417822e-05, "loss": 0.2177, "num_input_tokens_seen": 1405512, "step": 15600 }, { "epoch": 4.05535343035343, "grad_norm": 0.260030061006546, "learning_rate": 4.840901218060744e-05, "loss": 0.2646, "num_input_tokens_seen": 1405928, "step": 15605 }, { "epoch": 4.0566528066528065, "grad_norm": 0.2459307610988617, "learning_rate": 4.840702132308873e-05, "loss": 0.262, "num_input_tokens_seen": 1406376, "step": 15610 }, { "epoch": 4.0579521829521825, "grad_norm": 0.23490531742572784, "learning_rate": 4.840502926172449e-05, "loss": 0.3513, "num_input_tokens_seen": 1406840, "step": 15615 }, { "epoch": 4.0592515592515594, "grad_norm": 0.271013081073761, "learning_rate": 4.840303599661716e-05, "loss": 0.2388, "num_input_tokens_seen": 1407240, "step": 15620 }, { "epoch": 4.0605509355509355, "grad_norm": 0.2931273877620697, "learning_rate": 4.840104152786927e-05, "loss": 0.1908, "num_input_tokens_seen": 1407672, "step": 15625 }, { "epoch": 4.0618503118503115, "grad_norm": 0.2688788175582886, "learning_rate": 4.8399045855583404e-05, "loss": 0.2943, "num_input_tokens_seen": 1408168, "step": 15630 }, { "epoch": 4.0631496881496885, "grad_norm": 0.19260361790657043, "learning_rate": 4.8397048979862176e-05, "loss": 0.2485, "num_input_tokens_seen": 1408600, "step": 15635 }, { "epoch": 4.0644490644490645, "grad_norm": 0.23904122412204742, "learning_rate": 4.83950509008083e-05, "loss": 0.2212, "num_input_tokens_seen": 1409064, "step": 15640 }, { "epoch": 4.0657484407484406, "grad_norm": 0.24962863326072693, "learning_rate": 4.8393051618524554e-05, "loss": 0.2535, "num_input_tokens_seen": 1409512, "step": 15645 }, { "epoch": 4.0670478170478175, "grad_norm": 0.25052058696746826, "learning_rate": 4.839105113311373e-05, "loss": 0.3013, "num_input_tokens_seen": 1409976, "step": 15650 }, { "epoch": 4.0683471933471935, "grad_norm": 0.2777552902698517, "learning_rate": 4.8389049444678746e-05, "loss": 0.2487, "num_input_tokens_seen": 1410424, "step": 15655 }, { "epoch": 4.06964656964657, "grad_norm": 0.3185277283191681, "learning_rate": 4.838704655332254e-05, "loss": 0.308, "num_input_tokens_seen": 1410856, "step": 15660 }, { "epoch": 4.070945945945946, "grad_norm": 0.16629882156848907, "learning_rate": 4.838504245914812e-05, "loss": 0.3029, "num_input_tokens_seen": 1411320, "step": 15665 }, { "epoch": 4.0722453222453225, "grad_norm": 0.33700552582740784, "learning_rate": 4.838303716225856e-05, "loss": 0.2025, "num_input_tokens_seen": 1411752, "step": 15670 }, { "epoch": 4.073544698544699, "grad_norm": 0.30202436447143555, "learning_rate": 4.8381030662756984e-05, "loss": 0.2566, "num_input_tokens_seen": 1412184, "step": 15675 }, { "epoch": 4.074844074844075, "grad_norm": 0.21063745021820068, "learning_rate": 4.837902296074661e-05, "loss": 0.3038, "num_input_tokens_seen": 1412648, "step": 15680 }, { "epoch": 4.076143451143452, "grad_norm": 0.1889752596616745, "learning_rate": 4.8377014056330686e-05, "loss": 0.2559, "num_input_tokens_seen": 1413096, "step": 15685 }, { "epoch": 4.077442827442828, "grad_norm": 0.23322716355323792, "learning_rate": 4.8375003949612525e-05, "loss": 0.3019, "num_input_tokens_seen": 1413544, "step": 15690 }, { "epoch": 4.078742203742204, "grad_norm": 0.38552671670913696, "learning_rate": 4.8372992640695515e-05, "loss": 0.2597, "num_input_tokens_seen": 1414024, "step": 15695 }, { "epoch": 4.08004158004158, "grad_norm": 0.27375420928001404, "learning_rate": 4.8370980129683096e-05, "loss": 0.262, "num_input_tokens_seen": 1414520, "step": 15700 }, { "epoch": 4.081340956340957, "grad_norm": 0.41203704476356506, "learning_rate": 4.836896641667878e-05, "loss": 0.2862, "num_input_tokens_seen": 1415048, "step": 15705 }, { "epoch": 4.082640332640333, "grad_norm": 0.213813915848732, "learning_rate": 4.836695150178613e-05, "loss": 0.2446, "num_input_tokens_seen": 1415512, "step": 15710 }, { "epoch": 4.083939708939709, "grad_norm": 0.17536042630672455, "learning_rate": 4.8364935385108765e-05, "loss": 0.2533, "num_input_tokens_seen": 1415928, "step": 15715 }, { "epoch": 4.085239085239086, "grad_norm": 0.5496242642402649, "learning_rate": 4.83629180667504e-05, "loss": 0.2507, "num_input_tokens_seen": 1416344, "step": 15720 }, { "epoch": 4.086538461538462, "grad_norm": 0.2702100872993469, "learning_rate": 4.836089954681477e-05, "loss": 0.2704, "num_input_tokens_seen": 1416792, "step": 15725 }, { "epoch": 4.087837837837838, "grad_norm": 0.2471284568309784, "learning_rate": 4.835887982540569e-05, "loss": 0.2582, "num_input_tokens_seen": 1417208, "step": 15730 }, { "epoch": 4.089137214137214, "grad_norm": 0.355597585439682, "learning_rate": 4.8356858902627036e-05, "loss": 0.2903, "num_input_tokens_seen": 1417640, "step": 15735 }, { "epoch": 4.090436590436591, "grad_norm": 0.29548507928848267, "learning_rate": 4.8354836778582756e-05, "loss": 0.2117, "num_input_tokens_seen": 1418072, "step": 15740 }, { "epoch": 4.091735966735967, "grad_norm": 0.494436115026474, "learning_rate": 4.8352813453376836e-05, "loss": 0.2805, "num_input_tokens_seen": 1418520, "step": 15745 }, { "epoch": 4.093035343035343, "grad_norm": 0.5941007733345032, "learning_rate": 4.835078892711336e-05, "loss": 0.2598, "num_input_tokens_seen": 1418984, "step": 15750 }, { "epoch": 4.09433471933472, "grad_norm": 0.4966264069080353, "learning_rate": 4.834876319989642e-05, "loss": 0.1994, "num_input_tokens_seen": 1419400, "step": 15755 }, { "epoch": 4.095634095634096, "grad_norm": 0.7812311053276062, "learning_rate": 4.8346736271830214e-05, "loss": 0.3949, "num_input_tokens_seen": 1419864, "step": 15760 }, { "epoch": 4.096933471933472, "grad_norm": 0.5469751358032227, "learning_rate": 4.8344708143019e-05, "loss": 0.2946, "num_input_tokens_seen": 1420280, "step": 15765 }, { "epoch": 4.098232848232848, "grad_norm": 0.7360925674438477, "learning_rate": 4.8342678813567076e-05, "loss": 0.2615, "num_input_tokens_seen": 1420744, "step": 15770 }, { "epoch": 4.099532224532225, "grad_norm": 0.7237881422042847, "learning_rate": 4.834064828357882e-05, "loss": 0.2325, "num_input_tokens_seen": 1421208, "step": 15775 }, { "epoch": 4.100831600831601, "grad_norm": 0.32185888290405273, "learning_rate": 4.8338616553158656e-05, "loss": 0.2113, "num_input_tokens_seen": 1421688, "step": 15780 }, { "epoch": 4.102130977130977, "grad_norm": 0.3072474002838135, "learning_rate": 4.833658362241108e-05, "loss": 0.314, "num_input_tokens_seen": 1422120, "step": 15785 }, { "epoch": 4.103430353430354, "grad_norm": 0.355844646692276, "learning_rate": 4.833454949144065e-05, "loss": 0.2156, "num_input_tokens_seen": 1422536, "step": 15790 }, { "epoch": 4.10472972972973, "grad_norm": 0.26077181100845337, "learning_rate": 4.833251416035198e-05, "loss": 0.1103, "num_input_tokens_seen": 1423032, "step": 15795 }, { "epoch": 4.106029106029106, "grad_norm": 0.5942691564559937, "learning_rate": 4.833047762924975e-05, "loss": 0.3645, "num_input_tokens_seen": 1423464, "step": 15800 }, { "epoch": 4.107328482328482, "grad_norm": 0.32095855474472046, "learning_rate": 4.83284398982387e-05, "loss": 0.2135, "num_input_tokens_seen": 1423880, "step": 15805 }, { "epoch": 4.108627858627859, "grad_norm": 0.3691795766353607, "learning_rate": 4.8326400967423636e-05, "loss": 0.2145, "num_input_tokens_seen": 1424344, "step": 15810 }, { "epoch": 4.109927234927235, "grad_norm": 0.45857706665992737, "learning_rate": 4.8324360836909425e-05, "loss": 0.25, "num_input_tokens_seen": 1424840, "step": 15815 }, { "epoch": 4.111226611226611, "grad_norm": 0.36511120200157166, "learning_rate": 4.832231950680097e-05, "loss": 0.2603, "num_input_tokens_seen": 1425256, "step": 15820 }, { "epoch": 4.112525987525988, "grad_norm": 0.38635775446891785, "learning_rate": 4.83202769772033e-05, "loss": 0.3562, "num_input_tokens_seen": 1425720, "step": 15825 }, { "epoch": 4.113825363825364, "grad_norm": 0.6117668151855469, "learning_rate": 4.8318233248221424e-05, "loss": 0.2374, "num_input_tokens_seen": 1426152, "step": 15830 }, { "epoch": 4.11512474012474, "grad_norm": 0.3249704837799072, "learning_rate": 4.831618831996048e-05, "loss": 0.2436, "num_input_tokens_seen": 1426648, "step": 15835 }, { "epoch": 4.116424116424117, "grad_norm": 0.33595797419548035, "learning_rate": 4.8314142192525615e-05, "loss": 0.2493, "num_input_tokens_seen": 1427048, "step": 15840 }, { "epoch": 4.117723492723493, "grad_norm": 0.5593876838684082, "learning_rate": 4.8312094866022096e-05, "loss": 0.25, "num_input_tokens_seen": 1427512, "step": 15845 }, { "epoch": 4.119022869022869, "grad_norm": 0.30859488248825073, "learning_rate": 4.8310046340555196e-05, "loss": 0.2869, "num_input_tokens_seen": 1427960, "step": 15850 }, { "epoch": 4.120322245322245, "grad_norm": 0.5227345824241638, "learning_rate": 4.830799661623027e-05, "loss": 0.2976, "num_input_tokens_seen": 1428392, "step": 15855 }, { "epoch": 4.121621621621622, "grad_norm": 0.4080590307712555, "learning_rate": 4.8305945693152754e-05, "loss": 0.2308, "num_input_tokens_seen": 1428840, "step": 15860 }, { "epoch": 4.122920997920998, "grad_norm": 0.3918842375278473, "learning_rate": 4.8303893571428116e-05, "loss": 0.2313, "num_input_tokens_seen": 1429256, "step": 15865 }, { "epoch": 4.124220374220374, "grad_norm": 0.5445548892021179, "learning_rate": 4.83018402511619e-05, "loss": 0.2644, "num_input_tokens_seen": 1429720, "step": 15870 }, { "epoch": 4.12551975051975, "grad_norm": 0.7697758078575134, "learning_rate": 4.829978573245972e-05, "loss": 0.2649, "num_input_tokens_seen": 1430152, "step": 15875 }, { "epoch": 4.126819126819127, "grad_norm": 0.32943952083587646, "learning_rate": 4.8297730015427233e-05, "loss": 0.1724, "num_input_tokens_seen": 1430616, "step": 15880 }, { "epoch": 4.128118503118503, "grad_norm": 0.3916146457195282, "learning_rate": 4.829567310017017e-05, "loss": 0.1832, "num_input_tokens_seen": 1431032, "step": 15885 }, { "epoch": 4.129417879417879, "grad_norm": 0.38623055815696716, "learning_rate": 4.829361498679432e-05, "loss": 0.3162, "num_input_tokens_seen": 1431528, "step": 15890 }, { "epoch": 4.130717255717256, "grad_norm": 0.4644460082054138, "learning_rate": 4.8291555675405526e-05, "loss": 0.3221, "num_input_tokens_seen": 1431976, "step": 15895 }, { "epoch": 4.132016632016632, "grad_norm": 0.26536980271339417, "learning_rate": 4.828949516610971e-05, "loss": 0.2058, "num_input_tokens_seen": 1432424, "step": 15900 }, { "epoch": 4.133316008316008, "grad_norm": 0.29752686619758606, "learning_rate": 4.8287433459012844e-05, "loss": 0.2906, "num_input_tokens_seen": 1432888, "step": 15905 }, { "epoch": 4.134615384615385, "grad_norm": 0.21773020923137665, "learning_rate": 4.828537055422096e-05, "loss": 0.3194, "num_input_tokens_seen": 1433352, "step": 15910 }, { "epoch": 4.135914760914761, "grad_norm": 0.46155449748039246, "learning_rate": 4.828330645184016e-05, "loss": 0.2257, "num_input_tokens_seen": 1433768, "step": 15915 }, { "epoch": 4.137214137214137, "grad_norm": 0.41360336542129517, "learning_rate": 4.8281241151976596e-05, "loss": 0.2755, "num_input_tokens_seen": 1434184, "step": 15920 }, { "epoch": 4.138513513513513, "grad_norm": 0.39314067363739014, "learning_rate": 4.82791746547365e-05, "loss": 0.2646, "num_input_tokens_seen": 1434616, "step": 15925 }, { "epoch": 4.13981288981289, "grad_norm": 0.39828982949256897, "learning_rate": 4.8277106960226136e-05, "loss": 0.2559, "num_input_tokens_seen": 1435080, "step": 15930 }, { "epoch": 4.141112266112266, "grad_norm": 0.6262319087982178, "learning_rate": 4.8275038068551866e-05, "loss": 0.2021, "num_input_tokens_seen": 1435512, "step": 15935 }, { "epoch": 4.142411642411642, "grad_norm": 0.466200590133667, "learning_rate": 4.827296797982008e-05, "loss": 0.2775, "num_input_tokens_seen": 1436008, "step": 15940 }, { "epoch": 4.143711018711019, "grad_norm": 0.47354617714881897, "learning_rate": 4.827089669413726e-05, "loss": 0.275, "num_input_tokens_seen": 1436440, "step": 15945 }, { "epoch": 4.145010395010395, "grad_norm": 0.265948623418808, "learning_rate": 4.826882421160992e-05, "loss": 0.2081, "num_input_tokens_seen": 1436904, "step": 15950 }, { "epoch": 4.146309771309771, "grad_norm": 0.29296761751174927, "learning_rate": 4.826675053234466e-05, "loss": 0.2574, "num_input_tokens_seen": 1437352, "step": 15955 }, { "epoch": 4.147609147609147, "grad_norm": 0.5815564393997192, "learning_rate": 4.8264675656448126e-05, "loss": 0.2592, "num_input_tokens_seen": 1437800, "step": 15960 }, { "epoch": 4.148908523908524, "grad_norm": 0.29634109139442444, "learning_rate": 4.826259958402703e-05, "loss": 0.2635, "num_input_tokens_seen": 1438280, "step": 15965 }, { "epoch": 4.1502079002079, "grad_norm": 0.39171677827835083, "learning_rate": 4.826052231518815e-05, "loss": 0.2325, "num_input_tokens_seen": 1438744, "step": 15970 }, { "epoch": 4.151507276507276, "grad_norm": 0.26712766289711, "learning_rate": 4.825844385003832e-05, "loss": 0.2438, "num_input_tokens_seen": 1439192, "step": 15975 }, { "epoch": 4.152806652806653, "grad_norm": 0.42340123653411865, "learning_rate": 4.825636418868443e-05, "loss": 0.2931, "num_input_tokens_seen": 1439640, "step": 15980 }, { "epoch": 4.154106029106029, "grad_norm": 0.42793166637420654, "learning_rate": 4.8254283331233464e-05, "loss": 0.2803, "num_input_tokens_seen": 1440104, "step": 15985 }, { "epoch": 4.155405405405405, "grad_norm": 0.3949640095233917, "learning_rate": 4.8252201277792405e-05, "loss": 0.2601, "num_input_tokens_seen": 1440568, "step": 15990 }, { "epoch": 4.156704781704781, "grad_norm": 0.40801379084587097, "learning_rate": 4.8250118028468374e-05, "loss": 0.2078, "num_input_tokens_seen": 1441048, "step": 15995 }, { "epoch": 4.158004158004158, "grad_norm": 0.34013456106185913, "learning_rate": 4.824803358336848e-05, "loss": 0.2263, "num_input_tokens_seen": 1441496, "step": 16000 }, { "epoch": 4.159303534303534, "grad_norm": 0.2401047646999359, "learning_rate": 4.8245947942599955e-05, "loss": 0.1919, "num_input_tokens_seen": 1441944, "step": 16005 }, { "epoch": 4.16060291060291, "grad_norm": 0.6430578827857971, "learning_rate": 4.824386110627005e-05, "loss": 0.1493, "num_input_tokens_seen": 1442424, "step": 16010 }, { "epoch": 4.161902286902287, "grad_norm": 0.6471669673919678, "learning_rate": 4.8241773074486094e-05, "loss": 0.2194, "num_input_tokens_seen": 1442872, "step": 16015 }, { "epoch": 4.163201663201663, "grad_norm": 0.24796485900878906, "learning_rate": 4.8239683847355475e-05, "loss": 0.2015, "num_input_tokens_seen": 1443352, "step": 16020 }, { "epoch": 4.164501039501039, "grad_norm": 0.4380836486816406, "learning_rate": 4.8237593424985664e-05, "loss": 0.3222, "num_input_tokens_seen": 1443784, "step": 16025 }, { "epoch": 4.165800415800415, "grad_norm": 0.5421971082687378, "learning_rate": 4.8235501807484144e-05, "loss": 0.1935, "num_input_tokens_seen": 1444232, "step": 16030 }, { "epoch": 4.167099792099792, "grad_norm": 0.6471447348594666, "learning_rate": 4.82334089949585e-05, "loss": 0.2373, "num_input_tokens_seen": 1444632, "step": 16035 }, { "epoch": 4.168399168399168, "grad_norm": 1.1188558340072632, "learning_rate": 4.823131498751638e-05, "loss": 0.3738, "num_input_tokens_seen": 1445128, "step": 16040 }, { "epoch": 4.169698544698544, "grad_norm": 0.7714798450469971, "learning_rate": 4.8229219785265476e-05, "loss": 0.223, "num_input_tokens_seen": 1445544, "step": 16045 }, { "epoch": 4.170997920997921, "grad_norm": 0.4790855348110199, "learning_rate": 4.8227123388313534e-05, "loss": 0.2646, "num_input_tokens_seen": 1446040, "step": 16050 }, { "epoch": 4.172297297297297, "grad_norm": 0.8732495307922363, "learning_rate": 4.822502579676839e-05, "loss": 0.3062, "num_input_tokens_seen": 1446472, "step": 16055 }, { "epoch": 4.173596673596673, "grad_norm": 0.4030246138572693, "learning_rate": 4.82229270107379e-05, "loss": 0.2725, "num_input_tokens_seen": 1446936, "step": 16060 }, { "epoch": 4.17489604989605, "grad_norm": 0.48866936564445496, "learning_rate": 4.822082703033003e-05, "loss": 0.2555, "num_input_tokens_seen": 1447352, "step": 16065 }, { "epoch": 4.176195426195426, "grad_norm": 0.3142084777355194, "learning_rate": 4.821872585565278e-05, "loss": 0.238, "num_input_tokens_seen": 1447800, "step": 16070 }, { "epoch": 4.177494802494802, "grad_norm": 0.30125191807746887, "learning_rate": 4.821662348681421e-05, "loss": 0.3193, "num_input_tokens_seen": 1448264, "step": 16075 }, { "epoch": 4.1787941787941785, "grad_norm": 0.36178502440452576, "learning_rate": 4.821451992392245e-05, "loss": 0.1954, "num_input_tokens_seen": 1448712, "step": 16080 }, { "epoch": 4.180093555093555, "grad_norm": 0.3619803190231323, "learning_rate": 4.821241516708568e-05, "loss": 0.177, "num_input_tokens_seen": 1449160, "step": 16085 }, { "epoch": 4.1813929313929314, "grad_norm": 0.2857784330844879, "learning_rate": 4.821030921641216e-05, "loss": 0.1811, "num_input_tokens_seen": 1449624, "step": 16090 }, { "epoch": 4.1826923076923075, "grad_norm": 0.7878798842430115, "learning_rate": 4.8208202072010204e-05, "loss": 0.1482, "num_input_tokens_seen": 1450088, "step": 16095 }, { "epoch": 4.183991683991684, "grad_norm": 0.25403982400894165, "learning_rate": 4.820609373398818e-05, "loss": 0.25, "num_input_tokens_seen": 1450552, "step": 16100 }, { "epoch": 4.1852910602910605, "grad_norm": 0.2335718423128128, "learning_rate": 4.820398420245451e-05, "loss": 0.1416, "num_input_tokens_seen": 1450968, "step": 16105 }, { "epoch": 4.1865904365904365, "grad_norm": 0.5165984630584717, "learning_rate": 4.820187347751771e-05, "loss": 0.2694, "num_input_tokens_seen": 1451400, "step": 16110 }, { "epoch": 4.1878898128898125, "grad_norm": 0.38484281301498413, "learning_rate": 4.819976155928631e-05, "loss": 0.392, "num_input_tokens_seen": 1451800, "step": 16115 }, { "epoch": 4.1891891891891895, "grad_norm": 0.8421950936317444, "learning_rate": 4.819764844786896e-05, "loss": 0.2821, "num_input_tokens_seen": 1452216, "step": 16120 }, { "epoch": 4.1904885654885655, "grad_norm": 0.6321636438369751, "learning_rate": 4.819553414337432e-05, "loss": 0.267, "num_input_tokens_seen": 1452664, "step": 16125 }, { "epoch": 4.191787941787942, "grad_norm": 0.2967955470085144, "learning_rate": 4.819341864591113e-05, "loss": 0.2633, "num_input_tokens_seen": 1453208, "step": 16130 }, { "epoch": 4.1930873180873185, "grad_norm": 0.26616087555885315, "learning_rate": 4.81913019555882e-05, "loss": 0.2535, "num_input_tokens_seen": 1453640, "step": 16135 }, { "epoch": 4.1943866943866945, "grad_norm": 0.7146945595741272, "learning_rate": 4.818918407251439e-05, "loss": 0.2688, "num_input_tokens_seen": 1454120, "step": 16140 }, { "epoch": 4.195686070686071, "grad_norm": 0.417484313249588, "learning_rate": 4.818706499679862e-05, "loss": 0.2854, "num_input_tokens_seen": 1454552, "step": 16145 }, { "epoch": 4.196985446985447, "grad_norm": 0.3633457124233246, "learning_rate": 4.818494472854988e-05, "loss": 0.215, "num_input_tokens_seen": 1455064, "step": 16150 }, { "epoch": 4.1982848232848236, "grad_norm": 0.3765411972999573, "learning_rate": 4.818282326787722e-05, "loss": 0.2585, "num_input_tokens_seen": 1455496, "step": 16155 }, { "epoch": 4.1995841995842, "grad_norm": 0.3650352656841278, "learning_rate": 4.818070061488975e-05, "loss": 0.2334, "num_input_tokens_seen": 1455944, "step": 16160 }, { "epoch": 4.200883575883576, "grad_norm": 0.3373618721961975, "learning_rate": 4.817857676969663e-05, "loss": 0.2588, "num_input_tokens_seen": 1456376, "step": 16165 }, { "epoch": 4.202182952182953, "grad_norm": 0.3770531117916107, "learning_rate": 4.8176451732407104e-05, "loss": 0.3519, "num_input_tokens_seen": 1456824, "step": 16170 }, { "epoch": 4.203482328482329, "grad_norm": 0.3314666450023651, "learning_rate": 4.8174325503130454e-05, "loss": 0.137, "num_input_tokens_seen": 1457256, "step": 16175 }, { "epoch": 4.204781704781705, "grad_norm": 0.26116254925727844, "learning_rate": 4.8172198081976046e-05, "loss": 0.2462, "num_input_tokens_seen": 1457720, "step": 16180 }, { "epoch": 4.206081081081081, "grad_norm": 0.3265712261199951, "learning_rate": 4.817006946905328e-05, "loss": 0.3069, "num_input_tokens_seen": 1458152, "step": 16185 }, { "epoch": 4.207380457380458, "grad_norm": 0.30119645595550537, "learning_rate": 4.816793966447165e-05, "loss": 0.1763, "num_input_tokens_seen": 1458632, "step": 16190 }, { "epoch": 4.208679833679834, "grad_norm": 0.3059956729412079, "learning_rate": 4.8165808668340675e-05, "loss": 0.2152, "num_input_tokens_seen": 1459064, "step": 16195 }, { "epoch": 4.20997920997921, "grad_norm": 0.3042071759700775, "learning_rate": 4.8163676480769974e-05, "loss": 0.3434, "num_input_tokens_seen": 1459528, "step": 16200 }, { "epoch": 4.211278586278587, "grad_norm": 0.3778424561023712, "learning_rate": 4.816154310186919e-05, "loss": 0.2571, "num_input_tokens_seen": 1459944, "step": 16205 }, { "epoch": 4.212577962577963, "grad_norm": 0.3794483244419098, "learning_rate": 4.8159408531748054e-05, "loss": 0.2659, "num_input_tokens_seen": 1460408, "step": 16210 }, { "epoch": 4.213877338877339, "grad_norm": 0.2768644094467163, "learning_rate": 4.815727277051634e-05, "loss": 0.1577, "num_input_tokens_seen": 1460840, "step": 16215 }, { "epoch": 4.215176715176715, "grad_norm": 0.2307029813528061, "learning_rate": 4.81551358182839e-05, "loss": 0.268, "num_input_tokens_seen": 1461304, "step": 16220 }, { "epoch": 4.216476091476092, "grad_norm": 0.2508695721626282, "learning_rate": 4.815299767516065e-05, "loss": 0.1525, "num_input_tokens_seen": 1461768, "step": 16225 }, { "epoch": 4.217775467775468, "grad_norm": 0.21042010188102722, "learning_rate": 4.8150858341256535e-05, "loss": 0.2449, "num_input_tokens_seen": 1462216, "step": 16230 }, { "epoch": 4.219074844074844, "grad_norm": 1.0656256675720215, "learning_rate": 4.81487178166816e-05, "loss": 0.4635, "num_input_tokens_seen": 1462680, "step": 16235 }, { "epoch": 4.220374220374221, "grad_norm": 0.2781343460083008, "learning_rate": 4.814657610154593e-05, "loss": 0.1286, "num_input_tokens_seen": 1463160, "step": 16240 }, { "epoch": 4.221673596673597, "grad_norm": 0.26804086565971375, "learning_rate": 4.8144433195959666e-05, "loss": 0.1832, "num_input_tokens_seen": 1463592, "step": 16245 }, { "epoch": 4.222972972972973, "grad_norm": 0.18834376335144043, "learning_rate": 4.814228910003303e-05, "loss": 0.1193, "num_input_tokens_seen": 1464088, "step": 16250 }, { "epoch": 4.224272349272349, "grad_norm": 0.18585968017578125, "learning_rate": 4.814014381387629e-05, "loss": 0.0903, "num_input_tokens_seen": 1464504, "step": 16255 }, { "epoch": 4.225571725571726, "grad_norm": 0.1627553403377533, "learning_rate": 4.8137997337599785e-05, "loss": 0.2891, "num_input_tokens_seen": 1464920, "step": 16260 }, { "epoch": 4.226871101871102, "grad_norm": 0.39992448687553406, "learning_rate": 4.8135849671313904e-05, "loss": 0.2822, "num_input_tokens_seen": 1465368, "step": 16265 }, { "epoch": 4.228170478170478, "grad_norm": 0.16871459782123566, "learning_rate": 4.813370081512911e-05, "loss": 0.1408, "num_input_tokens_seen": 1465800, "step": 16270 }, { "epoch": 4.229469854469855, "grad_norm": 0.35027116537094116, "learning_rate": 4.8131550769155906e-05, "loss": 0.4113, "num_input_tokens_seen": 1466248, "step": 16275 }, { "epoch": 4.230769230769231, "grad_norm": 0.221245676279068, "learning_rate": 4.812939953350489e-05, "loss": 0.2152, "num_input_tokens_seen": 1466680, "step": 16280 }, { "epoch": 4.232068607068607, "grad_norm": 0.26548343896865845, "learning_rate": 4.81272471082867e-05, "loss": 0.346, "num_input_tokens_seen": 1467128, "step": 16285 }, { "epoch": 4.233367983367984, "grad_norm": 0.34375354647636414, "learning_rate": 4.8125093493612026e-05, "loss": 0.2316, "num_input_tokens_seen": 1467560, "step": 16290 }, { "epoch": 4.23466735966736, "grad_norm": 0.24878962337970734, "learning_rate": 4.8122938689591644e-05, "loss": 0.2263, "num_input_tokens_seen": 1468008, "step": 16295 }, { "epoch": 4.235966735966736, "grad_norm": 0.2994513213634491, "learning_rate": 4.8120782696336366e-05, "loss": 0.2596, "num_input_tokens_seen": 1468472, "step": 16300 }, { "epoch": 4.237266112266112, "grad_norm": 0.255896657705307, "learning_rate": 4.8118625513957074e-05, "loss": 0.2977, "num_input_tokens_seen": 1468920, "step": 16305 }, { "epoch": 4.238565488565489, "grad_norm": 0.26032671332359314, "learning_rate": 4.811646714256473e-05, "loss": 0.1464, "num_input_tokens_seen": 1469352, "step": 16310 }, { "epoch": 4.239864864864865, "grad_norm": 0.27183255553245544, "learning_rate": 4.811430758227032e-05, "loss": 0.1635, "num_input_tokens_seen": 1469848, "step": 16315 }, { "epoch": 4.241164241164241, "grad_norm": 0.30102312564849854, "learning_rate": 4.8112146833184937e-05, "loss": 0.2499, "num_input_tokens_seen": 1470296, "step": 16320 }, { "epoch": 4.242463617463618, "grad_norm": 0.30291086435317993, "learning_rate": 4.810998489541969e-05, "loss": 0.3066, "num_input_tokens_seen": 1470728, "step": 16325 }, { "epoch": 4.243762993762994, "grad_norm": 0.20914816856384277, "learning_rate": 4.8107821769085775e-05, "loss": 0.2513, "num_input_tokens_seen": 1471160, "step": 16330 }, { "epoch": 4.24506237006237, "grad_norm": 0.24244658648967743, "learning_rate": 4.810565745429445e-05, "loss": 0.3589, "num_input_tokens_seen": 1471608, "step": 16335 }, { "epoch": 4.246361746361746, "grad_norm": 0.5095072388648987, "learning_rate": 4.810349195115702e-05, "loss": 0.2653, "num_input_tokens_seen": 1472040, "step": 16340 }, { "epoch": 4.247661122661123, "grad_norm": 0.3942359387874603, "learning_rate": 4.810132525978487e-05, "loss": 0.3084, "num_input_tokens_seen": 1472488, "step": 16345 }, { "epoch": 4.248960498960499, "grad_norm": 0.46316468715667725, "learning_rate": 4.809915738028942e-05, "loss": 0.2791, "num_input_tokens_seen": 1472952, "step": 16350 }, { "epoch": 4.250259875259875, "grad_norm": 0.5290759801864624, "learning_rate": 4.8096988312782174e-05, "loss": 0.2768, "num_input_tokens_seen": 1473416, "step": 16355 }, { "epoch": 4.251559251559252, "grad_norm": 0.4485546052455902, "learning_rate": 4.8094818057374686e-05, "loss": 0.2583, "num_input_tokens_seen": 1473848, "step": 16360 }, { "epoch": 4.252858627858628, "grad_norm": 0.2947385013103485, "learning_rate": 4.809264661417858e-05, "loss": 0.2759, "num_input_tokens_seen": 1474296, "step": 16365 }, { "epoch": 4.254158004158004, "grad_norm": 0.3167896866798401, "learning_rate": 4.8090473983305535e-05, "loss": 0.2504, "num_input_tokens_seen": 1474776, "step": 16370 }, { "epoch": 4.25545738045738, "grad_norm": 0.2128889411687851, "learning_rate": 4.8088300164867284e-05, "loss": 0.2638, "num_input_tokens_seen": 1475256, "step": 16375 }, { "epoch": 4.256756756756757, "grad_norm": 0.22031474113464355, "learning_rate": 4.808612515897564e-05, "loss": 0.2954, "num_input_tokens_seen": 1475752, "step": 16380 }, { "epoch": 4.258056133056133, "grad_norm": 0.2940511107444763, "learning_rate": 4.808394896574245e-05, "loss": 0.2122, "num_input_tokens_seen": 1476184, "step": 16385 }, { "epoch": 4.259355509355509, "grad_norm": 0.45230433344841003, "learning_rate": 4.808177158527965e-05, "loss": 0.1931, "num_input_tokens_seen": 1476616, "step": 16390 }, { "epoch": 4.260654885654886, "grad_norm": 0.6126343607902527, "learning_rate": 4.807959301769923e-05, "loss": 0.217, "num_input_tokens_seen": 1477048, "step": 16395 }, { "epoch": 4.261954261954262, "grad_norm": 0.628935694694519, "learning_rate": 4.807741326311321e-05, "loss": 0.2701, "num_input_tokens_seen": 1477480, "step": 16400 }, { "epoch": 4.263253638253638, "grad_norm": 0.30141788721084595, "learning_rate": 4.8075232321633736e-05, "loss": 0.1513, "num_input_tokens_seen": 1477912, "step": 16405 }, { "epoch": 4.264553014553014, "grad_norm": 0.47653481364250183, "learning_rate": 4.8073050193372935e-05, "loss": 0.2749, "num_input_tokens_seen": 1478344, "step": 16410 }, { "epoch": 4.265852390852391, "grad_norm": 0.8061456680297852, "learning_rate": 4.8070866878443065e-05, "loss": 0.2395, "num_input_tokens_seen": 1478824, "step": 16415 }, { "epoch": 4.267151767151767, "grad_norm": 0.33088040351867676, "learning_rate": 4.806868237695641e-05, "loss": 0.2645, "num_input_tokens_seen": 1479272, "step": 16420 }, { "epoch": 4.268451143451143, "grad_norm": 0.26749923825263977, "learning_rate": 4.806649668902531e-05, "loss": 0.2175, "num_input_tokens_seen": 1479720, "step": 16425 }, { "epoch": 4.26975051975052, "grad_norm": 0.35390380024909973, "learning_rate": 4.8064309814762176e-05, "loss": 0.2538, "num_input_tokens_seen": 1480136, "step": 16430 }, { "epoch": 4.271049896049896, "grad_norm": 0.3214409053325653, "learning_rate": 4.80621217542795e-05, "loss": 0.2884, "num_input_tokens_seen": 1480552, "step": 16435 }, { "epoch": 4.272349272349272, "grad_norm": 0.24549104273319244, "learning_rate": 4.805993250768981e-05, "loss": 0.2576, "num_input_tokens_seen": 1481032, "step": 16440 }, { "epoch": 4.273648648648648, "grad_norm": 0.3080499470233917, "learning_rate": 4.805774207510568e-05, "loss": 0.1529, "num_input_tokens_seen": 1481464, "step": 16445 }, { "epoch": 4.274948024948025, "grad_norm": 0.24598434567451477, "learning_rate": 4.8055550456639796e-05, "loss": 0.3222, "num_input_tokens_seen": 1481928, "step": 16450 }, { "epoch": 4.276247401247401, "grad_norm": 0.29312676191329956, "learning_rate": 4.805335765240486e-05, "loss": 0.3122, "num_input_tokens_seen": 1482360, "step": 16455 }, { "epoch": 4.277546777546777, "grad_norm": 0.27763715386390686, "learning_rate": 4.805116366251364e-05, "loss": 0.2232, "num_input_tokens_seen": 1482808, "step": 16460 }, { "epoch": 4.278846153846154, "grad_norm": 0.22766533493995667, "learning_rate": 4.8048968487079e-05, "loss": 0.2631, "num_input_tokens_seen": 1483256, "step": 16465 }, { "epoch": 4.28014553014553, "grad_norm": 0.2548733949661255, "learning_rate": 4.804677212621382e-05, "loss": 0.298, "num_input_tokens_seen": 1483736, "step": 16470 }, { "epoch": 4.281444906444906, "grad_norm": 0.29504233598709106, "learning_rate": 4.8044574580031065e-05, "loss": 0.2273, "num_input_tokens_seen": 1484152, "step": 16475 }, { "epoch": 4.282744282744282, "grad_norm": 0.5461616516113281, "learning_rate": 4.8042375848643764e-05, "loss": 0.2506, "num_input_tokens_seen": 1484616, "step": 16480 }, { "epoch": 4.284043659043659, "grad_norm": 0.27065473794937134, "learning_rate": 4.804017593216499e-05, "loss": 0.3266, "num_input_tokens_seen": 1485048, "step": 16485 }, { "epoch": 4.285343035343035, "grad_norm": 0.4840782582759857, "learning_rate": 4.80379748307079e-05, "loss": 0.2705, "num_input_tokens_seen": 1485496, "step": 16490 }, { "epoch": 4.286642411642411, "grad_norm": 0.1816481053829193, "learning_rate": 4.8035772544385685e-05, "loss": 0.2513, "num_input_tokens_seen": 1485896, "step": 16495 }, { "epoch": 4.287941787941788, "grad_norm": 0.3036896288394928, "learning_rate": 4.803356907331161e-05, "loss": 0.2363, "num_input_tokens_seen": 1486344, "step": 16500 }, { "epoch": 4.289241164241164, "grad_norm": 0.21515408158302307, "learning_rate": 4.803136441759902e-05, "loss": 0.256, "num_input_tokens_seen": 1486760, "step": 16505 }, { "epoch": 4.29054054054054, "grad_norm": 0.22219370305538177, "learning_rate": 4.8029158577361275e-05, "loss": 0.2269, "num_input_tokens_seen": 1487240, "step": 16510 }, { "epoch": 4.291839916839917, "grad_norm": 0.2682008445262909, "learning_rate": 4.802695155271185e-05, "loss": 0.223, "num_input_tokens_seen": 1487688, "step": 16515 }, { "epoch": 4.293139293139293, "grad_norm": 0.21805879473686218, "learning_rate": 4.802474334376425e-05, "loss": 0.2447, "num_input_tokens_seen": 1488136, "step": 16520 }, { "epoch": 4.294438669438669, "grad_norm": 0.2738482654094696, "learning_rate": 4.802253395063203e-05, "loss": 0.2683, "num_input_tokens_seen": 1488568, "step": 16525 }, { "epoch": 4.295738045738045, "grad_norm": 0.3019534945487976, "learning_rate": 4.802032337342882e-05, "loss": 0.2709, "num_input_tokens_seen": 1489016, "step": 16530 }, { "epoch": 4.297037422037422, "grad_norm": 0.2463444173336029, "learning_rate": 4.801811161226834e-05, "loss": 0.3348, "num_input_tokens_seen": 1489496, "step": 16535 }, { "epoch": 4.298336798336798, "grad_norm": 0.18965741991996765, "learning_rate": 4.8015898667264316e-05, "loss": 0.3023, "num_input_tokens_seen": 1489928, "step": 16540 }, { "epoch": 4.299636174636174, "grad_norm": 0.3285282552242279, "learning_rate": 4.8013684538530565e-05, "loss": 0.2082, "num_input_tokens_seen": 1490344, "step": 16545 }, { "epoch": 4.3009355509355505, "grad_norm": 0.4825505316257477, "learning_rate": 4.801146922618098e-05, "loss": 0.3049, "num_input_tokens_seen": 1490792, "step": 16550 }, { "epoch": 4.302234927234927, "grad_norm": 0.19693851470947266, "learning_rate": 4.8009252730329476e-05, "loss": 0.264, "num_input_tokens_seen": 1491272, "step": 16555 }, { "epoch": 4.303534303534303, "grad_norm": 0.1580193191766739, "learning_rate": 4.800703505109006e-05, "loss": 0.3043, "num_input_tokens_seen": 1491688, "step": 16560 }, { "epoch": 4.3048336798336795, "grad_norm": 0.11459221690893173, "learning_rate": 4.8004816188576783e-05, "loss": 0.2937, "num_input_tokens_seen": 1492120, "step": 16565 }, { "epoch": 4.306133056133056, "grad_norm": 0.23686327040195465, "learning_rate": 4.800259614290378e-05, "loss": 0.2601, "num_input_tokens_seen": 1492552, "step": 16570 }, { "epoch": 4.3074324324324325, "grad_norm": 0.48672235012054443, "learning_rate": 4.800037491418521e-05, "loss": 0.2988, "num_input_tokens_seen": 1492984, "step": 16575 }, { "epoch": 4.3087318087318085, "grad_norm": 0.41249918937683105, "learning_rate": 4.7998152502535316e-05, "loss": 0.2754, "num_input_tokens_seen": 1493416, "step": 16580 }, { "epoch": 4.310031185031185, "grad_norm": 0.38167011737823486, "learning_rate": 4.79959289080684e-05, "loss": 0.2717, "num_input_tokens_seen": 1493896, "step": 16585 }, { "epoch": 4.3113305613305615, "grad_norm": 0.15137416124343872, "learning_rate": 4.799370413089884e-05, "loss": 0.248, "num_input_tokens_seen": 1494360, "step": 16590 }, { "epoch": 4.3126299376299375, "grad_norm": 0.5218256711959839, "learning_rate": 4.799147817114104e-05, "loss": 0.2854, "num_input_tokens_seen": 1494824, "step": 16595 }, { "epoch": 4.313929313929314, "grad_norm": 0.2930223047733307, "learning_rate": 4.7989251028909476e-05, "loss": 0.1874, "num_input_tokens_seen": 1495288, "step": 16600 }, { "epoch": 4.3152286902286905, "grad_norm": 0.313973069190979, "learning_rate": 4.798702270431872e-05, "loss": 0.2628, "num_input_tokens_seen": 1495752, "step": 16605 }, { "epoch": 4.3165280665280665, "grad_norm": 0.3267359137535095, "learning_rate": 4.7984793197483356e-05, "loss": 0.1734, "num_input_tokens_seen": 1496168, "step": 16610 }, { "epoch": 4.317827442827443, "grad_norm": 0.3406093418598175, "learning_rate": 4.798256250851805e-05, "loss": 0.278, "num_input_tokens_seen": 1496664, "step": 16615 }, { "epoch": 4.3191268191268195, "grad_norm": 0.6739736199378967, "learning_rate": 4.798033063753754e-05, "loss": 0.4004, "num_input_tokens_seen": 1497112, "step": 16620 }, { "epoch": 4.3204261954261955, "grad_norm": 0.31809306144714355, "learning_rate": 4.79780975846566e-05, "loss": 0.3004, "num_input_tokens_seen": 1497528, "step": 16625 }, { "epoch": 4.321725571725572, "grad_norm": 0.3185933828353882, "learning_rate": 4.797586334999009e-05, "loss": 0.2323, "num_input_tokens_seen": 1498008, "step": 16630 }, { "epoch": 4.323024948024948, "grad_norm": 0.3411969542503357, "learning_rate": 4.797362793365291e-05, "loss": 0.1866, "num_input_tokens_seen": 1498440, "step": 16635 }, { "epoch": 4.324324324324325, "grad_norm": 0.9101939797401428, "learning_rate": 4.797139133576004e-05, "loss": 0.2902, "num_input_tokens_seen": 1498920, "step": 16640 }, { "epoch": 4.325623700623701, "grad_norm": 0.35803067684173584, "learning_rate": 4.79691535564265e-05, "loss": 0.2177, "num_input_tokens_seen": 1499384, "step": 16645 }, { "epoch": 4.326923076923077, "grad_norm": 0.3758173882961273, "learning_rate": 4.796691459576739e-05, "loss": 0.1674, "num_input_tokens_seen": 1499816, "step": 16650 }, { "epoch": 4.328222453222454, "grad_norm": 0.3552602529525757, "learning_rate": 4.796467445389784e-05, "loss": 0.2763, "num_input_tokens_seen": 1500296, "step": 16655 }, { "epoch": 4.32952182952183, "grad_norm": 1.093448519706726, "learning_rate": 4.7962433130933096e-05, "loss": 0.3459, "num_input_tokens_seen": 1500744, "step": 16660 }, { "epoch": 4.330821205821206, "grad_norm": 0.24718676507472992, "learning_rate": 4.7960190626988405e-05, "loss": 0.3002, "num_input_tokens_seen": 1501208, "step": 16665 }, { "epoch": 4.332120582120582, "grad_norm": 0.23096343874931335, "learning_rate": 4.795794694217911e-05, "loss": 0.2605, "num_input_tokens_seen": 1501672, "step": 16670 }, { "epoch": 4.333419958419959, "grad_norm": 0.3523135185241699, "learning_rate": 4.795570207662061e-05, "loss": 0.2328, "num_input_tokens_seen": 1502136, "step": 16675 }, { "epoch": 4.334719334719335, "grad_norm": 0.29351598024368286, "learning_rate": 4.795345603042836e-05, "loss": 0.2943, "num_input_tokens_seen": 1502568, "step": 16680 }, { "epoch": 4.336018711018711, "grad_norm": 0.39512330293655396, "learning_rate": 4.7951208803717876e-05, "loss": 0.276, "num_input_tokens_seen": 1502984, "step": 16685 }, { "epoch": 4.337318087318088, "grad_norm": 0.4492991268634796, "learning_rate": 4.794896039660472e-05, "loss": 0.3066, "num_input_tokens_seen": 1503400, "step": 16690 }, { "epoch": 4.338617463617464, "grad_norm": 0.4778492748737335, "learning_rate": 4.794671080920455e-05, "loss": 0.263, "num_input_tokens_seen": 1503848, "step": 16695 }, { "epoch": 4.33991683991684, "grad_norm": 0.5218383073806763, "learning_rate": 4.794446004163306e-05, "loss": 0.2885, "num_input_tokens_seen": 1504312, "step": 16700 }, { "epoch": 4.341216216216216, "grad_norm": 0.5149637460708618, "learning_rate": 4.7942208094006e-05, "loss": 0.2926, "num_input_tokens_seen": 1504728, "step": 16705 }, { "epoch": 4.342515592515593, "grad_norm": 0.823739230632782, "learning_rate": 4.7939954966439195e-05, "loss": 0.3248, "num_input_tokens_seen": 1505176, "step": 16710 }, { "epoch": 4.343814968814969, "grad_norm": 0.5025395750999451, "learning_rate": 4.7937700659048527e-05, "loss": 0.2785, "num_input_tokens_seen": 1505656, "step": 16715 }, { "epoch": 4.345114345114345, "grad_norm": 0.23637887835502625, "learning_rate": 4.7935445171949936e-05, "loss": 0.2842, "num_input_tokens_seen": 1506120, "step": 16720 }, { "epoch": 4.346413721413722, "grad_norm": 0.17268434166908264, "learning_rate": 4.793318850525943e-05, "loss": 0.2426, "num_input_tokens_seen": 1506584, "step": 16725 }, { "epoch": 4.347713097713098, "grad_norm": 0.3267146348953247, "learning_rate": 4.7930930659093054e-05, "loss": 0.2059, "num_input_tokens_seen": 1506984, "step": 16730 }, { "epoch": 4.349012474012474, "grad_norm": 0.4339400827884674, "learning_rate": 4.792867163356696e-05, "loss": 0.2564, "num_input_tokens_seen": 1507464, "step": 16735 }, { "epoch": 4.350311850311851, "grad_norm": 0.33901965618133545, "learning_rate": 4.79264114287973e-05, "loss": 0.2851, "num_input_tokens_seen": 1507912, "step": 16740 }, { "epoch": 4.351611226611227, "grad_norm": 0.3177605867385864, "learning_rate": 4.792415004490034e-05, "loss": 0.3157, "num_input_tokens_seen": 1508344, "step": 16745 }, { "epoch": 4.352910602910603, "grad_norm": 0.3344195485115051, "learning_rate": 4.792188748199237e-05, "loss": 0.1439, "num_input_tokens_seen": 1508776, "step": 16750 }, { "epoch": 4.354209979209979, "grad_norm": 0.3611336648464203, "learning_rate": 4.7919623740189774e-05, "loss": 0.295, "num_input_tokens_seen": 1509224, "step": 16755 }, { "epoch": 4.355509355509356, "grad_norm": 0.3627137839794159, "learning_rate": 4.791735881960896e-05, "loss": 0.2876, "num_input_tokens_seen": 1509688, "step": 16760 }, { "epoch": 4.356808731808732, "grad_norm": 0.3021560311317444, "learning_rate": 4.791509272036643e-05, "loss": 0.2446, "num_input_tokens_seen": 1510120, "step": 16765 }, { "epoch": 4.358108108108108, "grad_norm": 0.6605596542358398, "learning_rate": 4.791282544257872e-05, "loss": 0.2726, "num_input_tokens_seen": 1510568, "step": 16770 }, { "epoch": 4.359407484407485, "grad_norm": 0.33430543541908264, "learning_rate": 4.7910556986362455e-05, "loss": 0.2305, "num_input_tokens_seen": 1510968, "step": 16775 }, { "epoch": 4.360706860706861, "grad_norm": 0.2861432135105133, "learning_rate": 4.790828735183428e-05, "loss": 0.2611, "num_input_tokens_seen": 1511464, "step": 16780 }, { "epoch": 4.362006237006237, "grad_norm": 0.32685187458992004, "learning_rate": 4.790601653911094e-05, "loss": 0.2601, "num_input_tokens_seen": 1511912, "step": 16785 }, { "epoch": 4.363305613305613, "grad_norm": 0.30779287219047546, "learning_rate": 4.790374454830923e-05, "loss": 0.1362, "num_input_tokens_seen": 1512344, "step": 16790 }, { "epoch": 4.36460498960499, "grad_norm": 0.2895217537879944, "learning_rate": 4.790147137954598e-05, "loss": 0.305, "num_input_tokens_seen": 1512792, "step": 16795 }, { "epoch": 4.365904365904366, "grad_norm": 0.5020607709884644, "learning_rate": 4.7899197032938125e-05, "loss": 0.1521, "num_input_tokens_seen": 1513240, "step": 16800 }, { "epoch": 4.367203742203742, "grad_norm": 0.5231324434280396, "learning_rate": 4.7896921508602623e-05, "loss": 0.4078, "num_input_tokens_seen": 1513720, "step": 16805 }, { "epoch": 4.368503118503119, "grad_norm": 0.3204137682914734, "learning_rate": 4.7894644806656493e-05, "loss": 0.2071, "num_input_tokens_seen": 1514184, "step": 16810 }, { "epoch": 4.369802494802495, "grad_norm": 0.35205796360969543, "learning_rate": 4.789236692721686e-05, "loss": 0.2314, "num_input_tokens_seen": 1514632, "step": 16815 }, { "epoch": 4.371101871101871, "grad_norm": 0.38234245777130127, "learning_rate": 4.789008787040086e-05, "loss": 0.1856, "num_input_tokens_seen": 1515096, "step": 16820 }, { "epoch": 4.372401247401247, "grad_norm": 0.27538734674453735, "learning_rate": 4.788780763632571e-05, "loss": 0.2142, "num_input_tokens_seen": 1515544, "step": 16825 }, { "epoch": 4.373700623700624, "grad_norm": 0.3167732059955597, "learning_rate": 4.788552622510868e-05, "loss": 0.2977, "num_input_tokens_seen": 1515992, "step": 16830 }, { "epoch": 4.375, "grad_norm": 0.3098771274089813, "learning_rate": 4.788324363686711e-05, "loss": 0.2992, "num_input_tokens_seen": 1516456, "step": 16835 }, { "epoch": 4.376299376299376, "grad_norm": 0.2677319645881653, "learning_rate": 4.788095987171839e-05, "loss": 0.1776, "num_input_tokens_seen": 1516888, "step": 16840 }, { "epoch": 4.377598752598753, "grad_norm": 0.33470550179481506, "learning_rate": 4.787867492977999e-05, "loss": 0.1678, "num_input_tokens_seen": 1517336, "step": 16845 }, { "epoch": 4.378898128898129, "grad_norm": 0.22174721956253052, "learning_rate": 4.7876388811169405e-05, "loss": 0.3, "num_input_tokens_seen": 1517800, "step": 16850 }, { "epoch": 4.380197505197505, "grad_norm": 0.45608577132225037, "learning_rate": 4.787410151600423e-05, "loss": 0.3107, "num_input_tokens_seen": 1518264, "step": 16855 }, { "epoch": 4.381496881496881, "grad_norm": 0.4141707122325897, "learning_rate": 4.78718130444021e-05, "loss": 0.2787, "num_input_tokens_seen": 1518776, "step": 16860 }, { "epoch": 4.382796257796258, "grad_norm": 0.3337297737598419, "learning_rate": 4.786952339648071e-05, "loss": 0.2945, "num_input_tokens_seen": 1519240, "step": 16865 }, { "epoch": 4.384095634095634, "grad_norm": 0.37027451395988464, "learning_rate": 4.786723257235781e-05, "loss": 0.2334, "num_input_tokens_seen": 1519704, "step": 16870 }, { "epoch": 4.38539501039501, "grad_norm": 0.37350961565971375, "learning_rate": 4.786494057215124e-05, "loss": 0.2259, "num_input_tokens_seen": 1520152, "step": 16875 }, { "epoch": 4.386694386694387, "grad_norm": 0.2905241847038269, "learning_rate": 4.786264739597886e-05, "loss": 0.1402, "num_input_tokens_seen": 1520584, "step": 16880 }, { "epoch": 4.387993762993763, "grad_norm": 0.27777501940727234, "learning_rate": 4.786035304395862e-05, "loss": 0.3021, "num_input_tokens_seen": 1521032, "step": 16885 }, { "epoch": 4.389293139293139, "grad_norm": 0.472105473279953, "learning_rate": 4.785805751620852e-05, "loss": 0.3755, "num_input_tokens_seen": 1521496, "step": 16890 }, { "epoch": 4.390592515592515, "grad_norm": 0.3657895624637604, "learning_rate": 4.785576081284663e-05, "loss": 0.2244, "num_input_tokens_seen": 1521896, "step": 16895 }, { "epoch": 4.391891891891892, "grad_norm": 0.2748298645019531, "learning_rate": 4.7853462933991045e-05, "loss": 0.1832, "num_input_tokens_seen": 1522344, "step": 16900 }, { "epoch": 4.393191268191268, "grad_norm": 0.9075926542282104, "learning_rate": 4.7851163879759975e-05, "loss": 0.3454, "num_input_tokens_seen": 1522840, "step": 16905 }, { "epoch": 4.394490644490644, "grad_norm": 0.37421733140945435, "learning_rate": 4.7848863650271646e-05, "loss": 0.3308, "num_input_tokens_seen": 1523256, "step": 16910 }, { "epoch": 4.395790020790021, "grad_norm": 0.3248263895511627, "learning_rate": 4.784656224564438e-05, "loss": 0.2017, "num_input_tokens_seen": 1523720, "step": 16915 }, { "epoch": 4.397089397089397, "grad_norm": 0.3342113196849823, "learning_rate": 4.784425966599651e-05, "loss": 0.1961, "num_input_tokens_seen": 1524168, "step": 16920 }, { "epoch": 4.398388773388773, "grad_norm": 0.545335590839386, "learning_rate": 4.7841955911446487e-05, "loss": 0.4031, "num_input_tokens_seen": 1524648, "step": 16925 }, { "epoch": 4.399688149688149, "grad_norm": 0.3471650183200836, "learning_rate": 4.783965098211278e-05, "loss": 0.1663, "num_input_tokens_seen": 1525080, "step": 16930 }, { "epoch": 4.400987525987526, "grad_norm": 0.3792141377925873, "learning_rate": 4.7837344878113944e-05, "loss": 0.2312, "num_input_tokens_seen": 1525528, "step": 16935 }, { "epoch": 4.402286902286902, "grad_norm": 0.2513134181499481, "learning_rate": 4.783503759956858e-05, "loss": 0.3788, "num_input_tokens_seen": 1525992, "step": 16940 }, { "epoch": 4.403586278586278, "grad_norm": 0.2431616634130478, "learning_rate": 4.783272914659535e-05, "loss": 0.2554, "num_input_tokens_seen": 1526424, "step": 16945 }, { "epoch": 4.404885654885655, "grad_norm": 0.29520466923713684, "learning_rate": 4.783041951931297e-05, "loss": 0.2329, "num_input_tokens_seen": 1526872, "step": 16950 }, { "epoch": 4.406185031185031, "grad_norm": 0.638508141040802, "learning_rate": 4.7828108717840256e-05, "loss": 0.3028, "num_input_tokens_seen": 1527320, "step": 16955 }, { "epoch": 4.407484407484407, "grad_norm": 0.3310997188091278, "learning_rate": 4.782579674229604e-05, "loss": 0.2255, "num_input_tokens_seen": 1527768, "step": 16960 }, { "epoch": 4.408783783783784, "grad_norm": 0.34479284286499023, "learning_rate": 4.782348359279922e-05, "loss": 0.2279, "num_input_tokens_seen": 1528216, "step": 16965 }, { "epoch": 4.41008316008316, "grad_norm": 0.2979392111301422, "learning_rate": 4.782116926946877e-05, "loss": 0.2151, "num_input_tokens_seen": 1528664, "step": 16970 }, { "epoch": 4.411382536382536, "grad_norm": 0.31595978140830994, "learning_rate": 4.781885377242372e-05, "loss": 0.3549, "num_input_tokens_seen": 1529096, "step": 16975 }, { "epoch": 4.412681912681912, "grad_norm": 0.3262927532196045, "learning_rate": 4.781653710178315e-05, "loss": 0.1826, "num_input_tokens_seen": 1529528, "step": 16980 }, { "epoch": 4.413981288981289, "grad_norm": 0.34547245502471924, "learning_rate": 4.781421925766623e-05, "loss": 0.2603, "num_input_tokens_seen": 1529944, "step": 16985 }, { "epoch": 4.415280665280665, "grad_norm": 0.29362744092941284, "learning_rate": 4.781190024019214e-05, "loss": 0.2443, "num_input_tokens_seen": 1530376, "step": 16990 }, { "epoch": 4.416580041580041, "grad_norm": 0.32093775272369385, "learning_rate": 4.7809580049480164e-05, "loss": 0.2345, "num_input_tokens_seen": 1530840, "step": 16995 }, { "epoch": 4.417879417879418, "grad_norm": 0.4214405417442322, "learning_rate": 4.7807258685649634e-05, "loss": 0.2494, "num_input_tokens_seen": 1531304, "step": 17000 }, { "epoch": 4.419178794178794, "grad_norm": 0.32216036319732666, "learning_rate": 4.780493614881995e-05, "loss": 0.3262, "num_input_tokens_seen": 1531768, "step": 17005 }, { "epoch": 4.42047817047817, "grad_norm": 0.3273129463195801, "learning_rate": 4.780261243911053e-05, "loss": 0.1914, "num_input_tokens_seen": 1532200, "step": 17010 }, { "epoch": 4.421777546777546, "grad_norm": 0.3277895450592041, "learning_rate": 4.780028755664091e-05, "loss": 0.2386, "num_input_tokens_seen": 1532664, "step": 17015 }, { "epoch": 4.423076923076923, "grad_norm": 0.38533738255500793, "learning_rate": 4.779796150153065e-05, "loss": 0.2606, "num_input_tokens_seen": 1533144, "step": 17020 }, { "epoch": 4.424376299376299, "grad_norm": 1.4264655113220215, "learning_rate": 4.77956342738994e-05, "loss": 0.2409, "num_input_tokens_seen": 1533608, "step": 17025 }, { "epoch": 4.425675675675675, "grad_norm": 0.3993496596813202, "learning_rate": 4.7793305873866825e-05, "loss": 0.3651, "num_input_tokens_seen": 1534008, "step": 17030 }, { "epoch": 4.426975051975052, "grad_norm": 0.6423736214637756, "learning_rate": 4.779097630155269e-05, "loss": 0.2284, "num_input_tokens_seen": 1534504, "step": 17035 }, { "epoch": 4.428274428274428, "grad_norm": 0.517410933971405, "learning_rate": 4.778864555707681e-05, "loss": 0.2212, "num_input_tokens_seen": 1534936, "step": 17040 }, { "epoch": 4.4295738045738045, "grad_norm": 0.6952757835388184, "learning_rate": 4.778631364055906e-05, "loss": 0.2878, "num_input_tokens_seen": 1535400, "step": 17045 }, { "epoch": 4.4308731808731805, "grad_norm": 0.3200521469116211, "learning_rate": 4.778398055211936e-05, "loss": 0.2427, "num_input_tokens_seen": 1535896, "step": 17050 }, { "epoch": 4.432172557172557, "grad_norm": 0.8138808608055115, "learning_rate": 4.778164629187771e-05, "loss": 0.3627, "num_input_tokens_seen": 1536360, "step": 17055 }, { "epoch": 4.4334719334719335, "grad_norm": 0.5771310329437256, "learning_rate": 4.777931085995416e-05, "loss": 0.2855, "num_input_tokens_seen": 1536808, "step": 17060 }, { "epoch": 4.4347713097713095, "grad_norm": 0.6714667677879333, "learning_rate": 4.777697425646883e-05, "loss": 0.289, "num_input_tokens_seen": 1537272, "step": 17065 }, { "epoch": 4.436070686070686, "grad_norm": 0.20815935730934143, "learning_rate": 4.7774636481541897e-05, "loss": 0.2882, "num_input_tokens_seen": 1537752, "step": 17070 }, { "epoch": 4.4373700623700625, "grad_norm": 0.497113436460495, "learning_rate": 4.777229753529357e-05, "loss": 0.28, "num_input_tokens_seen": 1538184, "step": 17075 }, { "epoch": 4.4386694386694385, "grad_norm": 0.24814610183238983, "learning_rate": 4.776995741784418e-05, "loss": 0.2581, "num_input_tokens_seen": 1538648, "step": 17080 }, { "epoch": 4.439968814968815, "grad_norm": 0.3564527630805969, "learning_rate": 4.776761612931405e-05, "loss": 0.2059, "num_input_tokens_seen": 1539096, "step": 17085 }, { "epoch": 4.4412681912681915, "grad_norm": 0.42248061299324036, "learning_rate": 4.776527366982362e-05, "loss": 0.3009, "num_input_tokens_seen": 1539544, "step": 17090 }, { "epoch": 4.4425675675675675, "grad_norm": 0.3230780363082886, "learning_rate": 4.776293003949335e-05, "loss": 0.1644, "num_input_tokens_seen": 1540024, "step": 17095 }, { "epoch": 4.443866943866944, "grad_norm": 0.41295355558395386, "learning_rate": 4.776058523844377e-05, "loss": 0.3835, "num_input_tokens_seen": 1540488, "step": 17100 }, { "epoch": 4.4451663201663205, "grad_norm": 0.23537293076515198, "learning_rate": 4.775823926679548e-05, "loss": 0.3003, "num_input_tokens_seen": 1540936, "step": 17105 }, { "epoch": 4.446465696465697, "grad_norm": 0.19949796795845032, "learning_rate": 4.775589212466915e-05, "loss": 0.2657, "num_input_tokens_seen": 1541368, "step": 17110 }, { "epoch": 4.447765072765073, "grad_norm": 0.22524839639663696, "learning_rate": 4.775354381218548e-05, "loss": 0.2183, "num_input_tokens_seen": 1541784, "step": 17115 }, { "epoch": 4.4490644490644495, "grad_norm": 0.2999807298183441, "learning_rate": 4.775119432946525e-05, "loss": 0.2446, "num_input_tokens_seen": 1542216, "step": 17120 }, { "epoch": 4.450363825363826, "grad_norm": 0.28902965784072876, "learning_rate": 4.7748843676629295e-05, "loss": 0.1847, "num_input_tokens_seen": 1542664, "step": 17125 }, { "epoch": 4.451663201663202, "grad_norm": 0.34327638149261475, "learning_rate": 4.7746491853798514e-05, "loss": 0.2812, "num_input_tokens_seen": 1543112, "step": 17130 }, { "epoch": 4.452962577962578, "grad_norm": 0.28649964928627014, "learning_rate": 4.774413886109386e-05, "loss": 0.22, "num_input_tokens_seen": 1543576, "step": 17135 }, { "epoch": 4.454261954261955, "grad_norm": 0.28599926829338074, "learning_rate": 4.774178469863636e-05, "loss": 0.1136, "num_input_tokens_seen": 1544024, "step": 17140 }, { "epoch": 4.455561330561331, "grad_norm": 0.5017053484916687, "learning_rate": 4.7739429366547076e-05, "loss": 0.2181, "num_input_tokens_seen": 1544488, "step": 17145 }, { "epoch": 4.456860706860707, "grad_norm": 0.21766959130764008, "learning_rate": 4.773707286494715e-05, "loss": 0.3454, "num_input_tokens_seen": 1544984, "step": 17150 }, { "epoch": 4.458160083160083, "grad_norm": 0.2404458075761795, "learning_rate": 4.773471519395778e-05, "loss": 0.2137, "num_input_tokens_seen": 1545432, "step": 17155 }, { "epoch": 4.45945945945946, "grad_norm": 0.28298020362854004, "learning_rate": 4.7732356353700234e-05, "loss": 0.2703, "num_input_tokens_seen": 1545912, "step": 17160 }, { "epoch": 4.460758835758836, "grad_norm": 0.230453222990036, "learning_rate": 4.7729996344295814e-05, "loss": 0.27, "num_input_tokens_seen": 1546328, "step": 17165 }, { "epoch": 4.462058212058212, "grad_norm": 0.26464197039604187, "learning_rate": 4.77276351658659e-05, "loss": 0.2105, "num_input_tokens_seen": 1546776, "step": 17170 }, { "epoch": 4.463357588357589, "grad_norm": 0.28785577416419983, "learning_rate": 4.7725272818531934e-05, "loss": 0.2205, "num_input_tokens_seen": 1547256, "step": 17175 }, { "epoch": 4.464656964656965, "grad_norm": 0.22148333489894867, "learning_rate": 4.772290930241542e-05, "loss": 0.218, "num_input_tokens_seen": 1547720, "step": 17180 }, { "epoch": 4.465956340956341, "grad_norm": 0.5206027626991272, "learning_rate": 4.77205446176379e-05, "loss": 0.3562, "num_input_tokens_seen": 1548216, "step": 17185 }, { "epoch": 4.467255717255718, "grad_norm": 0.23539945483207703, "learning_rate": 4.7718178764321006e-05, "loss": 0.2623, "num_input_tokens_seen": 1548664, "step": 17190 }, { "epoch": 4.468555093555094, "grad_norm": 0.4770796597003937, "learning_rate": 4.7715811742586404e-05, "loss": 0.3693, "num_input_tokens_seen": 1549112, "step": 17195 }, { "epoch": 4.46985446985447, "grad_norm": 0.2750793993473053, "learning_rate": 4.7713443552555845e-05, "loss": 0.27, "num_input_tokens_seen": 1549528, "step": 17200 }, { "epoch": 4.471153846153846, "grad_norm": 0.5297673344612122, "learning_rate": 4.771107419435112e-05, "loss": 0.2462, "num_input_tokens_seen": 1549960, "step": 17205 }, { "epoch": 4.472453222453223, "grad_norm": 0.4783867597579956, "learning_rate": 4.770870366809409e-05, "loss": 0.2771, "num_input_tokens_seen": 1550408, "step": 17210 }, { "epoch": 4.473752598752599, "grad_norm": 0.35166430473327637, "learning_rate": 4.7706331973906664e-05, "loss": 0.2096, "num_input_tokens_seen": 1550840, "step": 17215 }, { "epoch": 4.475051975051975, "grad_norm": 0.40608036518096924, "learning_rate": 4.770395911191084e-05, "loss": 0.2241, "num_input_tokens_seen": 1551272, "step": 17220 }, { "epoch": 4.476351351351352, "grad_norm": 0.6978062391281128, "learning_rate": 4.7701585082228634e-05, "loss": 0.2623, "num_input_tokens_seen": 1551736, "step": 17225 }, { "epoch": 4.477650727650728, "grad_norm": 0.5416643023490906, "learning_rate": 4.769920988498216e-05, "loss": 0.3304, "num_input_tokens_seen": 1552200, "step": 17230 }, { "epoch": 4.478950103950104, "grad_norm": 0.30314919352531433, "learning_rate": 4.769683352029357e-05, "loss": 0.376, "num_input_tokens_seen": 1552632, "step": 17235 }, { "epoch": 4.48024948024948, "grad_norm": 0.5090392231941223, "learning_rate": 4.769445598828509e-05, "loss": 0.2187, "num_input_tokens_seen": 1553080, "step": 17240 }, { "epoch": 4.481548856548857, "grad_norm": 0.4433571696281433, "learning_rate": 4.7692077289078996e-05, "loss": 0.2591, "num_input_tokens_seen": 1553496, "step": 17245 }, { "epoch": 4.482848232848233, "grad_norm": 0.32115939259529114, "learning_rate": 4.768969742279762e-05, "loss": 0.2318, "num_input_tokens_seen": 1553944, "step": 17250 }, { "epoch": 4.484147609147609, "grad_norm": 0.2559366822242737, "learning_rate": 4.7687316389563365e-05, "loss": 0.2168, "num_input_tokens_seen": 1554376, "step": 17255 }, { "epoch": 4.485446985446986, "grad_norm": 0.24692051112651825, "learning_rate": 4.7684934189498686e-05, "loss": 0.2628, "num_input_tokens_seen": 1554792, "step": 17260 }, { "epoch": 4.486746361746362, "grad_norm": 0.32574328780174255, "learning_rate": 4.768255082272611e-05, "loss": 0.2717, "num_input_tokens_seen": 1555240, "step": 17265 }, { "epoch": 4.488045738045738, "grad_norm": 0.3045159578323364, "learning_rate": 4.768016628936821e-05, "loss": 0.3214, "num_input_tokens_seen": 1555720, "step": 17270 }, { "epoch": 4.489345114345114, "grad_norm": 0.23173648118972778, "learning_rate": 4.7677780589547635e-05, "loss": 0.2537, "num_input_tokens_seen": 1556152, "step": 17275 }, { "epoch": 4.490644490644491, "grad_norm": 0.35869917273521423, "learning_rate": 4.7675393723387064e-05, "loss": 0.2727, "num_input_tokens_seen": 1556632, "step": 17280 }, { "epoch": 4.491943866943867, "grad_norm": 0.26418232917785645, "learning_rate": 4.767300569100928e-05, "loss": 0.2048, "num_input_tokens_seen": 1557080, "step": 17285 }, { "epoch": 4.493243243243243, "grad_norm": 0.2916033864021301, "learning_rate": 4.7670616492537076e-05, "loss": 0.2156, "num_input_tokens_seen": 1557544, "step": 17290 }, { "epoch": 4.49454261954262, "grad_norm": 0.27158570289611816, "learning_rate": 4.7668226128093354e-05, "loss": 0.1632, "num_input_tokens_seen": 1557992, "step": 17295 }, { "epoch": 4.495841995841996, "grad_norm": 0.29636889696121216, "learning_rate": 4.766583459780104e-05, "loss": 0.2174, "num_input_tokens_seen": 1558472, "step": 17300 }, { "epoch": 4.497141372141372, "grad_norm": 0.2708413302898407, "learning_rate": 4.766344190178313e-05, "loss": 0.2176, "num_input_tokens_seen": 1558920, "step": 17305 }, { "epoch": 4.498440748440748, "grad_norm": 0.5841064453125, "learning_rate": 4.766104804016269e-05, "loss": 0.2162, "num_input_tokens_seen": 1559336, "step": 17310 }, { "epoch": 4.499740124740125, "grad_norm": 0.5152611136436462, "learning_rate": 4.765865301306284e-05, "loss": 0.2774, "num_input_tokens_seen": 1559816, "step": 17315 }, { "epoch": 4.501039501039501, "grad_norm": 0.3936282992362976, "learning_rate": 4.7656256820606745e-05, "loss": 0.3013, "num_input_tokens_seen": 1560280, "step": 17320 }, { "epoch": 4.502338877338877, "grad_norm": 0.408583402633667, "learning_rate": 4.765385946291767e-05, "loss": 0.3216, "num_input_tokens_seen": 1560792, "step": 17325 }, { "epoch": 4.503638253638254, "grad_norm": 0.8346330523490906, "learning_rate": 4.7651460940118887e-05, "loss": 0.2871, "num_input_tokens_seen": 1561240, "step": 17330 }, { "epoch": 4.50493762993763, "grad_norm": 0.35737863183021545, "learning_rate": 4.764906125233377e-05, "loss": 0.2922, "num_input_tokens_seen": 1561704, "step": 17335 }, { "epoch": 4.506237006237006, "grad_norm": 0.7466405630111694, "learning_rate": 4.764666039968572e-05, "loss": 0.2946, "num_input_tokens_seen": 1562136, "step": 17340 }, { "epoch": 4.507536382536383, "grad_norm": 0.42944592237472534, "learning_rate": 4.764425838229824e-05, "loss": 0.271, "num_input_tokens_seen": 1562552, "step": 17345 }, { "epoch": 4.508835758835759, "grad_norm": 0.29850271344184875, "learning_rate": 4.7641855200294844e-05, "loss": 0.3003, "num_input_tokens_seen": 1563000, "step": 17350 }, { "epoch": 4.510135135135135, "grad_norm": 0.31344762444496155, "learning_rate": 4.763945085379915e-05, "loss": 0.2605, "num_input_tokens_seen": 1563448, "step": 17355 }, { "epoch": 4.511434511434511, "grad_norm": 0.4550086259841919, "learning_rate": 4.7637045342934795e-05, "loss": 0.2516, "num_input_tokens_seen": 1563864, "step": 17360 }, { "epoch": 4.512733887733888, "grad_norm": 0.33956295251846313, "learning_rate": 4.763463866782551e-05, "loss": 0.3071, "num_input_tokens_seen": 1564344, "step": 17365 }, { "epoch": 4.514033264033264, "grad_norm": 0.30721306800842285, "learning_rate": 4.763223082859508e-05, "loss": 0.2447, "num_input_tokens_seen": 1564776, "step": 17370 }, { "epoch": 4.51533264033264, "grad_norm": 0.4477798044681549, "learning_rate": 4.7629821825367335e-05, "loss": 0.2385, "num_input_tokens_seen": 1565208, "step": 17375 }, { "epoch": 4.516632016632016, "grad_norm": 0.5126039981842041, "learning_rate": 4.762741165826616e-05, "loss": 0.2792, "num_input_tokens_seen": 1565656, "step": 17380 }, { "epoch": 4.517931392931393, "grad_norm": 0.3807888925075531, "learning_rate": 4.762500032741553e-05, "loss": 0.2082, "num_input_tokens_seen": 1566104, "step": 17385 }, { "epoch": 4.519230769230769, "grad_norm": 0.3826828896999359, "learning_rate": 4.7622587832939456e-05, "loss": 0.326, "num_input_tokens_seen": 1566552, "step": 17390 }, { "epoch": 4.520530145530145, "grad_norm": 0.32408249378204346, "learning_rate": 4.7620174174962005e-05, "loss": 0.2485, "num_input_tokens_seen": 1567016, "step": 17395 }, { "epoch": 4.521829521829522, "grad_norm": 0.22261923551559448, "learning_rate": 4.7617759353607325e-05, "loss": 0.3134, "num_input_tokens_seen": 1567448, "step": 17400 }, { "epoch": 4.523128898128898, "grad_norm": 0.5221038460731506, "learning_rate": 4.761534336899962e-05, "loss": 0.3008, "num_input_tokens_seen": 1567896, "step": 17405 }, { "epoch": 4.524428274428274, "grad_norm": 0.16145363450050354, "learning_rate": 4.761292622126313e-05, "loss": 0.2618, "num_input_tokens_seen": 1568424, "step": 17410 }, { "epoch": 4.525727650727651, "grad_norm": 0.4402347207069397, "learning_rate": 4.761050791052217e-05, "loss": 0.2573, "num_input_tokens_seen": 1568856, "step": 17415 }, { "epoch": 4.527027027027027, "grad_norm": 0.46798837184906006, "learning_rate": 4.7608088436901135e-05, "loss": 0.2289, "num_input_tokens_seen": 1569320, "step": 17420 }, { "epoch": 4.528326403326403, "grad_norm": 0.27832120656967163, "learning_rate": 4.760566780052445e-05, "loss": 0.1755, "num_input_tokens_seen": 1569720, "step": 17425 }, { "epoch": 4.529625779625779, "grad_norm": 0.2630334794521332, "learning_rate": 4.760324600151661e-05, "loss": 0.2966, "num_input_tokens_seen": 1570152, "step": 17430 }, { "epoch": 4.530925155925156, "grad_norm": 0.8289205431938171, "learning_rate": 4.7600823040002164e-05, "loss": 0.3764, "num_input_tokens_seen": 1570584, "step": 17435 }, { "epoch": 4.532224532224532, "grad_norm": 0.2779191732406616, "learning_rate": 4.7598398916105736e-05, "loss": 0.354, "num_input_tokens_seen": 1571048, "step": 17440 }, { "epoch": 4.533523908523908, "grad_norm": 0.23199766874313354, "learning_rate": 4.7595973629952e-05, "loss": 0.1997, "num_input_tokens_seen": 1571480, "step": 17445 }, { "epoch": 4.534823284823284, "grad_norm": 0.23107393085956573, "learning_rate": 4.759354718166569e-05, "loss": 0.3092, "num_input_tokens_seen": 1571912, "step": 17450 }, { "epoch": 4.536122661122661, "grad_norm": 0.39368966221809387, "learning_rate": 4.75911195713716e-05, "loss": 0.2126, "num_input_tokens_seen": 1572360, "step": 17455 }, { "epoch": 4.537422037422037, "grad_norm": 0.3511519432067871, "learning_rate": 4.7588690799194586e-05, "loss": 0.2504, "num_input_tokens_seen": 1572792, "step": 17460 }, { "epoch": 4.538721413721413, "grad_norm": 0.331350713968277, "learning_rate": 4.758626086525956e-05, "loss": 0.2504, "num_input_tokens_seen": 1573240, "step": 17465 }, { "epoch": 4.54002079002079, "grad_norm": 0.6766587495803833, "learning_rate": 4.7583829769691496e-05, "loss": 0.3045, "num_input_tokens_seen": 1573672, "step": 17470 }, { "epoch": 4.541320166320166, "grad_norm": 0.26524725556373596, "learning_rate": 4.7581397512615425e-05, "loss": 0.2203, "num_input_tokens_seen": 1574104, "step": 17475 }, { "epoch": 4.542619542619542, "grad_norm": 0.2720552682876587, "learning_rate": 4.7578964094156455e-05, "loss": 0.2581, "num_input_tokens_seen": 1574568, "step": 17480 }, { "epoch": 4.543918918918919, "grad_norm": 0.2543336749076843, "learning_rate": 4.7576529514439715e-05, "loss": 0.2203, "num_input_tokens_seen": 1574984, "step": 17485 }, { "epoch": 4.545218295218295, "grad_norm": 0.2808227241039276, "learning_rate": 4.757409377359043e-05, "loss": 0.2229, "num_input_tokens_seen": 1575400, "step": 17490 }, { "epoch": 4.546517671517671, "grad_norm": 0.2719227373600006, "learning_rate": 4.757165687173388e-05, "loss": 0.2694, "num_input_tokens_seen": 1575832, "step": 17495 }, { "epoch": 4.547817047817047, "grad_norm": 0.5865762233734131, "learning_rate": 4.756921880899539e-05, "loss": 0.324, "num_input_tokens_seen": 1576296, "step": 17500 }, { "epoch": 4.549116424116424, "grad_norm": 0.2059197574853897, "learning_rate": 4.7566779585500347e-05, "loss": 0.279, "num_input_tokens_seen": 1576744, "step": 17505 }, { "epoch": 4.5504158004158, "grad_norm": 0.32276299595832825, "learning_rate": 4.756433920137421e-05, "loss": 0.1626, "num_input_tokens_seen": 1577192, "step": 17510 }, { "epoch": 4.5517151767151764, "grad_norm": 0.49989357590675354, "learning_rate": 4.756189765674249e-05, "loss": 0.3062, "num_input_tokens_seen": 1577640, "step": 17515 }, { "epoch": 4.553014553014553, "grad_norm": 0.18908970057964325, "learning_rate": 4.755945495173075e-05, "loss": 0.2831, "num_input_tokens_seen": 1578072, "step": 17520 }, { "epoch": 4.554313929313929, "grad_norm": 0.27659115195274353, "learning_rate": 4.755701108646463e-05, "loss": 0.1907, "num_input_tokens_seen": 1578536, "step": 17525 }, { "epoch": 4.5556133056133055, "grad_norm": 0.21179820597171783, "learning_rate": 4.7554566061069816e-05, "loss": 0.2813, "num_input_tokens_seen": 1578968, "step": 17530 }, { "epoch": 4.5569126819126815, "grad_norm": 0.20770955085754395, "learning_rate": 4.755211987567206e-05, "loss": 0.2226, "num_input_tokens_seen": 1579400, "step": 17535 }, { "epoch": 4.558212058212058, "grad_norm": 0.23754213750362396, "learning_rate": 4.754967253039717e-05, "loss": 0.1769, "num_input_tokens_seen": 1579816, "step": 17540 }, { "epoch": 4.5595114345114345, "grad_norm": 0.2568390965461731, "learning_rate": 4.754722402537102e-05, "loss": 0.2194, "num_input_tokens_seen": 1580264, "step": 17545 }, { "epoch": 4.5608108108108105, "grad_norm": 0.24619174003601074, "learning_rate": 4.7544774360719526e-05, "loss": 0.4212, "num_input_tokens_seen": 1580728, "step": 17550 }, { "epoch": 4.5621101871101875, "grad_norm": 0.2548225224018097, "learning_rate": 4.754232353656869e-05, "loss": 0.2158, "num_input_tokens_seen": 1581128, "step": 17555 }, { "epoch": 4.5634095634095635, "grad_norm": 0.2061701864004135, "learning_rate": 4.753987155304456e-05, "loss": 0.2818, "num_input_tokens_seen": 1581544, "step": 17560 }, { "epoch": 4.5647089397089395, "grad_norm": 0.28881531953811646, "learning_rate": 4.753741841027323e-05, "loss": 0.2637, "num_input_tokens_seen": 1581960, "step": 17565 }, { "epoch": 4.5660083160083165, "grad_norm": 0.1773272156715393, "learning_rate": 4.753496410838088e-05, "loss": 0.2775, "num_input_tokens_seen": 1582424, "step": 17570 }, { "epoch": 4.5673076923076925, "grad_norm": 0.30507901310920715, "learning_rate": 4.753250864749373e-05, "loss": 0.2526, "num_input_tokens_seen": 1582888, "step": 17575 }, { "epoch": 4.5686070686070686, "grad_norm": 0.31248438358306885, "learning_rate": 4.753005202773808e-05, "loss": 0.2311, "num_input_tokens_seen": 1583368, "step": 17580 }, { "epoch": 4.569906444906445, "grad_norm": 0.24294444918632507, "learning_rate": 4.7527594249240264e-05, "loss": 0.1545, "num_input_tokens_seen": 1583832, "step": 17585 }, { "epoch": 4.5712058212058215, "grad_norm": 0.22620807588100433, "learning_rate": 4.752513531212669e-05, "loss": 0.3523, "num_input_tokens_seen": 1584296, "step": 17590 }, { "epoch": 4.572505197505198, "grad_norm": 0.23415036499500275, "learning_rate": 4.7522675216523824e-05, "loss": 0.3238, "num_input_tokens_seen": 1584728, "step": 17595 }, { "epoch": 4.573804573804574, "grad_norm": 0.25163012742996216, "learning_rate": 4.7520213962558194e-05, "loss": 0.2598, "num_input_tokens_seen": 1585192, "step": 17600 }, { "epoch": 4.57510395010395, "grad_norm": 0.18800170719623566, "learning_rate": 4.751775155035637e-05, "loss": 0.332, "num_input_tokens_seen": 1585640, "step": 17605 }, { "epoch": 4.576403326403327, "grad_norm": 0.30976247787475586, "learning_rate": 4.751528798004502e-05, "loss": 0.2502, "num_input_tokens_seen": 1586072, "step": 17610 }, { "epoch": 4.577702702702703, "grad_norm": 0.3345116078853607, "learning_rate": 4.7512823251750836e-05, "loss": 0.2166, "num_input_tokens_seen": 1586568, "step": 17615 }, { "epoch": 4.579002079002079, "grad_norm": 0.30214694142341614, "learning_rate": 4.7510357365600576e-05, "loss": 0.248, "num_input_tokens_seen": 1587032, "step": 17620 }, { "epoch": 4.580301455301456, "grad_norm": 0.23995868861675262, "learning_rate": 4.750789032172107e-05, "loss": 0.1873, "num_input_tokens_seen": 1587496, "step": 17625 }, { "epoch": 4.581600831600832, "grad_norm": 0.22692809998989105, "learning_rate": 4.750542212023921e-05, "loss": 0.2635, "num_input_tokens_seen": 1587928, "step": 17630 }, { "epoch": 4.582900207900208, "grad_norm": 0.23388203978538513, "learning_rate": 4.750295276128191e-05, "loss": 0.3153, "num_input_tokens_seen": 1588408, "step": 17635 }, { "epoch": 4.584199584199585, "grad_norm": 0.228934645652771, "learning_rate": 4.7500482244976193e-05, "loss": 0.1187, "num_input_tokens_seen": 1588840, "step": 17640 }, { "epoch": 4.585498960498961, "grad_norm": 0.2350911796092987, "learning_rate": 4.7498010571449117e-05, "loss": 0.3031, "num_input_tokens_seen": 1589272, "step": 17645 }, { "epoch": 4.586798336798337, "grad_norm": 0.21002589166164398, "learning_rate": 4.74955377408278e-05, "loss": 0.1147, "num_input_tokens_seen": 1589688, "step": 17650 }, { "epoch": 4.588097713097713, "grad_norm": 0.3113340437412262, "learning_rate": 4.749306375323943e-05, "loss": 0.3048, "num_input_tokens_seen": 1590120, "step": 17655 }, { "epoch": 4.58939708939709, "grad_norm": 0.27885502576828003, "learning_rate": 4.749058860881123e-05, "loss": 0.2489, "num_input_tokens_seen": 1590584, "step": 17660 }, { "epoch": 4.590696465696466, "grad_norm": 0.20403717458248138, "learning_rate": 4.748811230767051e-05, "loss": 0.2094, "num_input_tokens_seen": 1591048, "step": 17665 }, { "epoch": 4.591995841995842, "grad_norm": 0.22585046291351318, "learning_rate": 4.748563484994463e-05, "loss": 0.3648, "num_input_tokens_seen": 1591496, "step": 17670 }, { "epoch": 4.593295218295218, "grad_norm": 0.3196600079536438, "learning_rate": 4.748315623576101e-05, "loss": 0.3129, "num_input_tokens_seen": 1591992, "step": 17675 }, { "epoch": 4.594594594594595, "grad_norm": 0.5656000971794128, "learning_rate": 4.748067646524711e-05, "loss": 0.2694, "num_input_tokens_seen": 1592456, "step": 17680 }, { "epoch": 4.595893970893971, "grad_norm": 0.4864625334739685, "learning_rate": 4.747819553853049e-05, "loss": 0.3127, "num_input_tokens_seen": 1592904, "step": 17685 }, { "epoch": 4.597193347193347, "grad_norm": 0.6561311483383179, "learning_rate": 4.7475713455738724e-05, "loss": 0.2568, "num_input_tokens_seen": 1593320, "step": 17690 }, { "epoch": 4.598492723492724, "grad_norm": 0.305232971906662, "learning_rate": 4.747323021699949e-05, "loss": 0.2816, "num_input_tokens_seen": 1593784, "step": 17695 }, { "epoch": 4.5997920997921, "grad_norm": 0.6586288213729858, "learning_rate": 4.7470745822440487e-05, "loss": 0.2626, "num_input_tokens_seen": 1594232, "step": 17700 }, { "epoch": 4.601091476091476, "grad_norm": 0.2105463445186615, "learning_rate": 4.74682602721895e-05, "loss": 0.2886, "num_input_tokens_seen": 1594680, "step": 17705 }, { "epoch": 4.602390852390853, "grad_norm": 0.21347641944885254, "learning_rate": 4.746577356637436e-05, "loss": 0.239, "num_input_tokens_seen": 1595096, "step": 17710 }, { "epoch": 4.603690228690229, "grad_norm": 0.3628983795642853, "learning_rate": 4.746328570512296e-05, "loss": 0.2036, "num_input_tokens_seen": 1595512, "step": 17715 }, { "epoch": 4.604989604989605, "grad_norm": 0.2906492054462433, "learning_rate": 4.746079668856325e-05, "loss": 0.2529, "num_input_tokens_seen": 1595960, "step": 17720 }, { "epoch": 4.606288981288982, "grad_norm": 0.284280925989151, "learning_rate": 4.745830651682325e-05, "loss": 0.2168, "num_input_tokens_seen": 1596392, "step": 17725 }, { "epoch": 4.607588357588358, "grad_norm": 0.26397067308425903, "learning_rate": 4.745581519003102e-05, "loss": 0.203, "num_input_tokens_seen": 1596872, "step": 17730 }, { "epoch": 4.608887733887734, "grad_norm": 0.43832293152809143, "learning_rate": 4.74533227083147e-05, "loss": 0.1605, "num_input_tokens_seen": 1597320, "step": 17735 }, { "epoch": 4.61018711018711, "grad_norm": 0.24260906875133514, "learning_rate": 4.7450829071802484e-05, "loss": 0.2889, "num_input_tokens_seen": 1597768, "step": 17740 }, { "epoch": 4.611486486486487, "grad_norm": 0.2176164835691452, "learning_rate": 4.744833428062262e-05, "loss": 0.1463, "num_input_tokens_seen": 1598184, "step": 17745 }, { "epoch": 4.612785862785863, "grad_norm": 0.33443593978881836, "learning_rate": 4.74458383349034e-05, "loss": 0.2147, "num_input_tokens_seen": 1598616, "step": 17750 }, { "epoch": 4.614085239085239, "grad_norm": 0.4696332812309265, "learning_rate": 4.744334123477322e-05, "loss": 0.4283, "num_input_tokens_seen": 1599032, "step": 17755 }, { "epoch": 4.615384615384615, "grad_norm": 0.7205445766448975, "learning_rate": 4.7440842980360503e-05, "loss": 0.1265, "num_input_tokens_seen": 1599464, "step": 17760 }, { "epoch": 4.616683991683992, "grad_norm": 0.2630017399787903, "learning_rate": 4.743834357179372e-05, "loss": 0.2603, "num_input_tokens_seen": 1599880, "step": 17765 }, { "epoch": 4.617983367983368, "grad_norm": 0.23099873960018158, "learning_rate": 4.743584300920142e-05, "loss": 0.3138, "num_input_tokens_seen": 1600344, "step": 17770 }, { "epoch": 4.619282744282744, "grad_norm": 0.47103092074394226, "learning_rate": 4.743334129271223e-05, "loss": 0.2189, "num_input_tokens_seen": 1600792, "step": 17775 }, { "epoch": 4.620582120582121, "grad_norm": 0.2139245867729187, "learning_rate": 4.74308384224548e-05, "loss": 0.2568, "num_input_tokens_seen": 1601224, "step": 17780 }, { "epoch": 4.621881496881497, "grad_norm": 0.31073418259620667, "learning_rate": 4.7428334398557856e-05, "loss": 0.2415, "num_input_tokens_seen": 1601720, "step": 17785 }, { "epoch": 4.623180873180873, "grad_norm": 0.28920337557792664, "learning_rate": 4.742582922115019e-05, "loss": 0.2481, "num_input_tokens_seen": 1602152, "step": 17790 }, { "epoch": 4.62448024948025, "grad_norm": 0.2563111186027527, "learning_rate": 4.7423322890360625e-05, "loss": 0.2195, "num_input_tokens_seen": 1602600, "step": 17795 }, { "epoch": 4.625779625779626, "grad_norm": 0.23324419558048248, "learning_rate": 4.7420815406318086e-05, "loss": 0.1746, "num_input_tokens_seen": 1603032, "step": 17800 }, { "epoch": 4.627079002079002, "grad_norm": 0.22065553069114685, "learning_rate": 4.741830676915153e-05, "loss": 0.1122, "num_input_tokens_seen": 1603464, "step": 17805 }, { "epoch": 4.628378378378378, "grad_norm": 0.22695907950401306, "learning_rate": 4.741579697898998e-05, "loss": 0.2558, "num_input_tokens_seen": 1603928, "step": 17810 }, { "epoch": 4.629677754677755, "grad_norm": 0.18881142139434814, "learning_rate": 4.741328603596251e-05, "loss": 0.2511, "num_input_tokens_seen": 1604344, "step": 17815 }, { "epoch": 4.630977130977131, "grad_norm": 0.20582111179828644, "learning_rate": 4.741077394019826e-05, "loss": 0.1483, "num_input_tokens_seen": 1604744, "step": 17820 }, { "epoch": 4.632276507276507, "grad_norm": 0.21923291683197021, "learning_rate": 4.740826069182645e-05, "loss": 0.1385, "num_input_tokens_seen": 1605208, "step": 17825 }, { "epoch": 4.633575883575883, "grad_norm": 0.42843684554100037, "learning_rate": 4.74057462909763e-05, "loss": 0.2708, "num_input_tokens_seen": 1605656, "step": 17830 }, { "epoch": 4.63487525987526, "grad_norm": 0.4002547264099121, "learning_rate": 4.740323073777716e-05, "loss": 0.3964, "num_input_tokens_seen": 1606104, "step": 17835 }, { "epoch": 4.636174636174636, "grad_norm": 0.371703565120697, "learning_rate": 4.740071403235839e-05, "loss": 0.2662, "num_input_tokens_seen": 1606536, "step": 17840 }, { "epoch": 4.637474012474012, "grad_norm": 0.34078970551490784, "learning_rate": 4.739819617484945e-05, "loss": 0.2651, "num_input_tokens_seen": 1607000, "step": 17845 }, { "epoch": 4.638773388773389, "grad_norm": 0.2621719241142273, "learning_rate": 4.739567716537981e-05, "loss": 0.1373, "num_input_tokens_seen": 1607432, "step": 17850 }, { "epoch": 4.640072765072765, "grad_norm": 0.25960057973861694, "learning_rate": 4.7393157004079034e-05, "loss": 0.1696, "num_input_tokens_seen": 1607896, "step": 17855 }, { "epoch": 4.641372141372141, "grad_norm": 0.35646501183509827, "learning_rate": 4.7390635691076735e-05, "loss": 0.2606, "num_input_tokens_seen": 1608376, "step": 17860 }, { "epoch": 4.642671517671518, "grad_norm": 0.3927176296710968, "learning_rate": 4.73881132265026e-05, "loss": 0.2574, "num_input_tokens_seen": 1608824, "step": 17865 }, { "epoch": 4.643970893970894, "grad_norm": 0.27973315119743347, "learning_rate": 4.738558961048634e-05, "loss": 0.2687, "num_input_tokens_seen": 1609320, "step": 17870 }, { "epoch": 4.64527027027027, "grad_norm": 0.22421185672283173, "learning_rate": 4.7383064843157757e-05, "loss": 0.148, "num_input_tokens_seen": 1609784, "step": 17875 }, { "epoch": 4.646569646569646, "grad_norm": 0.2620384097099304, "learning_rate": 4.7380538924646714e-05, "loss": 0.3869, "num_input_tokens_seen": 1610232, "step": 17880 }, { "epoch": 4.647869022869023, "grad_norm": 0.3092893958091736, "learning_rate": 4.737801185508309e-05, "loss": 0.1823, "num_input_tokens_seen": 1610648, "step": 17885 }, { "epoch": 4.649168399168399, "grad_norm": 0.24025090038776398, "learning_rate": 4.7375483634596895e-05, "loss": 0.3228, "num_input_tokens_seen": 1611096, "step": 17890 }, { "epoch": 4.650467775467775, "grad_norm": 0.3784443140029907, "learning_rate": 4.7372954263318124e-05, "loss": 0.2299, "num_input_tokens_seen": 1611560, "step": 17895 }, { "epoch": 4.651767151767151, "grad_norm": 0.3837445378303528, "learning_rate": 4.737042374137689e-05, "loss": 0.2727, "num_input_tokens_seen": 1611960, "step": 17900 }, { "epoch": 4.653066528066528, "grad_norm": 0.40858423709869385, "learning_rate": 4.736789206890332e-05, "loss": 0.2781, "num_input_tokens_seen": 1612360, "step": 17905 }, { "epoch": 4.654365904365904, "grad_norm": 0.35834023356437683, "learning_rate": 4.736535924602763e-05, "loss": 0.2543, "num_input_tokens_seen": 1612808, "step": 17910 }, { "epoch": 4.65566528066528, "grad_norm": 0.6372369527816772, "learning_rate": 4.736282527288008e-05, "loss": 0.2606, "num_input_tokens_seen": 1613240, "step": 17915 }, { "epoch": 4.656964656964657, "grad_norm": 0.3469238579273224, "learning_rate": 4.736029014959101e-05, "loss": 0.2292, "num_input_tokens_seen": 1613736, "step": 17920 }, { "epoch": 4.658264033264033, "grad_norm": 0.27764248847961426, "learning_rate": 4.735775387629079e-05, "loss": 0.289, "num_input_tokens_seen": 1614200, "step": 17925 }, { "epoch": 4.659563409563409, "grad_norm": 0.25028812885284424, "learning_rate": 4.735521645310986e-05, "loss": 0.1775, "num_input_tokens_seen": 1614648, "step": 17930 }, { "epoch": 4.660862785862786, "grad_norm": 0.24540778994560242, "learning_rate": 4.735267788017874e-05, "loss": 0.2503, "num_input_tokens_seen": 1615080, "step": 17935 }, { "epoch": 4.662162162162162, "grad_norm": 0.27207091450691223, "learning_rate": 4.735013815762796e-05, "loss": 0.2104, "num_input_tokens_seen": 1615512, "step": 17940 }, { "epoch": 4.663461538461538, "grad_norm": 0.6338043808937073, "learning_rate": 4.734759728558819e-05, "loss": 0.3016, "num_input_tokens_seen": 1615944, "step": 17945 }, { "epoch": 4.664760914760915, "grad_norm": 0.26335904002189636, "learning_rate": 4.734505526419005e-05, "loss": 0.37, "num_input_tokens_seen": 1616392, "step": 17950 }, { "epoch": 4.666060291060291, "grad_norm": 0.30707937479019165, "learning_rate": 4.734251209356432e-05, "loss": 0.2228, "num_input_tokens_seen": 1616824, "step": 17955 }, { "epoch": 4.667359667359667, "grad_norm": 0.24773792922496796, "learning_rate": 4.7339967773841786e-05, "loss": 0.2181, "num_input_tokens_seen": 1617272, "step": 17960 }, { "epoch": 4.668659043659043, "grad_norm": 0.3155944347381592, "learning_rate": 4.73374223051533e-05, "loss": 0.2071, "num_input_tokens_seen": 1617736, "step": 17965 }, { "epoch": 4.66995841995842, "grad_norm": 0.31142884492874146, "learning_rate": 4.733487568762979e-05, "loss": 0.352, "num_input_tokens_seen": 1618184, "step": 17970 }, { "epoch": 4.671257796257796, "grad_norm": 0.29960718750953674, "learning_rate": 4.733232792140221e-05, "loss": 0.2316, "num_input_tokens_seen": 1618616, "step": 17975 }, { "epoch": 4.672557172557172, "grad_norm": 0.33089134097099304, "learning_rate": 4.732977900660162e-05, "loss": 0.2126, "num_input_tokens_seen": 1619064, "step": 17980 }, { "epoch": 4.673856548856548, "grad_norm": 0.5493955016136169, "learning_rate": 4.732722894335909e-05, "loss": 0.2726, "num_input_tokens_seen": 1619528, "step": 17985 }, { "epoch": 4.675155925155925, "grad_norm": 0.3798086941242218, "learning_rate": 4.732467773180579e-05, "loss": 0.2334, "num_input_tokens_seen": 1619976, "step": 17990 }, { "epoch": 4.676455301455301, "grad_norm": 0.3987483084201813, "learning_rate": 4.7322125372072924e-05, "loss": 0.2226, "num_input_tokens_seen": 1620440, "step": 17995 }, { "epoch": 4.6777546777546775, "grad_norm": 0.3348250389099121, "learning_rate": 4.731957186429176e-05, "loss": 0.2635, "num_input_tokens_seen": 1620936, "step": 18000 }, { "epoch": 4.679054054054054, "grad_norm": 0.2835632264614105, "learning_rate": 4.731701720859362e-05, "loss": 0.2142, "num_input_tokens_seen": 1621416, "step": 18005 }, { "epoch": 4.68035343035343, "grad_norm": 0.3219725787639618, "learning_rate": 4.731446140510991e-05, "loss": 0.0947, "num_input_tokens_seen": 1621864, "step": 18010 }, { "epoch": 4.6816528066528065, "grad_norm": 0.6781765818595886, "learning_rate": 4.731190445397207e-05, "loss": 0.3913, "num_input_tokens_seen": 1622344, "step": 18015 }, { "epoch": 4.682952182952183, "grad_norm": 0.40296486020088196, "learning_rate": 4.730934635531161e-05, "loss": 0.212, "num_input_tokens_seen": 1622808, "step": 18020 }, { "epoch": 4.6842515592515594, "grad_norm": 0.29329049587249756, "learning_rate": 4.7306787109260085e-05, "loss": 0.2517, "num_input_tokens_seen": 1623240, "step": 18025 }, { "epoch": 4.6855509355509355, "grad_norm": 0.6182325482368469, "learning_rate": 4.730422671594913e-05, "loss": 0.2451, "num_input_tokens_seen": 1623704, "step": 18030 }, { "epoch": 4.6868503118503115, "grad_norm": 0.3087700307369232, "learning_rate": 4.7301665175510416e-05, "loss": 0.3156, "num_input_tokens_seen": 1624200, "step": 18035 }, { "epoch": 4.6881496881496885, "grad_norm": 0.5422840118408203, "learning_rate": 4.72991024880757e-05, "loss": 0.3067, "num_input_tokens_seen": 1624648, "step": 18040 }, { "epoch": 4.6894490644490645, "grad_norm": 0.7568026781082153, "learning_rate": 4.729653865377678e-05, "loss": 0.3005, "num_input_tokens_seen": 1625080, "step": 18045 }, { "epoch": 4.6907484407484406, "grad_norm": 0.9852423667907715, "learning_rate": 4.729397367274551e-05, "loss": 0.28, "num_input_tokens_seen": 1625560, "step": 18050 }, { "epoch": 4.692047817047817, "grad_norm": 0.32764261960983276, "learning_rate": 4.729140754511381e-05, "loss": 0.2982, "num_input_tokens_seen": 1625992, "step": 18055 }, { "epoch": 4.6933471933471935, "grad_norm": 0.6782556772232056, "learning_rate": 4.728884027101367e-05, "loss": 0.2994, "num_input_tokens_seen": 1626456, "step": 18060 }, { "epoch": 4.69464656964657, "grad_norm": 0.3622226417064667, "learning_rate": 4.7286271850577105e-05, "loss": 0.2412, "num_input_tokens_seen": 1626920, "step": 18065 }, { "epoch": 4.695945945945946, "grad_norm": 0.4240908920764923, "learning_rate": 4.728370228393624e-05, "loss": 0.2193, "num_input_tokens_seen": 1627416, "step": 18070 }, { "epoch": 4.6972453222453225, "grad_norm": 0.2944318652153015, "learning_rate": 4.7281131571223206e-05, "loss": 0.2073, "num_input_tokens_seen": 1627832, "step": 18075 }, { "epoch": 4.698544698544699, "grad_norm": 0.7137175798416138, "learning_rate": 4.727855971257023e-05, "loss": 0.2203, "num_input_tokens_seen": 1628264, "step": 18080 }, { "epoch": 4.699844074844075, "grad_norm": 0.5731489658355713, "learning_rate": 4.727598670810958e-05, "loss": 0.3302, "num_input_tokens_seen": 1628728, "step": 18085 }, { "epoch": 4.701143451143452, "grad_norm": 0.30941489338874817, "learning_rate": 4.727341255797358e-05, "loss": 0.1687, "num_input_tokens_seen": 1629176, "step": 18090 }, { "epoch": 4.702442827442828, "grad_norm": 0.31406569480895996, "learning_rate": 4.7270837262294644e-05, "loss": 0.167, "num_input_tokens_seen": 1629640, "step": 18095 }, { "epoch": 4.703742203742204, "grad_norm": 0.36224111914634705, "learning_rate": 4.7268260821205205e-05, "loss": 0.2527, "num_input_tokens_seen": 1630104, "step": 18100 }, { "epoch": 4.70504158004158, "grad_norm": 0.774534285068512, "learning_rate": 4.7265683234837774e-05, "loss": 0.359, "num_input_tokens_seen": 1630584, "step": 18105 }, { "epoch": 4.706340956340957, "grad_norm": 0.4583212435245514, "learning_rate": 4.726310450332493e-05, "loss": 0.2101, "num_input_tokens_seen": 1631064, "step": 18110 }, { "epoch": 4.707640332640333, "grad_norm": 0.2791767418384552, "learning_rate": 4.726052462679928e-05, "loss": 0.2084, "num_input_tokens_seen": 1631544, "step": 18115 }, { "epoch": 4.708939708939709, "grad_norm": 0.36954525113105774, "learning_rate": 4.725794360539352e-05, "loss": 0.2082, "num_input_tokens_seen": 1632024, "step": 18120 }, { "epoch": 4.710239085239085, "grad_norm": 0.4811212122440338, "learning_rate": 4.7255361439240395e-05, "loss": 0.2751, "num_input_tokens_seen": 1632472, "step": 18125 }, { "epoch": 4.711538461538462, "grad_norm": 0.2976840138435364, "learning_rate": 4.725277812847271e-05, "loss": 0.2773, "num_input_tokens_seen": 1632952, "step": 18130 }, { "epoch": 4.712837837837838, "grad_norm": 0.32396721839904785, "learning_rate": 4.725019367322332e-05, "loss": 0.2591, "num_input_tokens_seen": 1633384, "step": 18135 }, { "epoch": 4.714137214137214, "grad_norm": 0.37454986572265625, "learning_rate": 4.7247608073625154e-05, "loss": 0.2338, "num_input_tokens_seen": 1633864, "step": 18140 }, { "epoch": 4.715436590436591, "grad_norm": 0.31528106331825256, "learning_rate": 4.724502132981119e-05, "loss": 0.2657, "num_input_tokens_seen": 1634328, "step": 18145 }, { "epoch": 4.716735966735967, "grad_norm": 0.26192164421081543, "learning_rate": 4.724243344191446e-05, "loss": 0.2563, "num_input_tokens_seen": 1634760, "step": 18150 }, { "epoch": 4.718035343035343, "grad_norm": 0.2985522150993347, "learning_rate": 4.7239844410068065e-05, "loss": 0.3037, "num_input_tokens_seen": 1635240, "step": 18155 }, { "epoch": 4.71933471933472, "grad_norm": 0.3036249876022339, "learning_rate": 4.7237254234405164e-05, "loss": 0.19, "num_input_tokens_seen": 1635704, "step": 18160 }, { "epoch": 4.720634095634096, "grad_norm": 0.2315470427274704, "learning_rate": 4.723466291505897e-05, "loss": 0.2546, "num_input_tokens_seen": 1636152, "step": 18165 }, { "epoch": 4.721933471933472, "grad_norm": 0.3008439540863037, "learning_rate": 4.7232070452162765e-05, "loss": 0.2673, "num_input_tokens_seen": 1636584, "step": 18170 }, { "epoch": 4.723232848232849, "grad_norm": 0.26316988468170166, "learning_rate": 4.722947684584987e-05, "loss": 0.2174, "num_input_tokens_seen": 1637048, "step": 18175 }, { "epoch": 4.724532224532225, "grad_norm": 0.2875228524208069, "learning_rate": 4.722688209625368e-05, "loss": 0.2613, "num_input_tokens_seen": 1637560, "step": 18180 }, { "epoch": 4.725831600831601, "grad_norm": 0.23547960817813873, "learning_rate": 4.722428620350765e-05, "loss": 0.3344, "num_input_tokens_seen": 1637976, "step": 18185 }, { "epoch": 4.727130977130977, "grad_norm": 0.2024010717868805, "learning_rate": 4.7221689167745286e-05, "loss": 0.2479, "num_input_tokens_seen": 1638408, "step": 18190 }, { "epoch": 4.728430353430354, "grad_norm": 0.38528865575790405, "learning_rate": 4.7219090989100155e-05, "loss": 0.2506, "num_input_tokens_seen": 1638840, "step": 18195 }, { "epoch": 4.72972972972973, "grad_norm": 0.5629755854606628, "learning_rate": 4.721649166770589e-05, "loss": 0.2984, "num_input_tokens_seen": 1639272, "step": 18200 }, { "epoch": 4.731029106029106, "grad_norm": 0.41504907608032227, "learning_rate": 4.7213891203696164e-05, "loss": 0.2102, "num_input_tokens_seen": 1639752, "step": 18205 }, { "epoch": 4.732328482328482, "grad_norm": 0.33723729848861694, "learning_rate": 4.7211289597204736e-05, "loss": 0.2509, "num_input_tokens_seen": 1640216, "step": 18210 }, { "epoch": 4.733627858627859, "grad_norm": 0.3140156865119934, "learning_rate": 4.7208686848365394e-05, "loss": 0.185, "num_input_tokens_seen": 1640680, "step": 18215 }, { "epoch": 4.734927234927235, "grad_norm": 0.2808302938938141, "learning_rate": 4.7206082957312015e-05, "loss": 0.2586, "num_input_tokens_seen": 1641128, "step": 18220 }, { "epoch": 4.736226611226611, "grad_norm": 0.2985164523124695, "learning_rate": 4.7203477924178506e-05, "loss": 0.1776, "num_input_tokens_seen": 1641608, "step": 18225 }, { "epoch": 4.737525987525988, "grad_norm": 0.22764025628566742, "learning_rate": 4.720087174909886e-05, "loss": 0.3874, "num_input_tokens_seen": 1642040, "step": 18230 }, { "epoch": 4.738825363825364, "grad_norm": 0.43379566073417664, "learning_rate": 4.7198264432207106e-05, "loss": 0.3458, "num_input_tokens_seen": 1642488, "step": 18235 }, { "epoch": 4.74012474012474, "grad_norm": 0.3712597191333771, "learning_rate": 4.7195655973637345e-05, "loss": 0.2328, "num_input_tokens_seen": 1642904, "step": 18240 }, { "epoch": 4.741424116424117, "grad_norm": 0.29607927799224854, "learning_rate": 4.719304637352373e-05, "loss": 0.2605, "num_input_tokens_seen": 1643368, "step": 18245 }, { "epoch": 4.742723492723493, "grad_norm": 0.3976448178291321, "learning_rate": 4.719043563200047e-05, "loss": 0.2689, "num_input_tokens_seen": 1643832, "step": 18250 }, { "epoch": 4.744022869022869, "grad_norm": 0.41126489639282227, "learning_rate": 4.718782374920184e-05, "loss": 0.2768, "num_input_tokens_seen": 1644296, "step": 18255 }, { "epoch": 4.745322245322245, "grad_norm": 0.3931798040866852, "learning_rate": 4.718521072526219e-05, "loss": 0.3092, "num_input_tokens_seen": 1644776, "step": 18260 }, { "epoch": 4.746621621621622, "grad_norm": 0.23840482532978058, "learning_rate": 4.7182596560315886e-05, "loss": 0.2614, "num_input_tokens_seen": 1645224, "step": 18265 }, { "epoch": 4.747920997920998, "grad_norm": 0.5166231989860535, "learning_rate": 4.7179981254497385e-05, "loss": 0.2615, "num_input_tokens_seen": 1645688, "step": 18270 }, { "epoch": 4.749220374220374, "grad_norm": 0.42900189757347107, "learning_rate": 4.7177364807941206e-05, "loss": 0.2734, "num_input_tokens_seen": 1646168, "step": 18275 }, { "epoch": 4.75051975051975, "grad_norm": 0.580555260181427, "learning_rate": 4.71747472207819e-05, "loss": 0.2998, "num_input_tokens_seen": 1646600, "step": 18280 }, { "epoch": 4.751819126819127, "grad_norm": 0.45644575357437134, "learning_rate": 4.7172128493154086e-05, "loss": 0.2863, "num_input_tokens_seen": 1647048, "step": 18285 }, { "epoch": 4.753118503118503, "grad_norm": 0.22442419826984406, "learning_rate": 4.7169508625192475e-05, "loss": 0.2272, "num_input_tokens_seen": 1647512, "step": 18290 }, { "epoch": 4.754417879417879, "grad_norm": 0.3242223262786865, "learning_rate": 4.716688761703179e-05, "loss": 0.2571, "num_input_tokens_seen": 1647928, "step": 18295 }, { "epoch": 4.755717255717256, "grad_norm": 0.29230910539627075, "learning_rate": 4.716426546880683e-05, "loss": 0.2252, "num_input_tokens_seen": 1648360, "step": 18300 }, { "epoch": 4.757016632016632, "grad_norm": 0.2501787841320038, "learning_rate": 4.7161642180652464e-05, "loss": 0.2116, "num_input_tokens_seen": 1648824, "step": 18305 }, { "epoch": 4.758316008316008, "grad_norm": 0.34735697507858276, "learning_rate": 4.715901775270361e-05, "loss": 0.2064, "num_input_tokens_seen": 1649224, "step": 18310 }, { "epoch": 4.759615384615385, "grad_norm": 0.4279111921787262, "learning_rate": 4.715639218509524e-05, "loss": 0.3805, "num_input_tokens_seen": 1649672, "step": 18315 }, { "epoch": 4.760914760914761, "grad_norm": 0.292683869600296, "learning_rate": 4.715376547796239e-05, "loss": 0.2134, "num_input_tokens_seen": 1650184, "step": 18320 }, { "epoch": 4.762214137214137, "grad_norm": 0.3592882752418518, "learning_rate": 4.715113763144015e-05, "loss": 0.3145, "num_input_tokens_seen": 1650648, "step": 18325 }, { "epoch": 4.763513513513513, "grad_norm": 0.261225163936615, "learning_rate": 4.714850864566368e-05, "loss": 0.2558, "num_input_tokens_seen": 1651096, "step": 18330 }, { "epoch": 4.76481288981289, "grad_norm": 0.40718507766723633, "learning_rate": 4.7145878520768194e-05, "loss": 0.1996, "num_input_tokens_seen": 1651560, "step": 18335 }, { "epoch": 4.766112266112266, "grad_norm": 0.33024394512176514, "learning_rate": 4.714324725688895e-05, "loss": 0.2504, "num_input_tokens_seen": 1652008, "step": 18340 }, { "epoch": 4.767411642411642, "grad_norm": 0.32216352224349976, "learning_rate": 4.7140614854161284e-05, "loss": 0.1877, "num_input_tokens_seen": 1652424, "step": 18345 }, { "epoch": 4.768711018711018, "grad_norm": 0.6877448558807373, "learning_rate": 4.713798131272058e-05, "loss": 0.2761, "num_input_tokens_seen": 1652904, "step": 18350 }, { "epoch": 4.770010395010395, "grad_norm": 0.3817863464355469, "learning_rate": 4.7135346632702293e-05, "loss": 0.3164, "num_input_tokens_seen": 1653352, "step": 18355 }, { "epoch": 4.771309771309771, "grad_norm": 0.6451156139373779, "learning_rate": 4.7132710814241914e-05, "loss": 0.3578, "num_input_tokens_seen": 1653848, "step": 18360 }, { "epoch": 4.772609147609147, "grad_norm": 0.2467864751815796, "learning_rate": 4.713007385747501e-05, "loss": 0.1945, "num_input_tokens_seen": 1654280, "step": 18365 }, { "epoch": 4.773908523908524, "grad_norm": 0.3123992383480072, "learning_rate": 4.712743576253721e-05, "loss": 0.1977, "num_input_tokens_seen": 1654744, "step": 18370 }, { "epoch": 4.7752079002079, "grad_norm": 0.2833242118358612, "learning_rate": 4.712479652956419e-05, "loss": 0.2605, "num_input_tokens_seen": 1655176, "step": 18375 }, { "epoch": 4.776507276507276, "grad_norm": 0.2615572512149811, "learning_rate": 4.712215615869167e-05, "loss": 0.159, "num_input_tokens_seen": 1655624, "step": 18380 }, { "epoch": 4.777806652806653, "grad_norm": 0.37932273745536804, "learning_rate": 4.7119514650055476e-05, "loss": 0.4049, "num_input_tokens_seen": 1656088, "step": 18385 }, { "epoch": 4.779106029106029, "grad_norm": 0.4100794494152069, "learning_rate": 4.711687200379144e-05, "loss": 0.2831, "num_input_tokens_seen": 1656552, "step": 18390 }, { "epoch": 4.780405405405405, "grad_norm": 0.14557000994682312, "learning_rate": 4.71142282200355e-05, "loss": 0.3143, "num_input_tokens_seen": 1657000, "step": 18395 }, { "epoch": 4.781704781704782, "grad_norm": 0.8620565533638, "learning_rate": 4.71115832989236e-05, "loss": 0.291, "num_input_tokens_seen": 1657464, "step": 18400 }, { "epoch": 4.783004158004158, "grad_norm": 0.27986830472946167, "learning_rate": 4.710893724059179e-05, "loss": 0.6146, "num_input_tokens_seen": 1657912, "step": 18405 }, { "epoch": 4.784303534303534, "grad_norm": 0.26756614446640015, "learning_rate": 4.710629004517615e-05, "loss": 0.261, "num_input_tokens_seen": 1658360, "step": 18410 }, { "epoch": 4.78560291060291, "grad_norm": 0.3802553415298462, "learning_rate": 4.710364171281283e-05, "loss": 0.247, "num_input_tokens_seen": 1658824, "step": 18415 }, { "epoch": 4.786902286902287, "grad_norm": 0.34390178322792053, "learning_rate": 4.710099224363804e-05, "loss": 0.2337, "num_input_tokens_seen": 1659320, "step": 18420 }, { "epoch": 4.788201663201663, "grad_norm": 0.3668959140777588, "learning_rate": 4.709834163778805e-05, "loss": 0.2628, "num_input_tokens_seen": 1659784, "step": 18425 }, { "epoch": 4.789501039501039, "grad_norm": 0.24788923561573029, "learning_rate": 4.709568989539917e-05, "loss": 0.3614, "num_input_tokens_seen": 1660248, "step": 18430 }, { "epoch": 4.790800415800415, "grad_norm": 0.2819145619869232, "learning_rate": 4.709303701660779e-05, "loss": 0.2613, "num_input_tokens_seen": 1660712, "step": 18435 }, { "epoch": 4.792099792099792, "grad_norm": 0.4380500316619873, "learning_rate": 4.709038300155034e-05, "loss": 0.2453, "num_input_tokens_seen": 1661208, "step": 18440 }, { "epoch": 4.793399168399168, "grad_norm": 0.5081170201301575, "learning_rate": 4.708772785036334e-05, "loss": 0.2587, "num_input_tokens_seen": 1661656, "step": 18445 }, { "epoch": 4.794698544698544, "grad_norm": 0.263471782207489, "learning_rate": 4.7085071563183325e-05, "loss": 0.2841, "num_input_tokens_seen": 1662136, "step": 18450 }, { "epoch": 4.795997920997921, "grad_norm": 0.2100389003753662, "learning_rate": 4.7082414140146914e-05, "loss": 0.1894, "num_input_tokens_seen": 1662600, "step": 18455 }, { "epoch": 4.797297297297297, "grad_norm": 0.6223077774047852, "learning_rate": 4.70797555813908e-05, "loss": 0.3316, "num_input_tokens_seen": 1663048, "step": 18460 }, { "epoch": 4.798596673596673, "grad_norm": 0.34639155864715576, "learning_rate": 4.7077095887051686e-05, "loss": 0.2758, "num_input_tokens_seen": 1663496, "step": 18465 }, { "epoch": 4.79989604989605, "grad_norm": 0.30136027932167053, "learning_rate": 4.707443505726639e-05, "loss": 0.2508, "num_input_tokens_seen": 1663944, "step": 18470 }, { "epoch": 4.801195426195426, "grad_norm": 0.30003538727760315, "learning_rate": 4.707177309217173e-05, "loss": 0.1737, "num_input_tokens_seen": 1664360, "step": 18475 }, { "epoch": 4.802494802494802, "grad_norm": 0.24867135286331177, "learning_rate": 4.706910999190465e-05, "loss": 0.2898, "num_input_tokens_seen": 1664808, "step": 18480 }, { "epoch": 4.8037941787941785, "grad_norm": 0.25113335251808167, "learning_rate": 4.706644575660209e-05, "loss": 0.126, "num_input_tokens_seen": 1665240, "step": 18485 }, { "epoch": 4.805093555093555, "grad_norm": 0.21600662171840668, "learning_rate": 4.706378038640108e-05, "loss": 0.3292, "num_input_tokens_seen": 1665688, "step": 18490 }, { "epoch": 4.8063929313929314, "grad_norm": 0.21562108397483826, "learning_rate": 4.70611138814387e-05, "loss": 0.2785, "num_input_tokens_seen": 1666120, "step": 18495 }, { "epoch": 4.8076923076923075, "grad_norm": 0.2260858714580536, "learning_rate": 4.705844624185211e-05, "loss": 0.2032, "num_input_tokens_seen": 1666552, "step": 18500 }, { "epoch": 4.8089916839916835, "grad_norm": 0.2306281477212906, "learning_rate": 4.705577746777849e-05, "loss": 0.3401, "num_input_tokens_seen": 1667032, "step": 18505 }, { "epoch": 4.8102910602910605, "grad_norm": 0.23577803373336792, "learning_rate": 4.705310755935509e-05, "loss": 0.2465, "num_input_tokens_seen": 1667496, "step": 18510 }, { "epoch": 4.8115904365904365, "grad_norm": 0.5032378435134888, "learning_rate": 4.705043651671924e-05, "loss": 0.328, "num_input_tokens_seen": 1667960, "step": 18515 }, { "epoch": 4.8128898128898125, "grad_norm": 0.34692177176475525, "learning_rate": 4.7047764340008324e-05, "loss": 0.2551, "num_input_tokens_seen": 1668424, "step": 18520 }, { "epoch": 4.8141891891891895, "grad_norm": 0.1859414130449295, "learning_rate": 4.704509102935976e-05, "loss": 0.272, "num_input_tokens_seen": 1668904, "step": 18525 }, { "epoch": 4.8154885654885655, "grad_norm": 0.17269538342952728, "learning_rate": 4.704241658491104e-05, "loss": 0.2671, "num_input_tokens_seen": 1669336, "step": 18530 }, { "epoch": 4.816787941787942, "grad_norm": 0.3326883018016815, "learning_rate": 4.703974100679971e-05, "loss": 0.2398, "num_input_tokens_seen": 1669784, "step": 18535 }, { "epoch": 4.8180873180873185, "grad_norm": 0.3274000883102417, "learning_rate": 4.703706429516339e-05, "loss": 0.2727, "num_input_tokens_seen": 1670232, "step": 18540 }, { "epoch": 4.8193866943866945, "grad_norm": 0.29278820753097534, "learning_rate": 4.7034386450139735e-05, "loss": 0.2673, "num_input_tokens_seen": 1670664, "step": 18545 }, { "epoch": 4.820686070686071, "grad_norm": 0.3030218183994293, "learning_rate": 4.703170747186647e-05, "loss": 0.1454, "num_input_tokens_seen": 1671096, "step": 18550 }, { "epoch": 4.821985446985447, "grad_norm": 0.3543890714645386, "learning_rate": 4.702902736048138e-05, "loss": 0.264, "num_input_tokens_seen": 1671576, "step": 18555 }, { "epoch": 4.8232848232848236, "grad_norm": 0.5014283061027527, "learning_rate": 4.702634611612231e-05, "loss": 0.2673, "num_input_tokens_seen": 1672024, "step": 18560 }, { "epoch": 4.8245841995842, "grad_norm": 0.24183164536952972, "learning_rate": 4.702366373892715e-05, "loss": 0.2797, "num_input_tokens_seen": 1672488, "step": 18565 }, { "epoch": 4.825883575883576, "grad_norm": 0.5009349584579468, "learning_rate": 4.702098022903386e-05, "loss": 0.3899, "num_input_tokens_seen": 1672952, "step": 18570 }, { "epoch": 4.827182952182953, "grad_norm": 0.2662062346935272, "learning_rate": 4.701829558658046e-05, "loss": 0.2641, "num_input_tokens_seen": 1673432, "step": 18575 }, { "epoch": 4.828482328482329, "grad_norm": 0.3201392889022827, "learning_rate": 4.701560981170503e-05, "loss": 0.2242, "num_input_tokens_seen": 1673848, "step": 18580 }, { "epoch": 4.829781704781705, "grad_norm": 0.3854338824748993, "learning_rate": 4.701292290454568e-05, "loss": 0.2688, "num_input_tokens_seen": 1674328, "step": 18585 }, { "epoch": 4.831081081081081, "grad_norm": 0.17318977415561676, "learning_rate": 4.7010234865240624e-05, "loss": 0.2706, "num_input_tokens_seen": 1674776, "step": 18590 }, { "epoch": 4.832380457380458, "grad_norm": 0.31939804553985596, "learning_rate": 4.700754569392809e-05, "loss": 0.2909, "num_input_tokens_seen": 1675256, "step": 18595 }, { "epoch": 4.833679833679834, "grad_norm": 0.3789345920085907, "learning_rate": 4.700485539074641e-05, "loss": 0.2594, "num_input_tokens_seen": 1675672, "step": 18600 }, { "epoch": 4.83497920997921, "grad_norm": 0.33632877469062805, "learning_rate": 4.700216395583391e-05, "loss": 0.2319, "num_input_tokens_seen": 1676088, "step": 18605 }, { "epoch": 4.836278586278587, "grad_norm": 0.30980777740478516, "learning_rate": 4.6999471389329055e-05, "loss": 0.2359, "num_input_tokens_seen": 1676552, "step": 18610 }, { "epoch": 4.837577962577963, "grad_norm": 0.37205037474632263, "learning_rate": 4.6996777691370295e-05, "loss": 0.2587, "num_input_tokens_seen": 1677032, "step": 18615 }, { "epoch": 4.838877338877339, "grad_norm": 0.37873971462249756, "learning_rate": 4.699408286209619e-05, "loss": 0.2685, "num_input_tokens_seen": 1677496, "step": 18620 }, { "epoch": 4.840176715176716, "grad_norm": 0.36559486389160156, "learning_rate": 4.699138690164533e-05, "loss": 0.2961, "num_input_tokens_seen": 1677976, "step": 18625 }, { "epoch": 4.841476091476092, "grad_norm": 0.425909161567688, "learning_rate": 4.698868981015637e-05, "loss": 0.1951, "num_input_tokens_seen": 1678392, "step": 18630 }, { "epoch": 4.842775467775468, "grad_norm": 0.4157494306564331, "learning_rate": 4.6985991587768016e-05, "loss": 0.2973, "num_input_tokens_seen": 1678824, "step": 18635 }, { "epoch": 4.844074844074844, "grad_norm": 0.29550814628601074, "learning_rate": 4.698329223461906e-05, "loss": 0.2479, "num_input_tokens_seen": 1679256, "step": 18640 }, { "epoch": 4.845374220374221, "grad_norm": 0.42585209012031555, "learning_rate": 4.6980591750848315e-05, "loss": 0.2974, "num_input_tokens_seen": 1679752, "step": 18645 }, { "epoch": 4.846673596673597, "grad_norm": 0.30883562564849854, "learning_rate": 4.697789013659468e-05, "loss": 1.357, "num_input_tokens_seen": 1680168, "step": 18650 }, { "epoch": 4.847972972972973, "grad_norm": 0.2248053103685379, "learning_rate": 4.697518739199709e-05, "loss": 0.2679, "num_input_tokens_seen": 1680632, "step": 18655 }, { "epoch": 4.849272349272349, "grad_norm": 0.3420407176017761, "learning_rate": 4.697248351719457e-05, "loss": 0.2857, "num_input_tokens_seen": 1681096, "step": 18660 }, { "epoch": 4.850571725571726, "grad_norm": 0.5018962025642395, "learning_rate": 4.6969778512326156e-05, "loss": 0.289, "num_input_tokens_seen": 1681544, "step": 18665 }, { "epoch": 4.851871101871102, "grad_norm": 0.32591482996940613, "learning_rate": 4.6967072377530983e-05, "loss": 0.2161, "num_input_tokens_seen": 1681992, "step": 18670 }, { "epoch": 4.853170478170478, "grad_norm": 0.18816356360912323, "learning_rate": 4.6964365112948237e-05, "loss": 0.2851, "num_input_tokens_seen": 1682424, "step": 18675 }, { "epoch": 4.854469854469855, "grad_norm": 0.30707696080207825, "learning_rate": 4.696165671871714e-05, "loss": 0.2723, "num_input_tokens_seen": 1682872, "step": 18680 }, { "epoch": 4.855769230769231, "grad_norm": 0.331194132566452, "learning_rate": 4.695894719497701e-05, "loss": 0.2446, "num_input_tokens_seen": 1683304, "step": 18685 }, { "epoch": 4.857068607068607, "grad_norm": 0.39733994007110596, "learning_rate": 4.695623654186717e-05, "loss": 0.2345, "num_input_tokens_seen": 1683720, "step": 18690 }, { "epoch": 4.858367983367984, "grad_norm": 0.2807588279247284, "learning_rate": 4.6953524759527054e-05, "loss": 0.2919, "num_input_tokens_seen": 1684152, "step": 18695 }, { "epoch": 4.85966735966736, "grad_norm": 0.2780364751815796, "learning_rate": 4.6950811848096124e-05, "loss": 0.2666, "num_input_tokens_seen": 1684600, "step": 18700 }, { "epoch": 4.860966735966736, "grad_norm": 0.32191693782806396, "learning_rate": 4.694809780771391e-05, "loss": 0.2208, "num_input_tokens_seen": 1685048, "step": 18705 }, { "epoch": 4.862266112266112, "grad_norm": 0.3131627142429352, "learning_rate": 4.694538263851999e-05, "loss": 0.2501, "num_input_tokens_seen": 1685528, "step": 18710 }, { "epoch": 4.863565488565489, "grad_norm": 0.2920820415019989, "learning_rate": 4.694266634065402e-05, "loss": 0.264, "num_input_tokens_seen": 1685992, "step": 18715 }, { "epoch": 4.864864864864865, "grad_norm": 0.30841970443725586, "learning_rate": 4.69399489142557e-05, "loss": 0.2208, "num_input_tokens_seen": 1686408, "step": 18720 }, { "epoch": 4.866164241164241, "grad_norm": 0.25115451216697693, "learning_rate": 4.6937230359464774e-05, "loss": 0.3608, "num_input_tokens_seen": 1686856, "step": 18725 }, { "epoch": 4.867463617463617, "grad_norm": 0.3711850643157959, "learning_rate": 4.6934510676421074e-05, "loss": 0.2438, "num_input_tokens_seen": 1687304, "step": 18730 }, { "epoch": 4.868762993762994, "grad_norm": 0.214581698179245, "learning_rate": 4.693178986526448e-05, "loss": 0.294, "num_input_tokens_seen": 1687768, "step": 18735 }, { "epoch": 4.87006237006237, "grad_norm": 0.5014727711677551, "learning_rate": 4.6929067926134915e-05, "loss": 0.2824, "num_input_tokens_seen": 1688264, "step": 18740 }, { "epoch": 4.871361746361746, "grad_norm": 0.2971351146697998, "learning_rate": 4.692634485917238e-05, "loss": 0.2073, "num_input_tokens_seen": 1688744, "step": 18745 }, { "epoch": 4.872661122661123, "grad_norm": 0.27211257815361023, "learning_rate": 4.692362066451691e-05, "loss": 0.2571, "num_input_tokens_seen": 1689208, "step": 18750 }, { "epoch": 4.873960498960499, "grad_norm": 0.5548259019851685, "learning_rate": 4.6920895342308626e-05, "loss": 0.2896, "num_input_tokens_seen": 1689640, "step": 18755 }, { "epoch": 4.875259875259875, "grad_norm": 0.2535400092601776, "learning_rate": 4.69181688926877e-05, "loss": 0.1763, "num_input_tokens_seen": 1690088, "step": 18760 }, { "epoch": 4.876559251559252, "grad_norm": 0.2783706784248352, "learning_rate": 4.691544131579434e-05, "loss": 0.259, "num_input_tokens_seen": 1690520, "step": 18765 }, { "epoch": 4.877858627858628, "grad_norm": 0.2541835606098175, "learning_rate": 4.691271261176883e-05, "loss": 0.3159, "num_input_tokens_seen": 1690984, "step": 18770 }, { "epoch": 4.879158004158004, "grad_norm": 0.26543155312538147, "learning_rate": 4.690998278075152e-05, "loss": 0.23, "num_input_tokens_seen": 1691416, "step": 18775 }, { "epoch": 4.88045738045738, "grad_norm": 0.5329368114471436, "learning_rate": 4.69072518228828e-05, "loss": 0.2847, "num_input_tokens_seen": 1691896, "step": 18780 }, { "epoch": 4.881756756756757, "grad_norm": 0.26098504662513733, "learning_rate": 4.690451973830313e-05, "loss": 0.1797, "num_input_tokens_seen": 1692360, "step": 18785 }, { "epoch": 4.883056133056133, "grad_norm": 0.3574253022670746, "learning_rate": 4.690178652715302e-05, "loss": 0.2279, "num_input_tokens_seen": 1692792, "step": 18790 }, { "epoch": 4.884355509355509, "grad_norm": 0.24561232328414917, "learning_rate": 4.689905218957305e-05, "loss": 0.241, "num_input_tokens_seen": 1693224, "step": 18795 }, { "epoch": 4.885654885654886, "grad_norm": 0.2533816397190094, "learning_rate": 4.6896316725703844e-05, "loss": 0.264, "num_input_tokens_seen": 1693640, "step": 18800 }, { "epoch": 4.886954261954262, "grad_norm": 0.283843994140625, "learning_rate": 4.689358013568608e-05, "loss": 0.2724, "num_input_tokens_seen": 1694136, "step": 18805 }, { "epoch": 4.888253638253638, "grad_norm": 0.41127705574035645, "learning_rate": 4.689084241966052e-05, "loss": 0.2921, "num_input_tokens_seen": 1694568, "step": 18810 }, { "epoch": 4.889553014553014, "grad_norm": 0.2634311616420746, "learning_rate": 4.688810357776795e-05, "loss": 0.3012, "num_input_tokens_seen": 1695032, "step": 18815 }, { "epoch": 4.890852390852391, "grad_norm": 0.3610597252845764, "learning_rate": 4.688536361014925e-05, "loss": 0.2655, "num_input_tokens_seen": 1695496, "step": 18820 }, { "epoch": 4.892151767151767, "grad_norm": 0.21224215626716614, "learning_rate": 4.688262251694533e-05, "loss": 0.2676, "num_input_tokens_seen": 1695960, "step": 18825 }, { "epoch": 4.893451143451143, "grad_norm": 0.29878902435302734, "learning_rate": 4.6879880298297164e-05, "loss": 0.2716, "num_input_tokens_seen": 1696408, "step": 18830 }, { "epoch": 4.89475051975052, "grad_norm": 0.3513087034225464, "learning_rate": 4.6877136954345785e-05, "loss": 0.2303, "num_input_tokens_seen": 1696856, "step": 18835 }, { "epoch": 4.896049896049896, "grad_norm": 0.2655498683452606, "learning_rate": 4.68743924852323e-05, "loss": 0.2019, "num_input_tokens_seen": 1697272, "step": 18840 }, { "epoch": 4.897349272349272, "grad_norm": 0.7275565266609192, "learning_rate": 4.6871646891097844e-05, "loss": 0.3934, "num_input_tokens_seen": 1697752, "step": 18845 }, { "epoch": 4.898648648648649, "grad_norm": 0.27486687898635864, "learning_rate": 4.686890017208363e-05, "loss": 0.2656, "num_input_tokens_seen": 1698184, "step": 18850 }, { "epoch": 4.899948024948025, "grad_norm": 0.20140935480594635, "learning_rate": 4.6866152328330933e-05, "loss": 0.1943, "num_input_tokens_seen": 1698616, "step": 18855 }, { "epoch": 4.901247401247401, "grad_norm": 0.25933581590652466, "learning_rate": 4.6863403359981065e-05, "loss": 0.2257, "num_input_tokens_seen": 1699064, "step": 18860 }, { "epoch": 4.902546777546777, "grad_norm": 0.22942176461219788, "learning_rate": 4.6860653267175416e-05, "loss": 0.2873, "num_input_tokens_seen": 1699528, "step": 18865 }, { "epoch": 4.903846153846154, "grad_norm": 0.24558576941490173, "learning_rate": 4.685790205005542e-05, "loss": 0.1754, "num_input_tokens_seen": 1699976, "step": 18870 }, { "epoch": 4.90514553014553, "grad_norm": 0.2539081573486328, "learning_rate": 4.685514970876259e-05, "loss": 0.2445, "num_input_tokens_seen": 1700408, "step": 18875 }, { "epoch": 4.906444906444906, "grad_norm": 0.2221328765153885, "learning_rate": 4.685239624343846e-05, "loss": 0.1946, "num_input_tokens_seen": 1700840, "step": 18880 }, { "epoch": 4.907744282744282, "grad_norm": 0.3117448091506958, "learning_rate": 4.684964165422466e-05, "loss": 0.3496, "num_input_tokens_seen": 1701288, "step": 18885 }, { "epoch": 4.909043659043659, "grad_norm": 0.27033138275146484, "learning_rate": 4.684688594126285e-05, "loss": 0.3149, "num_input_tokens_seen": 1701704, "step": 18890 }, { "epoch": 4.910343035343035, "grad_norm": 0.2655923366546631, "learning_rate": 4.684412910469476e-05, "loss": 0.1913, "num_input_tokens_seen": 1702168, "step": 18895 }, { "epoch": 4.911642411642411, "grad_norm": 0.2543151378631592, "learning_rate": 4.6841371144662185e-05, "loss": 0.2496, "num_input_tokens_seen": 1702616, "step": 18900 }, { "epoch": 4.912941787941788, "grad_norm": 0.5261064767837524, "learning_rate": 4.6838612061306966e-05, "loss": 0.2849, "num_input_tokens_seen": 1703080, "step": 18905 }, { "epoch": 4.914241164241164, "grad_norm": 0.25770846009254456, "learning_rate": 4.6835851854770996e-05, "loss": 0.2178, "num_input_tokens_seen": 1703544, "step": 18910 }, { "epoch": 4.91554054054054, "grad_norm": 0.23379413783550262, "learning_rate": 4.683309052519625e-05, "loss": 0.1753, "num_input_tokens_seen": 1703976, "step": 18915 }, { "epoch": 4.916839916839917, "grad_norm": 0.2406119853258133, "learning_rate": 4.683032807272474e-05, "loss": 0.3359, "num_input_tokens_seen": 1704456, "step": 18920 }, { "epoch": 4.918139293139293, "grad_norm": 0.24368903040885925, "learning_rate": 4.6827564497498534e-05, "loss": 0.2691, "num_input_tokens_seen": 1704888, "step": 18925 }, { "epoch": 4.919438669438669, "grad_norm": 0.27655360102653503, "learning_rate": 4.6824799799659773e-05, "loss": 0.2891, "num_input_tokens_seen": 1705320, "step": 18930 }, { "epoch": 4.920738045738045, "grad_norm": 0.28004810214042664, "learning_rate": 4.682203397935066e-05, "loss": 0.2382, "num_input_tokens_seen": 1705784, "step": 18935 }, { "epoch": 4.922037422037422, "grad_norm": 0.19575928151607513, "learning_rate": 4.681926703671341e-05, "loss": 0.3458, "num_input_tokens_seen": 1706216, "step": 18940 }, { "epoch": 4.923336798336798, "grad_norm": 0.3428828716278076, "learning_rate": 4.681649897189036e-05, "loss": 0.1824, "num_input_tokens_seen": 1706648, "step": 18945 }, { "epoch": 4.924636174636174, "grad_norm": 0.49437060952186584, "learning_rate": 4.681372978502386e-05, "loss": 0.2717, "num_input_tokens_seen": 1707112, "step": 18950 }, { "epoch": 4.9259355509355505, "grad_norm": 0.2719247341156006, "learning_rate": 4.681095947625635e-05, "loss": 0.1892, "num_input_tokens_seen": 1707560, "step": 18955 }, { "epoch": 4.927234927234927, "grad_norm": 0.5189912915229797, "learning_rate": 4.680818804573028e-05, "loss": 0.3551, "num_input_tokens_seen": 1708008, "step": 18960 }, { "epoch": 4.928534303534303, "grad_norm": 0.269384503364563, "learning_rate": 4.6805415493588215e-05, "loss": 0.2116, "num_input_tokens_seen": 1708456, "step": 18965 }, { "epoch": 4.9298336798336795, "grad_norm": 0.2115425318479538, "learning_rate": 4.680264181997273e-05, "loss": 0.2546, "num_input_tokens_seen": 1708904, "step": 18970 }, { "epoch": 4.931133056133056, "grad_norm": 0.25695353746414185, "learning_rate": 4.6799867025026485e-05, "loss": 0.2447, "num_input_tokens_seen": 1709368, "step": 18975 }, { "epoch": 4.9324324324324325, "grad_norm": 0.28304019570350647, "learning_rate": 4.679709110889219e-05, "loss": 0.2224, "num_input_tokens_seen": 1709816, "step": 18980 }, { "epoch": 4.9337318087318085, "grad_norm": 0.27728092670440674, "learning_rate": 4.679431407171262e-05, "loss": 0.2429, "num_input_tokens_seen": 1710280, "step": 18985 }, { "epoch": 4.935031185031185, "grad_norm": 0.27314257621765137, "learning_rate": 4.67915359136306e-05, "loss": 0.2627, "num_input_tokens_seen": 1710728, "step": 18990 }, { "epoch": 4.9363305613305615, "grad_norm": 0.31413334608078003, "learning_rate": 4.6788756634789e-05, "loss": 0.241, "num_input_tokens_seen": 1711176, "step": 18995 }, { "epoch": 4.9376299376299375, "grad_norm": 0.21769610047340393, "learning_rate": 4.678597623533077e-05, "loss": 0.1847, "num_input_tokens_seen": 1711624, "step": 19000 }, { "epoch": 4.938929313929314, "grad_norm": 0.5131176710128784, "learning_rate": 4.678319471539891e-05, "loss": 0.2503, "num_input_tokens_seen": 1712056, "step": 19005 }, { "epoch": 4.9402286902286905, "grad_norm": 0.7132238745689392, "learning_rate": 4.678041207513647e-05, "loss": 0.2741, "num_input_tokens_seen": 1712536, "step": 19010 }, { "epoch": 4.9415280665280665, "grad_norm": 0.7699716687202454, "learning_rate": 4.677762831468657e-05, "loss": 0.3289, "num_input_tokens_seen": 1713000, "step": 19015 }, { "epoch": 4.942827442827443, "grad_norm": 0.2993412911891937, "learning_rate": 4.6774843434192384e-05, "loss": 0.2625, "num_input_tokens_seen": 1713432, "step": 19020 }, { "epoch": 4.9441268191268195, "grad_norm": 0.39037245512008667, "learning_rate": 4.677205743379713e-05, "loss": 0.2386, "num_input_tokens_seen": 1713848, "step": 19025 }, { "epoch": 4.9454261954261955, "grad_norm": 0.7421100735664368, "learning_rate": 4.676927031364411e-05, "loss": 0.2591, "num_input_tokens_seen": 1714312, "step": 19030 }, { "epoch": 4.946725571725572, "grad_norm": 0.3892192840576172, "learning_rate": 4.676648207387665e-05, "loss": 0.2842, "num_input_tokens_seen": 1714728, "step": 19035 }, { "epoch": 4.948024948024948, "grad_norm": 0.357975572347641, "learning_rate": 4.6763692714638166e-05, "loss": 0.2792, "num_input_tokens_seen": 1715208, "step": 19040 }, { "epoch": 4.949324324324325, "grad_norm": 0.2621408998966217, "learning_rate": 4.6760902236072115e-05, "loss": 0.2542, "num_input_tokens_seen": 1715656, "step": 19045 }, { "epoch": 4.950623700623701, "grad_norm": 0.4418870210647583, "learning_rate": 4.6758110638322006e-05, "loss": 0.2757, "num_input_tokens_seen": 1716136, "step": 19050 }, { "epoch": 4.951923076923077, "grad_norm": 0.3465016782283783, "learning_rate": 4.675531792153143e-05, "loss": 0.2109, "num_input_tokens_seen": 1716552, "step": 19055 }, { "epoch": 4.953222453222454, "grad_norm": 0.3050096929073334, "learning_rate": 4.675252408584399e-05, "loss": 0.189, "num_input_tokens_seen": 1716984, "step": 19060 }, { "epoch": 4.95452182952183, "grad_norm": 0.8775444626808167, "learning_rate": 4.674972913140341e-05, "loss": 0.2685, "num_input_tokens_seen": 1717400, "step": 19065 }, { "epoch": 4.955821205821206, "grad_norm": 0.4745945632457733, "learning_rate": 4.6746933058353416e-05, "loss": 0.2756, "num_input_tokens_seen": 1717848, "step": 19070 }, { "epoch": 4.957120582120583, "grad_norm": 0.3105548024177551, "learning_rate": 4.674413586683781e-05, "loss": 0.3338, "num_input_tokens_seen": 1718296, "step": 19075 }, { "epoch": 4.958419958419959, "grad_norm": 0.2819850444793701, "learning_rate": 4.674133755700048e-05, "loss": 0.2488, "num_input_tokens_seen": 1718744, "step": 19080 }, { "epoch": 4.959719334719335, "grad_norm": 0.34641599655151367, "learning_rate": 4.673853812898531e-05, "loss": 0.2517, "num_input_tokens_seen": 1719224, "step": 19085 }, { "epoch": 4.961018711018711, "grad_norm": 0.5168090462684631, "learning_rate": 4.673573758293631e-05, "loss": 0.2023, "num_input_tokens_seen": 1719672, "step": 19090 }, { "epoch": 4.962318087318088, "grad_norm": 0.392520010471344, "learning_rate": 4.673293591899749e-05, "loss": 0.2958, "num_input_tokens_seen": 1720120, "step": 19095 }, { "epoch": 4.963617463617464, "grad_norm": 0.2910110354423523, "learning_rate": 4.673013313731296e-05, "loss": 0.2197, "num_input_tokens_seen": 1720552, "step": 19100 }, { "epoch": 4.96491683991684, "grad_norm": 0.2802918553352356, "learning_rate": 4.672732923802685e-05, "loss": 0.246, "num_input_tokens_seen": 1720968, "step": 19105 }, { "epoch": 4.966216216216216, "grad_norm": 0.3189212679862976, "learning_rate": 4.672452422128338e-05, "loss": 0.3214, "num_input_tokens_seen": 1721400, "step": 19110 }, { "epoch": 4.967515592515593, "grad_norm": 0.24482092261314392, "learning_rate": 4.672171808722683e-05, "loss": 0.3441, "num_input_tokens_seen": 1721848, "step": 19115 }, { "epoch": 4.968814968814969, "grad_norm": 0.19795508682727814, "learning_rate": 4.671891083600149e-05, "loss": 0.2823, "num_input_tokens_seen": 1722296, "step": 19120 }, { "epoch": 4.970114345114345, "grad_norm": 0.1402435153722763, "learning_rate": 4.671610246775176e-05, "loss": 0.2746, "num_input_tokens_seen": 1722792, "step": 19125 }, { "epoch": 4.971413721413722, "grad_norm": 0.14756444096565247, "learning_rate": 4.671329298262208e-05, "loss": 0.3113, "num_input_tokens_seen": 1723256, "step": 19130 }, { "epoch": 4.972713097713098, "grad_norm": 0.22275428473949432, "learning_rate": 4.6710482380756926e-05, "loss": 0.2775, "num_input_tokens_seen": 1723688, "step": 19135 }, { "epoch": 4.974012474012474, "grad_norm": 0.22060462832450867, "learning_rate": 4.6707670662300873e-05, "loss": 0.2526, "num_input_tokens_seen": 1724136, "step": 19140 }, { "epoch": 4.975311850311851, "grad_norm": 0.3609267473220825, "learning_rate": 4.670485782739851e-05, "loss": 0.2673, "num_input_tokens_seen": 1724568, "step": 19145 }, { "epoch": 4.976611226611227, "grad_norm": 0.34591951966285706, "learning_rate": 4.6702043876194515e-05, "loss": 0.2692, "num_input_tokens_seen": 1725032, "step": 19150 }, { "epoch": 4.977910602910603, "grad_norm": 0.3202543258666992, "learning_rate": 4.66992288088336e-05, "loss": 0.1908, "num_input_tokens_seen": 1725464, "step": 19155 }, { "epoch": 4.979209979209979, "grad_norm": 0.42565855383872986, "learning_rate": 4.6696412625460575e-05, "loss": 0.1814, "num_input_tokens_seen": 1725944, "step": 19160 }, { "epoch": 4.980509355509356, "grad_norm": 0.26203101873397827, "learning_rate": 4.6693595326220255e-05, "loss": 0.2743, "num_input_tokens_seen": 1726392, "step": 19165 }, { "epoch": 4.981808731808732, "grad_norm": 0.3178975284099579, "learning_rate": 4.669077691125753e-05, "loss": 0.3823, "num_input_tokens_seen": 1726824, "step": 19170 }, { "epoch": 4.983108108108108, "grad_norm": 0.30487698316574097, "learning_rate": 4.6687957380717376e-05, "loss": 0.3081, "num_input_tokens_seen": 1727272, "step": 19175 }, { "epoch": 4.984407484407484, "grad_norm": 0.34385281801223755, "learning_rate": 4.668513673474479e-05, "loss": 0.2852, "num_input_tokens_seen": 1727752, "step": 19180 }, { "epoch": 4.985706860706861, "grad_norm": 0.17789065837860107, "learning_rate": 4.668231497348484e-05, "loss": 0.2612, "num_input_tokens_seen": 1728200, "step": 19185 }, { "epoch": 4.987006237006237, "grad_norm": 0.33516237139701843, "learning_rate": 4.667949209708266e-05, "loss": 0.2427, "num_input_tokens_seen": 1728680, "step": 19190 }, { "epoch": 4.988305613305613, "grad_norm": 0.3457320034503937, "learning_rate": 4.667666810568343e-05, "loss": 0.2339, "num_input_tokens_seen": 1729144, "step": 19195 }, { "epoch": 4.98960498960499, "grad_norm": 0.7663844227790833, "learning_rate": 4.667384299943239e-05, "loss": 0.2437, "num_input_tokens_seen": 1729576, "step": 19200 }, { "epoch": 4.990904365904366, "grad_norm": 0.3755476474761963, "learning_rate": 4.667101677847484e-05, "loss": 0.2559, "num_input_tokens_seen": 1730008, "step": 19205 }, { "epoch": 4.992203742203742, "grad_norm": 0.31422916054725647, "learning_rate": 4.666818944295612e-05, "loss": 0.2052, "num_input_tokens_seen": 1730456, "step": 19210 }, { "epoch": 4.993503118503119, "grad_norm": 0.7119705080986023, "learning_rate": 4.666536099302167e-05, "loss": 0.3322, "num_input_tokens_seen": 1730872, "step": 19215 }, { "epoch": 4.994802494802495, "grad_norm": 0.45908084511756897, "learning_rate": 4.666253142881694e-05, "loss": 0.3408, "num_input_tokens_seen": 1731336, "step": 19220 }, { "epoch": 4.996101871101871, "grad_norm": 0.349413126707077, "learning_rate": 4.6659700750487464e-05, "loss": 0.2837, "num_input_tokens_seen": 1731784, "step": 19225 }, { "epoch": 4.997401247401247, "grad_norm": 0.45804721117019653, "learning_rate": 4.6656868958178825e-05, "loss": 0.2535, "num_input_tokens_seen": 1732216, "step": 19230 }, { "epoch": 4.998700623700624, "grad_norm": 0.2935500741004944, "learning_rate": 4.6654036052036665e-05, "loss": 0.2682, "num_input_tokens_seen": 1732696, "step": 19235 }, { "epoch": 5.0, "grad_norm": 0.38504672050476074, "learning_rate": 4.6651202032206676e-05, "loss": 0.2187, "num_input_tokens_seen": 1733072, "step": 19240 }, { "epoch": 5.0, "eval_loss": 0.2433796525001526, "eval_runtime": 13.2581, "eval_samples_per_second": 64.564, "eval_steps_per_second": 32.282, "num_input_tokens_seen": 1733072, "step": 19240 }, { "epoch": 5.001299376299376, "grad_norm": 0.3975268304347992, "learning_rate": 4.664836689883463e-05, "loss": 0.321, "num_input_tokens_seen": 1733520, "step": 19245 }, { "epoch": 5.002598752598753, "grad_norm": 0.3976019620895386, "learning_rate": 4.664553065206633e-05, "loss": 0.1913, "num_input_tokens_seen": 1733984, "step": 19250 }, { "epoch": 5.003898128898129, "grad_norm": 0.3677685260772705, "learning_rate": 4.6642693292047644e-05, "loss": 0.2558, "num_input_tokens_seen": 1734400, "step": 19255 }, { "epoch": 5.005197505197505, "grad_norm": 0.4949113130569458, "learning_rate": 4.663985481892451e-05, "loss": 0.1703, "num_input_tokens_seen": 1734848, "step": 19260 }, { "epoch": 5.006496881496881, "grad_norm": 0.28669312596321106, "learning_rate": 4.663701523284291e-05, "loss": 0.146, "num_input_tokens_seen": 1735296, "step": 19265 }, { "epoch": 5.007796257796258, "grad_norm": 0.5693766474723816, "learning_rate": 4.663417453394888e-05, "loss": 0.4655, "num_input_tokens_seen": 1735712, "step": 19270 }, { "epoch": 5.009095634095634, "grad_norm": 0.3086421489715576, "learning_rate": 4.663133272238853e-05, "loss": 0.3139, "num_input_tokens_seen": 1736176, "step": 19275 }, { "epoch": 5.01039501039501, "grad_norm": 0.2881488800048828, "learning_rate": 4.6628489798308006e-05, "loss": 0.1971, "num_input_tokens_seen": 1736624, "step": 19280 }, { "epoch": 5.011694386694387, "grad_norm": 0.2536989450454712, "learning_rate": 4.662564576185353e-05, "loss": 0.2002, "num_input_tokens_seen": 1737088, "step": 19285 }, { "epoch": 5.012993762993763, "grad_norm": 0.3000261187553406, "learning_rate": 4.6622800613171376e-05, "loss": 0.2572, "num_input_tokens_seen": 1737552, "step": 19290 }, { "epoch": 5.014293139293139, "grad_norm": 0.2481493502855301, "learning_rate": 4.6619954352407866e-05, "loss": 0.2551, "num_input_tokens_seen": 1738000, "step": 19295 }, { "epoch": 5.015592515592515, "grad_norm": 0.3323572874069214, "learning_rate": 4.6617106979709384e-05, "loss": 0.2214, "num_input_tokens_seen": 1738464, "step": 19300 }, { "epoch": 5.016891891891892, "grad_norm": 0.20992407202720642, "learning_rate": 4.6614258495222384e-05, "loss": 0.1573, "num_input_tokens_seen": 1738912, "step": 19305 }, { "epoch": 5.018191268191268, "grad_norm": 0.3868658244609833, "learning_rate": 4.6611408899093355e-05, "loss": 0.224, "num_input_tokens_seen": 1739312, "step": 19310 }, { "epoch": 5.019490644490644, "grad_norm": 0.2457854002714157, "learning_rate": 4.660855819146887e-05, "loss": 0.3791, "num_input_tokens_seen": 1739744, "step": 19315 }, { "epoch": 5.020790020790021, "grad_norm": 0.5575962066650391, "learning_rate": 4.6605706372495515e-05, "loss": 0.316, "num_input_tokens_seen": 1740192, "step": 19320 }, { "epoch": 5.022089397089397, "grad_norm": 0.25110921263694763, "learning_rate": 4.660285344231999e-05, "loss": 0.1885, "num_input_tokens_seen": 1740656, "step": 19325 }, { "epoch": 5.023388773388773, "grad_norm": 0.5726140737533569, "learning_rate": 4.659999940108901e-05, "loss": 0.2658, "num_input_tokens_seen": 1741136, "step": 19330 }, { "epoch": 5.024688149688149, "grad_norm": 0.2296750843524933, "learning_rate": 4.659714424894936e-05, "loss": 0.2827, "num_input_tokens_seen": 1741552, "step": 19335 }, { "epoch": 5.025987525987526, "grad_norm": 0.2708682417869568, "learning_rate": 4.65942879860479e-05, "loss": 0.2752, "num_input_tokens_seen": 1742000, "step": 19340 }, { "epoch": 5.027286902286902, "grad_norm": 0.3928663730621338, "learning_rate": 4.6591430612531515e-05, "loss": 0.2579, "num_input_tokens_seen": 1742480, "step": 19345 }, { "epoch": 5.028586278586278, "grad_norm": 0.19110757112503052, "learning_rate": 4.658857212854717e-05, "loss": 0.2546, "num_input_tokens_seen": 1742912, "step": 19350 }, { "epoch": 5.029885654885655, "grad_norm": 0.3298237919807434, "learning_rate": 4.6585712534241864e-05, "loss": 0.2591, "num_input_tokens_seen": 1743360, "step": 19355 }, { "epoch": 5.031185031185031, "grad_norm": 0.2517065405845642, "learning_rate": 4.658285182976269e-05, "loss": 0.2614, "num_input_tokens_seen": 1743824, "step": 19360 }, { "epoch": 5.032484407484407, "grad_norm": 0.202285498380661, "learning_rate": 4.657999001525676e-05, "loss": 0.3153, "num_input_tokens_seen": 1744256, "step": 19365 }, { "epoch": 5.033783783783784, "grad_norm": 0.352403461933136, "learning_rate": 4.6577127090871265e-05, "loss": 0.218, "num_input_tokens_seen": 1744704, "step": 19370 }, { "epoch": 5.03508316008316, "grad_norm": 0.2786487340927124, "learning_rate": 4.6574263056753455e-05, "loss": 0.2503, "num_input_tokens_seen": 1745184, "step": 19375 }, { "epoch": 5.036382536382536, "grad_norm": 0.3017931282520294, "learning_rate": 4.6571397913050625e-05, "loss": 0.2221, "num_input_tokens_seen": 1745664, "step": 19380 }, { "epoch": 5.037681912681912, "grad_norm": 0.3224921226501465, "learning_rate": 4.656853165991013e-05, "loss": 0.3214, "num_input_tokens_seen": 1746128, "step": 19385 }, { "epoch": 5.038981288981289, "grad_norm": 0.30233293771743774, "learning_rate": 4.656566429747938e-05, "loss": 0.2672, "num_input_tokens_seen": 1746576, "step": 19390 }, { "epoch": 5.040280665280665, "grad_norm": 0.27523496747016907, "learning_rate": 4.656279582590586e-05, "loss": 0.2975, "num_input_tokens_seen": 1747040, "step": 19395 }, { "epoch": 5.041580041580041, "grad_norm": 0.27091655135154724, "learning_rate": 4.655992624533708e-05, "loss": 0.2081, "num_input_tokens_seen": 1747520, "step": 19400 }, { "epoch": 5.042879417879418, "grad_norm": 0.32816842198371887, "learning_rate": 4.655705555592065e-05, "loss": 0.2444, "num_input_tokens_seen": 1748048, "step": 19405 }, { "epoch": 5.044178794178794, "grad_norm": 0.2972874641418457, "learning_rate": 4.655418375780419e-05, "loss": 0.1692, "num_input_tokens_seen": 1748480, "step": 19410 }, { "epoch": 5.04547817047817, "grad_norm": 0.3510720133781433, "learning_rate": 4.655131085113541e-05, "loss": 0.369, "num_input_tokens_seen": 1748992, "step": 19415 }, { "epoch": 5.046777546777546, "grad_norm": 0.33372941613197327, "learning_rate": 4.6548436836062056e-05, "loss": 0.2837, "num_input_tokens_seen": 1749488, "step": 19420 }, { "epoch": 5.048076923076923, "grad_norm": 0.2696203291416168, "learning_rate": 4.6545561712731954e-05, "loss": 0.2705, "num_input_tokens_seen": 1749952, "step": 19425 }, { "epoch": 5.049376299376299, "grad_norm": 0.504696249961853, "learning_rate": 4.654268548129297e-05, "loss": 0.2962, "num_input_tokens_seen": 1750368, "step": 19430 }, { "epoch": 5.050675675675675, "grad_norm": 0.43469861149787903, "learning_rate": 4.653980814189303e-05, "loss": 0.2214, "num_input_tokens_seen": 1750848, "step": 19435 }, { "epoch": 5.051975051975052, "grad_norm": 0.353533536195755, "learning_rate": 4.653692969468012e-05, "loss": 0.2545, "num_input_tokens_seen": 1751280, "step": 19440 }, { "epoch": 5.053274428274428, "grad_norm": 0.2717914879322052, "learning_rate": 4.6534050139802275e-05, "loss": 0.2954, "num_input_tokens_seen": 1751760, "step": 19445 }, { "epoch": 5.0545738045738045, "grad_norm": 0.328731894493103, "learning_rate": 4.65311694774076e-05, "loss": 0.2275, "num_input_tokens_seen": 1752224, "step": 19450 }, { "epoch": 5.0558731808731805, "grad_norm": 0.2819979786872864, "learning_rate": 4.6528287707644254e-05, "loss": 0.2943, "num_input_tokens_seen": 1752656, "step": 19455 }, { "epoch": 5.057172557172557, "grad_norm": 0.28424155712127686, "learning_rate": 4.6525404830660435e-05, "loss": 0.2619, "num_input_tokens_seen": 1753104, "step": 19460 }, { "epoch": 5.0584719334719335, "grad_norm": 0.3144875466823578, "learning_rate": 4.6522520846604425e-05, "loss": 0.2523, "num_input_tokens_seen": 1753552, "step": 19465 }, { "epoch": 5.0597713097713095, "grad_norm": 0.37334486842155457, "learning_rate": 4.651963575562455e-05, "loss": 0.2619, "num_input_tokens_seen": 1754016, "step": 19470 }, { "epoch": 5.061070686070686, "grad_norm": 0.3265777826309204, "learning_rate": 4.651674955786919e-05, "loss": 0.2886, "num_input_tokens_seen": 1754512, "step": 19475 }, { "epoch": 5.0623700623700625, "grad_norm": 0.2958661913871765, "learning_rate": 4.651386225348677e-05, "loss": 0.1919, "num_input_tokens_seen": 1754944, "step": 19480 }, { "epoch": 5.0636694386694385, "grad_norm": 0.2503194808959961, "learning_rate": 4.6510973842625816e-05, "loss": 0.2556, "num_input_tokens_seen": 1755408, "step": 19485 }, { "epoch": 5.064968814968815, "grad_norm": 0.2676774561405182, "learning_rate": 4.650808432543486e-05, "loss": 0.2643, "num_input_tokens_seen": 1755840, "step": 19490 }, { "epoch": 5.0662681912681915, "grad_norm": 0.6576443314552307, "learning_rate": 4.6505193702062524e-05, "loss": 0.2587, "num_input_tokens_seen": 1756272, "step": 19495 }, { "epoch": 5.0675675675675675, "grad_norm": 0.29217076301574707, "learning_rate": 4.650230197265746e-05, "loss": 0.2411, "num_input_tokens_seen": 1756736, "step": 19500 }, { "epoch": 5.068866943866944, "grad_norm": 0.2571257948875427, "learning_rate": 4.649940913736841e-05, "loss": 0.1804, "num_input_tokens_seen": 1757184, "step": 19505 }, { "epoch": 5.0701663201663205, "grad_norm": 0.2953794598579407, "learning_rate": 4.649651519634415e-05, "loss": 0.3021, "num_input_tokens_seen": 1757600, "step": 19510 }, { "epoch": 5.071465696465697, "grad_norm": 0.2398921251296997, "learning_rate": 4.649362014973352e-05, "loss": 0.3354, "num_input_tokens_seen": 1758016, "step": 19515 }, { "epoch": 5.072765072765073, "grad_norm": 0.18549659848213196, "learning_rate": 4.64907239976854e-05, "loss": 0.279, "num_input_tokens_seen": 1758528, "step": 19520 }, { "epoch": 5.074064449064449, "grad_norm": 0.32494163513183594, "learning_rate": 4.648782674034876e-05, "loss": 0.2083, "num_input_tokens_seen": 1758976, "step": 19525 }, { "epoch": 5.075363825363826, "grad_norm": 0.2007724940776825, "learning_rate": 4.648492837787261e-05, "loss": 0.2749, "num_input_tokens_seen": 1759424, "step": 19530 }, { "epoch": 5.076663201663202, "grad_norm": 0.20569352805614471, "learning_rate": 4.6482028910406e-05, "loss": 0.2611, "num_input_tokens_seen": 1759856, "step": 19535 }, { "epoch": 5.077962577962578, "grad_norm": 0.2941955626010895, "learning_rate": 4.647912833809805e-05, "loss": 0.2832, "num_input_tokens_seen": 1760336, "step": 19540 }, { "epoch": 5.079261954261955, "grad_norm": 0.309906005859375, "learning_rate": 4.647622666109796e-05, "loss": 0.2315, "num_input_tokens_seen": 1760832, "step": 19545 }, { "epoch": 5.080561330561331, "grad_norm": 0.26378485560417175, "learning_rate": 4.647332387955495e-05, "loss": 0.2191, "num_input_tokens_seen": 1761280, "step": 19550 }, { "epoch": 5.081860706860707, "grad_norm": 0.26855477690696716, "learning_rate": 4.647041999361833e-05, "loss": 0.2594, "num_input_tokens_seen": 1761744, "step": 19555 }, { "epoch": 5.083160083160083, "grad_norm": 0.5610151886940002, "learning_rate": 4.6467515003437425e-05, "loss": 0.3417, "num_input_tokens_seen": 1762208, "step": 19560 }, { "epoch": 5.08445945945946, "grad_norm": 0.2714335024356842, "learning_rate": 4.646460890916165e-05, "loss": 0.3458, "num_input_tokens_seen": 1762672, "step": 19565 }, { "epoch": 5.085758835758836, "grad_norm": 0.5032157897949219, "learning_rate": 4.646170171094049e-05, "loss": 0.2452, "num_input_tokens_seen": 1763104, "step": 19570 }, { "epoch": 5.087058212058212, "grad_norm": 0.18529877066612244, "learning_rate": 4.645879340892344e-05, "loss": 0.3018, "num_input_tokens_seen": 1763504, "step": 19575 }, { "epoch": 5.088357588357589, "grad_norm": 0.3622232973575592, "learning_rate": 4.645588400326008e-05, "loss": 0.2239, "num_input_tokens_seen": 1763936, "step": 19580 }, { "epoch": 5.089656964656965, "grad_norm": 0.21392908692359924, "learning_rate": 4.645297349410005e-05, "loss": 0.2576, "num_input_tokens_seen": 1764400, "step": 19585 }, { "epoch": 5.090956340956341, "grad_norm": 0.33170902729034424, "learning_rate": 4.645006188159304e-05, "loss": 0.2248, "num_input_tokens_seen": 1764816, "step": 19590 }, { "epoch": 5.092255717255718, "grad_norm": 0.26184728741645813, "learning_rate": 4.644714916588879e-05, "loss": 0.2243, "num_input_tokens_seen": 1765232, "step": 19595 }, { "epoch": 5.093555093555094, "grad_norm": 0.38116341829299927, "learning_rate": 4.6444235347137115e-05, "loss": 0.332, "num_input_tokens_seen": 1765680, "step": 19600 }, { "epoch": 5.09485446985447, "grad_norm": 0.3751718997955322, "learning_rate": 4.6441320425487865e-05, "loss": 0.2163, "num_input_tokens_seen": 1766160, "step": 19605 }, { "epoch": 5.096153846153846, "grad_norm": 0.2691768705844879, "learning_rate": 4.643840440109096e-05, "loss": 0.2502, "num_input_tokens_seen": 1766576, "step": 19610 }, { "epoch": 5.097453222453223, "grad_norm": 0.27140283584594727, "learning_rate": 4.643548727409638e-05, "loss": 0.1817, "num_input_tokens_seen": 1767024, "step": 19615 }, { "epoch": 5.098752598752599, "grad_norm": 0.3729926347732544, "learning_rate": 4.643256904465415e-05, "loss": 0.2614, "num_input_tokens_seen": 1767456, "step": 19620 }, { "epoch": 5.100051975051975, "grad_norm": 0.39016208052635193, "learning_rate": 4.642964971291436e-05, "loss": 0.1607, "num_input_tokens_seen": 1767888, "step": 19625 }, { "epoch": 5.101351351351352, "grad_norm": 0.2555716335773468, "learning_rate": 4.642672927902715e-05, "loss": 0.2533, "num_input_tokens_seen": 1768304, "step": 19630 }, { "epoch": 5.102650727650728, "grad_norm": 0.4391617178916931, "learning_rate": 4.642380774314272e-05, "loss": 0.3172, "num_input_tokens_seen": 1768752, "step": 19635 }, { "epoch": 5.103950103950104, "grad_norm": 0.38229280710220337, "learning_rate": 4.642088510541134e-05, "loss": 0.2467, "num_input_tokens_seen": 1769200, "step": 19640 }, { "epoch": 5.10524948024948, "grad_norm": 0.28606727719306946, "learning_rate": 4.64179613659833e-05, "loss": 0.1525, "num_input_tokens_seen": 1769712, "step": 19645 }, { "epoch": 5.106548856548857, "grad_norm": 0.6284445524215698, "learning_rate": 4.641503652500899e-05, "loss": 0.3581, "num_input_tokens_seen": 1770160, "step": 19650 }, { "epoch": 5.107848232848233, "grad_norm": 0.26289689540863037, "learning_rate": 4.641211058263884e-05, "loss": 0.2114, "num_input_tokens_seen": 1770592, "step": 19655 }, { "epoch": 5.109147609147609, "grad_norm": 0.28617388010025024, "learning_rate": 4.6409183539023324e-05, "loss": 0.283, "num_input_tokens_seen": 1771056, "step": 19660 }, { "epoch": 5.110446985446986, "grad_norm": 0.44546785950660706, "learning_rate": 4.640625539431298e-05, "loss": 0.2644, "num_input_tokens_seen": 1771520, "step": 19665 }, { "epoch": 5.111746361746362, "grad_norm": 0.3665326237678528, "learning_rate": 4.6403326148658424e-05, "loss": 0.2457, "num_input_tokens_seen": 1771952, "step": 19670 }, { "epoch": 5.113045738045738, "grad_norm": 0.399639368057251, "learning_rate": 4.6400395802210285e-05, "loss": 0.2707, "num_input_tokens_seen": 1772448, "step": 19675 }, { "epoch": 5.114345114345114, "grad_norm": 0.40563690662384033, "learning_rate": 4.639746435511928e-05, "loss": 0.2429, "num_input_tokens_seen": 1772880, "step": 19680 }, { "epoch": 5.115644490644491, "grad_norm": 0.3405209481716156, "learning_rate": 4.639453180753619e-05, "loss": 0.2273, "num_input_tokens_seen": 1773344, "step": 19685 }, { "epoch": 5.116943866943867, "grad_norm": 0.30660903453826904, "learning_rate": 4.6391598159611825e-05, "loss": 0.2376, "num_input_tokens_seen": 1773808, "step": 19690 }, { "epoch": 5.118243243243243, "grad_norm": 0.5295736789703369, "learning_rate": 4.638866341149708e-05, "loss": 0.2889, "num_input_tokens_seen": 1774272, "step": 19695 }, { "epoch": 5.11954261954262, "grad_norm": 0.7327246069908142, "learning_rate": 4.6385727563342876e-05, "loss": 0.3242, "num_input_tokens_seen": 1774720, "step": 19700 }, { "epoch": 5.120841995841996, "grad_norm": 0.8032816052436829, "learning_rate": 4.638279061530021e-05, "loss": 0.2724, "num_input_tokens_seen": 1775200, "step": 19705 }, { "epoch": 5.122141372141372, "grad_norm": 0.6064355373382568, "learning_rate": 4.637985256752013e-05, "loss": 0.2759, "num_input_tokens_seen": 1775648, "step": 19710 }, { "epoch": 5.123440748440748, "grad_norm": 0.36807918548583984, "learning_rate": 4.6376913420153755e-05, "loss": 0.2182, "num_input_tokens_seen": 1776080, "step": 19715 }, { "epoch": 5.124740124740125, "grad_norm": 0.36961132287979126, "learning_rate": 4.6373973173352245e-05, "loss": 0.2862, "num_input_tokens_seen": 1776560, "step": 19720 }, { "epoch": 5.126039501039501, "grad_norm": 0.3205464780330658, "learning_rate": 4.637103182726681e-05, "loss": 0.3235, "num_input_tokens_seen": 1777008, "step": 19725 }, { "epoch": 5.127338877338877, "grad_norm": 0.3806019723415375, "learning_rate": 4.6368089382048726e-05, "loss": 0.2388, "num_input_tokens_seen": 1777456, "step": 19730 }, { "epoch": 5.128638253638254, "grad_norm": 0.28707748651504517, "learning_rate": 4.636514583784933e-05, "loss": 0.256, "num_input_tokens_seen": 1777904, "step": 19735 }, { "epoch": 5.12993762993763, "grad_norm": 0.46811947226524353, "learning_rate": 4.636220119482003e-05, "loss": 0.2457, "num_input_tokens_seen": 1778368, "step": 19740 }, { "epoch": 5.131237006237006, "grad_norm": 0.38513097167015076, "learning_rate": 4.635925545311224e-05, "loss": 0.2416, "num_input_tokens_seen": 1778816, "step": 19745 }, { "epoch": 5.132536382536382, "grad_norm": 0.3887077271938324, "learning_rate": 4.6356308612877476e-05, "loss": 0.2319, "num_input_tokens_seen": 1779264, "step": 19750 }, { "epoch": 5.133835758835759, "grad_norm": 0.3631491959095001, "learning_rate": 4.6353360674267296e-05, "loss": 0.2832, "num_input_tokens_seen": 1779728, "step": 19755 }, { "epoch": 5.135135135135135, "grad_norm": 0.32570382952690125, "learning_rate": 4.635041163743333e-05, "loss": 0.2476, "num_input_tokens_seen": 1780176, "step": 19760 }, { "epoch": 5.136434511434511, "grad_norm": 0.2893189489841461, "learning_rate": 4.634746150252722e-05, "loss": 0.3229, "num_input_tokens_seen": 1780640, "step": 19765 }, { "epoch": 5.137733887733888, "grad_norm": 0.30121609568595886, "learning_rate": 4.634451026970072e-05, "loss": 0.2262, "num_input_tokens_seen": 1781104, "step": 19770 }, { "epoch": 5.139033264033264, "grad_norm": 0.33662259578704834, "learning_rate": 4.6341557939105604e-05, "loss": 0.1935, "num_input_tokens_seen": 1781552, "step": 19775 }, { "epoch": 5.14033264033264, "grad_norm": 0.29579630494117737, "learning_rate": 4.633860451089371e-05, "loss": 0.3191, "num_input_tokens_seen": 1782016, "step": 19780 }, { "epoch": 5.141632016632016, "grad_norm": 0.3179287612438202, "learning_rate": 4.633564998521696e-05, "loss": 0.2037, "num_input_tokens_seen": 1782464, "step": 19785 }, { "epoch": 5.142931392931393, "grad_norm": 0.36291927099227905, "learning_rate": 4.633269436222727e-05, "loss": 0.2158, "num_input_tokens_seen": 1782912, "step": 19790 }, { "epoch": 5.144230769230769, "grad_norm": 0.3410411775112152, "learning_rate": 4.632973764207666e-05, "loss": 0.213, "num_input_tokens_seen": 1783360, "step": 19795 }, { "epoch": 5.145530145530145, "grad_norm": 0.3335917592048645, "learning_rate": 4.632677982491723e-05, "loss": 0.2566, "num_input_tokens_seen": 1783808, "step": 19800 }, { "epoch": 5.146829521829522, "grad_norm": 0.6605656147003174, "learning_rate": 4.632382091090106e-05, "loss": 0.367, "num_input_tokens_seen": 1784304, "step": 19805 }, { "epoch": 5.148128898128898, "grad_norm": 0.3264617323875427, "learning_rate": 4.6320860900180364e-05, "loss": 0.2891, "num_input_tokens_seen": 1784800, "step": 19810 }, { "epoch": 5.149428274428274, "grad_norm": 0.34185367822647095, "learning_rate": 4.631789979290735e-05, "loss": 0.2646, "num_input_tokens_seen": 1785248, "step": 19815 }, { "epoch": 5.150727650727651, "grad_norm": 0.27570411562919617, "learning_rate": 4.631493758923433e-05, "loss": 0.2086, "num_input_tokens_seen": 1785712, "step": 19820 }, { "epoch": 5.152027027027027, "grad_norm": 0.5434854030609131, "learning_rate": 4.6311974289313646e-05, "loss": 0.2593, "num_input_tokens_seen": 1786144, "step": 19825 }, { "epoch": 5.153326403326403, "grad_norm": 0.38313964009284973, "learning_rate": 4.630900989329771e-05, "loss": 0.2809, "num_input_tokens_seen": 1786592, "step": 19830 }, { "epoch": 5.154625779625779, "grad_norm": 0.3781501352787018, "learning_rate": 4.630604440133896e-05, "loss": 0.2615, "num_input_tokens_seen": 1787024, "step": 19835 }, { "epoch": 5.155925155925156, "grad_norm": 0.40275445580482483, "learning_rate": 4.6303077813589956e-05, "loss": 0.2496, "num_input_tokens_seen": 1787456, "step": 19840 }, { "epoch": 5.157224532224532, "grad_norm": 0.4605545997619629, "learning_rate": 4.6300110130203236e-05, "loss": 0.2764, "num_input_tokens_seen": 1787872, "step": 19845 }, { "epoch": 5.158523908523908, "grad_norm": 0.5008760690689087, "learning_rate": 4.629714135133144e-05, "loss": 0.2673, "num_input_tokens_seen": 1788336, "step": 19850 }, { "epoch": 5.159823284823285, "grad_norm": 0.564010739326477, "learning_rate": 4.629417147712727e-05, "loss": 0.2405, "num_input_tokens_seen": 1788784, "step": 19855 }, { "epoch": 5.161122661122661, "grad_norm": 0.5306153893470764, "learning_rate": 4.6291200507743446e-05, "loss": 0.244, "num_input_tokens_seen": 1789264, "step": 19860 }, { "epoch": 5.162422037422037, "grad_norm": 0.3299996554851532, "learning_rate": 4.628822844333278e-05, "loss": 0.2885, "num_input_tokens_seen": 1789728, "step": 19865 }, { "epoch": 5.163721413721413, "grad_norm": 0.3706335425376892, "learning_rate": 4.6285255284048134e-05, "loss": 0.3026, "num_input_tokens_seen": 1790176, "step": 19870 }, { "epoch": 5.16502079002079, "grad_norm": 0.5538793802261353, "learning_rate": 4.628228103004242e-05, "loss": 0.2369, "num_input_tokens_seen": 1790656, "step": 19875 }, { "epoch": 5.166320166320166, "grad_norm": 0.300053209066391, "learning_rate": 4.627930568146859e-05, "loss": 0.253, "num_input_tokens_seen": 1791120, "step": 19880 }, { "epoch": 5.167619542619542, "grad_norm": 0.286065012216568, "learning_rate": 4.627632923847968e-05, "loss": 0.174, "num_input_tokens_seen": 1791568, "step": 19885 }, { "epoch": 5.168918918918919, "grad_norm": 0.6083042025566101, "learning_rate": 4.627335170122877e-05, "loss": 0.2213, "num_input_tokens_seen": 1792032, "step": 19890 }, { "epoch": 5.170218295218295, "grad_norm": 0.28077322244644165, "learning_rate": 4.627037306986901e-05, "loss": 0.3225, "num_input_tokens_seen": 1792496, "step": 19895 }, { "epoch": 5.171517671517671, "grad_norm": 0.721576452255249, "learning_rate": 4.626739334455357e-05, "loss": 0.3684, "num_input_tokens_seen": 1792960, "step": 19900 }, { "epoch": 5.172817047817047, "grad_norm": 0.2376977950334549, "learning_rate": 4.6264412525435716e-05, "loss": 0.301, "num_input_tokens_seen": 1793424, "step": 19905 }, { "epoch": 5.174116424116424, "grad_norm": 0.5330404043197632, "learning_rate": 4.626143061266876e-05, "loss": 0.2781, "num_input_tokens_seen": 1793920, "step": 19910 }, { "epoch": 5.1754158004158, "grad_norm": 0.39224326610565186, "learning_rate": 4.625844760640605e-05, "loss": 0.2702, "num_input_tokens_seen": 1794368, "step": 19915 }, { "epoch": 5.1767151767151764, "grad_norm": 0.34701278805732727, "learning_rate": 4.625546350680101e-05, "loss": 0.2648, "num_input_tokens_seen": 1794832, "step": 19920 }, { "epoch": 5.178014553014553, "grad_norm": 0.5670446157455444, "learning_rate": 4.625247831400712e-05, "loss": 0.298, "num_input_tokens_seen": 1795248, "step": 19925 }, { "epoch": 5.179313929313929, "grad_norm": 0.271679550409317, "learning_rate": 4.6249492028177896e-05, "loss": 0.2865, "num_input_tokens_seen": 1795712, "step": 19930 }, { "epoch": 5.1806133056133055, "grad_norm": 0.5686834454536438, "learning_rate": 4.624650464946695e-05, "loss": 0.2383, "num_input_tokens_seen": 1796192, "step": 19935 }, { "epoch": 5.1819126819126815, "grad_norm": 0.8670960664749146, "learning_rate": 4.62435161780279e-05, "loss": 0.2042, "num_input_tokens_seen": 1796608, "step": 19940 }, { "epoch": 5.183212058212058, "grad_norm": 0.39616596698760986, "learning_rate": 4.6240526614014465e-05, "loss": 0.2981, "num_input_tokens_seen": 1797072, "step": 19945 }, { "epoch": 5.1845114345114345, "grad_norm": 0.5199609398841858, "learning_rate": 4.6237535957580405e-05, "loss": 0.3281, "num_input_tokens_seen": 1797552, "step": 19950 }, { "epoch": 5.1858108108108105, "grad_norm": 0.4675224721431732, "learning_rate": 4.6234544208879503e-05, "loss": 0.2237, "num_input_tokens_seen": 1798016, "step": 19955 }, { "epoch": 5.1871101871101875, "grad_norm": 0.3207915127277374, "learning_rate": 4.623155136806566e-05, "loss": 0.2046, "num_input_tokens_seen": 1798448, "step": 19960 }, { "epoch": 5.1884095634095635, "grad_norm": 0.7102227210998535, "learning_rate": 4.622855743529278e-05, "loss": 0.3322, "num_input_tokens_seen": 1798880, "step": 19965 }, { "epoch": 5.1897089397089395, "grad_norm": 0.3602803349494934, "learning_rate": 4.6225562410714854e-05, "loss": 0.1797, "num_input_tokens_seen": 1799344, "step": 19970 }, { "epoch": 5.191008316008316, "grad_norm": 0.3368944525718689, "learning_rate": 4.622256629448591e-05, "loss": 0.2129, "num_input_tokens_seen": 1799760, "step": 19975 }, { "epoch": 5.1923076923076925, "grad_norm": 0.2937062382698059, "learning_rate": 4.621956908676005e-05, "loss": 0.2318, "num_input_tokens_seen": 1800192, "step": 19980 }, { "epoch": 5.1936070686070686, "grad_norm": 0.7046310305595398, "learning_rate": 4.6216570787691423e-05, "loss": 0.2679, "num_input_tokens_seen": 1800656, "step": 19985 }, { "epoch": 5.194906444906445, "grad_norm": 0.41615989804267883, "learning_rate": 4.621357139743423e-05, "loss": 0.2527, "num_input_tokens_seen": 1801104, "step": 19990 }, { "epoch": 5.1962058212058215, "grad_norm": 0.3372029960155487, "learning_rate": 4.621057091614274e-05, "loss": 0.2117, "num_input_tokens_seen": 1801504, "step": 19995 }, { "epoch": 5.197505197505198, "grad_norm": 0.44864848256111145, "learning_rate": 4.620756934397126e-05, "loss": 0.2495, "num_input_tokens_seen": 1801920, "step": 20000 }, { "epoch": 5.198804573804574, "grad_norm": 0.5805349946022034, "learning_rate": 4.620456668107417e-05, "loss": 0.1695, "num_input_tokens_seen": 1802352, "step": 20005 }, { "epoch": 5.20010395010395, "grad_norm": 0.4702690541744232, "learning_rate": 4.620156292760589e-05, "loss": 0.2846, "num_input_tokens_seen": 1802800, "step": 20010 }, { "epoch": 5.201403326403327, "grad_norm": 0.2765844464302063, "learning_rate": 4.619855808372092e-05, "loss": 0.2536, "num_input_tokens_seen": 1803264, "step": 20015 }, { "epoch": 5.202702702702703, "grad_norm": 0.2894502580165863, "learning_rate": 4.61955521495738e-05, "loss": 0.2141, "num_input_tokens_seen": 1803728, "step": 20020 }, { "epoch": 5.204002079002079, "grad_norm": 0.4010847210884094, "learning_rate": 4.619254512531912e-05, "loss": 0.2714, "num_input_tokens_seen": 1804176, "step": 20025 }, { "epoch": 5.205301455301456, "grad_norm": 0.3183976113796234, "learning_rate": 4.618953701111154e-05, "loss": 0.3275, "num_input_tokens_seen": 1804624, "step": 20030 }, { "epoch": 5.206600831600832, "grad_norm": 0.6495065093040466, "learning_rate": 4.6186527807105775e-05, "loss": 0.234, "num_input_tokens_seen": 1805088, "step": 20035 }, { "epoch": 5.207900207900208, "grad_norm": 0.35120171308517456, "learning_rate": 4.6183517513456576e-05, "loss": 0.2485, "num_input_tokens_seen": 1805504, "step": 20040 }, { "epoch": 5.209199584199585, "grad_norm": 0.6068681478500366, "learning_rate": 4.6180506130318776e-05, "loss": 0.3391, "num_input_tokens_seen": 1805968, "step": 20045 }, { "epoch": 5.210498960498961, "grad_norm": 0.6906151175498962, "learning_rate": 4.617749365784725e-05, "loss": 0.2875, "num_input_tokens_seen": 1806416, "step": 20050 }, { "epoch": 5.211798336798337, "grad_norm": 0.8848720192909241, "learning_rate": 4.617448009619694e-05, "loss": 0.3363, "num_input_tokens_seen": 1806864, "step": 20055 }, { "epoch": 5.213097713097713, "grad_norm": 0.42941486835479736, "learning_rate": 4.6171465445522824e-05, "loss": 0.2658, "num_input_tokens_seen": 1807296, "step": 20060 }, { "epoch": 5.21439708939709, "grad_norm": 0.2826007306575775, "learning_rate": 4.6168449705979956e-05, "loss": 0.2628, "num_input_tokens_seen": 1807744, "step": 20065 }, { "epoch": 5.215696465696466, "grad_norm": 0.354004442691803, "learning_rate": 4.6165432877723435e-05, "loss": 0.2299, "num_input_tokens_seen": 1808144, "step": 20070 }, { "epoch": 5.216995841995842, "grad_norm": 0.35640770196914673, "learning_rate": 4.616241496090843e-05, "loss": 0.2171, "num_input_tokens_seen": 1808576, "step": 20075 }, { "epoch": 5.218295218295219, "grad_norm": 0.2937390208244324, "learning_rate": 4.615939595569014e-05, "loss": 0.1602, "num_input_tokens_seen": 1809040, "step": 20080 }, { "epoch": 5.219594594594595, "grad_norm": 0.25390493869781494, "learning_rate": 4.615637586222384e-05, "loss": 0.2669, "num_input_tokens_seen": 1809488, "step": 20085 }, { "epoch": 5.220893970893971, "grad_norm": 0.2896406948566437, "learning_rate": 4.615335468066486e-05, "loss": 0.3704, "num_input_tokens_seen": 1809904, "step": 20090 }, { "epoch": 5.222193347193347, "grad_norm": 0.3956730365753174, "learning_rate": 4.615033241116858e-05, "loss": 0.3405, "num_input_tokens_seen": 1810384, "step": 20095 }, { "epoch": 5.223492723492724, "grad_norm": 0.404313862323761, "learning_rate": 4.614730905389044e-05, "loss": 0.2597, "num_input_tokens_seen": 1810832, "step": 20100 }, { "epoch": 5.2247920997921, "grad_norm": 0.45973458886146545, "learning_rate": 4.614428460898594e-05, "loss": 0.2739, "num_input_tokens_seen": 1811280, "step": 20105 }, { "epoch": 5.226091476091476, "grad_norm": 0.27123698592185974, "learning_rate": 4.614125907661061e-05, "loss": 0.2748, "num_input_tokens_seen": 1811728, "step": 20110 }, { "epoch": 5.227390852390853, "grad_norm": 0.3050244152545929, "learning_rate": 4.6138232456920084e-05, "loss": 0.2504, "num_input_tokens_seen": 1812160, "step": 20115 }, { "epoch": 5.228690228690229, "grad_norm": 0.8138343095779419, "learning_rate": 4.613520475006999e-05, "loss": 0.2534, "num_input_tokens_seen": 1812624, "step": 20120 }, { "epoch": 5.229989604989605, "grad_norm": 0.5041064023971558, "learning_rate": 4.613217595621608e-05, "loss": 0.1739, "num_input_tokens_seen": 1813056, "step": 20125 }, { "epoch": 5.231288981288981, "grad_norm": 0.2945524752140045, "learning_rate": 4.612914607551411e-05, "loss": 0.2566, "num_input_tokens_seen": 1813456, "step": 20130 }, { "epoch": 5.232588357588358, "grad_norm": 0.25495582818984985, "learning_rate": 4.612611510811991e-05, "loss": 0.2561, "num_input_tokens_seen": 1813920, "step": 20135 }, { "epoch": 5.233887733887734, "grad_norm": 0.30435267090797424, "learning_rate": 4.612308305418937e-05, "loss": 0.3021, "num_input_tokens_seen": 1814368, "step": 20140 }, { "epoch": 5.23518711018711, "grad_norm": 0.2684551179409027, "learning_rate": 4.612004991387843e-05, "loss": 0.1783, "num_input_tokens_seen": 1814832, "step": 20145 }, { "epoch": 5.236486486486487, "grad_norm": 0.29673483967781067, "learning_rate": 4.6117015687343086e-05, "loss": 0.3277, "num_input_tokens_seen": 1815248, "step": 20150 }, { "epoch": 5.237785862785863, "grad_norm": 0.31515878438949585, "learning_rate": 4.611398037473939e-05, "loss": 0.2988, "num_input_tokens_seen": 1815648, "step": 20155 }, { "epoch": 5.239085239085239, "grad_norm": 0.3584878444671631, "learning_rate": 4.611094397622345e-05, "loss": 0.2847, "num_input_tokens_seen": 1816112, "step": 20160 }, { "epoch": 5.240384615384615, "grad_norm": 0.3771243691444397, "learning_rate": 4.610790649195144e-05, "loss": 0.2125, "num_input_tokens_seen": 1816560, "step": 20165 }, { "epoch": 5.241683991683992, "grad_norm": 0.21759094297885895, "learning_rate": 4.6104867922079574e-05, "loss": 0.3219, "num_input_tokens_seen": 1817024, "step": 20170 }, { "epoch": 5.242983367983368, "grad_norm": 0.2367144227027893, "learning_rate": 4.6101828266764126e-05, "loss": 0.259, "num_input_tokens_seen": 1817456, "step": 20175 }, { "epoch": 5.244282744282744, "grad_norm": 0.5571315288543701, "learning_rate": 4.6098787526161435e-05, "loss": 0.2406, "num_input_tokens_seen": 1817920, "step": 20180 }, { "epoch": 5.245582120582121, "grad_norm": 0.317362517118454, "learning_rate": 4.609574570042788e-05, "loss": 0.216, "num_input_tokens_seen": 1818384, "step": 20185 }, { "epoch": 5.246881496881497, "grad_norm": 0.25792473554611206, "learning_rate": 4.609270278971991e-05, "loss": 0.2952, "num_input_tokens_seen": 1818800, "step": 20190 }, { "epoch": 5.248180873180873, "grad_norm": 0.30562683939933777, "learning_rate": 4.6089658794194035e-05, "loss": 0.3068, "num_input_tokens_seen": 1819232, "step": 20195 }, { "epoch": 5.24948024948025, "grad_norm": 0.23810474574565887, "learning_rate": 4.608661371400679e-05, "loss": 0.2183, "num_input_tokens_seen": 1819664, "step": 20200 }, { "epoch": 5.250779625779626, "grad_norm": 0.2361420840024948, "learning_rate": 4.608356754931481e-05, "loss": 0.2815, "num_input_tokens_seen": 1820128, "step": 20205 }, { "epoch": 5.252079002079002, "grad_norm": 0.23797380924224854, "learning_rate": 4.608052030027474e-05, "loss": 0.1953, "num_input_tokens_seen": 1820608, "step": 20210 }, { "epoch": 5.253378378378378, "grad_norm": 0.29080116748809814, "learning_rate": 4.607747196704332e-05, "loss": 0.1758, "num_input_tokens_seen": 1821088, "step": 20215 }, { "epoch": 5.254677754677755, "grad_norm": 0.21507926285266876, "learning_rate": 4.6074422549777306e-05, "loss": 0.2383, "num_input_tokens_seen": 1821552, "step": 20220 }, { "epoch": 5.255977130977131, "grad_norm": 0.39553102850914, "learning_rate": 4.6071372048633566e-05, "loss": 0.3207, "num_input_tokens_seen": 1821984, "step": 20225 }, { "epoch": 5.257276507276507, "grad_norm": 0.23629839718341827, "learning_rate": 4.606832046376896e-05, "loss": 0.1653, "num_input_tokens_seen": 1822384, "step": 20230 }, { "epoch": 5.258575883575883, "grad_norm": 0.5420928597450256, "learning_rate": 4.606526779534045e-05, "loss": 0.2685, "num_input_tokens_seen": 1822832, "step": 20235 }, { "epoch": 5.25987525987526, "grad_norm": 0.20418524742126465, "learning_rate": 4.6062214043505034e-05, "loss": 0.1026, "num_input_tokens_seen": 1823264, "step": 20240 }, { "epoch": 5.261174636174636, "grad_norm": 0.21990343928337097, "learning_rate": 4.6059159208419764e-05, "loss": 0.1974, "num_input_tokens_seen": 1823696, "step": 20245 }, { "epoch": 5.262474012474012, "grad_norm": 0.869846761226654, "learning_rate": 4.6056103290241764e-05, "loss": 0.3268, "num_input_tokens_seen": 1824144, "step": 20250 }, { "epoch": 5.263773388773389, "grad_norm": 0.3166965842247009, "learning_rate": 4.60530462891282e-05, "loss": 0.3153, "num_input_tokens_seen": 1824576, "step": 20255 }, { "epoch": 5.265072765072765, "grad_norm": 0.2820129990577698, "learning_rate": 4.604998820523628e-05, "loss": 0.2638, "num_input_tokens_seen": 1825008, "step": 20260 }, { "epoch": 5.266372141372141, "grad_norm": 0.22620916366577148, "learning_rate": 4.6046929038723304e-05, "loss": 0.2808, "num_input_tokens_seen": 1825472, "step": 20265 }, { "epoch": 5.267671517671518, "grad_norm": 0.30461350083351135, "learning_rate": 4.604386878974661e-05, "loss": 0.1509, "num_input_tokens_seen": 1825920, "step": 20270 }, { "epoch": 5.268970893970894, "grad_norm": 0.245607390999794, "learning_rate": 4.6040807458463576e-05, "loss": 0.2525, "num_input_tokens_seen": 1826352, "step": 20275 }, { "epoch": 5.27027027027027, "grad_norm": 0.3262728452682495, "learning_rate": 4.603774504503165e-05, "loss": 0.2514, "num_input_tokens_seen": 1826832, "step": 20280 }, { "epoch": 5.271569646569646, "grad_norm": 0.3068196177482605, "learning_rate": 4.6034681549608335e-05, "loss": 0.3098, "num_input_tokens_seen": 1827248, "step": 20285 }, { "epoch": 5.272869022869023, "grad_norm": 0.22555619478225708, "learning_rate": 4.60316169723512e-05, "loss": 0.2832, "num_input_tokens_seen": 1827680, "step": 20290 }, { "epoch": 5.274168399168399, "grad_norm": 0.3039511740207672, "learning_rate": 4.602855131341786e-05, "loss": 0.1825, "num_input_tokens_seen": 1828144, "step": 20295 }, { "epoch": 5.275467775467775, "grad_norm": 0.26665690541267395, "learning_rate": 4.602548457296597e-05, "loss": 0.2304, "num_input_tokens_seen": 1828624, "step": 20300 }, { "epoch": 5.276767151767152, "grad_norm": 0.35204437375068665, "learning_rate": 4.6022416751153255e-05, "loss": 0.3441, "num_input_tokens_seen": 1829104, "step": 20305 }, { "epoch": 5.278066528066528, "grad_norm": 0.2769361734390259, "learning_rate": 4.6019347848137516e-05, "loss": 0.2934, "num_input_tokens_seen": 1829536, "step": 20310 }, { "epoch": 5.279365904365904, "grad_norm": 0.3716367185115814, "learning_rate": 4.601627786407657e-05, "loss": 0.2435, "num_input_tokens_seen": 1829968, "step": 20315 }, { "epoch": 5.28066528066528, "grad_norm": 0.624792754650116, "learning_rate": 4.601320679912832e-05, "loss": 0.3092, "num_input_tokens_seen": 1830416, "step": 20320 }, { "epoch": 5.281964656964657, "grad_norm": 0.5566162467002869, "learning_rate": 4.601013465345071e-05, "loss": 0.2624, "num_input_tokens_seen": 1830848, "step": 20325 }, { "epoch": 5.283264033264033, "grad_norm": 0.5301048755645752, "learning_rate": 4.600706142720174e-05, "loss": 0.2656, "num_input_tokens_seen": 1831328, "step": 20330 }, { "epoch": 5.284563409563409, "grad_norm": 0.37371960282325745, "learning_rate": 4.6003987120539476e-05, "loss": 0.286, "num_input_tokens_seen": 1831808, "step": 20335 }, { "epoch": 5.285862785862786, "grad_norm": 0.2503002882003784, "learning_rate": 4.600091173362203e-05, "loss": 0.2556, "num_input_tokens_seen": 1832256, "step": 20340 }, { "epoch": 5.287162162162162, "grad_norm": 0.2223026603460312, "learning_rate": 4.599783526660757e-05, "loss": 0.2613, "num_input_tokens_seen": 1832688, "step": 20345 }, { "epoch": 5.288461538461538, "grad_norm": 0.33341118693351746, "learning_rate": 4.599475771965432e-05, "loss": 0.298, "num_input_tokens_seen": 1833152, "step": 20350 }, { "epoch": 5.289760914760914, "grad_norm": 0.23943330347537994, "learning_rate": 4.599167909292057e-05, "loss": 0.256, "num_input_tokens_seen": 1833584, "step": 20355 }, { "epoch": 5.291060291060291, "grad_norm": 0.5780152082443237, "learning_rate": 4.5988599386564654e-05, "loss": 0.3093, "num_input_tokens_seen": 1834032, "step": 20360 }, { "epoch": 5.292359667359667, "grad_norm": 0.16743823885917664, "learning_rate": 4.5985518600744956e-05, "loss": 0.2885, "num_input_tokens_seen": 1834496, "step": 20365 }, { "epoch": 5.293659043659043, "grad_norm": 0.7017781734466553, "learning_rate": 4.5982436735619926e-05, "loss": 0.3014, "num_input_tokens_seen": 1834928, "step": 20370 }, { "epoch": 5.29495841995842, "grad_norm": 0.48854565620422363, "learning_rate": 4.597935379134808e-05, "loss": 0.2696, "num_input_tokens_seen": 1835408, "step": 20375 }, { "epoch": 5.296257796257796, "grad_norm": 0.1987021565437317, "learning_rate": 4.5976269768087956e-05, "loss": 0.2564, "num_input_tokens_seen": 1835888, "step": 20380 }, { "epoch": 5.297557172557172, "grad_norm": 0.29014697670936584, "learning_rate": 4.5973184665998186e-05, "loss": 0.3111, "num_input_tokens_seen": 1836352, "step": 20385 }, { "epoch": 5.298856548856548, "grad_norm": 0.31476011872291565, "learning_rate": 4.597009848523744e-05, "loss": 0.2803, "num_input_tokens_seen": 1836848, "step": 20390 }, { "epoch": 5.300155925155925, "grad_norm": 0.7043251991271973, "learning_rate": 4.5967011225964415e-05, "loss": 0.2851, "num_input_tokens_seen": 1837312, "step": 20395 }, { "epoch": 5.301455301455301, "grad_norm": 0.3013265132904053, "learning_rate": 4.596392288833793e-05, "loss": 0.2071, "num_input_tokens_seen": 1837760, "step": 20400 }, { "epoch": 5.3027546777546775, "grad_norm": 0.24491947889328003, "learning_rate": 4.596083347251679e-05, "loss": 0.3043, "num_input_tokens_seen": 1838256, "step": 20405 }, { "epoch": 5.304054054054054, "grad_norm": 0.22984620928764343, "learning_rate": 4.595774297865992e-05, "loss": 0.2878, "num_input_tokens_seen": 1838688, "step": 20410 }, { "epoch": 5.30535343035343, "grad_norm": 0.5319976210594177, "learning_rate": 4.595465140692624e-05, "loss": 0.2887, "num_input_tokens_seen": 1839152, "step": 20415 }, { "epoch": 5.3066528066528065, "grad_norm": 0.4109744727611542, "learning_rate": 4.595155875747476e-05, "loss": 0.2197, "num_input_tokens_seen": 1839664, "step": 20420 }, { "epoch": 5.307952182952183, "grad_norm": 0.44625455141067505, "learning_rate": 4.5948465030464536e-05, "loss": 0.3274, "num_input_tokens_seen": 1840112, "step": 20425 }, { "epoch": 5.3092515592515594, "grad_norm": 0.5340493321418762, "learning_rate": 4.594537022605468e-05, "loss": 0.2732, "num_input_tokens_seen": 1840560, "step": 20430 }, { "epoch": 5.3105509355509355, "grad_norm": 0.3108513057231903, "learning_rate": 4.5942274344404367e-05, "loss": 0.2566, "num_input_tokens_seen": 1840992, "step": 20435 }, { "epoch": 5.3118503118503115, "grad_norm": 0.276942640542984, "learning_rate": 4.593917738567283e-05, "loss": 0.2574, "num_input_tokens_seen": 1841456, "step": 20440 }, { "epoch": 5.3131496881496885, "grad_norm": 0.3179682195186615, "learning_rate": 4.593607935001932e-05, "loss": 0.1997, "num_input_tokens_seen": 1841872, "step": 20445 }, { "epoch": 5.3144490644490645, "grad_norm": 0.3329450488090515, "learning_rate": 4.59329802376032e-05, "loss": 0.2915, "num_input_tokens_seen": 1842320, "step": 20450 }, { "epoch": 5.3157484407484406, "grad_norm": 0.3598553538322449, "learning_rate": 4.5929880048583846e-05, "loss": 0.2133, "num_input_tokens_seen": 1842736, "step": 20455 }, { "epoch": 5.317047817047817, "grad_norm": 0.3908247649669647, "learning_rate": 4.592677878312071e-05, "loss": 0.2134, "num_input_tokens_seen": 1843168, "step": 20460 }, { "epoch": 5.3183471933471935, "grad_norm": 0.5568287372589111, "learning_rate": 4.5923676441373287e-05, "loss": 0.0977, "num_input_tokens_seen": 1843568, "step": 20465 }, { "epoch": 5.31964656964657, "grad_norm": 0.41632014513015747, "learning_rate": 4.5920573023501134e-05, "loss": 0.2452, "num_input_tokens_seen": 1844016, "step": 20470 }, { "epoch": 5.320945945945946, "grad_norm": 2.4816038608551025, "learning_rate": 4.5917468529663866e-05, "loss": 0.2753, "num_input_tokens_seen": 1844448, "step": 20475 }, { "epoch": 5.3222453222453225, "grad_norm": 1.5896048545837402, "learning_rate": 4.591436296002115e-05, "loss": 0.395, "num_input_tokens_seen": 1844944, "step": 20480 }, { "epoch": 5.323544698544699, "grad_norm": 1.3869540691375732, "learning_rate": 4.591125631473271e-05, "loss": 0.3063, "num_input_tokens_seen": 1845360, "step": 20485 }, { "epoch": 5.324844074844075, "grad_norm": 0.3058362305164337, "learning_rate": 4.590814859395832e-05, "loss": 0.2813, "num_input_tokens_seen": 1845792, "step": 20490 }, { "epoch": 5.326143451143452, "grad_norm": 0.6083893179893494, "learning_rate": 4.590503979785782e-05, "loss": 0.3458, "num_input_tokens_seen": 1846256, "step": 20495 }, { "epoch": 5.327442827442828, "grad_norm": 0.5708713531494141, "learning_rate": 4.590192992659109e-05, "loss": 0.2485, "num_input_tokens_seen": 1846704, "step": 20500 }, { "epoch": 5.328742203742204, "grad_norm": 0.6080154776573181, "learning_rate": 4.589881898031807e-05, "loss": 0.2771, "num_input_tokens_seen": 1847136, "step": 20505 }, { "epoch": 5.33004158004158, "grad_norm": 0.4083256125450134, "learning_rate": 4.589570695919877e-05, "loss": 0.241, "num_input_tokens_seen": 1847600, "step": 20510 }, { "epoch": 5.331340956340957, "grad_norm": 0.49265220761299133, "learning_rate": 4.589259386339324e-05, "loss": 0.2401, "num_input_tokens_seen": 1848048, "step": 20515 }, { "epoch": 5.332640332640333, "grad_norm": 0.34344273805618286, "learning_rate": 4.588947969306159e-05, "loss": 0.1323, "num_input_tokens_seen": 1848464, "step": 20520 }, { "epoch": 5.333939708939709, "grad_norm": 0.6448827981948853, "learning_rate": 4.5886364448363985e-05, "loss": 0.2866, "num_input_tokens_seen": 1848912, "step": 20525 }, { "epoch": 5.335239085239086, "grad_norm": 0.5191292762756348, "learning_rate": 4.5883248129460644e-05, "loss": 0.3591, "num_input_tokens_seen": 1849376, "step": 20530 }, { "epoch": 5.336538461538462, "grad_norm": 0.35046783089637756, "learning_rate": 4.588013073651184e-05, "loss": 0.2025, "num_input_tokens_seen": 1849840, "step": 20535 }, { "epoch": 5.337837837837838, "grad_norm": 0.3432212769985199, "learning_rate": 4.587701226967791e-05, "loss": 0.3027, "num_input_tokens_seen": 1850304, "step": 20540 }, { "epoch": 5.339137214137214, "grad_norm": 0.34614259004592896, "learning_rate": 4.5873892729119225e-05, "loss": 0.2334, "num_input_tokens_seen": 1850752, "step": 20545 }, { "epoch": 5.340436590436591, "grad_norm": 0.28351956605911255, "learning_rate": 4.5870772114996254e-05, "loss": 0.2819, "num_input_tokens_seen": 1851232, "step": 20550 }, { "epoch": 5.341735966735967, "grad_norm": 0.40489229559898376, "learning_rate": 4.586765042746946e-05, "loss": 0.2809, "num_input_tokens_seen": 1851696, "step": 20555 }, { "epoch": 5.343035343035343, "grad_norm": 0.41276368498802185, "learning_rate": 4.586452766669942e-05, "loss": 0.2776, "num_input_tokens_seen": 1852176, "step": 20560 }, { "epoch": 5.34433471933472, "grad_norm": 0.3745267689228058, "learning_rate": 4.586140383284673e-05, "loss": 0.2944, "num_input_tokens_seen": 1852576, "step": 20565 }, { "epoch": 5.345634095634096, "grad_norm": 0.32100173830986023, "learning_rate": 4.585827892607204e-05, "loss": 0.2231, "num_input_tokens_seen": 1852976, "step": 20570 }, { "epoch": 5.346933471933472, "grad_norm": 0.2049732804298401, "learning_rate": 4.585515294653609e-05, "loss": 0.3127, "num_input_tokens_seen": 1853408, "step": 20575 }, { "epoch": 5.348232848232848, "grad_norm": 0.6704343557357788, "learning_rate": 4.585202589439964e-05, "loss": 0.185, "num_input_tokens_seen": 1853856, "step": 20580 }, { "epoch": 5.349532224532225, "grad_norm": 0.2584402859210968, "learning_rate": 4.5848897769823515e-05, "loss": 0.26, "num_input_tokens_seen": 1854320, "step": 20585 }, { "epoch": 5.350831600831601, "grad_norm": 0.28011855483055115, "learning_rate": 4.5845768572968607e-05, "loss": 0.1689, "num_input_tokens_seen": 1854736, "step": 20590 }, { "epoch": 5.352130977130977, "grad_norm": 0.5838616490364075, "learning_rate": 4.584263830399585e-05, "loss": 0.2903, "num_input_tokens_seen": 1855232, "step": 20595 }, { "epoch": 5.353430353430354, "grad_norm": 0.2321445196866989, "learning_rate": 4.5839506963066226e-05, "loss": 0.1629, "num_input_tokens_seen": 1855632, "step": 20600 }, { "epoch": 5.35472972972973, "grad_norm": 0.6757594347000122, "learning_rate": 4.58363745503408e-05, "loss": 0.4521, "num_input_tokens_seen": 1856080, "step": 20605 }, { "epoch": 5.356029106029106, "grad_norm": 0.33319321274757385, "learning_rate": 4.583324106598066e-05, "loss": 0.2318, "num_input_tokens_seen": 1856496, "step": 20610 }, { "epoch": 5.357328482328482, "grad_norm": 0.45297157764434814, "learning_rate": 4.5830106510146975e-05, "loss": 0.2483, "num_input_tokens_seen": 1856928, "step": 20615 }, { "epoch": 5.358627858627859, "grad_norm": 0.39229297637939453, "learning_rate": 4.582697088300095e-05, "loss": 0.2036, "num_input_tokens_seen": 1857360, "step": 20620 }, { "epoch": 5.359927234927235, "grad_norm": 0.2963487207889557, "learning_rate": 4.582383418470386e-05, "loss": 0.1689, "num_input_tokens_seen": 1857808, "step": 20625 }, { "epoch": 5.361226611226611, "grad_norm": 0.2549406588077545, "learning_rate": 4.582069641541702e-05, "loss": 0.2135, "num_input_tokens_seen": 1858224, "step": 20630 }, { "epoch": 5.362525987525988, "grad_norm": 0.23657569289207458, "learning_rate": 4.581755757530182e-05, "loss": 0.2137, "num_input_tokens_seen": 1858704, "step": 20635 }, { "epoch": 5.363825363825364, "grad_norm": 0.588407576084137, "learning_rate": 4.581441766451968e-05, "loss": 0.3787, "num_input_tokens_seen": 1859120, "step": 20640 }, { "epoch": 5.36512474012474, "grad_norm": 0.25038382411003113, "learning_rate": 4.5811276683232104e-05, "loss": 0.2054, "num_input_tokens_seen": 1859536, "step": 20645 }, { "epoch": 5.366424116424117, "grad_norm": 0.279628723859787, "learning_rate": 4.580813463160063e-05, "loss": 0.2295, "num_input_tokens_seen": 1860016, "step": 20650 }, { "epoch": 5.367723492723493, "grad_norm": 0.2874991297721863, "learning_rate": 4.580499150978685e-05, "loss": 0.2466, "num_input_tokens_seen": 1860512, "step": 20655 }, { "epoch": 5.369022869022869, "grad_norm": 0.24146494269371033, "learning_rate": 4.580184731795242e-05, "loss": 0.2167, "num_input_tokens_seen": 1860944, "step": 20660 }, { "epoch": 5.370322245322245, "grad_norm": 0.277474969625473, "learning_rate": 4.579870205625905e-05, "loss": 0.2905, "num_input_tokens_seen": 1861392, "step": 20665 }, { "epoch": 5.371621621621622, "grad_norm": 0.35630548000335693, "learning_rate": 4.579555572486851e-05, "loss": 0.2858, "num_input_tokens_seen": 1861840, "step": 20670 }, { "epoch": 5.372920997920998, "grad_norm": 0.352098673582077, "learning_rate": 4.5792408323942615e-05, "loss": 0.2526, "num_input_tokens_seen": 1862272, "step": 20675 }, { "epoch": 5.374220374220374, "grad_norm": 0.42650532722473145, "learning_rate": 4.5789259853643226e-05, "loss": 0.1911, "num_input_tokens_seen": 1862736, "step": 20680 }, { "epoch": 5.37551975051975, "grad_norm": 0.2661376893520355, "learning_rate": 4.57861103141323e-05, "loss": 0.1812, "num_input_tokens_seen": 1863248, "step": 20685 }, { "epoch": 5.376819126819127, "grad_norm": 0.48844993114471436, "learning_rate": 4.578295970557179e-05, "loss": 0.2167, "num_input_tokens_seen": 1863680, "step": 20690 }, { "epoch": 5.378118503118503, "grad_norm": 0.23156337440013885, "learning_rate": 4.577980802812376e-05, "loss": 0.1571, "num_input_tokens_seen": 1864112, "step": 20695 }, { "epoch": 5.379417879417879, "grad_norm": 0.21653281152248383, "learning_rate": 4.577665528195029e-05, "loss": 0.2283, "num_input_tokens_seen": 1864592, "step": 20700 }, { "epoch": 5.380717255717256, "grad_norm": 0.2065538763999939, "learning_rate": 4.577350146721353e-05, "loss": 0.219, "num_input_tokens_seen": 1865024, "step": 20705 }, { "epoch": 5.382016632016632, "grad_norm": 0.8209167718887329, "learning_rate": 4.577034658407568e-05, "loss": 0.2859, "num_input_tokens_seen": 1865456, "step": 20710 }, { "epoch": 5.383316008316008, "grad_norm": 0.2044382244348526, "learning_rate": 4.576719063269901e-05, "loss": 0.2345, "num_input_tokens_seen": 1865888, "step": 20715 }, { "epoch": 5.384615384615385, "grad_norm": 0.30974435806274414, "learning_rate": 4.5764033613245824e-05, "loss": 0.1502, "num_input_tokens_seen": 1866352, "step": 20720 }, { "epoch": 5.385914760914761, "grad_norm": 0.24187394976615906, "learning_rate": 4.57608755258785e-05, "loss": 0.2563, "num_input_tokens_seen": 1866784, "step": 20725 }, { "epoch": 5.387214137214137, "grad_norm": 0.27014750242233276, "learning_rate": 4.5757716370759455e-05, "loss": 0.2547, "num_input_tokens_seen": 1867200, "step": 20730 }, { "epoch": 5.388513513513513, "grad_norm": 0.3199710547924042, "learning_rate": 4.5754556148051165e-05, "loss": 0.252, "num_input_tokens_seen": 1867648, "step": 20735 }, { "epoch": 5.38981288981289, "grad_norm": 0.7691150307655334, "learning_rate": 4.575139485791616e-05, "loss": 0.2699, "num_input_tokens_seen": 1868064, "step": 20740 }, { "epoch": 5.391112266112266, "grad_norm": 0.8350960612297058, "learning_rate": 4.574823250051704e-05, "loss": 0.3227, "num_input_tokens_seen": 1868560, "step": 20745 }, { "epoch": 5.392411642411642, "grad_norm": 0.30058780312538147, "learning_rate": 4.574506907601644e-05, "loss": 0.204, "num_input_tokens_seen": 1868992, "step": 20750 }, { "epoch": 5.393711018711019, "grad_norm": 0.35605475306510925, "learning_rate": 4.5741904584577065e-05, "loss": 0.272, "num_input_tokens_seen": 1869424, "step": 20755 }, { "epoch": 5.395010395010395, "grad_norm": 1.065757155418396, "learning_rate": 4.573873902636167e-05, "loss": 0.2844, "num_input_tokens_seen": 1869888, "step": 20760 }, { "epoch": 5.396309771309771, "grad_norm": 0.39027783274650574, "learning_rate": 4.573557240153305e-05, "loss": 0.2057, "num_input_tokens_seen": 1870304, "step": 20765 }, { "epoch": 5.397609147609147, "grad_norm": 0.4139503240585327, "learning_rate": 4.573240471025406e-05, "loss": 0.2095, "num_input_tokens_seen": 1870736, "step": 20770 }, { "epoch": 5.398908523908524, "grad_norm": 0.49884912371635437, "learning_rate": 4.572923595268764e-05, "loss": 0.2598, "num_input_tokens_seen": 1871168, "step": 20775 }, { "epoch": 5.4002079002079, "grad_norm": 0.3588990569114685, "learning_rate": 4.5726066128996765e-05, "loss": 0.1591, "num_input_tokens_seen": 1871616, "step": 20780 }, { "epoch": 5.401507276507276, "grad_norm": 0.2865729033946991, "learning_rate": 4.5722895239344435e-05, "loss": 0.2186, "num_input_tokens_seen": 1872048, "step": 20785 }, { "epoch": 5.402806652806653, "grad_norm": 0.2254306823015213, "learning_rate": 4.5719723283893756e-05, "loss": 0.3849, "num_input_tokens_seen": 1872512, "step": 20790 }, { "epoch": 5.404106029106029, "grad_norm": 0.27421924471855164, "learning_rate": 4.5716550262807854e-05, "loss": 0.2328, "num_input_tokens_seen": 1872960, "step": 20795 }, { "epoch": 5.405405405405405, "grad_norm": 0.27948877215385437, "learning_rate": 4.571337617624992e-05, "loss": 0.2093, "num_input_tokens_seen": 1873408, "step": 20800 }, { "epoch": 5.406704781704781, "grad_norm": 0.2610570192337036, "learning_rate": 4.57102010243832e-05, "loss": 0.2304, "num_input_tokens_seen": 1873840, "step": 20805 }, { "epoch": 5.408004158004158, "grad_norm": 0.2658218741416931, "learning_rate": 4.5707024807371e-05, "loss": 0.2493, "num_input_tokens_seen": 1874240, "step": 20810 }, { "epoch": 5.409303534303534, "grad_norm": 0.251420259475708, "learning_rate": 4.570384752537668e-05, "loss": 0.2584, "num_input_tokens_seen": 1874704, "step": 20815 }, { "epoch": 5.41060291060291, "grad_norm": 0.2736320495605469, "learning_rate": 4.5700669178563635e-05, "loss": 0.2166, "num_input_tokens_seen": 1875136, "step": 20820 }, { "epoch": 5.411902286902287, "grad_norm": 0.2822780907154083, "learning_rate": 4.569748976709535e-05, "loss": 0.22, "num_input_tokens_seen": 1875568, "step": 20825 }, { "epoch": 5.413201663201663, "grad_norm": 0.2079193890094757, "learning_rate": 4.5694309291135326e-05, "loss": 0.3185, "num_input_tokens_seen": 1876032, "step": 20830 }, { "epoch": 5.414501039501039, "grad_norm": 0.42394983768463135, "learning_rate": 4.569112775084715e-05, "loss": 0.3416, "num_input_tokens_seen": 1876448, "step": 20835 }, { "epoch": 5.415800415800415, "grad_norm": 0.9037531018257141, "learning_rate": 4.568794514639445e-05, "loss": 0.2653, "num_input_tokens_seen": 1876880, "step": 20840 }, { "epoch": 5.417099792099792, "grad_norm": 0.31708595156669617, "learning_rate": 4.568476147794091e-05, "loss": 0.2534, "num_input_tokens_seen": 1877328, "step": 20845 }, { "epoch": 5.418399168399168, "grad_norm": 0.5573012828826904, "learning_rate": 4.568157674565027e-05, "loss": 0.2064, "num_input_tokens_seen": 1877776, "step": 20850 }, { "epoch": 5.419698544698544, "grad_norm": 0.5887853503227234, "learning_rate": 4.567839094968631e-05, "loss": 0.2386, "num_input_tokens_seen": 1878256, "step": 20855 }, { "epoch": 5.420997920997921, "grad_norm": 0.7803569436073303, "learning_rate": 4.5675204090212895e-05, "loss": 0.3003, "num_input_tokens_seen": 1878720, "step": 20860 }, { "epoch": 5.422297297297297, "grad_norm": 0.5772403478622437, "learning_rate": 4.567201616739393e-05, "loss": 0.2519, "num_input_tokens_seen": 1879168, "step": 20865 }, { "epoch": 5.423596673596673, "grad_norm": 0.9496750831604004, "learning_rate": 4.566882718139336e-05, "loss": 0.3422, "num_input_tokens_seen": 1879616, "step": 20870 }, { "epoch": 5.42489604989605, "grad_norm": 2.9116714000701904, "learning_rate": 4.56656371323752e-05, "loss": 0.2402, "num_input_tokens_seen": 1880048, "step": 20875 }, { "epoch": 5.426195426195426, "grad_norm": 0.7244580984115601, "learning_rate": 4.5662446020503535e-05, "loss": 0.2847, "num_input_tokens_seen": 1880496, "step": 20880 }, { "epoch": 5.427494802494802, "grad_norm": 0.5936213135719299, "learning_rate": 4.5659253845942473e-05, "loss": 0.8448, "num_input_tokens_seen": 1880960, "step": 20885 }, { "epoch": 5.4287941787941785, "grad_norm": 0.37542295455932617, "learning_rate": 4.5656060608856175e-05, "loss": 0.2172, "num_input_tokens_seen": 1881376, "step": 20890 }, { "epoch": 5.430093555093555, "grad_norm": 0.3273310959339142, "learning_rate": 4.56528663094089e-05, "loss": 0.3992, "num_input_tokens_seen": 1881856, "step": 20895 }, { "epoch": 5.4313929313929314, "grad_norm": 0.44311463832855225, "learning_rate": 4.564967094776492e-05, "loss": 0.2415, "num_input_tokens_seen": 1882320, "step": 20900 }, { "epoch": 5.4326923076923075, "grad_norm": 0.5608602166175842, "learning_rate": 4.564647452408858e-05, "loss": 0.2938, "num_input_tokens_seen": 1882752, "step": 20905 }, { "epoch": 5.4339916839916835, "grad_norm": 0.4474755525588989, "learning_rate": 4.5643277038544276e-05, "loss": 0.2772, "num_input_tokens_seen": 1883216, "step": 20910 }, { "epoch": 5.4352910602910605, "grad_norm": 0.5957719683647156, "learning_rate": 4.5640078491296447e-05, "loss": 0.3091, "num_input_tokens_seen": 1883664, "step": 20915 }, { "epoch": 5.4365904365904365, "grad_norm": 0.3576343357563019, "learning_rate": 4.5636878882509604e-05, "loss": 0.2598, "num_input_tokens_seen": 1884128, "step": 20920 }, { "epoch": 5.4378898128898125, "grad_norm": 0.5432608723640442, "learning_rate": 4.563367821234831e-05, "loss": 0.231, "num_input_tokens_seen": 1884592, "step": 20925 }, { "epoch": 5.4391891891891895, "grad_norm": 0.2936896085739136, "learning_rate": 4.563047648097717e-05, "loss": 0.2221, "num_input_tokens_seen": 1885056, "step": 20930 }, { "epoch": 5.4404885654885655, "grad_norm": 0.7518013715744019, "learning_rate": 4.562727368856087e-05, "loss": 0.3619, "num_input_tokens_seen": 1885472, "step": 20935 }, { "epoch": 5.441787941787942, "grad_norm": 0.5115635991096497, "learning_rate": 4.5624069835264106e-05, "loss": 0.2268, "num_input_tokens_seen": 1885984, "step": 20940 }, { "epoch": 5.4430873180873185, "grad_norm": 0.8723769783973694, "learning_rate": 4.562086492125167e-05, "loss": 0.2818, "num_input_tokens_seen": 1886464, "step": 20945 }, { "epoch": 5.4443866943866945, "grad_norm": 0.3083415627479553, "learning_rate": 4.56176589466884e-05, "loss": 0.2125, "num_input_tokens_seen": 1886912, "step": 20950 }, { "epoch": 5.445686070686071, "grad_norm": 0.45784834027290344, "learning_rate": 4.561445191173918e-05, "loss": 0.2193, "num_input_tokens_seen": 1887376, "step": 20955 }, { "epoch": 5.446985446985447, "grad_norm": 0.29872334003448486, "learning_rate": 4.561124381656894e-05, "loss": 0.2434, "num_input_tokens_seen": 1887808, "step": 20960 }, { "epoch": 5.4482848232848236, "grad_norm": 0.7626957893371582, "learning_rate": 4.560803466134268e-05, "loss": 0.2006, "num_input_tokens_seen": 1888272, "step": 20965 }, { "epoch": 5.4495841995842, "grad_norm": 0.2737632989883423, "learning_rate": 4.560482444622546e-05, "loss": 0.2028, "num_input_tokens_seen": 1888736, "step": 20970 }, { "epoch": 5.450883575883576, "grad_norm": 0.37637147307395935, "learning_rate": 4.560161317138236e-05, "loss": 0.3131, "num_input_tokens_seen": 1889168, "step": 20975 }, { "epoch": 5.452182952182953, "grad_norm": 0.5162674188613892, "learning_rate": 4.559840083697857e-05, "loss": 0.3052, "num_input_tokens_seen": 1889632, "step": 20980 }, { "epoch": 5.453482328482329, "grad_norm": 0.47598081827163696, "learning_rate": 4.559518744317929e-05, "loss": 0.2285, "num_input_tokens_seen": 1890096, "step": 20985 }, { "epoch": 5.454781704781705, "grad_norm": 0.29227080941200256, "learning_rate": 4.559197299014977e-05, "loss": 0.2241, "num_input_tokens_seen": 1890512, "step": 20990 }, { "epoch": 5.456081081081081, "grad_norm": 0.31815865635871887, "learning_rate": 4.558875747805537e-05, "loss": 0.2631, "num_input_tokens_seen": 1890944, "step": 20995 }, { "epoch": 5.457380457380458, "grad_norm": 0.2895435392856598, "learning_rate": 4.558554090706143e-05, "loss": 0.3014, "num_input_tokens_seen": 1891392, "step": 21000 }, { "epoch": 5.458679833679834, "grad_norm": 0.46491867303848267, "learning_rate": 4.55823232773334e-05, "loss": 0.2301, "num_input_tokens_seen": 1891840, "step": 21005 }, { "epoch": 5.45997920997921, "grad_norm": 0.6420420408248901, "learning_rate": 4.5579104589036764e-05, "loss": 0.2867, "num_input_tokens_seen": 1892304, "step": 21010 }, { "epoch": 5.461278586278587, "grad_norm": 0.31841808557510376, "learning_rate": 4.557588484233706e-05, "loss": 0.2231, "num_input_tokens_seen": 1892736, "step": 21015 }, { "epoch": 5.462577962577963, "grad_norm": 0.3792353570461273, "learning_rate": 4.5572664037399886e-05, "loss": 0.1881, "num_input_tokens_seen": 1893216, "step": 21020 }, { "epoch": 5.463877338877339, "grad_norm": 0.8165502548217773, "learning_rate": 4.556944217439088e-05, "loss": 0.2751, "num_input_tokens_seen": 1893680, "step": 21025 }, { "epoch": 5.465176715176715, "grad_norm": 0.23920239508152008, "learning_rate": 4.556621925347577e-05, "loss": 0.1993, "num_input_tokens_seen": 1894112, "step": 21030 }, { "epoch": 5.466476091476092, "grad_norm": 0.727807343006134, "learning_rate": 4.5562995274820284e-05, "loss": 0.2916, "num_input_tokens_seen": 1894544, "step": 21035 }, { "epoch": 5.467775467775468, "grad_norm": 0.31268441677093506, "learning_rate": 4.5559770238590264e-05, "loss": 0.3365, "num_input_tokens_seen": 1894960, "step": 21040 }, { "epoch": 5.469074844074844, "grad_norm": 0.41079244017601013, "learning_rate": 4.555654414495155e-05, "loss": 0.2238, "num_input_tokens_seen": 1895376, "step": 21045 }, { "epoch": 5.470374220374221, "grad_norm": 0.46131232380867004, "learning_rate": 4.5553316994070074e-05, "loss": 0.247, "num_input_tokens_seen": 1895840, "step": 21050 }, { "epoch": 5.471673596673597, "grad_norm": 0.30528780817985535, "learning_rate": 4.5550088786111814e-05, "loss": 0.159, "num_input_tokens_seen": 1896288, "step": 21055 }, { "epoch": 5.472972972972973, "grad_norm": 0.39077886939048767, "learning_rate": 4.55468595212428e-05, "loss": 0.314, "num_input_tokens_seen": 1896752, "step": 21060 }, { "epoch": 5.474272349272349, "grad_norm": 0.425170361995697, "learning_rate": 4.554362919962911e-05, "loss": 0.2323, "num_input_tokens_seen": 1897216, "step": 21065 }, { "epoch": 5.475571725571726, "grad_norm": 0.44439396262168884, "learning_rate": 4.5540397821436886e-05, "loss": 0.2178, "num_input_tokens_seen": 1897680, "step": 21070 }, { "epoch": 5.476871101871102, "grad_norm": 0.48947182297706604, "learning_rate": 4.553716538683232e-05, "loss": 0.2239, "num_input_tokens_seen": 1898128, "step": 21075 }, { "epoch": 5.478170478170478, "grad_norm": 0.31101807951927185, "learning_rate": 4.553393189598167e-05, "loss": 0.2562, "num_input_tokens_seen": 1898576, "step": 21080 }, { "epoch": 5.479469854469855, "grad_norm": 0.5415576100349426, "learning_rate": 4.553069734905122e-05, "loss": 0.2773, "num_input_tokens_seen": 1899024, "step": 21085 }, { "epoch": 5.480769230769231, "grad_norm": 0.2950344681739807, "learning_rate": 4.5527461746207337e-05, "loss": 0.2173, "num_input_tokens_seen": 1899472, "step": 21090 }, { "epoch": 5.482068607068607, "grad_norm": 0.3321733772754669, "learning_rate": 4.5524225087616426e-05, "loss": 0.2125, "num_input_tokens_seen": 1899888, "step": 21095 }, { "epoch": 5.483367983367984, "grad_norm": 0.34693479537963867, "learning_rate": 4.552098737344496e-05, "loss": 0.2984, "num_input_tokens_seen": 1900320, "step": 21100 }, { "epoch": 5.48466735966736, "grad_norm": 0.34254777431488037, "learning_rate": 4.5517748603859435e-05, "loss": 0.2142, "num_input_tokens_seen": 1900768, "step": 21105 }, { "epoch": 5.485966735966736, "grad_norm": 0.33032241463661194, "learning_rate": 4.5514508779026455e-05, "loss": 0.2257, "num_input_tokens_seen": 1901248, "step": 21110 }, { "epoch": 5.487266112266112, "grad_norm": 0.37268614768981934, "learning_rate": 4.551126789911263e-05, "loss": 0.3147, "num_input_tokens_seen": 1901680, "step": 21115 }, { "epoch": 5.488565488565489, "grad_norm": 0.5166509747505188, "learning_rate": 4.550802596428464e-05, "loss": 0.2322, "num_input_tokens_seen": 1902128, "step": 21120 }, { "epoch": 5.489864864864865, "grad_norm": 0.548220694065094, "learning_rate": 4.550478297470922e-05, "loss": 0.2682, "num_input_tokens_seen": 1902592, "step": 21125 }, { "epoch": 5.491164241164241, "grad_norm": 0.7626625895500183, "learning_rate": 4.550153893055317e-05, "loss": 0.2563, "num_input_tokens_seen": 1903008, "step": 21130 }, { "epoch": 5.492463617463617, "grad_norm": 0.4222646951675415, "learning_rate": 4.549829383198333e-05, "loss": 0.1402, "num_input_tokens_seen": 1903456, "step": 21135 }, { "epoch": 5.493762993762994, "grad_norm": 1.2597270011901855, "learning_rate": 4.54950476791666e-05, "loss": 0.2209, "num_input_tokens_seen": 1903936, "step": 21140 }, { "epoch": 5.49506237006237, "grad_norm": 0.2977670133113861, "learning_rate": 4.549180047226993e-05, "loss": 0.3232, "num_input_tokens_seen": 1904416, "step": 21145 }, { "epoch": 5.496361746361746, "grad_norm": 0.5779452919960022, "learning_rate": 4.5488552211460324e-05, "loss": 0.1255, "num_input_tokens_seen": 1904848, "step": 21150 }, { "epoch": 5.497661122661123, "grad_norm": 0.8710838556289673, "learning_rate": 4.5485302896904846e-05, "loss": 0.3799, "num_input_tokens_seen": 1905328, "step": 21155 }, { "epoch": 5.498960498960499, "grad_norm": 0.3649301528930664, "learning_rate": 4.5482052528770615e-05, "loss": 0.1919, "num_input_tokens_seen": 1905808, "step": 21160 }, { "epoch": 5.500259875259875, "grad_norm": 0.40661951899528503, "learning_rate": 4.54788011072248e-05, "loss": 0.3339, "num_input_tokens_seen": 1906256, "step": 21165 }, { "epoch": 5.501559251559252, "grad_norm": 0.47314420342445374, "learning_rate": 4.5475548632434616e-05, "loss": 0.1929, "num_input_tokens_seen": 1906688, "step": 21170 }, { "epoch": 5.502858627858628, "grad_norm": 0.5143239498138428, "learning_rate": 4.547229510456735e-05, "loss": 0.3118, "num_input_tokens_seen": 1907152, "step": 21175 }, { "epoch": 5.504158004158004, "grad_norm": 0.6820245385169983, "learning_rate": 4.546904052379033e-05, "loss": 0.2634, "num_input_tokens_seen": 1907648, "step": 21180 }, { "epoch": 5.50545738045738, "grad_norm": 0.8221606016159058, "learning_rate": 4.546578489027095e-05, "loss": 0.2625, "num_input_tokens_seen": 1908112, "step": 21185 }, { "epoch": 5.506756756756757, "grad_norm": 0.4633744955062866, "learning_rate": 4.546252820417664e-05, "loss": 0.2597, "num_input_tokens_seen": 1908560, "step": 21190 }, { "epoch": 5.508056133056133, "grad_norm": 0.5125693678855896, "learning_rate": 4.545927046567489e-05, "loss": 0.2277, "num_input_tokens_seen": 1908976, "step": 21195 }, { "epoch": 5.509355509355509, "grad_norm": 0.9928686022758484, "learning_rate": 4.5456011674933264e-05, "loss": 0.2003, "num_input_tokens_seen": 1909504, "step": 21200 }, { "epoch": 5.510654885654886, "grad_norm": 0.9160645604133606, "learning_rate": 4.545275183211936e-05, "loss": 0.4303, "num_input_tokens_seen": 1909968, "step": 21205 }, { "epoch": 5.511954261954262, "grad_norm": 0.3729301691055298, "learning_rate": 4.5449490937400824e-05, "loss": 0.1889, "num_input_tokens_seen": 1910400, "step": 21210 }, { "epoch": 5.513253638253638, "grad_norm": 0.4688933193683624, "learning_rate": 4.5446228990945385e-05, "loss": 0.2795, "num_input_tokens_seen": 1910832, "step": 21215 }, { "epoch": 5.514553014553014, "grad_norm": 0.6751311421394348, "learning_rate": 4.5442965992920796e-05, "loss": 0.2979, "num_input_tokens_seen": 1911296, "step": 21220 }, { "epoch": 5.515852390852391, "grad_norm": 0.39646148681640625, "learning_rate": 4.543970194349487e-05, "loss": 0.2686, "num_input_tokens_seen": 1911760, "step": 21225 }, { "epoch": 5.517151767151767, "grad_norm": 0.38350510597229004, "learning_rate": 4.5436436842835495e-05, "loss": 0.2993, "num_input_tokens_seen": 1912208, "step": 21230 }, { "epoch": 5.518451143451143, "grad_norm": 0.4056433439254761, "learning_rate": 4.5433170691110596e-05, "loss": 0.2735, "num_input_tokens_seen": 1912672, "step": 21235 }, { "epoch": 5.51975051975052, "grad_norm": 0.6527827978134155, "learning_rate": 4.542990348848814e-05, "loss": 0.2837, "num_input_tokens_seen": 1913104, "step": 21240 }, { "epoch": 5.521049896049896, "grad_norm": 0.17348523437976837, "learning_rate": 4.542663523513618e-05, "loss": 0.2674, "num_input_tokens_seen": 1913584, "step": 21245 }, { "epoch": 5.522349272349272, "grad_norm": 0.4560535252094269, "learning_rate": 4.542336593122279e-05, "loss": 0.277, "num_input_tokens_seen": 1914080, "step": 21250 }, { "epoch": 5.523648648648649, "grad_norm": 0.23260459303855896, "learning_rate": 4.542009557691614e-05, "loss": 0.2244, "num_input_tokens_seen": 1914496, "step": 21255 }, { "epoch": 5.524948024948025, "grad_norm": 0.8104182481765747, "learning_rate": 4.541682417238439e-05, "loss": 0.2466, "num_input_tokens_seen": 1914944, "step": 21260 }, { "epoch": 5.526247401247401, "grad_norm": 1.3985706567764282, "learning_rate": 4.541355171779582e-05, "loss": 0.3449, "num_input_tokens_seen": 1915376, "step": 21265 }, { "epoch": 5.527546777546777, "grad_norm": 0.5754616856575012, "learning_rate": 4.541027821331872e-05, "loss": 0.2288, "num_input_tokens_seen": 1915824, "step": 21270 }, { "epoch": 5.528846153846154, "grad_norm": 0.4652358293533325, "learning_rate": 4.540700365912146e-05, "loss": 0.2409, "num_input_tokens_seen": 1916272, "step": 21275 }, { "epoch": 5.53014553014553, "grad_norm": 0.401811808347702, "learning_rate": 4.540372805537245e-05, "loss": 0.9379, "num_input_tokens_seen": 1916752, "step": 21280 }, { "epoch": 5.531444906444906, "grad_norm": 0.34988969564437866, "learning_rate": 4.540045140224015e-05, "loss": 0.2182, "num_input_tokens_seen": 1917200, "step": 21285 }, { "epoch": 5.532744282744282, "grad_norm": 0.34407734870910645, "learning_rate": 4.539717369989309e-05, "loss": 0.225, "num_input_tokens_seen": 1917664, "step": 21290 }, { "epoch": 5.534043659043659, "grad_norm": 0.46493449807167053, "learning_rate": 4.539389494849985e-05, "loss": 0.2146, "num_input_tokens_seen": 1918176, "step": 21295 }, { "epoch": 5.535343035343035, "grad_norm": 0.3948802947998047, "learning_rate": 4.5390615148229044e-05, "loss": 0.3524, "num_input_tokens_seen": 1918624, "step": 21300 }, { "epoch": 5.536642411642411, "grad_norm": 0.4333606958389282, "learning_rate": 4.5387334299249366e-05, "loss": 0.2862, "num_input_tokens_seen": 1919088, "step": 21305 }, { "epoch": 5.537941787941788, "grad_norm": 0.3250482380390167, "learning_rate": 4.5384052401729546e-05, "loss": 0.1557, "num_input_tokens_seen": 1919520, "step": 21310 }, { "epoch": 5.539241164241164, "grad_norm": 0.26182031631469727, "learning_rate": 4.538076945583839e-05, "loss": 0.2576, "num_input_tokens_seen": 1919936, "step": 21315 }, { "epoch": 5.54054054054054, "grad_norm": 0.2704180181026459, "learning_rate": 4.537748546174473e-05, "loss": 0.1252, "num_input_tokens_seen": 1920368, "step": 21320 }, { "epoch": 5.541839916839917, "grad_norm": 0.3109146058559418, "learning_rate": 4.537420041961746e-05, "loss": 0.3532, "num_input_tokens_seen": 1920768, "step": 21325 }, { "epoch": 5.543139293139293, "grad_norm": 0.2181778848171234, "learning_rate": 4.537091432962555e-05, "loss": 0.1029, "num_input_tokens_seen": 1921232, "step": 21330 }, { "epoch": 5.544438669438669, "grad_norm": 0.19378186762332916, "learning_rate": 4.5367627191937994e-05, "loss": 0.1618, "num_input_tokens_seen": 1921696, "step": 21335 }, { "epoch": 5.545738045738045, "grad_norm": 0.20694907009601593, "learning_rate": 4.536433900672386e-05, "loss": 0.3252, "num_input_tokens_seen": 1922128, "step": 21340 }, { "epoch": 5.547037422037422, "grad_norm": 0.2035001665353775, "learning_rate": 4.5361049774152256e-05, "loss": 0.2561, "num_input_tokens_seen": 1922608, "step": 21345 }, { "epoch": 5.548336798336798, "grad_norm": 0.2430991679430008, "learning_rate": 4.535775949439235e-05, "loss": 0.3527, "num_input_tokens_seen": 1923024, "step": 21350 }, { "epoch": 5.549636174636174, "grad_norm": 0.5303938984870911, "learning_rate": 4.5354468167613366e-05, "loss": 0.192, "num_input_tokens_seen": 1923440, "step": 21355 }, { "epoch": 5.5509355509355505, "grad_norm": 0.2927713096141815, "learning_rate": 4.535117579398459e-05, "loss": 0.2289, "num_input_tokens_seen": 1923936, "step": 21360 }, { "epoch": 5.552234927234927, "grad_norm": 0.25853124260902405, "learning_rate": 4.534788237367533e-05, "loss": 0.2223, "num_input_tokens_seen": 1924384, "step": 21365 }, { "epoch": 5.553534303534303, "grad_norm": 0.21892067790031433, "learning_rate": 4.5344587906855e-05, "loss": 0.2226, "num_input_tokens_seen": 1924848, "step": 21370 }, { "epoch": 5.5548336798336795, "grad_norm": 0.2777785062789917, "learning_rate": 4.534129239369301e-05, "loss": 0.2313, "num_input_tokens_seen": 1925280, "step": 21375 }, { "epoch": 5.556133056133056, "grad_norm": 0.19419065117835999, "learning_rate": 4.533799583435886e-05, "loss": 0.2083, "num_input_tokens_seen": 1925760, "step": 21380 }, { "epoch": 5.5574324324324325, "grad_norm": 0.2538975477218628, "learning_rate": 4.5334698229022096e-05, "loss": 0.3571, "num_input_tokens_seen": 1926256, "step": 21385 }, { "epoch": 5.5587318087318085, "grad_norm": 0.5417584776878357, "learning_rate": 4.533139957785233e-05, "loss": 0.2491, "num_input_tokens_seen": 1926736, "step": 21390 }, { "epoch": 5.560031185031185, "grad_norm": 0.21639618277549744, "learning_rate": 4.53280998810192e-05, "loss": 0.2548, "num_input_tokens_seen": 1927200, "step": 21395 }, { "epoch": 5.5613305613305615, "grad_norm": 0.21563391387462616, "learning_rate": 4.532479913869241e-05, "loss": 0.2523, "num_input_tokens_seen": 1927648, "step": 21400 }, { "epoch": 5.5626299376299375, "grad_norm": 0.25197288393974304, "learning_rate": 4.532149735104173e-05, "loss": 0.2529, "num_input_tokens_seen": 1928080, "step": 21405 }, { "epoch": 5.563929313929314, "grad_norm": 0.22332003712654114, "learning_rate": 4.531819451823697e-05, "loss": 0.2738, "num_input_tokens_seen": 1928560, "step": 21410 }, { "epoch": 5.5652286902286905, "grad_norm": 0.22506889700889587, "learning_rate": 4.5314890640447996e-05, "loss": 0.2539, "num_input_tokens_seen": 1929024, "step": 21415 }, { "epoch": 5.5665280665280665, "grad_norm": 0.2674817442893982, "learning_rate": 4.531158571784473e-05, "loss": 0.2146, "num_input_tokens_seen": 1929440, "step": 21420 }, { "epoch": 5.567827442827443, "grad_norm": 0.19742503762245178, "learning_rate": 4.530827975059715e-05, "loss": 0.2866, "num_input_tokens_seen": 1929952, "step": 21425 }, { "epoch": 5.5691268191268195, "grad_norm": 0.2785181999206543, "learning_rate": 4.530497273887529e-05, "loss": 0.2585, "num_input_tokens_seen": 1930384, "step": 21430 }, { "epoch": 5.5704261954261955, "grad_norm": 0.2133316993713379, "learning_rate": 4.530166468284922e-05, "loss": 0.1873, "num_input_tokens_seen": 1930832, "step": 21435 }, { "epoch": 5.571725571725572, "grad_norm": 0.2305450439453125, "learning_rate": 4.5298355582689086e-05, "loss": 0.2176, "num_input_tokens_seen": 1931280, "step": 21440 }, { "epoch": 5.573024948024948, "grad_norm": 0.210023894906044, "learning_rate": 4.529504543856507e-05, "loss": 0.2161, "num_input_tokens_seen": 1931696, "step": 21445 }, { "epoch": 5.574324324324325, "grad_norm": 0.553979754447937, "learning_rate": 4.529173425064743e-05, "loss": 0.261, "num_input_tokens_seen": 1932160, "step": 21450 }, { "epoch": 5.575623700623701, "grad_norm": 0.177488312125206, "learning_rate": 4.5288422019106446e-05, "loss": 0.2126, "num_input_tokens_seen": 1932608, "step": 21455 }, { "epoch": 5.576923076923077, "grad_norm": 0.18838806450366974, "learning_rate": 4.528510874411248e-05, "loss": 0.1605, "num_input_tokens_seen": 1933072, "step": 21460 }, { "epoch": 5.578222453222454, "grad_norm": 0.2815096378326416, "learning_rate": 4.528179442583594e-05, "loss": 0.349, "num_input_tokens_seen": 1933584, "step": 21465 }, { "epoch": 5.57952182952183, "grad_norm": 0.2588048577308655, "learning_rate": 4.5278479064447274e-05, "loss": 0.2945, "num_input_tokens_seen": 1934000, "step": 21470 }, { "epoch": 5.580821205821206, "grad_norm": 0.215941920876503, "learning_rate": 4.527516266011701e-05, "loss": 0.2573, "num_input_tokens_seen": 1934464, "step": 21475 }, { "epoch": 5.582120582120583, "grad_norm": 0.2047392576932907, "learning_rate": 4.527184521301569e-05, "loss": 0.2181, "num_input_tokens_seen": 1934896, "step": 21480 }, { "epoch": 5.583419958419959, "grad_norm": 0.21584783494472504, "learning_rate": 4.526852672331396e-05, "loss": 0.2515, "num_input_tokens_seen": 1935328, "step": 21485 }, { "epoch": 5.584719334719335, "grad_norm": 0.2997661232948303, "learning_rate": 4.526520719118247e-05, "loss": 0.3201, "num_input_tokens_seen": 1935824, "step": 21490 }, { "epoch": 5.586018711018711, "grad_norm": 0.18496011197566986, "learning_rate": 4.5261886616791966e-05, "loss": 0.2602, "num_input_tokens_seen": 1936256, "step": 21495 }, { "epoch": 5.587318087318088, "grad_norm": 0.18128478527069092, "learning_rate": 4.525856500031321e-05, "loss": 0.2245, "num_input_tokens_seen": 1936736, "step": 21500 }, { "epoch": 5.588617463617464, "grad_norm": 0.19992317259311676, "learning_rate": 4.5255242341917055e-05, "loss": 0.313, "num_input_tokens_seen": 1937184, "step": 21505 }, { "epoch": 5.58991683991684, "grad_norm": 0.35806962847709656, "learning_rate": 4.525191864177437e-05, "loss": 0.272, "num_input_tokens_seen": 1937632, "step": 21510 }, { "epoch": 5.591216216216216, "grad_norm": 0.31803983449935913, "learning_rate": 4.524859390005611e-05, "loss": 0.2571, "num_input_tokens_seen": 1938064, "step": 21515 }, { "epoch": 5.592515592515593, "grad_norm": 0.2000257819890976, "learning_rate": 4.524526811693326e-05, "loss": 0.2345, "num_input_tokens_seen": 1938512, "step": 21520 }, { "epoch": 5.593814968814969, "grad_norm": 0.24401575326919556, "learning_rate": 4.524194129257688e-05, "loss": 0.1998, "num_input_tokens_seen": 1938944, "step": 21525 }, { "epoch": 5.595114345114345, "grad_norm": 0.2233847826719284, "learning_rate": 4.523861342715806e-05, "loss": 0.312, "num_input_tokens_seen": 1939392, "step": 21530 }, { "epoch": 5.596413721413722, "grad_norm": 0.23425190150737762, "learning_rate": 4.523528452084795e-05, "loss": 0.2471, "num_input_tokens_seen": 1939824, "step": 21535 }, { "epoch": 5.597713097713098, "grad_norm": 0.24530234932899475, "learning_rate": 4.5231954573817785e-05, "loss": 0.2641, "num_input_tokens_seen": 1940256, "step": 21540 }, { "epoch": 5.599012474012474, "grad_norm": 0.21787351369857788, "learning_rate": 4.5228623586238806e-05, "loss": 0.253, "num_input_tokens_seen": 1940704, "step": 21545 }, { "epoch": 5.600311850311851, "grad_norm": 0.2738027572631836, "learning_rate": 4.5225291558282334e-05, "loss": 0.222, "num_input_tokens_seen": 1941136, "step": 21550 }, { "epoch": 5.601611226611227, "grad_norm": 0.258028507232666, "learning_rate": 4.522195849011973e-05, "loss": 0.2922, "num_input_tokens_seen": 1941600, "step": 21555 }, { "epoch": 5.602910602910603, "grad_norm": 0.23120704293251038, "learning_rate": 4.521862438192244e-05, "loss": 0.1808, "num_input_tokens_seen": 1942048, "step": 21560 }, { "epoch": 5.604209979209979, "grad_norm": 0.5668759942054749, "learning_rate": 4.521528923386191e-05, "loss": 0.289, "num_input_tokens_seen": 1942448, "step": 21565 }, { "epoch": 5.605509355509356, "grad_norm": 0.24633274972438812, "learning_rate": 4.521195304610969e-05, "loss": 0.2862, "num_input_tokens_seen": 1942912, "step": 21570 }, { "epoch": 5.606808731808732, "grad_norm": 0.2375348061323166, "learning_rate": 4.520861581883736e-05, "loss": 0.2182, "num_input_tokens_seen": 1943328, "step": 21575 }, { "epoch": 5.608108108108108, "grad_norm": 0.23565976321697235, "learning_rate": 4.520527755221656e-05, "loss": 0.2557, "num_input_tokens_seen": 1943824, "step": 21580 }, { "epoch": 5.609407484407484, "grad_norm": 0.20814107358455658, "learning_rate": 4.5201938246418976e-05, "loss": 0.3212, "num_input_tokens_seen": 1944304, "step": 21585 }, { "epoch": 5.610706860706861, "grad_norm": 0.16477899253368378, "learning_rate": 4.519859790161634e-05, "loss": 0.3023, "num_input_tokens_seen": 1944736, "step": 21590 }, { "epoch": 5.612006237006237, "grad_norm": 0.15883223712444305, "learning_rate": 4.519525651798047e-05, "loss": 0.2673, "num_input_tokens_seen": 1945184, "step": 21595 }, { "epoch": 5.613305613305613, "grad_norm": 0.16076093912124634, "learning_rate": 4.519191409568321e-05, "loss": 0.2657, "num_input_tokens_seen": 1945632, "step": 21600 }, { "epoch": 5.61460498960499, "grad_norm": 0.15079356729984283, "learning_rate": 4.5188570634896454e-05, "loss": 0.2516, "num_input_tokens_seen": 1946080, "step": 21605 }, { "epoch": 5.615904365904366, "grad_norm": 0.29016363620758057, "learning_rate": 4.518522613579217e-05, "loss": 0.2557, "num_input_tokens_seen": 1946544, "step": 21610 }, { "epoch": 5.617203742203742, "grad_norm": 0.19682267308235168, "learning_rate": 4.518188059854236e-05, "loss": 0.3365, "num_input_tokens_seen": 1947008, "step": 21615 }, { "epoch": 5.618503118503119, "grad_norm": 0.3222457468509674, "learning_rate": 4.5178534023319096e-05, "loss": 0.237, "num_input_tokens_seen": 1947440, "step": 21620 }, { "epoch": 5.619802494802495, "grad_norm": 0.29890507459640503, "learning_rate": 4.5175186410294495e-05, "loss": 0.226, "num_input_tokens_seen": 1947856, "step": 21625 }, { "epoch": 5.621101871101871, "grad_norm": 0.21794407069683075, "learning_rate": 4.517183775964073e-05, "loss": 0.2357, "num_input_tokens_seen": 1948288, "step": 21630 }, { "epoch": 5.622401247401247, "grad_norm": 0.26090848445892334, "learning_rate": 4.5168488071530015e-05, "loss": 0.1352, "num_input_tokens_seen": 1948720, "step": 21635 }, { "epoch": 5.623700623700624, "grad_norm": 0.2539535462856293, "learning_rate": 4.5165137346134634e-05, "loss": 0.265, "num_input_tokens_seen": 1949184, "step": 21640 }, { "epoch": 5.625, "grad_norm": 0.21682019531726837, "learning_rate": 4.516178558362692e-05, "loss": 0.1525, "num_input_tokens_seen": 1949632, "step": 21645 }, { "epoch": 5.626299376299376, "grad_norm": 0.21020813286304474, "learning_rate": 4.515843278417925e-05, "loss": 0.2077, "num_input_tokens_seen": 1950064, "step": 21650 }, { "epoch": 5.627598752598753, "grad_norm": 0.41324669122695923, "learning_rate": 4.515507894796408e-05, "loss": 0.2837, "num_input_tokens_seen": 1950496, "step": 21655 }, { "epoch": 5.628898128898129, "grad_norm": 0.1897127628326416, "learning_rate": 4.515172407515388e-05, "loss": 0.2612, "num_input_tokens_seen": 1950928, "step": 21660 }, { "epoch": 5.630197505197505, "grad_norm": 0.19052650034427643, "learning_rate": 4.51483681659212e-05, "loss": 0.0948, "num_input_tokens_seen": 1951376, "step": 21665 }, { "epoch": 5.631496881496881, "grad_norm": 0.2929794490337372, "learning_rate": 4.514501122043864e-05, "loss": 0.3196, "num_input_tokens_seen": 1951792, "step": 21670 }, { "epoch": 5.632796257796258, "grad_norm": 0.2958095669746399, "learning_rate": 4.5141653238878856e-05, "loss": 0.2508, "num_input_tokens_seen": 1952224, "step": 21675 }, { "epoch": 5.634095634095634, "grad_norm": 0.24950405955314636, "learning_rate": 4.5138294221414546e-05, "loss": 0.2945, "num_input_tokens_seen": 1952672, "step": 21680 }, { "epoch": 5.63539501039501, "grad_norm": 0.23127153515815735, "learning_rate": 4.513493416821847e-05, "loss": 0.2953, "num_input_tokens_seen": 1953152, "step": 21685 }, { "epoch": 5.636694386694387, "grad_norm": 0.24917271733283997, "learning_rate": 4.5131573079463426e-05, "loss": 0.2269, "num_input_tokens_seen": 1953616, "step": 21690 }, { "epoch": 5.637993762993763, "grad_norm": 0.2689155042171478, "learning_rate": 4.51282109553223e-05, "loss": 0.1811, "num_input_tokens_seen": 1954064, "step": 21695 }, { "epoch": 5.639293139293139, "grad_norm": 0.22658661007881165, "learning_rate": 4.5124847795967995e-05, "loss": 0.3087, "num_input_tokens_seen": 1954480, "step": 21700 }, { "epoch": 5.640592515592516, "grad_norm": 0.2524161636829376, "learning_rate": 4.512148360157349e-05, "loss": 0.1766, "num_input_tokens_seen": 1954928, "step": 21705 }, { "epoch": 5.641891891891892, "grad_norm": 0.20550642907619476, "learning_rate": 4.51181183723118e-05, "loss": 0.3001, "num_input_tokens_seen": 1955344, "step": 21710 }, { "epoch": 5.643191268191268, "grad_norm": 0.2589370906352997, "learning_rate": 4.5114752108356004e-05, "loss": 0.3244, "num_input_tokens_seen": 1955792, "step": 21715 }, { "epoch": 5.644490644490644, "grad_norm": 0.18895335495471954, "learning_rate": 4.511138480987924e-05, "loss": 0.297, "num_input_tokens_seen": 1956240, "step": 21720 }, { "epoch": 5.645790020790021, "grad_norm": 0.4878731071949005, "learning_rate": 4.510801647705468e-05, "loss": 0.2512, "num_input_tokens_seen": 1956672, "step": 21725 }, { "epoch": 5.647089397089397, "grad_norm": 0.3303707540035248, "learning_rate": 4.510464711005557e-05, "loss": 0.2467, "num_input_tokens_seen": 1957152, "step": 21730 }, { "epoch": 5.648388773388773, "grad_norm": 0.1977829486131668, "learning_rate": 4.510127670905519e-05, "loss": 0.2591, "num_input_tokens_seen": 1957584, "step": 21735 }, { "epoch": 5.649688149688149, "grad_norm": 0.2843727171421051, "learning_rate": 4.509790527422689e-05, "loss": 0.2672, "num_input_tokens_seen": 1958032, "step": 21740 }, { "epoch": 5.650987525987526, "grad_norm": 0.1582498997449875, "learning_rate": 4.509453280574407e-05, "loss": 0.2989, "num_input_tokens_seen": 1958496, "step": 21745 }, { "epoch": 5.652286902286902, "grad_norm": 0.4523053765296936, "learning_rate": 4.5091159303780175e-05, "loss": 0.3009, "num_input_tokens_seen": 1958976, "step": 21750 }, { "epoch": 5.653586278586278, "grad_norm": 0.3593837022781372, "learning_rate": 4.50877847685087e-05, "loss": 0.2829, "num_input_tokens_seen": 1959424, "step": 21755 }, { "epoch": 5.654885654885655, "grad_norm": 0.13841043412685394, "learning_rate": 4.508440920010321e-05, "loss": 0.278, "num_input_tokens_seen": 1959840, "step": 21760 }, { "epoch": 5.656185031185031, "grad_norm": 0.42976322770118713, "learning_rate": 4.508103259873732e-05, "loss": 0.3012, "num_input_tokens_seen": 1960320, "step": 21765 }, { "epoch": 5.657484407484407, "grad_norm": 0.46585729718208313, "learning_rate": 4.507765496458467e-05, "loss": 0.2815, "num_input_tokens_seen": 1960800, "step": 21770 }, { "epoch": 5.658783783783784, "grad_norm": 0.4444565773010254, "learning_rate": 4.507427629781899e-05, "loss": 0.2742, "num_input_tokens_seen": 1961232, "step": 21775 }, { "epoch": 5.66008316008316, "grad_norm": 0.15507373213768005, "learning_rate": 4.5070896598614045e-05, "loss": 0.2663, "num_input_tokens_seen": 1961696, "step": 21780 }, { "epoch": 5.661382536382536, "grad_norm": 0.1439485251903534, "learning_rate": 4.506751586714366e-05, "loss": 0.258, "num_input_tokens_seen": 1962128, "step": 21785 }, { "epoch": 5.662681912681912, "grad_norm": 0.27630144357681274, "learning_rate": 4.506413410358171e-05, "loss": 0.2427, "num_input_tokens_seen": 1962592, "step": 21790 }, { "epoch": 5.663981288981289, "grad_norm": 0.23419404029846191, "learning_rate": 4.506075130810211e-05, "loss": 0.2433, "num_input_tokens_seen": 1963056, "step": 21795 }, { "epoch": 5.665280665280665, "grad_norm": 0.19617660343647003, "learning_rate": 4.5057367480878856e-05, "loss": 0.265, "num_input_tokens_seen": 1963536, "step": 21800 }, { "epoch": 5.666580041580041, "grad_norm": 0.2058057337999344, "learning_rate": 4.5053982622085964e-05, "loss": 0.3104, "num_input_tokens_seen": 1963984, "step": 21805 }, { "epoch": 5.667879417879417, "grad_norm": 0.18428608775138855, "learning_rate": 4.505059673189754e-05, "loss": 0.2873, "num_input_tokens_seen": 1964464, "step": 21810 }, { "epoch": 5.669178794178794, "grad_norm": 0.293957382440567, "learning_rate": 4.504720981048771e-05, "loss": 0.2264, "num_input_tokens_seen": 1964944, "step": 21815 }, { "epoch": 5.67047817047817, "grad_norm": 0.2930189371109009, "learning_rate": 4.5043821858030675e-05, "loss": 0.2041, "num_input_tokens_seen": 1965392, "step": 21820 }, { "epoch": 5.671777546777546, "grad_norm": 0.20533177256584167, "learning_rate": 4.504043287470068e-05, "loss": 0.3064, "num_input_tokens_seen": 1965840, "step": 21825 }, { "epoch": 5.673076923076923, "grad_norm": 0.35106444358825684, "learning_rate": 4.503704286067202e-05, "loss": 0.2678, "num_input_tokens_seen": 1966320, "step": 21830 }, { "epoch": 5.674376299376299, "grad_norm": 0.2517157793045044, "learning_rate": 4.503365181611904e-05, "loss": 0.2199, "num_input_tokens_seen": 1966736, "step": 21835 }, { "epoch": 5.675675675675675, "grad_norm": 0.20447753369808197, "learning_rate": 4.503025974121615e-05, "loss": 0.2995, "num_input_tokens_seen": 1967168, "step": 21840 }, { "epoch": 5.676975051975052, "grad_norm": 0.23697204887866974, "learning_rate": 4.502686663613782e-05, "loss": 0.2217, "num_input_tokens_seen": 1967648, "step": 21845 }, { "epoch": 5.678274428274428, "grad_norm": 0.21538476645946503, "learning_rate": 4.502347250105854e-05, "loss": 0.2377, "num_input_tokens_seen": 1968064, "step": 21850 }, { "epoch": 5.6795738045738045, "grad_norm": 0.25318005681037903, "learning_rate": 4.502007733615289e-05, "loss": 0.2952, "num_input_tokens_seen": 1968528, "step": 21855 }, { "epoch": 5.6808731808731805, "grad_norm": 0.18324166536331177, "learning_rate": 4.501668114159548e-05, "loss": 0.3048, "num_input_tokens_seen": 1968928, "step": 21860 }, { "epoch": 5.682172557172557, "grad_norm": 0.27814316749572754, "learning_rate": 4.5013283917560974e-05, "loss": 0.2376, "num_input_tokens_seen": 1969392, "step": 21865 }, { "epoch": 5.6834719334719335, "grad_norm": 0.2612737715244293, "learning_rate": 4.5009885664224104e-05, "loss": 0.1789, "num_input_tokens_seen": 1969840, "step": 21870 }, { "epoch": 5.6847713097713095, "grad_norm": 0.3202439546585083, "learning_rate": 4.500648638175965e-05, "loss": 0.2202, "num_input_tokens_seen": 1970272, "step": 21875 }, { "epoch": 5.686070686070686, "grad_norm": 0.3140151798725128, "learning_rate": 4.500308607034242e-05, "loss": 0.3259, "num_input_tokens_seen": 1970720, "step": 21880 }, { "epoch": 5.6873700623700625, "grad_norm": 0.3583523631095886, "learning_rate": 4.499968473014731e-05, "loss": 0.2452, "num_input_tokens_seen": 1971184, "step": 21885 }, { "epoch": 5.6886694386694385, "grad_norm": 0.3644208312034607, "learning_rate": 4.4996282361349255e-05, "loss": 0.2753, "num_input_tokens_seen": 1971664, "step": 21890 }, { "epoch": 5.689968814968815, "grad_norm": 0.40589943528175354, "learning_rate": 4.499287896412324e-05, "loss": 0.2667, "num_input_tokens_seen": 1972096, "step": 21895 }, { "epoch": 5.6912681912681915, "grad_norm": 0.4264415204524994, "learning_rate": 4.49894745386443e-05, "loss": 0.1538, "num_input_tokens_seen": 1972560, "step": 21900 }, { "epoch": 5.6925675675675675, "grad_norm": 0.749932050704956, "learning_rate": 4.498606908508754e-05, "loss": 0.262, "num_input_tokens_seen": 1973024, "step": 21905 }, { "epoch": 5.693866943866944, "grad_norm": 0.3961433172225952, "learning_rate": 4.498266260362808e-05, "loss": 0.3772, "num_input_tokens_seen": 1973440, "step": 21910 }, { "epoch": 5.6951663201663205, "grad_norm": 1.4327785968780518, "learning_rate": 4.4979255094441146e-05, "loss": 0.2781, "num_input_tokens_seen": 1973904, "step": 21915 }, { "epoch": 5.696465696465697, "grad_norm": 0.21888765692710876, "learning_rate": 4.497584655770198e-05, "loss": 0.222, "num_input_tokens_seen": 1974352, "step": 21920 }, { "epoch": 5.697765072765073, "grad_norm": 0.2224195897579193, "learning_rate": 4.497243699358588e-05, "loss": 0.221, "num_input_tokens_seen": 1974784, "step": 21925 }, { "epoch": 5.6990644490644495, "grad_norm": 0.28588417172431946, "learning_rate": 4.496902640226822e-05, "loss": 0.3095, "num_input_tokens_seen": 1975248, "step": 21930 }, { "epoch": 5.700363825363826, "grad_norm": 0.299090713262558, "learning_rate": 4.4965614783924385e-05, "loss": 0.2432, "num_input_tokens_seen": 1975712, "step": 21935 }, { "epoch": 5.701663201663202, "grad_norm": 0.23375943303108215, "learning_rate": 4.496220213872986e-05, "loss": 0.1955, "num_input_tokens_seen": 1976128, "step": 21940 }, { "epoch": 5.702962577962578, "grad_norm": 0.23195582628250122, "learning_rate": 4.4958788466860154e-05, "loss": 0.215, "num_input_tokens_seen": 1976544, "step": 21945 }, { "epoch": 5.704261954261955, "grad_norm": 0.22211484611034393, "learning_rate": 4.495537376849083e-05, "loss": 0.1665, "num_input_tokens_seen": 1976976, "step": 21950 }, { "epoch": 5.705561330561331, "grad_norm": 0.2127131074666977, "learning_rate": 4.49519580437975e-05, "loss": 0.32, "num_input_tokens_seen": 1977440, "step": 21955 }, { "epoch": 5.706860706860707, "grad_norm": 0.2367086112499237, "learning_rate": 4.494854129295587e-05, "loss": 0.4003, "num_input_tokens_seen": 1977904, "step": 21960 }, { "epoch": 5.708160083160083, "grad_norm": 0.22463896870613098, "learning_rate": 4.494512351614164e-05, "loss": 0.2617, "num_input_tokens_seen": 1978352, "step": 21965 }, { "epoch": 5.70945945945946, "grad_norm": 0.21163472533226013, "learning_rate": 4.49417047135306e-05, "loss": 0.2223, "num_input_tokens_seen": 1978800, "step": 21970 }, { "epoch": 5.710758835758836, "grad_norm": 0.24703019857406616, "learning_rate": 4.4938284885298575e-05, "loss": 0.2577, "num_input_tokens_seen": 1979232, "step": 21975 }, { "epoch": 5.712058212058212, "grad_norm": 0.2560591995716095, "learning_rate": 4.493486403162146e-05, "loss": 0.2622, "num_input_tokens_seen": 1979712, "step": 21980 }, { "epoch": 5.713357588357589, "grad_norm": 0.1960805505514145, "learning_rate": 4.4931442152675185e-05, "loss": 0.2934, "num_input_tokens_seen": 1980192, "step": 21985 }, { "epoch": 5.714656964656965, "grad_norm": 0.18178057670593262, "learning_rate": 4.492801924863575e-05, "loss": 0.3036, "num_input_tokens_seen": 1980608, "step": 21990 }, { "epoch": 5.715956340956341, "grad_norm": 0.27401435375213623, "learning_rate": 4.492459531967917e-05, "loss": 0.2021, "num_input_tokens_seen": 1981104, "step": 21995 }, { "epoch": 5.717255717255718, "grad_norm": 0.32847854495048523, "learning_rate": 4.492117036598158e-05, "loss": 0.2412, "num_input_tokens_seen": 1981568, "step": 22000 }, { "epoch": 5.718555093555094, "grad_norm": 0.2817251682281494, "learning_rate": 4.4917744387719105e-05, "loss": 0.2776, "num_input_tokens_seen": 1982016, "step": 22005 }, { "epoch": 5.71985446985447, "grad_norm": 0.2902454733848572, "learning_rate": 4.491431738506795e-05, "loss": 0.2222, "num_input_tokens_seen": 1982496, "step": 22010 }, { "epoch": 5.721153846153846, "grad_norm": 0.34630775451660156, "learning_rate": 4.491088935820437e-05, "loss": 0.2175, "num_input_tokens_seen": 1982912, "step": 22015 }, { "epoch": 5.722453222453223, "grad_norm": 0.5866519212722778, "learning_rate": 4.490746030730468e-05, "loss": 0.308, "num_input_tokens_seen": 1983408, "step": 22020 }, { "epoch": 5.723752598752599, "grad_norm": 0.265455424785614, "learning_rate": 4.490403023254522e-05, "loss": 0.2869, "num_input_tokens_seen": 1983872, "step": 22025 }, { "epoch": 5.725051975051975, "grad_norm": 0.2967299818992615, "learning_rate": 4.4900599134102425e-05, "loss": 0.2559, "num_input_tokens_seen": 1984288, "step": 22030 }, { "epoch": 5.726351351351351, "grad_norm": 0.3537215292453766, "learning_rate": 4.4897167012152745e-05, "loss": 0.2564, "num_input_tokens_seen": 1984720, "step": 22035 }, { "epoch": 5.727650727650728, "grad_norm": 0.25838014483451843, "learning_rate": 4.48937338668727e-05, "loss": 0.2985, "num_input_tokens_seen": 1985152, "step": 22040 }, { "epoch": 5.728950103950104, "grad_norm": 0.35456547141075134, "learning_rate": 4.489029969843886e-05, "loss": 0.1955, "num_input_tokens_seen": 1985584, "step": 22045 }, { "epoch": 5.73024948024948, "grad_norm": 0.3964572250843048, "learning_rate": 4.488686450702785e-05, "loss": 0.2415, "num_input_tokens_seen": 1986032, "step": 22050 }, { "epoch": 5.731548856548857, "grad_norm": 0.36759358644485474, "learning_rate": 4.488342829281635e-05, "loss": 0.2523, "num_input_tokens_seen": 1986464, "step": 22055 }, { "epoch": 5.732848232848233, "grad_norm": 0.3153584599494934, "learning_rate": 4.487999105598108e-05, "loss": 0.2807, "num_input_tokens_seen": 1986880, "step": 22060 }, { "epoch": 5.734147609147609, "grad_norm": 0.35291334986686707, "learning_rate": 4.487655279669881e-05, "loss": 0.3126, "num_input_tokens_seen": 1987360, "step": 22065 }, { "epoch": 5.735446985446986, "grad_norm": 0.3737371265888214, "learning_rate": 4.4873113515146395e-05, "loss": 0.2307, "num_input_tokens_seen": 1987792, "step": 22070 }, { "epoch": 5.736746361746362, "grad_norm": 0.325874388217926, "learning_rate": 4.4869673211500706e-05, "loss": 0.2761, "num_input_tokens_seen": 1988240, "step": 22075 }, { "epoch": 5.738045738045738, "grad_norm": 0.2874668538570404, "learning_rate": 4.4866231885938694e-05, "loss": 0.182, "num_input_tokens_seen": 1988672, "step": 22080 }, { "epoch": 5.739345114345114, "grad_norm": 0.717472493648529, "learning_rate": 4.486278953863733e-05, "loss": 0.2258, "num_input_tokens_seen": 1989136, "step": 22085 }, { "epoch": 5.740644490644491, "grad_norm": 0.2657448947429657, "learning_rate": 4.485934616977367e-05, "loss": 0.2914, "num_input_tokens_seen": 1989584, "step": 22090 }, { "epoch": 5.741943866943867, "grad_norm": 0.7579615116119385, "learning_rate": 4.4855901779524816e-05, "loss": 0.2854, "num_input_tokens_seen": 1990048, "step": 22095 }, { "epoch": 5.743243243243243, "grad_norm": 0.43855348229408264, "learning_rate": 4.4852456368067905e-05, "loss": 0.2145, "num_input_tokens_seen": 1990464, "step": 22100 }, { "epoch": 5.74454261954262, "grad_norm": 0.4456954002380371, "learning_rate": 4.484900993558014e-05, "loss": 0.2499, "num_input_tokens_seen": 1990896, "step": 22105 }, { "epoch": 5.745841995841996, "grad_norm": 0.3245380222797394, "learning_rate": 4.484556248223877e-05, "loss": 0.2687, "num_input_tokens_seen": 1991344, "step": 22110 }, { "epoch": 5.747141372141372, "grad_norm": 0.3388109505176544, "learning_rate": 4.484211400822111e-05, "loss": 0.2152, "num_input_tokens_seen": 1991808, "step": 22115 }, { "epoch": 5.748440748440748, "grad_norm": 0.38471099734306335, "learning_rate": 4.483866451370452e-05, "loss": 0.2173, "num_input_tokens_seen": 1992272, "step": 22120 }, { "epoch": 5.749740124740125, "grad_norm": 0.26823174953460693, "learning_rate": 4.4835213998866405e-05, "loss": 0.2251, "num_input_tokens_seen": 1992688, "step": 22125 }, { "epoch": 5.751039501039501, "grad_norm": 0.2997725009918213, "learning_rate": 4.483176246388423e-05, "loss": 0.2572, "num_input_tokens_seen": 1993136, "step": 22130 }, { "epoch": 5.752338877338877, "grad_norm": 0.648963212966919, "learning_rate": 4.482830990893551e-05, "loss": 0.2698, "num_input_tokens_seen": 1993584, "step": 22135 }, { "epoch": 5.753638253638254, "grad_norm": 0.5270437002182007, "learning_rate": 4.48248563341978e-05, "loss": 0.2179, "num_input_tokens_seen": 1994000, "step": 22140 }, { "epoch": 5.75493762993763, "grad_norm": 0.4667920172214508, "learning_rate": 4.482140173984875e-05, "loss": 0.1915, "num_input_tokens_seen": 1994464, "step": 22145 }, { "epoch": 5.756237006237006, "grad_norm": 0.6062102913856506, "learning_rate": 4.4817946126066e-05, "loss": 0.2039, "num_input_tokens_seen": 1994896, "step": 22150 }, { "epoch": 5.757536382536383, "grad_norm": 0.41417503356933594, "learning_rate": 4.48144894930273e-05, "loss": 0.1904, "num_input_tokens_seen": 1995312, "step": 22155 }, { "epoch": 5.758835758835759, "grad_norm": 0.34109991788864136, "learning_rate": 4.4811031840910424e-05, "loss": 0.0678, "num_input_tokens_seen": 1995760, "step": 22160 }, { "epoch": 5.760135135135135, "grad_norm": 2.5224263668060303, "learning_rate": 4.4807573169893193e-05, "loss": 0.3915, "num_input_tokens_seen": 1996240, "step": 22165 }, { "epoch": 5.761434511434511, "grad_norm": 2.5702202320098877, "learning_rate": 4.48041134801535e-05, "loss": 0.4249, "num_input_tokens_seen": 1996688, "step": 22170 }, { "epoch": 5.762733887733888, "grad_norm": 0.4661118686199188, "learning_rate": 4.480065277186927e-05, "loss": 0.2189, "num_input_tokens_seen": 1997104, "step": 22175 }, { "epoch": 5.764033264033264, "grad_norm": 0.7653627395629883, "learning_rate": 4.47971910452185e-05, "loss": 0.2654, "num_input_tokens_seen": 1997552, "step": 22180 }, { "epoch": 5.76533264033264, "grad_norm": 0.9284581542015076, "learning_rate": 4.479372830037922e-05, "loss": 0.3141, "num_input_tokens_seen": 1998000, "step": 22185 }, { "epoch": 5.766632016632016, "grad_norm": 0.5281217694282532, "learning_rate": 4.479026453752953e-05, "loss": 0.2635, "num_input_tokens_seen": 1998448, "step": 22190 }, { "epoch": 5.767931392931393, "grad_norm": 0.6534028053283691, "learning_rate": 4.478679975684759e-05, "loss": 0.2706, "num_input_tokens_seen": 1998912, "step": 22195 }, { "epoch": 5.769230769230769, "grad_norm": 0.5688006281852722, "learning_rate": 4.4783333958511555e-05, "loss": 0.1904, "num_input_tokens_seen": 1999376, "step": 22200 }, { "epoch": 5.770530145530145, "grad_norm": 0.33568045496940613, "learning_rate": 4.477986714269972e-05, "loss": 0.2303, "num_input_tokens_seen": 1999792, "step": 22205 }, { "epoch": 5.771829521829522, "grad_norm": 0.41721728444099426, "learning_rate": 4.477639930959034e-05, "loss": 0.1605, "num_input_tokens_seen": 2000272, "step": 22210 }, { "epoch": 5.773128898128898, "grad_norm": 0.24253393709659576, "learning_rate": 4.477293045936182e-05, "loss": 0.1367, "num_input_tokens_seen": 2000720, "step": 22215 }, { "epoch": 5.774428274428274, "grad_norm": 1.369971752166748, "learning_rate": 4.4769460592192524e-05, "loss": 0.3201, "num_input_tokens_seen": 2001216, "step": 22220 }, { "epoch": 5.775727650727651, "grad_norm": 0.9364058375358582, "learning_rate": 4.476598970826094e-05, "loss": 0.3065, "num_input_tokens_seen": 2001648, "step": 22225 }, { "epoch": 5.777027027027027, "grad_norm": 0.6330464482307434, "learning_rate": 4.4762517807745564e-05, "loss": 0.3575, "num_input_tokens_seen": 2002080, "step": 22230 }, { "epoch": 5.778326403326403, "grad_norm": 0.4301328659057617, "learning_rate": 4.4759044890824954e-05, "loss": 0.2735, "num_input_tokens_seen": 2002528, "step": 22235 }, { "epoch": 5.779625779625779, "grad_norm": 58.500221252441406, "learning_rate": 4.475557095767774e-05, "loss": 0.2872, "num_input_tokens_seen": 2002992, "step": 22240 }, { "epoch": 5.780925155925156, "grad_norm": 0.4263887405395508, "learning_rate": 4.475209600848258e-05, "loss": 0.207, "num_input_tokens_seen": 2003456, "step": 22245 }, { "epoch": 5.782224532224532, "grad_norm": 0.2792670428752899, "learning_rate": 4.474862004341819e-05, "loss": 0.2004, "num_input_tokens_seen": 2003952, "step": 22250 }, { "epoch": 5.783523908523908, "grad_norm": 0.5525648593902588, "learning_rate": 4.474514306266335e-05, "loss": 0.2693, "num_input_tokens_seen": 2004368, "step": 22255 }, { "epoch": 5.784823284823284, "grad_norm": 0.25519153475761414, "learning_rate": 4.47416650663969e-05, "loss": 0.2816, "num_input_tokens_seen": 2004832, "step": 22260 }, { "epoch": 5.786122661122661, "grad_norm": 0.2604213356971741, "learning_rate": 4.4738186054797685e-05, "loss": 0.2026, "num_input_tokens_seen": 2005280, "step": 22265 }, { "epoch": 5.787422037422037, "grad_norm": 0.31461620330810547, "learning_rate": 4.473470602804465e-05, "loss": 0.3163, "num_input_tokens_seen": 2005728, "step": 22270 }, { "epoch": 5.788721413721413, "grad_norm": 0.3704673945903778, "learning_rate": 4.4731224986316784e-05, "loss": 0.2244, "num_input_tokens_seen": 2006128, "step": 22275 }, { "epoch": 5.79002079002079, "grad_norm": 0.356543630361557, "learning_rate": 4.4727742929793105e-05, "loss": 0.2079, "num_input_tokens_seen": 2006560, "step": 22280 }, { "epoch": 5.791320166320166, "grad_norm": 0.3218972980976105, "learning_rate": 4.4724259858652704e-05, "loss": 0.316, "num_input_tokens_seen": 2007008, "step": 22285 }, { "epoch": 5.792619542619542, "grad_norm": 0.6657745242118835, "learning_rate": 4.472077577307472e-05, "loss": 0.2652, "num_input_tokens_seen": 2007472, "step": 22290 }, { "epoch": 5.793918918918919, "grad_norm": 0.48796510696411133, "learning_rate": 4.4717290673238347e-05, "loss": 0.2368, "num_input_tokens_seen": 2007952, "step": 22295 }, { "epoch": 5.795218295218295, "grad_norm": 0.36305901408195496, "learning_rate": 4.4713804559322814e-05, "loss": 0.2417, "num_input_tokens_seen": 2008416, "step": 22300 }, { "epoch": 5.796517671517671, "grad_norm": 0.36309927701950073, "learning_rate": 4.4710317431507434e-05, "loss": 0.2692, "num_input_tokens_seen": 2008864, "step": 22305 }, { "epoch": 5.797817047817047, "grad_norm": 0.3426045775413513, "learning_rate": 4.470682928997153e-05, "loss": 0.2834, "num_input_tokens_seen": 2009312, "step": 22310 }, { "epoch": 5.799116424116424, "grad_norm": 0.38344672322273254, "learning_rate": 4.470334013489452e-05, "loss": 0.1738, "num_input_tokens_seen": 2009776, "step": 22315 }, { "epoch": 5.8004158004158, "grad_norm": 0.3352816700935364, "learning_rate": 4.4699849966455854e-05, "loss": 0.2068, "num_input_tokens_seen": 2010224, "step": 22320 }, { "epoch": 5.8017151767151764, "grad_norm": 0.3624177873134613, "learning_rate": 4.4696358784835026e-05, "loss": 0.3161, "num_input_tokens_seen": 2010720, "step": 22325 }, { "epoch": 5.803014553014553, "grad_norm": 0.37725648283958435, "learning_rate": 4.469286659021159e-05, "loss": 0.2329, "num_input_tokens_seen": 2011136, "step": 22330 }, { "epoch": 5.804313929313929, "grad_norm": 0.7325058579444885, "learning_rate": 4.4689373382765155e-05, "loss": 0.2344, "num_input_tokens_seen": 2011632, "step": 22335 }, { "epoch": 5.8056133056133055, "grad_norm": 0.3671099841594696, "learning_rate": 4.468587916267539e-05, "loss": 0.2592, "num_input_tokens_seen": 2012096, "step": 22340 }, { "epoch": 5.8069126819126815, "grad_norm": 0.3639487624168396, "learning_rate": 4.4682383930121985e-05, "loss": 0.2031, "num_input_tokens_seen": 2012544, "step": 22345 }, { "epoch": 5.808212058212058, "grad_norm": 0.4045248329639435, "learning_rate": 4.467888768528472e-05, "loss": 0.1737, "num_input_tokens_seen": 2013008, "step": 22350 }, { "epoch": 5.8095114345114345, "grad_norm": 4.088603496551514, "learning_rate": 4.467539042834342e-05, "loss": 0.2756, "num_input_tokens_seen": 2013456, "step": 22355 }, { "epoch": 5.8108108108108105, "grad_norm": 0.4428025782108307, "learning_rate": 4.4671892159477927e-05, "loss": 0.1348, "num_input_tokens_seen": 2013920, "step": 22360 }, { "epoch": 5.8121101871101875, "grad_norm": 2.1461052894592285, "learning_rate": 4.466839287886817e-05, "loss": 0.2901, "num_input_tokens_seen": 2014368, "step": 22365 }, { "epoch": 5.8134095634095635, "grad_norm": 0.29936352372169495, "learning_rate": 4.4664892586694116e-05, "loss": 0.3892, "num_input_tokens_seen": 2014800, "step": 22370 }, { "epoch": 5.8147089397089395, "grad_norm": 0.3173549175262451, "learning_rate": 4.466139128313581e-05, "loss": 0.139, "num_input_tokens_seen": 2015248, "step": 22375 }, { "epoch": 5.8160083160083165, "grad_norm": 0.36596760153770447, "learning_rate": 4.4657888968373295e-05, "loss": 0.4182, "num_input_tokens_seen": 2015696, "step": 22380 }, { "epoch": 5.8173076923076925, "grad_norm": 0.516502857208252, "learning_rate": 4.465438564258673e-05, "loss": 0.2223, "num_input_tokens_seen": 2016144, "step": 22385 }, { "epoch": 5.8186070686070686, "grad_norm": 0.3527456223964691, "learning_rate": 4.4650881305956274e-05, "loss": 0.3051, "num_input_tokens_seen": 2016624, "step": 22390 }, { "epoch": 5.819906444906445, "grad_norm": 0.5063745379447937, "learning_rate": 4.464737595866216e-05, "loss": 0.2201, "num_input_tokens_seen": 2017056, "step": 22395 }, { "epoch": 5.8212058212058215, "grad_norm": 0.27577701210975647, "learning_rate": 4.4643869600884675e-05, "loss": 0.2925, "num_input_tokens_seen": 2017488, "step": 22400 }, { "epoch": 5.822505197505198, "grad_norm": 0.37820008397102356, "learning_rate": 4.4640362232804156e-05, "loss": 0.2097, "num_input_tokens_seen": 2017968, "step": 22405 }, { "epoch": 5.823804573804574, "grad_norm": 0.6419669389724731, "learning_rate": 4.463685385460099e-05, "loss": 0.2574, "num_input_tokens_seen": 2018448, "step": 22410 }, { "epoch": 5.82510395010395, "grad_norm": 0.6331944465637207, "learning_rate": 4.4633344466455615e-05, "loss": 0.3266, "num_input_tokens_seen": 2018880, "step": 22415 }, { "epoch": 5.826403326403327, "grad_norm": 0.5032756924629211, "learning_rate": 4.4629834068548524e-05, "loss": 0.2889, "num_input_tokens_seen": 2019312, "step": 22420 }, { "epoch": 5.827702702702703, "grad_norm": 0.4329284727573395, "learning_rate": 4.462632266106026e-05, "loss": 0.2147, "num_input_tokens_seen": 2019776, "step": 22425 }, { "epoch": 5.829002079002079, "grad_norm": 0.5623720288276672, "learning_rate": 4.4622810244171406e-05, "loss": 0.2716, "num_input_tokens_seen": 2020208, "step": 22430 }, { "epoch": 5.830301455301456, "grad_norm": 0.7573030591011047, "learning_rate": 4.4619296818062636e-05, "loss": 0.254, "num_input_tokens_seen": 2020672, "step": 22435 }, { "epoch": 5.831600831600832, "grad_norm": 0.31773555278778076, "learning_rate": 4.461578238291462e-05, "loss": 0.227, "num_input_tokens_seen": 2021088, "step": 22440 }, { "epoch": 5.832900207900208, "grad_norm": 0.6918585300445557, "learning_rate": 4.461226693890812e-05, "loss": 0.3446, "num_input_tokens_seen": 2021600, "step": 22445 }, { "epoch": 5.834199584199585, "grad_norm": 0.7867322564125061, "learning_rate": 4.460875048622395e-05, "loss": 0.2652, "num_input_tokens_seen": 2022016, "step": 22450 }, { "epoch": 5.835498960498961, "grad_norm": 1.2712267637252808, "learning_rate": 4.460523302504295e-05, "loss": 0.2634, "num_input_tokens_seen": 2022480, "step": 22455 }, { "epoch": 5.836798336798337, "grad_norm": 0.36796894669532776, "learning_rate": 4.460171455554603e-05, "loss": 0.2514, "num_input_tokens_seen": 2022960, "step": 22460 }, { "epoch": 5.838097713097713, "grad_norm": 0.7071883082389832, "learning_rate": 4.4598195077914145e-05, "loss": 0.2965, "num_input_tokens_seen": 2023392, "step": 22465 }, { "epoch": 5.83939708939709, "grad_norm": 0.7316250801086426, "learning_rate": 4.4594674592328314e-05, "loss": 0.2722, "num_input_tokens_seen": 2023840, "step": 22470 }, { "epoch": 5.840696465696466, "grad_norm": 0.4876120686531067, "learning_rate": 4.459115309896959e-05, "loss": 0.2697, "num_input_tokens_seen": 2024288, "step": 22475 }, { "epoch": 5.841995841995842, "grad_norm": 0.7405224442481995, "learning_rate": 4.458763059801909e-05, "loss": 0.2475, "num_input_tokens_seen": 2024704, "step": 22480 }, { "epoch": 5.843295218295218, "grad_norm": 0.46126872301101685, "learning_rate": 4.458410708965799e-05, "loss": 0.2452, "num_input_tokens_seen": 2025168, "step": 22485 }, { "epoch": 5.844594594594595, "grad_norm": 0.3386567533016205, "learning_rate": 4.458058257406749e-05, "loss": 0.3132, "num_input_tokens_seen": 2025616, "step": 22490 }, { "epoch": 5.845893970893971, "grad_norm": 0.57366943359375, "learning_rate": 4.4577057051428865e-05, "loss": 0.2458, "num_input_tokens_seen": 2026048, "step": 22495 }, { "epoch": 5.847193347193347, "grad_norm": 0.4620266258716583, "learning_rate": 4.4573530521923445e-05, "loss": 0.3862, "num_input_tokens_seen": 2026464, "step": 22500 }, { "epoch": 5.848492723492724, "grad_norm": 0.4447748064994812, "learning_rate": 4.457000298573259e-05, "loss": 0.2624, "num_input_tokens_seen": 2026880, "step": 22505 }, { "epoch": 5.8497920997921, "grad_norm": 0.3509677052497864, "learning_rate": 4.4566474443037733e-05, "loss": 0.2201, "num_input_tokens_seen": 2027328, "step": 22510 }, { "epoch": 5.851091476091476, "grad_norm": 1.0303692817687988, "learning_rate": 4.4562944894020345e-05, "loss": 0.2761, "num_input_tokens_seen": 2027792, "step": 22515 }, { "epoch": 5.852390852390853, "grad_norm": 0.29279208183288574, "learning_rate": 4.455941433886196e-05, "loss": 0.2249, "num_input_tokens_seen": 2028256, "step": 22520 }, { "epoch": 5.853690228690229, "grad_norm": 0.3236721456050873, "learning_rate": 4.455588277774416e-05, "loss": 0.2623, "num_input_tokens_seen": 2028688, "step": 22525 }, { "epoch": 5.854989604989605, "grad_norm": 0.24104449152946472, "learning_rate": 4.455235021084856e-05, "loss": 0.2527, "num_input_tokens_seen": 2029120, "step": 22530 }, { "epoch": 5.856288981288982, "grad_norm": 0.24055200815200806, "learning_rate": 4.454881663835686e-05, "loss": 0.2861, "num_input_tokens_seen": 2029568, "step": 22535 }, { "epoch": 5.857588357588358, "grad_norm": 0.31761664152145386, "learning_rate": 4.454528206045079e-05, "loss": 0.2252, "num_input_tokens_seen": 2030000, "step": 22540 }, { "epoch": 5.858887733887734, "grad_norm": 0.38519707322120667, "learning_rate": 4.454174647731213e-05, "loss": 0.2726, "num_input_tokens_seen": 2030448, "step": 22545 }, { "epoch": 5.86018711018711, "grad_norm": 0.32846012711524963, "learning_rate": 4.453820988912273e-05, "loss": 0.2217, "num_input_tokens_seen": 2030944, "step": 22550 }, { "epoch": 5.861486486486487, "grad_norm": 0.563916802406311, "learning_rate": 4.453467229606448e-05, "loss": 0.2884, "num_input_tokens_seen": 2031392, "step": 22555 }, { "epoch": 5.862785862785863, "grad_norm": 0.3061181604862213, "learning_rate": 4.4531133698319306e-05, "loss": 0.2433, "num_input_tokens_seen": 2031856, "step": 22560 }, { "epoch": 5.864085239085239, "grad_norm": 1.2344037294387817, "learning_rate": 4.452759409606922e-05, "loss": 0.3269, "num_input_tokens_seen": 2032288, "step": 22565 }, { "epoch": 5.865384615384615, "grad_norm": 0.250635027885437, "learning_rate": 4.4524053489496255e-05, "loss": 0.2194, "num_input_tokens_seen": 2032784, "step": 22570 }, { "epoch": 5.866683991683992, "grad_norm": 0.3063454031944275, "learning_rate": 4.452051187878251e-05, "loss": 0.3237, "num_input_tokens_seen": 2033232, "step": 22575 }, { "epoch": 5.867983367983368, "grad_norm": 0.35927456617355347, "learning_rate": 4.451696926411014e-05, "loss": 0.2384, "num_input_tokens_seen": 2033648, "step": 22580 }, { "epoch": 5.869282744282744, "grad_norm": 0.32532232999801636, "learning_rate": 4.451342564566134e-05, "loss": 0.2882, "num_input_tokens_seen": 2034112, "step": 22585 }, { "epoch": 5.870582120582121, "grad_norm": 0.4997604191303253, "learning_rate": 4.450988102361836e-05, "loss": 0.2337, "num_input_tokens_seen": 2034592, "step": 22590 }, { "epoch": 5.871881496881497, "grad_norm": 0.3672897219657898, "learning_rate": 4.450633539816351e-05, "loss": 0.1514, "num_input_tokens_seen": 2035040, "step": 22595 }, { "epoch": 5.873180873180873, "grad_norm": 0.22080576419830322, "learning_rate": 4.4502788769479136e-05, "loss": 0.217, "num_input_tokens_seen": 2035472, "step": 22600 }, { "epoch": 5.87448024948025, "grad_norm": 0.36502012610435486, "learning_rate": 4.449924113774765e-05, "loss": 0.2593, "num_input_tokens_seen": 2035888, "step": 22605 }, { "epoch": 5.875779625779626, "grad_norm": 0.1847895383834839, "learning_rate": 4.449569250315151e-05, "loss": 0.1899, "num_input_tokens_seen": 2036336, "step": 22610 }, { "epoch": 5.877079002079002, "grad_norm": 0.8369194865226746, "learning_rate": 4.4492142865873225e-05, "loss": 0.1998, "num_input_tokens_seen": 2036752, "step": 22615 }, { "epoch": 5.878378378378378, "grad_norm": 0.3710474669933319, "learning_rate": 4.448859222609536e-05, "loss": 0.4153, "num_input_tokens_seen": 2037232, "step": 22620 }, { "epoch": 5.879677754677755, "grad_norm": 0.22315606474876404, "learning_rate": 4.4485040584000514e-05, "loss": 0.3149, "num_input_tokens_seen": 2037680, "step": 22625 }, { "epoch": 5.880977130977131, "grad_norm": 0.26009759306907654, "learning_rate": 4.448148793977137e-05, "loss": 0.2525, "num_input_tokens_seen": 2038144, "step": 22630 }, { "epoch": 5.882276507276507, "grad_norm": 0.3852096199989319, "learning_rate": 4.447793429359063e-05, "loss": 0.2147, "num_input_tokens_seen": 2038576, "step": 22635 }, { "epoch": 5.883575883575883, "grad_norm": 0.36121541261672974, "learning_rate": 4.4474379645641064e-05, "loss": 0.2383, "num_input_tokens_seen": 2038992, "step": 22640 }, { "epoch": 5.88487525987526, "grad_norm": 0.23539002239704132, "learning_rate": 4.447082399610549e-05, "loss": 0.2568, "num_input_tokens_seen": 2039440, "step": 22645 }, { "epoch": 5.886174636174636, "grad_norm": 0.958733856678009, "learning_rate": 4.44672673451668e-05, "loss": 0.2142, "num_input_tokens_seen": 2039888, "step": 22650 }, { "epoch": 5.887474012474012, "grad_norm": 0.28608790040016174, "learning_rate": 4.4463709693007884e-05, "loss": 0.3103, "num_input_tokens_seen": 2040336, "step": 22655 }, { "epoch": 5.888773388773389, "grad_norm": 0.652525782585144, "learning_rate": 4.446015103981173e-05, "loss": 0.2715, "num_input_tokens_seen": 2040816, "step": 22660 }, { "epoch": 5.890072765072765, "grad_norm": 0.9045510292053223, "learning_rate": 4.445659138576136e-05, "loss": 0.2565, "num_input_tokens_seen": 2041296, "step": 22665 }, { "epoch": 5.891372141372141, "grad_norm": 0.37546640634536743, "learning_rate": 4.445303073103986e-05, "loss": 0.2203, "num_input_tokens_seen": 2041760, "step": 22670 }, { "epoch": 5.892671517671518, "grad_norm": 0.5104849338531494, "learning_rate": 4.4449469075830344e-05, "loss": 0.2323, "num_input_tokens_seen": 2042176, "step": 22675 }, { "epoch": 5.893970893970894, "grad_norm": 0.3849117159843445, "learning_rate": 4.4445906420316e-05, "loss": 0.2233, "num_input_tokens_seen": 2042608, "step": 22680 }, { "epoch": 5.89527027027027, "grad_norm": 0.2945794463157654, "learning_rate": 4.444234276468006e-05, "loss": 0.2176, "num_input_tokens_seen": 2043024, "step": 22685 }, { "epoch": 5.896569646569646, "grad_norm": 1.6614586114883423, "learning_rate": 4.44387781091058e-05, "loss": 0.2486, "num_input_tokens_seen": 2043488, "step": 22690 }, { "epoch": 5.897869022869023, "grad_norm": 0.4463253319263458, "learning_rate": 4.4435212453776553e-05, "loss": 0.3273, "num_input_tokens_seen": 2043968, "step": 22695 }, { "epoch": 5.899168399168399, "grad_norm": 0.6085691452026367, "learning_rate": 4.4431645798875715e-05, "loss": 0.2777, "num_input_tokens_seen": 2044400, "step": 22700 }, { "epoch": 5.900467775467775, "grad_norm": 0.6139006018638611, "learning_rate": 4.442807814458672e-05, "loss": 0.2603, "num_input_tokens_seen": 2044912, "step": 22705 }, { "epoch": 5.901767151767151, "grad_norm": 0.29871323704719543, "learning_rate": 4.442450949109304e-05, "loss": 0.2535, "num_input_tokens_seen": 2045344, "step": 22710 }, { "epoch": 5.903066528066528, "grad_norm": 0.3369613289833069, "learning_rate": 4.4420939838578224e-05, "loss": 0.36, "num_input_tokens_seen": 2045840, "step": 22715 }, { "epoch": 5.904365904365904, "grad_norm": 0.4582430124282837, "learning_rate": 4.441736918722587e-05, "loss": 0.2331, "num_input_tokens_seen": 2046352, "step": 22720 }, { "epoch": 5.90566528066528, "grad_norm": 0.36941471695899963, "learning_rate": 4.4413797537219605e-05, "loss": 0.2328, "num_input_tokens_seen": 2046768, "step": 22725 }, { "epoch": 5.906964656964657, "grad_norm": 0.3588760197162628, "learning_rate": 4.4410224888743136e-05, "loss": 0.2442, "num_input_tokens_seen": 2047232, "step": 22730 }, { "epoch": 5.908264033264033, "grad_norm": 0.4082864820957184, "learning_rate": 4.4406651241980205e-05, "loss": 0.2324, "num_input_tokens_seen": 2047648, "step": 22735 }, { "epoch": 5.909563409563409, "grad_norm": 0.23735204339027405, "learning_rate": 4.4403076597114605e-05, "loss": 0.1601, "num_input_tokens_seen": 2048096, "step": 22740 }, { "epoch": 5.910862785862786, "grad_norm": 0.8246674537658691, "learning_rate": 4.439950095433019e-05, "loss": 0.3001, "num_input_tokens_seen": 2048560, "step": 22745 }, { "epoch": 5.912162162162162, "grad_norm": 0.6108237504959106, "learning_rate": 4.4395924313810845e-05, "loss": 0.3543, "num_input_tokens_seen": 2049024, "step": 22750 }, { "epoch": 5.913461538461538, "grad_norm": 0.27715227007865906, "learning_rate": 4.4392346675740535e-05, "loss": 0.2513, "num_input_tokens_seen": 2049488, "step": 22755 }, { "epoch": 5.914760914760915, "grad_norm": 0.2653399109840393, "learning_rate": 4.438876804030325e-05, "loss": 0.2524, "num_input_tokens_seen": 2049904, "step": 22760 }, { "epoch": 5.916060291060291, "grad_norm": 0.4664188325405121, "learning_rate": 4.438518840768304e-05, "loss": 0.2611, "num_input_tokens_seen": 2050320, "step": 22765 }, { "epoch": 5.917359667359667, "grad_norm": 0.33595019578933716, "learning_rate": 4.438160777806403e-05, "loss": 0.2542, "num_input_tokens_seen": 2050768, "step": 22770 }, { "epoch": 5.918659043659043, "grad_norm": 1.4412641525268555, "learning_rate": 4.437802615163036e-05, "loss": 0.247, "num_input_tokens_seen": 2051200, "step": 22775 }, { "epoch": 5.91995841995842, "grad_norm": 0.18819715082645416, "learning_rate": 4.4374443528566236e-05, "loss": 0.3364, "num_input_tokens_seen": 2051664, "step": 22780 }, { "epoch": 5.921257796257796, "grad_norm": 0.49079257249832153, "learning_rate": 4.437085990905591e-05, "loss": 0.2952, "num_input_tokens_seen": 2052128, "step": 22785 }, { "epoch": 5.922557172557172, "grad_norm": 0.6146979331970215, "learning_rate": 4.4367275293283704e-05, "loss": 0.256, "num_input_tokens_seen": 2052592, "step": 22790 }, { "epoch": 5.923856548856548, "grad_norm": 0.34230971336364746, "learning_rate": 4.4363689681433974e-05, "loss": 0.2622, "num_input_tokens_seen": 2053024, "step": 22795 }, { "epoch": 5.925155925155925, "grad_norm": 0.47006842494010925, "learning_rate": 4.4360103073691125e-05, "loss": 0.28, "num_input_tokens_seen": 2053520, "step": 22800 }, { "epoch": 5.926455301455301, "grad_norm": 0.35302597284317017, "learning_rate": 4.435651547023963e-05, "loss": 0.2831, "num_input_tokens_seen": 2053968, "step": 22805 }, { "epoch": 5.9277546777546775, "grad_norm": 0.5515212416648865, "learning_rate": 4.4352926871264e-05, "loss": 0.2516, "num_input_tokens_seen": 2054400, "step": 22810 }, { "epoch": 5.929054054054054, "grad_norm": 0.578870952129364, "learning_rate": 4.434933727694879e-05, "loss": 0.1545, "num_input_tokens_seen": 2054848, "step": 22815 }, { "epoch": 5.93035343035343, "grad_norm": 2.1379945278167725, "learning_rate": 4.4345746687478636e-05, "loss": 0.2356, "num_input_tokens_seen": 2055280, "step": 22820 }, { "epoch": 5.9316528066528065, "grad_norm": 0.31868425011634827, "learning_rate": 4.4342155103038184e-05, "loss": 0.2263, "num_input_tokens_seen": 2055744, "step": 22825 }, { "epoch": 5.932952182952183, "grad_norm": 0.339687705039978, "learning_rate": 4.4338562523812166e-05, "loss": 0.2176, "num_input_tokens_seen": 2056208, "step": 22830 }, { "epoch": 5.9342515592515594, "grad_norm": 0.3047400712966919, "learning_rate": 4.4334968949985344e-05, "loss": 0.2598, "num_input_tokens_seen": 2056624, "step": 22835 }, { "epoch": 5.9355509355509355, "grad_norm": 1.2210357189178467, "learning_rate": 4.433137438174255e-05, "loss": 0.3443, "num_input_tokens_seen": 2057072, "step": 22840 }, { "epoch": 5.9368503118503115, "grad_norm": 0.5147764086723328, "learning_rate": 4.4327778819268636e-05, "loss": 0.2784, "num_input_tokens_seen": 2057520, "step": 22845 }, { "epoch": 5.9381496881496885, "grad_norm": 0.48841074109077454, "learning_rate": 4.432418226274856e-05, "loss": 0.1757, "num_input_tokens_seen": 2057952, "step": 22850 }, { "epoch": 5.9394490644490645, "grad_norm": 0.37588080763816833, "learning_rate": 4.432058471236726e-05, "loss": 0.1217, "num_input_tokens_seen": 2058496, "step": 22855 }, { "epoch": 5.9407484407484406, "grad_norm": 0.43826860189437866, "learning_rate": 4.4316986168309774e-05, "loss": 0.2847, "num_input_tokens_seen": 2058944, "step": 22860 }, { "epoch": 5.942047817047817, "grad_norm": 0.3024730682373047, "learning_rate": 4.431338663076119e-05, "loss": 0.261, "num_input_tokens_seen": 2059392, "step": 22865 }, { "epoch": 5.9433471933471935, "grad_norm": 0.30897000432014465, "learning_rate": 4.430978609990662e-05, "loss": 0.2071, "num_input_tokens_seen": 2059840, "step": 22870 }, { "epoch": 5.94464656964657, "grad_norm": 0.33669188618659973, "learning_rate": 4.430618457593125e-05, "loss": 0.1574, "num_input_tokens_seen": 2060272, "step": 22875 }, { "epoch": 5.945945945945946, "grad_norm": 0.26020941138267517, "learning_rate": 4.4302582059020316e-05, "loss": 0.245, "num_input_tokens_seen": 2060720, "step": 22880 }, { "epoch": 5.9472453222453225, "grad_norm": 0.28105250000953674, "learning_rate": 4.4298978549359085e-05, "loss": 0.1375, "num_input_tokens_seen": 2061232, "step": 22885 }, { "epoch": 5.948544698544699, "grad_norm": 0.4056209921836853, "learning_rate": 4.4295374047132896e-05, "loss": 0.3888, "num_input_tokens_seen": 2061680, "step": 22890 }, { "epoch": 5.949844074844075, "grad_norm": 0.4175223410129547, "learning_rate": 4.429176855252713e-05, "loss": 0.2666, "num_input_tokens_seen": 2062128, "step": 22895 }, { "epoch": 5.951143451143452, "grad_norm": 0.2861492335796356, "learning_rate": 4.4288162065727226e-05, "loss": 0.2771, "num_input_tokens_seen": 2062608, "step": 22900 }, { "epoch": 5.952442827442828, "grad_norm": 0.46177947521209717, "learning_rate": 4.428455458691866e-05, "loss": 0.2072, "num_input_tokens_seen": 2063088, "step": 22905 }, { "epoch": 5.953742203742204, "grad_norm": 0.528645396232605, "learning_rate": 4.428094611628698e-05, "loss": 0.2285, "num_input_tokens_seen": 2063536, "step": 22910 }, { "epoch": 5.95504158004158, "grad_norm": 0.34573236107826233, "learning_rate": 4.427733665401776e-05, "loss": 0.2257, "num_input_tokens_seen": 2063968, "step": 22915 }, { "epoch": 5.956340956340957, "grad_norm": 0.2756308317184448, "learning_rate": 4.427372620029666e-05, "loss": 0.2011, "num_input_tokens_seen": 2064448, "step": 22920 }, { "epoch": 5.957640332640333, "grad_norm": 0.217239111661911, "learning_rate": 4.427011475530934e-05, "loss": 0.2623, "num_input_tokens_seen": 2064912, "step": 22925 }, { "epoch": 5.958939708939709, "grad_norm": 0.2667585611343384, "learning_rate": 4.426650231924157e-05, "loss": 0.3263, "num_input_tokens_seen": 2065328, "step": 22930 }, { "epoch": 5.960239085239085, "grad_norm": 0.380886971950531, "learning_rate": 4.426288889227911e-05, "loss": 0.2762, "num_input_tokens_seen": 2065776, "step": 22935 }, { "epoch": 5.961538461538462, "grad_norm": 0.2854301631450653, "learning_rate": 4.425927447460782e-05, "loss": 0.2053, "num_input_tokens_seen": 2066256, "step": 22940 }, { "epoch": 5.962837837837838, "grad_norm": 0.4942880868911743, "learning_rate": 4.4255659066413595e-05, "loss": 0.2953, "num_input_tokens_seen": 2066704, "step": 22945 }, { "epoch": 5.964137214137214, "grad_norm": 0.2398216873407364, "learning_rate": 4.425204266788238e-05, "loss": 0.3111, "num_input_tokens_seen": 2067120, "step": 22950 }, { "epoch": 5.965436590436591, "grad_norm": 0.6892995834350586, "learning_rate": 4.424842527920015e-05, "loss": 0.2757, "num_input_tokens_seen": 2067600, "step": 22955 }, { "epoch": 5.966735966735967, "grad_norm": 0.5260847806930542, "learning_rate": 4.424480690055297e-05, "loss": 0.2612, "num_input_tokens_seen": 2068000, "step": 22960 }, { "epoch": 5.968035343035343, "grad_norm": 0.6317753195762634, "learning_rate": 4.424118753212694e-05, "loss": 0.2112, "num_input_tokens_seen": 2068480, "step": 22965 }, { "epoch": 5.96933471933472, "grad_norm": 0.4012170135974884, "learning_rate": 4.4237567174108186e-05, "loss": 0.2959, "num_input_tokens_seen": 2068976, "step": 22970 }, { "epoch": 5.970634095634096, "grad_norm": 0.9193636775016785, "learning_rate": 4.423394582668293e-05, "loss": 0.3073, "num_input_tokens_seen": 2069440, "step": 22975 }, { "epoch": 5.971933471933472, "grad_norm": 0.5759416222572327, "learning_rate": 4.4230323490037405e-05, "loss": 0.2917, "num_input_tokens_seen": 2069904, "step": 22980 }, { "epoch": 5.973232848232849, "grad_norm": 0.4505932629108429, "learning_rate": 4.422670016435792e-05, "loss": 0.2604, "num_input_tokens_seen": 2070368, "step": 22985 }, { "epoch": 5.974532224532225, "grad_norm": 0.12762175500392914, "learning_rate": 4.4223075849830824e-05, "loss": 0.3069, "num_input_tokens_seen": 2070816, "step": 22990 }, { "epoch": 5.975831600831601, "grad_norm": 0.4594983160495758, "learning_rate": 4.421945054664251e-05, "loss": 0.2448, "num_input_tokens_seen": 2071312, "step": 22995 }, { "epoch": 5.977130977130977, "grad_norm": 0.560385525226593, "learning_rate": 4.421582425497945e-05, "loss": 0.2937, "num_input_tokens_seen": 2071776, "step": 23000 }, { "epoch": 5.978430353430354, "grad_norm": 0.14138640463352203, "learning_rate": 4.421219697502814e-05, "loss": 0.2745, "num_input_tokens_seen": 2072208, "step": 23005 }, { "epoch": 5.97972972972973, "grad_norm": 0.1365634799003601, "learning_rate": 4.420856870697512e-05, "loss": 0.2884, "num_input_tokens_seen": 2072672, "step": 23010 }, { "epoch": 5.981029106029106, "grad_norm": 0.5506166815757751, "learning_rate": 4.420493945100702e-05, "loss": 0.2826, "num_input_tokens_seen": 2073136, "step": 23015 }, { "epoch": 5.982328482328482, "grad_norm": 0.4381242096424103, "learning_rate": 4.420130920731047e-05, "loss": 0.261, "num_input_tokens_seen": 2073568, "step": 23020 }, { "epoch": 5.983627858627859, "grad_norm": 0.4196803867816925, "learning_rate": 4.419767797607219e-05, "loss": 0.2645, "num_input_tokens_seen": 2074016, "step": 23025 }, { "epoch": 5.984927234927235, "grad_norm": 0.3076944053173065, "learning_rate": 4.419404575747894e-05, "loss": 0.2003, "num_input_tokens_seen": 2074480, "step": 23030 }, { "epoch": 5.986226611226611, "grad_norm": 0.29310664534568787, "learning_rate": 4.419041255171753e-05, "loss": 0.3067, "num_input_tokens_seen": 2074928, "step": 23035 }, { "epoch": 5.987525987525988, "grad_norm": 0.2567193806171417, "learning_rate": 4.4186778358974814e-05, "loss": 0.2611, "num_input_tokens_seen": 2075408, "step": 23040 }, { "epoch": 5.988825363825364, "grad_norm": 0.2621706426143646, "learning_rate": 4.41831431794377e-05, "loss": 0.2023, "num_input_tokens_seen": 2075888, "step": 23045 }, { "epoch": 5.99012474012474, "grad_norm": 0.5059919953346252, "learning_rate": 4.417950701329315e-05, "loss": 0.2962, "num_input_tokens_seen": 2076336, "step": 23050 }, { "epoch": 5.991424116424117, "grad_norm": 0.25267669558525085, "learning_rate": 4.417586986072819e-05, "loss": 0.2378, "num_input_tokens_seen": 2076768, "step": 23055 }, { "epoch": 5.992723492723493, "grad_norm": 0.5702773332595825, "learning_rate": 4.4172231721929856e-05, "loss": 0.2325, "num_input_tokens_seen": 2077216, "step": 23060 }, { "epoch": 5.994022869022869, "grad_norm": 0.24609914422035217, "learning_rate": 4.416859259708528e-05, "loss": 0.2872, "num_input_tokens_seen": 2077648, "step": 23065 }, { "epoch": 5.995322245322245, "grad_norm": 0.5409969091415405, "learning_rate": 4.4164952486381616e-05, "loss": 0.2476, "num_input_tokens_seen": 2078048, "step": 23070 }, { "epoch": 5.996621621621622, "grad_norm": 0.27482402324676514, "learning_rate": 4.4161311390006085e-05, "loss": 0.2422, "num_input_tokens_seen": 2078496, "step": 23075 }, { "epoch": 5.997920997920998, "grad_norm": 0.32163286209106445, "learning_rate": 4.415766930814595e-05, "loss": 0.2665, "num_input_tokens_seen": 2078928, "step": 23080 }, { "epoch": 5.999220374220374, "grad_norm": 0.28987976908683777, "learning_rate": 4.415402624098853e-05, "loss": 0.1872, "num_input_tokens_seen": 2079424, "step": 23085 }, { "epoch": 6.0, "eval_loss": 0.2394219934940338, "eval_runtime": 13.1705, "eval_samples_per_second": 64.994, "eval_steps_per_second": 32.497, "num_input_tokens_seen": 2079640, "step": 23088 }, { "epoch": 6.000519750519751, "grad_norm": 0.27582088112831116, "learning_rate": 4.4150382188721184e-05, "loss": 0.2024, "num_input_tokens_seen": 2079800, "step": 23090 }, { "epoch": 6.001819126819127, "grad_norm": 0.2738741338253021, "learning_rate": 4.414673715153134e-05, "loss": 0.2395, "num_input_tokens_seen": 2080248, "step": 23095 }, { "epoch": 6.003118503118503, "grad_norm": 0.23945991694927216, "learning_rate": 4.414309112960645e-05, "loss": 0.2066, "num_input_tokens_seen": 2080680, "step": 23100 }, { "epoch": 6.004417879417879, "grad_norm": 0.2561686635017395, "learning_rate": 4.413944412313405e-05, "loss": 0.1966, "num_input_tokens_seen": 2081176, "step": 23105 }, { "epoch": 6.005717255717256, "grad_norm": 0.3941679000854492, "learning_rate": 4.413579613230169e-05, "loss": 0.1575, "num_input_tokens_seen": 2081592, "step": 23110 }, { "epoch": 6.007016632016632, "grad_norm": 0.20915843546390533, "learning_rate": 4.4132147157297e-05, "loss": 0.1409, "num_input_tokens_seen": 2082024, "step": 23115 }, { "epoch": 6.008316008316008, "grad_norm": 0.21941009163856506, "learning_rate": 4.412849719830765e-05, "loss": 0.2236, "num_input_tokens_seen": 2082472, "step": 23120 }, { "epoch": 6.009615384615385, "grad_norm": 0.21307086944580078, "learning_rate": 4.412484625552137e-05, "loss": 0.2259, "num_input_tokens_seen": 2082888, "step": 23125 }, { "epoch": 6.010914760914761, "grad_norm": 0.8879602551460266, "learning_rate": 4.412119432912592e-05, "loss": 0.2525, "num_input_tokens_seen": 2083384, "step": 23130 }, { "epoch": 6.012214137214137, "grad_norm": 0.7133299708366394, "learning_rate": 4.411754141930912e-05, "loss": 0.3148, "num_input_tokens_seen": 2083832, "step": 23135 }, { "epoch": 6.013513513513513, "grad_norm": 0.20615534484386444, "learning_rate": 4.411388752625885e-05, "loss": 0.3056, "num_input_tokens_seen": 2084280, "step": 23140 }, { "epoch": 6.01481288981289, "grad_norm": 0.20421136915683746, "learning_rate": 4.411023265016303e-05, "loss": 0.2948, "num_input_tokens_seen": 2084712, "step": 23145 }, { "epoch": 6.016112266112266, "grad_norm": 0.46942806243896484, "learning_rate": 4.410657679120964e-05, "loss": 0.2354, "num_input_tokens_seen": 2085144, "step": 23150 }, { "epoch": 6.017411642411642, "grad_norm": 0.18818484246730804, "learning_rate": 4.410291994958669e-05, "loss": 0.2789, "num_input_tokens_seen": 2085608, "step": 23155 }, { "epoch": 6.018711018711019, "grad_norm": 0.2991642653942108, "learning_rate": 4.409926212548227e-05, "loss": 0.1912, "num_input_tokens_seen": 2086040, "step": 23160 }, { "epoch": 6.020010395010395, "grad_norm": 0.23250478506088257, "learning_rate": 4.409560331908449e-05, "loss": 0.2327, "num_input_tokens_seen": 2086504, "step": 23165 }, { "epoch": 6.021309771309771, "grad_norm": 0.6434574723243713, "learning_rate": 4.4091943530581534e-05, "loss": 0.2127, "num_input_tokens_seen": 2086952, "step": 23170 }, { "epoch": 6.022609147609147, "grad_norm": 0.7582514882087708, "learning_rate": 4.408828276016164e-05, "loss": 0.2621, "num_input_tokens_seen": 2087400, "step": 23175 }, { "epoch": 6.023908523908524, "grad_norm": 0.5131765007972717, "learning_rate": 4.408462100801307e-05, "loss": 0.3548, "num_input_tokens_seen": 2087832, "step": 23180 }, { "epoch": 6.0252079002079, "grad_norm": 0.26483362913131714, "learning_rate": 4.4080958274324155e-05, "loss": 0.267, "num_input_tokens_seen": 2088296, "step": 23185 }, { "epoch": 6.026507276507276, "grad_norm": 0.17765414714813232, "learning_rate": 4.4077294559283274e-05, "loss": 0.2688, "num_input_tokens_seen": 2088760, "step": 23190 }, { "epoch": 6.027806652806653, "grad_norm": 0.44793590903282166, "learning_rate": 4.407362986307886e-05, "loss": 0.2333, "num_input_tokens_seen": 2089176, "step": 23195 }, { "epoch": 6.029106029106029, "grad_norm": 0.4089392423629761, "learning_rate": 4.406996418589937e-05, "loss": 0.2642, "num_input_tokens_seen": 2089656, "step": 23200 }, { "epoch": 6.030405405405405, "grad_norm": 0.4200344383716583, "learning_rate": 4.406629752793336e-05, "loss": 0.2267, "num_input_tokens_seen": 2090136, "step": 23205 }, { "epoch": 6.031704781704781, "grad_norm": 0.42332443594932556, "learning_rate": 4.406262988936941e-05, "loss": 0.3327, "num_input_tokens_seen": 2090616, "step": 23210 }, { "epoch": 6.033004158004158, "grad_norm": 0.8177331686019897, "learning_rate": 4.4058961270396125e-05, "loss": 0.3163, "num_input_tokens_seen": 2091064, "step": 23215 }, { "epoch": 6.034303534303534, "grad_norm": 0.547313928604126, "learning_rate": 4.405529167120221e-05, "loss": 0.3137, "num_input_tokens_seen": 2091512, "step": 23220 }, { "epoch": 6.03560291060291, "grad_norm": 0.5993756055831909, "learning_rate": 4.405162109197638e-05, "loss": 0.2653, "num_input_tokens_seen": 2091944, "step": 23225 }, { "epoch": 6.036902286902287, "grad_norm": 0.36859026551246643, "learning_rate": 4.404794953290743e-05, "loss": 0.2776, "num_input_tokens_seen": 2092408, "step": 23230 }, { "epoch": 6.038201663201663, "grad_norm": 0.401405394077301, "learning_rate": 4.4044276994184175e-05, "loss": 0.2461, "num_input_tokens_seen": 2092856, "step": 23235 }, { "epoch": 6.039501039501039, "grad_norm": 0.338809996843338, "learning_rate": 4.404060347599551e-05, "loss": 0.1722, "num_input_tokens_seen": 2093320, "step": 23240 }, { "epoch": 6.040800415800415, "grad_norm": 0.5135236382484436, "learning_rate": 4.403692897853037e-05, "loss": 0.2554, "num_input_tokens_seen": 2093768, "step": 23245 }, { "epoch": 6.042099792099792, "grad_norm": 0.4790891408920288, "learning_rate": 4.403325350197773e-05, "loss": 0.141, "num_input_tokens_seen": 2094200, "step": 23250 }, { "epoch": 6.043399168399168, "grad_norm": 0.3271692395210266, "learning_rate": 4.402957704652662e-05, "loss": 0.218, "num_input_tokens_seen": 2094632, "step": 23255 }, { "epoch": 6.044698544698544, "grad_norm": 0.3759535849094391, "learning_rate": 4.4025899612366134e-05, "loss": 0.3434, "num_input_tokens_seen": 2095064, "step": 23260 }, { "epoch": 6.045997920997921, "grad_norm": 0.5096385478973389, "learning_rate": 4.40222211996854e-05, "loss": 0.2092, "num_input_tokens_seen": 2095528, "step": 23265 }, { "epoch": 6.047297297297297, "grad_norm": 0.43492740392684937, "learning_rate": 4.401854180867361e-05, "loss": 0.3067, "num_input_tokens_seen": 2095992, "step": 23270 }, { "epoch": 6.048596673596673, "grad_norm": 0.4183175265789032, "learning_rate": 4.4014861439519987e-05, "loss": 0.2197, "num_input_tokens_seen": 2096440, "step": 23275 }, { "epoch": 6.04989604989605, "grad_norm": 0.3296174108982086, "learning_rate": 4.401118009241382e-05, "loss": 0.1995, "num_input_tokens_seen": 2096840, "step": 23280 }, { "epoch": 6.051195426195426, "grad_norm": 0.2927597165107727, "learning_rate": 4.4007497767544435e-05, "loss": 0.2266, "num_input_tokens_seen": 2097288, "step": 23285 }, { "epoch": 6.052494802494802, "grad_norm": 0.5146132111549377, "learning_rate": 4.400381446510124e-05, "loss": 0.3374, "num_input_tokens_seen": 2097816, "step": 23290 }, { "epoch": 6.0537941787941785, "grad_norm": 0.44381213188171387, "learning_rate": 4.400013018527366e-05, "loss": 0.295, "num_input_tokens_seen": 2098296, "step": 23295 }, { "epoch": 6.055093555093555, "grad_norm": 0.34979066252708435, "learning_rate": 4.3996444928251174e-05, "loss": 0.1885, "num_input_tokens_seen": 2098744, "step": 23300 }, { "epoch": 6.0563929313929314, "grad_norm": 0.3139982223510742, "learning_rate": 4.3992758694223326e-05, "loss": 0.1835, "num_input_tokens_seen": 2099176, "step": 23305 }, { "epoch": 6.0576923076923075, "grad_norm": 0.2798828184604645, "learning_rate": 4.39890714833797e-05, "loss": 0.1527, "num_input_tokens_seen": 2099656, "step": 23310 }, { "epoch": 6.058991683991684, "grad_norm": 0.42389845848083496, "learning_rate": 4.398538329590993e-05, "loss": 0.2602, "num_input_tokens_seen": 2100152, "step": 23315 }, { "epoch": 6.0602910602910605, "grad_norm": 0.45178133249282837, "learning_rate": 4.398169413200371e-05, "loss": 0.2117, "num_input_tokens_seen": 2100584, "step": 23320 }, { "epoch": 6.0615904365904365, "grad_norm": 0.21342702209949493, "learning_rate": 4.397800399185077e-05, "loss": 0.0865, "num_input_tokens_seen": 2101000, "step": 23325 }, { "epoch": 6.0628898128898125, "grad_norm": 0.7106930017471313, "learning_rate": 4.397431287564091e-05, "loss": 0.3301, "num_input_tokens_seen": 2101432, "step": 23330 }, { "epoch": 6.0641891891891895, "grad_norm": 0.2703535556793213, "learning_rate": 4.397062078356395e-05, "loss": 0.2555, "num_input_tokens_seen": 2101896, "step": 23335 }, { "epoch": 6.0654885654885655, "grad_norm": 0.25415462255477905, "learning_rate": 4.396692771580979e-05, "loss": 0.1926, "num_input_tokens_seen": 2102344, "step": 23340 }, { "epoch": 6.066787941787942, "grad_norm": 0.28539779782295227, "learning_rate": 4.396323367256836e-05, "loss": 0.2936, "num_input_tokens_seen": 2102792, "step": 23345 }, { "epoch": 6.0680873180873185, "grad_norm": 0.27602264285087585, "learning_rate": 4.395953865402966e-05, "loss": 0.3096, "num_input_tokens_seen": 2103224, "step": 23350 }, { "epoch": 6.0693866943866945, "grad_norm": 0.3478419780731201, "learning_rate": 4.395584266038372e-05, "loss": 0.2176, "num_input_tokens_seen": 2103704, "step": 23355 }, { "epoch": 6.070686070686071, "grad_norm": 0.33031946420669556, "learning_rate": 4.395214569182062e-05, "loss": 0.2239, "num_input_tokens_seen": 2104120, "step": 23360 }, { "epoch": 6.071985446985447, "grad_norm": 0.6064859628677368, "learning_rate": 4.394844774853051e-05, "loss": 0.2178, "num_input_tokens_seen": 2104568, "step": 23365 }, { "epoch": 6.0732848232848236, "grad_norm": 0.26747366786003113, "learning_rate": 4.3944748830703586e-05, "loss": 0.2073, "num_input_tokens_seen": 2105000, "step": 23370 }, { "epoch": 6.0745841995842, "grad_norm": 0.5631616115570068, "learning_rate": 4.394104893853007e-05, "loss": 0.2762, "num_input_tokens_seen": 2105448, "step": 23375 }, { "epoch": 6.075883575883576, "grad_norm": 0.2729043662548065, "learning_rate": 4.393734807220026e-05, "loss": 0.2621, "num_input_tokens_seen": 2105880, "step": 23380 }, { "epoch": 6.077182952182953, "grad_norm": 0.30359575152397156, "learning_rate": 4.3933646231904504e-05, "loss": 0.3066, "num_input_tokens_seen": 2106360, "step": 23385 }, { "epoch": 6.078482328482329, "grad_norm": 0.32484546303749084, "learning_rate": 4.3929943417833166e-05, "loss": 0.3473, "num_input_tokens_seen": 2106776, "step": 23390 }, { "epoch": 6.079781704781705, "grad_norm": 0.43259936571121216, "learning_rate": 4.3926239630176703e-05, "loss": 0.2367, "num_input_tokens_seen": 2107208, "step": 23395 }, { "epoch": 6.081081081081081, "grad_norm": 0.45532694458961487, "learning_rate": 4.392253486912561e-05, "loss": 0.2526, "num_input_tokens_seen": 2107656, "step": 23400 }, { "epoch": 6.082380457380458, "grad_norm": 0.6060383915901184, "learning_rate": 4.391882913487041e-05, "loss": 0.2785, "num_input_tokens_seen": 2108088, "step": 23405 }, { "epoch": 6.083679833679834, "grad_norm": 0.3493083715438843, "learning_rate": 4.39151224276017e-05, "loss": 0.1797, "num_input_tokens_seen": 2108584, "step": 23410 }, { "epoch": 6.08497920997921, "grad_norm": 0.45910605788230896, "learning_rate": 4.3911414747510126e-05, "loss": 0.2531, "num_input_tokens_seen": 2109080, "step": 23415 }, { "epoch": 6.086278586278587, "grad_norm": 0.27323010563850403, "learning_rate": 4.3907706094786364e-05, "loss": 0.2251, "num_input_tokens_seen": 2109528, "step": 23420 }, { "epoch": 6.087577962577963, "grad_norm": 0.3357374966144562, "learning_rate": 4.390399646962117e-05, "loss": 0.2428, "num_input_tokens_seen": 2109992, "step": 23425 }, { "epoch": 6.088877338877339, "grad_norm": 0.33342245221138, "learning_rate": 4.390028587220531e-05, "loss": 0.2997, "num_input_tokens_seen": 2110440, "step": 23430 }, { "epoch": 6.090176715176715, "grad_norm": 0.27940186858177185, "learning_rate": 4.3896574302729655e-05, "loss": 0.1268, "num_input_tokens_seen": 2110840, "step": 23435 }, { "epoch": 6.091476091476092, "grad_norm": 0.7291778326034546, "learning_rate": 4.3892861761385076e-05, "loss": 0.4115, "num_input_tokens_seen": 2111272, "step": 23440 }, { "epoch": 6.092775467775468, "grad_norm": 0.2849505841732025, "learning_rate": 4.3889148248362506e-05, "loss": 0.2467, "num_input_tokens_seen": 2111704, "step": 23445 }, { "epoch": 6.094074844074844, "grad_norm": 0.36755773425102234, "learning_rate": 4.3885433763852945e-05, "loss": 0.2215, "num_input_tokens_seen": 2112136, "step": 23450 }, { "epoch": 6.095374220374221, "grad_norm": 0.7848150134086609, "learning_rate": 4.388171830804743e-05, "loss": 0.302, "num_input_tokens_seen": 2112600, "step": 23455 }, { "epoch": 6.096673596673597, "grad_norm": 0.4941909909248352, "learning_rate": 4.3878001881137054e-05, "loss": 0.23, "num_input_tokens_seen": 2113016, "step": 23460 }, { "epoch": 6.097972972972973, "grad_norm": 0.57611083984375, "learning_rate": 4.387428448331295e-05, "loss": 0.238, "num_input_tokens_seen": 2113448, "step": 23465 }, { "epoch": 6.099272349272349, "grad_norm": 0.5079919695854187, "learning_rate": 4.387056611476631e-05, "loss": 0.3056, "num_input_tokens_seen": 2113912, "step": 23470 }, { "epoch": 6.100571725571726, "grad_norm": 0.6524064540863037, "learning_rate": 4.386684677568838e-05, "loss": 0.3222, "num_input_tokens_seen": 2114360, "step": 23475 }, { "epoch": 6.101871101871102, "grad_norm": 0.36562079191207886, "learning_rate": 4.3863126466270436e-05, "loss": 0.2736, "num_input_tokens_seen": 2114824, "step": 23480 }, { "epoch": 6.103170478170478, "grad_norm": 0.7068233489990234, "learning_rate": 4.3859405186703825e-05, "loss": 0.2805, "num_input_tokens_seen": 2115288, "step": 23485 }, { "epoch": 6.104469854469855, "grad_norm": 0.9532825946807861, "learning_rate": 4.3855682937179945e-05, "loss": 0.3335, "num_input_tokens_seen": 2115752, "step": 23490 }, { "epoch": 6.105769230769231, "grad_norm": 0.32263755798339844, "learning_rate": 4.385195971789021e-05, "loss": 0.2332, "num_input_tokens_seen": 2116280, "step": 23495 }, { "epoch": 6.107068607068607, "grad_norm": 0.25623491406440735, "learning_rate": 4.384823552902613e-05, "loss": 0.3931, "num_input_tokens_seen": 2116744, "step": 23500 }, { "epoch": 6.108367983367984, "grad_norm": 0.23568996787071228, "learning_rate": 4.384451037077924e-05, "loss": 0.251, "num_input_tokens_seen": 2117192, "step": 23505 }, { "epoch": 6.10966735966736, "grad_norm": 0.2316833734512329, "learning_rate": 4.3840784243341126e-05, "loss": 0.2638, "num_input_tokens_seen": 2117672, "step": 23510 }, { "epoch": 6.110966735966736, "grad_norm": 0.18036232888698578, "learning_rate": 4.383705714690342e-05, "loss": 0.2739, "num_input_tokens_seen": 2118168, "step": 23515 }, { "epoch": 6.112266112266112, "grad_norm": 0.371203750371933, "learning_rate": 4.383332908165782e-05, "loss": 0.216, "num_input_tokens_seen": 2118616, "step": 23520 }, { "epoch": 6.113565488565489, "grad_norm": 0.29900628328323364, "learning_rate": 4.382960004779606e-05, "loss": 0.1962, "num_input_tokens_seen": 2119080, "step": 23525 }, { "epoch": 6.114864864864865, "grad_norm": 0.2403767853975296, "learning_rate": 4.382587004550993e-05, "loss": 0.2963, "num_input_tokens_seen": 2119576, "step": 23530 }, { "epoch": 6.116164241164241, "grad_norm": 0.32351163029670715, "learning_rate": 4.3822139074991264e-05, "loss": 0.3278, "num_input_tokens_seen": 2120088, "step": 23535 }, { "epoch": 6.117463617463618, "grad_norm": 0.31570151448249817, "learning_rate": 4.381840713643195e-05, "loss": 0.3555, "num_input_tokens_seen": 2120520, "step": 23540 }, { "epoch": 6.118762993762994, "grad_norm": 0.33943793177604675, "learning_rate": 4.381467423002392e-05, "loss": 0.2383, "num_input_tokens_seen": 2121032, "step": 23545 }, { "epoch": 6.12006237006237, "grad_norm": 0.32043570280075073, "learning_rate": 4.381094035595917e-05, "loss": 0.2074, "num_input_tokens_seen": 2121496, "step": 23550 }, { "epoch": 6.121361746361746, "grad_norm": 0.27442029118537903, "learning_rate": 4.3807205514429736e-05, "loss": 0.2293, "num_input_tokens_seen": 2121928, "step": 23555 }, { "epoch": 6.122661122661123, "grad_norm": 0.24790523946285248, "learning_rate": 4.380346970562771e-05, "loss": 0.2135, "num_input_tokens_seen": 2122376, "step": 23560 }, { "epoch": 6.123960498960499, "grad_norm": 0.20501022040843964, "learning_rate": 4.3799732929745214e-05, "loss": 0.129, "num_input_tokens_seen": 2122840, "step": 23565 }, { "epoch": 6.125259875259875, "grad_norm": 0.20054052770137787, "learning_rate": 4.379599518697444e-05, "loss": 0.2666, "num_input_tokens_seen": 2123304, "step": 23570 }, { "epoch": 6.126559251559252, "grad_norm": 0.3328365087509155, "learning_rate": 4.379225647750762e-05, "loss": 0.2711, "num_input_tokens_seen": 2123720, "step": 23575 }, { "epoch": 6.127858627858628, "grad_norm": 0.27776622772216797, "learning_rate": 4.378851680153705e-05, "loss": 0.3145, "num_input_tokens_seen": 2124184, "step": 23580 }, { "epoch": 6.129158004158004, "grad_norm": 0.21127626299858093, "learning_rate": 4.378477615925505e-05, "loss": 0.2083, "num_input_tokens_seen": 2124632, "step": 23585 }, { "epoch": 6.13045738045738, "grad_norm": 0.1947920322418213, "learning_rate": 4.378103455085402e-05, "loss": 0.2529, "num_input_tokens_seen": 2125128, "step": 23590 }, { "epoch": 6.131756756756757, "grad_norm": 0.2446983903646469, "learning_rate": 4.377729197652639e-05, "loss": 0.2102, "num_input_tokens_seen": 2125576, "step": 23595 }, { "epoch": 6.133056133056133, "grad_norm": 0.20319698750972748, "learning_rate": 4.377354843646464e-05, "loss": 0.1673, "num_input_tokens_seen": 2125976, "step": 23600 }, { "epoch": 6.134355509355509, "grad_norm": 0.2047802358865738, "learning_rate": 4.376980393086131e-05, "loss": 0.2091, "num_input_tokens_seen": 2126408, "step": 23605 }, { "epoch": 6.135654885654886, "grad_norm": 0.26231810450553894, "learning_rate": 4.376605845990897e-05, "loss": 0.3054, "num_input_tokens_seen": 2126856, "step": 23610 }, { "epoch": 6.136954261954262, "grad_norm": 0.2282942682504654, "learning_rate": 4.376231202380027e-05, "loss": 0.3873, "num_input_tokens_seen": 2127320, "step": 23615 }, { "epoch": 6.138253638253638, "grad_norm": 0.2702080011367798, "learning_rate": 4.375856462272788e-05, "loss": 0.2675, "num_input_tokens_seen": 2127768, "step": 23620 }, { "epoch": 6.139553014553014, "grad_norm": 0.18428702652454376, "learning_rate": 4.375481625688454e-05, "loss": 0.261, "num_input_tokens_seen": 2128216, "step": 23625 }, { "epoch": 6.140852390852391, "grad_norm": 0.31761521100997925, "learning_rate": 4.375106692646304e-05, "loss": 0.2065, "num_input_tokens_seen": 2128696, "step": 23630 }, { "epoch": 6.142151767151767, "grad_norm": 0.30409833788871765, "learning_rate": 4.374731663165619e-05, "loss": 0.2886, "num_input_tokens_seen": 2129144, "step": 23635 }, { "epoch": 6.143451143451143, "grad_norm": 0.3382900655269623, "learning_rate": 4.374356537265688e-05, "loss": 0.3216, "num_input_tokens_seen": 2129576, "step": 23640 }, { "epoch": 6.14475051975052, "grad_norm": 0.16828490793704987, "learning_rate": 4.373981314965805e-05, "loss": 0.2952, "num_input_tokens_seen": 2130040, "step": 23645 }, { "epoch": 6.146049896049896, "grad_norm": 0.16064348816871643, "learning_rate": 4.373605996285267e-05, "loss": 0.2766, "num_input_tokens_seen": 2130488, "step": 23650 }, { "epoch": 6.147349272349272, "grad_norm": 0.4159817099571228, "learning_rate": 4.3732305812433774e-05, "loss": 0.2446, "num_input_tokens_seen": 2130936, "step": 23655 }, { "epoch": 6.148648648648648, "grad_norm": 0.13162676990032196, "learning_rate": 4.372855069859445e-05, "loss": 0.2742, "num_input_tokens_seen": 2131400, "step": 23660 }, { "epoch": 6.149948024948025, "grad_norm": 0.15830715000629425, "learning_rate": 4.372479462152781e-05, "loss": 0.2382, "num_input_tokens_seen": 2131800, "step": 23665 }, { "epoch": 6.151247401247401, "grad_norm": 0.27486369013786316, "learning_rate": 4.372103758142705e-05, "loss": 0.2308, "num_input_tokens_seen": 2132216, "step": 23670 }, { "epoch": 6.152546777546777, "grad_norm": 0.22773653268814087, "learning_rate": 4.3717279578485385e-05, "loss": 0.177, "num_input_tokens_seen": 2132664, "step": 23675 }, { "epoch": 6.153846153846154, "grad_norm": 0.3913263976573944, "learning_rate": 4.371352061289609e-05, "loss": 0.217, "num_input_tokens_seen": 2133128, "step": 23680 }, { "epoch": 6.15514553014553, "grad_norm": 0.28203776478767395, "learning_rate": 4.370976068485251e-05, "loss": 0.3653, "num_input_tokens_seen": 2133608, "step": 23685 }, { "epoch": 6.156444906444906, "grad_norm": 0.2515411674976349, "learning_rate": 4.3705999794548e-05, "loss": 0.2107, "num_input_tokens_seen": 2134104, "step": 23690 }, { "epoch": 6.157744282744282, "grad_norm": 0.3647574186325073, "learning_rate": 4.3702237942176e-05, "loss": 0.2136, "num_input_tokens_seen": 2134536, "step": 23695 }, { "epoch": 6.159043659043659, "grad_norm": 0.5740609765052795, "learning_rate": 4.3698475127929995e-05, "loss": 0.2014, "num_input_tokens_seen": 2135000, "step": 23700 }, { "epoch": 6.160343035343035, "grad_norm": 0.2764245271682739, "learning_rate": 4.369471135200349e-05, "loss": 0.1834, "num_input_tokens_seen": 2135448, "step": 23705 }, { "epoch": 6.161642411642411, "grad_norm": 0.4158894121646881, "learning_rate": 4.369094661459007e-05, "loss": 0.3318, "num_input_tokens_seen": 2135912, "step": 23710 }, { "epoch": 6.162941787941788, "grad_norm": 0.6908010840415955, "learning_rate": 4.368718091588335e-05, "loss": 0.3261, "num_input_tokens_seen": 2136328, "step": 23715 }, { "epoch": 6.164241164241164, "grad_norm": 0.574958086013794, "learning_rate": 4.3683414256077014e-05, "loss": 0.2241, "num_input_tokens_seen": 2136792, "step": 23720 }, { "epoch": 6.16554054054054, "grad_norm": 0.716002881526947, "learning_rate": 4.367964663536479e-05, "loss": 0.2538, "num_input_tokens_seen": 2137240, "step": 23725 }, { "epoch": 6.166839916839917, "grad_norm": 0.6959664821624756, "learning_rate": 4.367587805394043e-05, "loss": 0.2454, "num_input_tokens_seen": 2137704, "step": 23730 }, { "epoch": 6.168139293139293, "grad_norm": 0.43339210748672485, "learning_rate": 4.367210851199778e-05, "loss": 0.2493, "num_input_tokens_seen": 2138136, "step": 23735 }, { "epoch": 6.169438669438669, "grad_norm": 0.748086154460907, "learning_rate": 4.366833800973068e-05, "loss": 0.2715, "num_input_tokens_seen": 2138600, "step": 23740 }, { "epoch": 6.170738045738045, "grad_norm": 1.0044001340866089, "learning_rate": 4.366456654733308e-05, "loss": 0.2261, "num_input_tokens_seen": 2139048, "step": 23745 }, { "epoch": 6.172037422037422, "grad_norm": 0.3782803416252136, "learning_rate": 4.366079412499894e-05, "loss": 0.2335, "num_input_tokens_seen": 2139544, "step": 23750 }, { "epoch": 6.173336798336798, "grad_norm": 0.5058063268661499, "learning_rate": 4.365702074292227e-05, "loss": 0.2052, "num_input_tokens_seen": 2139976, "step": 23755 }, { "epoch": 6.174636174636174, "grad_norm": 0.9716061353683472, "learning_rate": 4.365324640129716e-05, "loss": 0.3198, "num_input_tokens_seen": 2140408, "step": 23760 }, { "epoch": 6.175935550935551, "grad_norm": 0.3531540036201477, "learning_rate": 4.364947110031771e-05, "loss": 0.1631, "num_input_tokens_seen": 2140840, "step": 23765 }, { "epoch": 6.177234927234927, "grad_norm": 0.7536720633506775, "learning_rate": 4.3645694840178084e-05, "loss": 0.2292, "num_input_tokens_seen": 2141288, "step": 23770 }, { "epoch": 6.178534303534303, "grad_norm": 1.230222225189209, "learning_rate": 4.3641917621072515e-05, "loss": 0.1641, "num_input_tokens_seen": 2141704, "step": 23775 }, { "epoch": 6.1798336798336795, "grad_norm": 2.3602733612060547, "learning_rate": 4.3638139443195256e-05, "loss": 0.3162, "num_input_tokens_seen": 2142120, "step": 23780 }, { "epoch": 6.181133056133056, "grad_norm": 0.6732871532440186, "learning_rate": 4.3634360306740636e-05, "loss": 0.2835, "num_input_tokens_seen": 2142616, "step": 23785 }, { "epoch": 6.1824324324324325, "grad_norm": 0.5813425183296204, "learning_rate": 4.3630580211903006e-05, "loss": 0.2712, "num_input_tokens_seen": 2143096, "step": 23790 }, { "epoch": 6.1837318087318085, "grad_norm": 0.8244838118553162, "learning_rate": 4.362679915887679e-05, "loss": 0.2366, "num_input_tokens_seen": 2143544, "step": 23795 }, { "epoch": 6.185031185031185, "grad_norm": 0.4508058428764343, "learning_rate": 4.362301714785643e-05, "loss": 0.2784, "num_input_tokens_seen": 2143960, "step": 23800 }, { "epoch": 6.1863305613305615, "grad_norm": 0.5425960421562195, "learning_rate": 4.3619234179036463e-05, "loss": 0.2855, "num_input_tokens_seen": 2144424, "step": 23805 }, { "epoch": 6.1876299376299375, "grad_norm": 0.49317410588264465, "learning_rate": 4.361545025261145e-05, "loss": 0.2586, "num_input_tokens_seen": 2144888, "step": 23810 }, { "epoch": 6.188929313929314, "grad_norm": 1.013780951499939, "learning_rate": 4.3611665368775986e-05, "loss": 0.3149, "num_input_tokens_seen": 2145368, "step": 23815 }, { "epoch": 6.1902286902286905, "grad_norm": 0.9184024333953857, "learning_rate": 4.360787952772474e-05, "loss": 0.296, "num_input_tokens_seen": 2145832, "step": 23820 }, { "epoch": 6.1915280665280665, "grad_norm": 0.4735654294490814, "learning_rate": 4.360409272965242e-05, "loss": 0.2872, "num_input_tokens_seen": 2146264, "step": 23825 }, { "epoch": 6.192827442827443, "grad_norm": 0.40065813064575195, "learning_rate": 4.360030497475379e-05, "loss": 0.2476, "num_input_tokens_seen": 2146760, "step": 23830 }, { "epoch": 6.1941268191268195, "grad_norm": 0.5350281596183777, "learning_rate": 4.359651626322364e-05, "loss": 0.3113, "num_input_tokens_seen": 2147224, "step": 23835 }, { "epoch": 6.1954261954261955, "grad_norm": 0.5209065675735474, "learning_rate": 4.3592726595256854e-05, "loss": 0.2221, "num_input_tokens_seen": 2147656, "step": 23840 }, { "epoch": 6.196725571725572, "grad_norm": 0.30830708146095276, "learning_rate": 4.358893597104832e-05, "loss": 0.2369, "num_input_tokens_seen": 2148120, "step": 23845 }, { "epoch": 6.198024948024948, "grad_norm": 0.3728522062301636, "learning_rate": 4.3585144390793e-05, "loss": 0.2867, "num_input_tokens_seen": 2148536, "step": 23850 }, { "epoch": 6.199324324324325, "grad_norm": 0.29316797852516174, "learning_rate": 4.358135185468589e-05, "loss": 0.2202, "num_input_tokens_seen": 2149032, "step": 23855 }, { "epoch": 6.200623700623701, "grad_norm": 0.4309576451778412, "learning_rate": 4.357755836292207e-05, "loss": 0.3476, "num_input_tokens_seen": 2149480, "step": 23860 }, { "epoch": 6.201923076923077, "grad_norm": 0.716672420501709, "learning_rate": 4.3573763915696594e-05, "loss": 0.2643, "num_input_tokens_seen": 2149928, "step": 23865 }, { "epoch": 6.203222453222454, "grad_norm": 0.5810636878013611, "learning_rate": 4.356996851320466e-05, "loss": 0.2442, "num_input_tokens_seen": 2150424, "step": 23870 }, { "epoch": 6.20452182952183, "grad_norm": 0.3845231831073761, "learning_rate": 4.356617215564146e-05, "loss": 0.2647, "num_input_tokens_seen": 2150872, "step": 23875 }, { "epoch": 6.205821205821206, "grad_norm": 0.2752852737903595, "learning_rate": 4.356237484320222e-05, "loss": 0.2425, "num_input_tokens_seen": 2151320, "step": 23880 }, { "epoch": 6.207120582120582, "grad_norm": 0.42267805337905884, "learning_rate": 4.355857657608227e-05, "loss": 0.2491, "num_input_tokens_seen": 2151784, "step": 23885 }, { "epoch": 6.208419958419959, "grad_norm": 0.41343238949775696, "learning_rate": 4.355477735447693e-05, "loss": 0.2216, "num_input_tokens_seen": 2152216, "step": 23890 }, { "epoch": 6.209719334719335, "grad_norm": 0.7274555563926697, "learning_rate": 4.355097717858162e-05, "loss": 0.319, "num_input_tokens_seen": 2152696, "step": 23895 }, { "epoch": 6.211018711018711, "grad_norm": 0.4854191243648529, "learning_rate": 4.354717604859178e-05, "loss": 0.2525, "num_input_tokens_seen": 2153176, "step": 23900 }, { "epoch": 6.212318087318088, "grad_norm": 0.5076610445976257, "learning_rate": 4.3543373964702907e-05, "loss": 0.2266, "num_input_tokens_seen": 2153656, "step": 23905 }, { "epoch": 6.213617463617464, "grad_norm": 0.3739721477031708, "learning_rate": 4.353957092711054e-05, "loss": 0.2704, "num_input_tokens_seen": 2154120, "step": 23910 }, { "epoch": 6.21491683991684, "grad_norm": 0.35703611373901367, "learning_rate": 4.353576693601028e-05, "loss": 0.2375, "num_input_tokens_seen": 2154568, "step": 23915 }, { "epoch": 6.216216216216216, "grad_norm": 0.35376691818237305, "learning_rate": 4.353196199159776e-05, "loss": 0.2264, "num_input_tokens_seen": 2155128, "step": 23920 }, { "epoch": 6.217515592515593, "grad_norm": 0.35375311970710754, "learning_rate": 4.352815609406868e-05, "loss": 0.2875, "num_input_tokens_seen": 2155576, "step": 23925 }, { "epoch": 6.218814968814969, "grad_norm": 0.29355165362358093, "learning_rate": 4.3524349243618785e-05, "loss": 0.2025, "num_input_tokens_seen": 2156040, "step": 23930 }, { "epoch": 6.220114345114345, "grad_norm": 0.29733526706695557, "learning_rate": 4.3520541440443854e-05, "loss": 0.1594, "num_input_tokens_seen": 2156536, "step": 23935 }, { "epoch": 6.221413721413722, "grad_norm": 0.3191187083721161, "learning_rate": 4.351673268473974e-05, "loss": 0.3409, "num_input_tokens_seen": 2156984, "step": 23940 }, { "epoch": 6.222713097713098, "grad_norm": 0.33014991879463196, "learning_rate": 4.351292297670231e-05, "loss": 0.2184, "num_input_tokens_seen": 2157416, "step": 23945 }, { "epoch": 6.224012474012474, "grad_norm": 0.3712809085845947, "learning_rate": 4.350911231652751e-05, "loss": 0.2948, "num_input_tokens_seen": 2157832, "step": 23950 }, { "epoch": 6.225311850311851, "grad_norm": 0.6894217133522034, "learning_rate": 4.350530070441135e-05, "loss": 0.2034, "num_input_tokens_seen": 2158232, "step": 23955 }, { "epoch": 6.226611226611227, "grad_norm": 0.8121634125709534, "learning_rate": 4.350148814054982e-05, "loss": 0.2797, "num_input_tokens_seen": 2158664, "step": 23960 }, { "epoch": 6.227910602910603, "grad_norm": 0.41358065605163574, "learning_rate": 4.349767462513904e-05, "loss": 0.2949, "num_input_tokens_seen": 2159160, "step": 23965 }, { "epoch": 6.229209979209979, "grad_norm": 0.40814751386642456, "learning_rate": 4.3493860158375135e-05, "loss": 0.304, "num_input_tokens_seen": 2159624, "step": 23970 }, { "epoch": 6.230509355509356, "grad_norm": 0.5294715762138367, "learning_rate": 4.3490044740454274e-05, "loss": 0.2399, "num_input_tokens_seen": 2160072, "step": 23975 }, { "epoch": 6.231808731808732, "grad_norm": 0.3474424183368683, "learning_rate": 4.3486228371572694e-05, "loss": 0.1653, "num_input_tokens_seen": 2160520, "step": 23980 }, { "epoch": 6.233108108108108, "grad_norm": 0.3694816827774048, "learning_rate": 4.348241105192668e-05, "loss": 0.2605, "num_input_tokens_seen": 2160968, "step": 23985 }, { "epoch": 6.234407484407485, "grad_norm": 0.3394525647163391, "learning_rate": 4.347859278171256e-05, "loss": 0.3546, "num_input_tokens_seen": 2161432, "step": 23990 }, { "epoch": 6.235706860706861, "grad_norm": 0.3032163679599762, "learning_rate": 4.3474773561126705e-05, "loss": 0.268, "num_input_tokens_seen": 2161864, "step": 23995 }, { "epoch": 6.237006237006237, "grad_norm": 0.4163324236869812, "learning_rate": 4.3470953390365545e-05, "loss": 0.246, "num_input_tokens_seen": 2162312, "step": 24000 }, { "epoch": 6.238305613305613, "grad_norm": 0.41054436564445496, "learning_rate": 4.3467132269625546e-05, "loss": 0.2735, "num_input_tokens_seen": 2162776, "step": 24005 }, { "epoch": 6.23960498960499, "grad_norm": 0.37496256828308105, "learning_rate": 4.346331019910325e-05, "loss": 0.2671, "num_input_tokens_seen": 2163224, "step": 24010 }, { "epoch": 6.240904365904366, "grad_norm": 0.397089421749115, "learning_rate": 4.345948717899521e-05, "loss": 0.245, "num_input_tokens_seen": 2163672, "step": 24015 }, { "epoch": 6.242203742203742, "grad_norm": 0.5577801465988159, "learning_rate": 4.3455663209498065e-05, "loss": 0.2626, "num_input_tokens_seen": 2164104, "step": 24020 }, { "epoch": 6.243503118503119, "grad_norm": 0.38737207651138306, "learning_rate": 4.3451838290808475e-05, "loss": 0.2016, "num_input_tokens_seen": 2164600, "step": 24025 }, { "epoch": 6.244802494802495, "grad_norm": 0.8246592879295349, "learning_rate": 4.344801242312317e-05, "loss": 0.2279, "num_input_tokens_seen": 2165096, "step": 24030 }, { "epoch": 6.246101871101871, "grad_norm": 0.9100603461265564, "learning_rate": 4.34441856066389e-05, "loss": 0.2808, "num_input_tokens_seen": 2165560, "step": 24035 }, { "epoch": 6.247401247401247, "grad_norm": 0.6727311015129089, "learning_rate": 4.3440357841552496e-05, "loss": 0.1594, "num_input_tokens_seen": 2165992, "step": 24040 }, { "epoch": 6.248700623700624, "grad_norm": 0.4671415090560913, "learning_rate": 4.343652912806081e-05, "loss": 0.2899, "num_input_tokens_seen": 2166440, "step": 24045 }, { "epoch": 6.25, "grad_norm": 0.8215568661689758, "learning_rate": 4.343269946636078e-05, "loss": 0.3999, "num_input_tokens_seen": 2166904, "step": 24050 }, { "epoch": 6.251299376299376, "grad_norm": 1.238071084022522, "learning_rate": 4.342886885664935e-05, "loss": 0.3922, "num_input_tokens_seen": 2167368, "step": 24055 }, { "epoch": 6.252598752598753, "grad_norm": 0.6186638474464417, "learning_rate": 4.342503729912354e-05, "loss": 0.2692, "num_input_tokens_seen": 2167816, "step": 24060 }, { "epoch": 6.253898128898129, "grad_norm": 0.3015926480293274, "learning_rate": 4.34212047939804e-05, "loss": 0.2866, "num_input_tokens_seen": 2168264, "step": 24065 }, { "epoch": 6.255197505197505, "grad_norm": 0.40714508295059204, "learning_rate": 4.3417371341417056e-05, "loss": 0.29, "num_input_tokens_seen": 2168696, "step": 24070 }, { "epoch": 6.256496881496881, "grad_norm": 1.2516236305236816, "learning_rate": 4.341353694163065e-05, "loss": 0.2941, "num_input_tokens_seen": 2169160, "step": 24075 }, { "epoch": 6.257796257796258, "grad_norm": 0.7460029721260071, "learning_rate": 4.34097015948184e-05, "loss": 0.2988, "num_input_tokens_seen": 2169592, "step": 24080 }, { "epoch": 6.259095634095634, "grad_norm": 0.46444201469421387, "learning_rate": 4.3405865301177555e-05, "loss": 0.222, "num_input_tokens_seen": 2170040, "step": 24085 }, { "epoch": 6.26039501039501, "grad_norm": 0.4260518550872803, "learning_rate": 4.340202806090543e-05, "loss": 0.2639, "num_input_tokens_seen": 2170456, "step": 24090 }, { "epoch": 6.261694386694387, "grad_norm": 0.8161375522613525, "learning_rate": 4.339818987419936e-05, "loss": 0.4624, "num_input_tokens_seen": 2170952, "step": 24095 }, { "epoch": 6.262993762993763, "grad_norm": 0.4212983250617981, "learning_rate": 4.339435074125676e-05, "loss": 0.1986, "num_input_tokens_seen": 2171384, "step": 24100 }, { "epoch": 6.264293139293139, "grad_norm": 0.37779539823532104, "learning_rate": 4.3390510662275076e-05, "loss": 0.2034, "num_input_tokens_seen": 2171816, "step": 24105 }, { "epoch": 6.265592515592515, "grad_norm": 0.5757791996002197, "learning_rate": 4.3386669637451806e-05, "loss": 0.2037, "num_input_tokens_seen": 2172264, "step": 24110 }, { "epoch": 6.266891891891892, "grad_norm": 0.25582385063171387, "learning_rate": 4.33828276669845e-05, "loss": 0.1617, "num_input_tokens_seen": 2172712, "step": 24115 }, { "epoch": 6.268191268191268, "grad_norm": 0.5223694443702698, "learning_rate": 4.3378984751070764e-05, "loss": 0.287, "num_input_tokens_seen": 2173144, "step": 24120 }, { "epoch": 6.269490644490644, "grad_norm": 0.5573758482933044, "learning_rate": 4.337514088990822e-05, "loss": 0.3733, "num_input_tokens_seen": 2173608, "step": 24125 }, { "epoch": 6.270790020790021, "grad_norm": 0.39920371770858765, "learning_rate": 4.337129608369457e-05, "loss": 0.3719, "num_input_tokens_seen": 2174072, "step": 24130 }, { "epoch": 6.272089397089397, "grad_norm": 0.6127133369445801, "learning_rate": 4.3367450332627566e-05, "loss": 0.2845, "num_input_tokens_seen": 2174536, "step": 24135 }, { "epoch": 6.273388773388773, "grad_norm": 0.7390316128730774, "learning_rate": 4.3363603636905e-05, "loss": 0.2808, "num_input_tokens_seen": 2174968, "step": 24140 }, { "epoch": 6.274688149688149, "grad_norm": 0.2330932468175888, "learning_rate": 4.335975599672469e-05, "loss": 0.2731, "num_input_tokens_seen": 2175448, "step": 24145 }, { "epoch": 6.275987525987526, "grad_norm": 0.4513813257217407, "learning_rate": 4.335590741228455e-05, "loss": 0.2466, "num_input_tokens_seen": 2175880, "step": 24150 }, { "epoch": 6.277286902286902, "grad_norm": 0.8344014286994934, "learning_rate": 4.3352057883782505e-05, "loss": 0.2687, "num_input_tokens_seen": 2176312, "step": 24155 }, { "epoch": 6.278586278586278, "grad_norm": 0.5908873081207275, "learning_rate": 4.334820741141653e-05, "loss": 0.2163, "num_input_tokens_seen": 2176760, "step": 24160 }, { "epoch": 6.279885654885655, "grad_norm": 0.7211194038391113, "learning_rate": 4.3344355995384664e-05, "loss": 0.2775, "num_input_tokens_seen": 2177224, "step": 24165 }, { "epoch": 6.281185031185031, "grad_norm": 0.5658144354820251, "learning_rate": 4.3340503635885006e-05, "loss": 0.2684, "num_input_tokens_seen": 2177640, "step": 24170 }, { "epoch": 6.282484407484407, "grad_norm": 0.42625024914741516, "learning_rate": 4.333665033311566e-05, "loss": 0.3191, "num_input_tokens_seen": 2178072, "step": 24175 }, { "epoch": 6.283783783783784, "grad_norm": 0.3734930753707886, "learning_rate": 4.333279608727483e-05, "loss": 0.2455, "num_input_tokens_seen": 2178536, "step": 24180 }, { "epoch": 6.28508316008316, "grad_norm": 0.5352206230163574, "learning_rate": 4.332894089856072e-05, "loss": 0.2728, "num_input_tokens_seen": 2178968, "step": 24185 }, { "epoch": 6.286382536382536, "grad_norm": 0.5074172616004944, "learning_rate": 4.332508476717163e-05, "loss": 0.225, "num_input_tokens_seen": 2179416, "step": 24190 }, { "epoch": 6.287681912681912, "grad_norm": 0.5745322108268738, "learning_rate": 4.332122769330586e-05, "loss": 0.2832, "num_input_tokens_seen": 2179896, "step": 24195 }, { "epoch": 6.288981288981289, "grad_norm": 0.7692566514015198, "learning_rate": 4.33173696771618e-05, "loss": 0.2447, "num_input_tokens_seen": 2180344, "step": 24200 }, { "epoch": 6.290280665280665, "grad_norm": 0.802298367023468, "learning_rate": 4.331351071893787e-05, "loss": 0.2249, "num_input_tokens_seen": 2180744, "step": 24205 }, { "epoch": 6.291580041580041, "grad_norm": 0.6381521224975586, "learning_rate": 4.330965081883254e-05, "loss": 0.2569, "num_input_tokens_seen": 2181192, "step": 24210 }, { "epoch": 6.292879417879418, "grad_norm": 0.369627445936203, "learning_rate": 4.330578997704431e-05, "loss": 0.203, "num_input_tokens_seen": 2181608, "step": 24215 }, { "epoch": 6.294178794178794, "grad_norm": 1.343297004699707, "learning_rate": 4.3301928193771766e-05, "loss": 0.2319, "num_input_tokens_seen": 2182056, "step": 24220 }, { "epoch": 6.29547817047817, "grad_norm": 0.7508637309074402, "learning_rate": 4.329806546921353e-05, "loss": 0.1579, "num_input_tokens_seen": 2182520, "step": 24225 }, { "epoch": 6.296777546777546, "grad_norm": 0.4411088824272156, "learning_rate": 4.3294201803568243e-05, "loss": 0.3537, "num_input_tokens_seen": 2183000, "step": 24230 }, { "epoch": 6.298076923076923, "grad_norm": 0.9559813737869263, "learning_rate": 4.329033719703464e-05, "loss": 0.3787, "num_input_tokens_seen": 2183432, "step": 24235 }, { "epoch": 6.299376299376299, "grad_norm": 1.2281306982040405, "learning_rate": 4.328647164981146e-05, "loss": 0.2423, "num_input_tokens_seen": 2183880, "step": 24240 }, { "epoch": 6.300675675675675, "grad_norm": 0.40125882625579834, "learning_rate": 4.328260516209752e-05, "loss": 0.1697, "num_input_tokens_seen": 2184360, "step": 24245 }, { "epoch": 6.301975051975052, "grad_norm": 0.5910820364952087, "learning_rate": 4.327873773409169e-05, "loss": 0.3442, "num_input_tokens_seen": 2184792, "step": 24250 }, { "epoch": 6.303274428274428, "grad_norm": 0.49172013998031616, "learning_rate": 4.327486936599286e-05, "loss": 0.3295, "num_input_tokens_seen": 2185272, "step": 24255 }, { "epoch": 6.3045738045738045, "grad_norm": 0.5743730068206787, "learning_rate": 4.327100005799999e-05, "loss": 0.213, "num_input_tokens_seen": 2185736, "step": 24260 }, { "epoch": 6.3058731808731805, "grad_norm": 0.3716009855270386, "learning_rate": 4.3267129810312074e-05, "loss": 0.2499, "num_input_tokens_seen": 2186184, "step": 24265 }, { "epoch": 6.307172557172557, "grad_norm": 0.425155907869339, "learning_rate": 4.326325862312817e-05, "loss": 0.2897, "num_input_tokens_seen": 2186584, "step": 24270 }, { "epoch": 6.3084719334719335, "grad_norm": 0.39988991618156433, "learning_rate": 4.3259386496647384e-05, "loss": 0.184, "num_input_tokens_seen": 2187000, "step": 24275 }, { "epoch": 6.3097713097713095, "grad_norm": 0.3914678394794464, "learning_rate": 4.325551343106885e-05, "loss": 0.2651, "num_input_tokens_seen": 2187480, "step": 24280 }, { "epoch": 6.311070686070686, "grad_norm": 0.32053324580192566, "learning_rate": 4.325163942659177e-05, "loss": 0.2126, "num_input_tokens_seen": 2187912, "step": 24285 }, { "epoch": 6.3123700623700625, "grad_norm": 0.3279946446418762, "learning_rate": 4.324776448341538e-05, "loss": 0.3062, "num_input_tokens_seen": 2188376, "step": 24290 }, { "epoch": 6.3136694386694385, "grad_norm": 0.7219842076301575, "learning_rate": 4.3243888601738984e-05, "loss": 0.3074, "num_input_tokens_seen": 2188808, "step": 24295 }, { "epoch": 6.314968814968815, "grad_norm": 0.4030189514160156, "learning_rate": 4.324001178176191e-05, "loss": 0.2302, "num_input_tokens_seen": 2189304, "step": 24300 }, { "epoch": 6.3162681912681915, "grad_norm": 0.4073502719402313, "learning_rate": 4.323613402368357e-05, "loss": 0.2928, "num_input_tokens_seen": 2189720, "step": 24305 }, { "epoch": 6.3175675675675675, "grad_norm": 0.5441614389419556, "learning_rate": 4.323225532770337e-05, "loss": 0.2905, "num_input_tokens_seen": 2190136, "step": 24310 }, { "epoch": 6.318866943866944, "grad_norm": 0.3803219199180603, "learning_rate": 4.322837569402081e-05, "loss": 0.2482, "num_input_tokens_seen": 2190600, "step": 24315 }, { "epoch": 6.3201663201663205, "grad_norm": 0.5865766406059265, "learning_rate": 4.322449512283543e-05, "loss": 0.2735, "num_input_tokens_seen": 2191048, "step": 24320 }, { "epoch": 6.321465696465697, "grad_norm": 0.3948194086551666, "learning_rate": 4.32206136143468e-05, "loss": 0.2068, "num_input_tokens_seen": 2191496, "step": 24325 }, { "epoch": 6.322765072765073, "grad_norm": 0.3631890118122101, "learning_rate": 4.321673116875455e-05, "loss": 0.2714, "num_input_tokens_seen": 2191928, "step": 24330 }, { "epoch": 6.3240644490644495, "grad_norm": 0.26083147525787354, "learning_rate": 4.321284778625836e-05, "loss": 0.1692, "num_input_tokens_seen": 2192392, "step": 24335 }, { "epoch": 6.325363825363826, "grad_norm": 0.23943482339382172, "learning_rate": 4.320896346705797e-05, "loss": 0.2602, "num_input_tokens_seen": 2192840, "step": 24340 }, { "epoch": 6.326663201663202, "grad_norm": 0.2974209785461426, "learning_rate": 4.3205078211353135e-05, "loss": 0.1621, "num_input_tokens_seen": 2193304, "step": 24345 }, { "epoch": 6.327962577962578, "grad_norm": 0.38007915019989014, "learning_rate": 4.3201192019343685e-05, "loss": 0.3561, "num_input_tokens_seen": 2193752, "step": 24350 }, { "epoch": 6.329261954261955, "grad_norm": 0.39202409982681274, "learning_rate": 4.3197304891229485e-05, "loss": 0.3268, "num_input_tokens_seen": 2194232, "step": 24355 }, { "epoch": 6.330561330561331, "grad_norm": 0.33519044518470764, "learning_rate": 4.319341682721046e-05, "loss": 0.1752, "num_input_tokens_seen": 2194696, "step": 24360 }, { "epoch": 6.331860706860707, "grad_norm": 0.34539321064949036, "learning_rate": 4.3189527827486575e-05, "loss": 0.3357, "num_input_tokens_seen": 2195192, "step": 24365 }, { "epoch": 6.333160083160083, "grad_norm": 0.5841752290725708, "learning_rate": 4.318563789225785e-05, "loss": 0.2994, "num_input_tokens_seen": 2195624, "step": 24370 }, { "epoch": 6.33445945945946, "grad_norm": 0.23834453523159027, "learning_rate": 4.318174702172434e-05, "loss": 0.2588, "num_input_tokens_seen": 2196040, "step": 24375 }, { "epoch": 6.335758835758836, "grad_norm": 0.43495097756385803, "learning_rate": 4.317785521608616e-05, "loss": 0.2325, "num_input_tokens_seen": 2196472, "step": 24380 }, { "epoch": 6.337058212058212, "grad_norm": 0.3622337877750397, "learning_rate": 4.317396247554347e-05, "loss": 0.2026, "num_input_tokens_seen": 2196888, "step": 24385 }, { "epoch": 6.338357588357589, "grad_norm": 0.29304495453834534, "learning_rate": 4.317006880029648e-05, "loss": 0.2956, "num_input_tokens_seen": 2197368, "step": 24390 }, { "epoch": 6.339656964656965, "grad_norm": 0.44669651985168457, "learning_rate": 4.316617419054544e-05, "loss": 0.2628, "num_input_tokens_seen": 2197800, "step": 24395 }, { "epoch": 6.340956340956341, "grad_norm": 0.4244580864906311, "learning_rate": 4.316227864649065e-05, "loss": 0.263, "num_input_tokens_seen": 2198264, "step": 24400 }, { "epoch": 6.342255717255718, "grad_norm": 0.42291074991226196, "learning_rate": 4.315838216833247e-05, "loss": 0.1627, "num_input_tokens_seen": 2198744, "step": 24405 }, { "epoch": 6.343555093555094, "grad_norm": 0.2835083603858948, "learning_rate": 4.31544847562713e-05, "loss": 0.3115, "num_input_tokens_seen": 2199176, "step": 24410 }, { "epoch": 6.34485446985447, "grad_norm": 0.36904793977737427, "learning_rate": 4.315058641050758e-05, "loss": 0.1882, "num_input_tokens_seen": 2199608, "step": 24415 }, { "epoch": 6.346153846153846, "grad_norm": 0.3117590546607971, "learning_rate": 4.3146687131241815e-05, "loss": 0.2603, "num_input_tokens_seen": 2200008, "step": 24420 }, { "epoch": 6.347453222453223, "grad_norm": 0.5173304080963135, "learning_rate": 4.314278691867454e-05, "loss": 0.2678, "num_input_tokens_seen": 2200472, "step": 24425 }, { "epoch": 6.348752598752599, "grad_norm": 0.8683826923370361, "learning_rate": 4.313888577300635e-05, "loss": 0.2542, "num_input_tokens_seen": 2200936, "step": 24430 }, { "epoch": 6.350051975051975, "grad_norm": 0.2913972735404968, "learning_rate": 4.313498369443788e-05, "loss": 0.2245, "num_input_tokens_seen": 2201400, "step": 24435 }, { "epoch": 6.351351351351352, "grad_norm": 0.45806029438972473, "learning_rate": 4.313108068316983e-05, "loss": 0.2037, "num_input_tokens_seen": 2201832, "step": 24440 }, { "epoch": 6.352650727650728, "grad_norm": 0.31576672196388245, "learning_rate": 4.312717673940293e-05, "loss": 0.2232, "num_input_tokens_seen": 2202232, "step": 24445 }, { "epoch": 6.353950103950104, "grad_norm": 0.3733771741390228, "learning_rate": 4.3123271863337954e-05, "loss": 0.2639, "num_input_tokens_seen": 2202728, "step": 24450 }, { "epoch": 6.35524948024948, "grad_norm": 0.34333062171936035, "learning_rate": 4.3119366055175746e-05, "loss": 0.3312, "num_input_tokens_seen": 2203160, "step": 24455 }, { "epoch": 6.356548856548857, "grad_norm": 0.3679887354373932, "learning_rate": 4.311545931511718e-05, "loss": 0.3006, "num_input_tokens_seen": 2203592, "step": 24460 }, { "epoch": 6.357848232848233, "grad_norm": 0.4115658402442932, "learning_rate": 4.311155164336318e-05, "loss": 0.2531, "num_input_tokens_seen": 2203992, "step": 24465 }, { "epoch": 6.359147609147609, "grad_norm": 0.3150765299797058, "learning_rate": 4.310764304011473e-05, "loss": 0.2383, "num_input_tokens_seen": 2204408, "step": 24470 }, { "epoch": 6.360446985446986, "grad_norm": 0.32887154817581177, "learning_rate": 4.3103733505572854e-05, "loss": 0.1769, "num_input_tokens_seen": 2204888, "step": 24475 }, { "epoch": 6.361746361746362, "grad_norm": 0.5460582971572876, "learning_rate": 4.30998230399386e-05, "loss": 0.4159, "num_input_tokens_seen": 2205368, "step": 24480 }, { "epoch": 6.363045738045738, "grad_norm": 0.2967614233493805, "learning_rate": 4.3095911643413124e-05, "loss": 0.178, "num_input_tokens_seen": 2205832, "step": 24485 }, { "epoch": 6.364345114345114, "grad_norm": 0.5618082880973816, "learning_rate": 4.309199931619756e-05, "loss": 0.3371, "num_input_tokens_seen": 2206296, "step": 24490 }, { "epoch": 6.365644490644491, "grad_norm": 0.2838872969150543, "learning_rate": 4.308808605849314e-05, "loss": 0.3046, "num_input_tokens_seen": 2206744, "step": 24495 }, { "epoch": 6.366943866943867, "grad_norm": 0.27205759286880493, "learning_rate": 4.308417187050113e-05, "loss": 0.2105, "num_input_tokens_seen": 2207160, "step": 24500 }, { "epoch": 6.368243243243243, "grad_norm": 0.35067927837371826, "learning_rate": 4.308025675242282e-05, "loss": 0.2218, "num_input_tokens_seen": 2207592, "step": 24505 }, { "epoch": 6.36954261954262, "grad_norm": 0.36007368564605713, "learning_rate": 4.307634070445959e-05, "loss": 0.2143, "num_input_tokens_seen": 2208008, "step": 24510 }, { "epoch": 6.370841995841996, "grad_norm": 0.26035743951797485, "learning_rate": 4.307242372681284e-05, "loss": 0.2652, "num_input_tokens_seen": 2208472, "step": 24515 }, { "epoch": 6.372141372141372, "grad_norm": 0.4130478501319885, "learning_rate": 4.306850581968402e-05, "loss": 0.3359, "num_input_tokens_seen": 2208968, "step": 24520 }, { "epoch": 6.373440748440748, "grad_norm": 0.32639431953430176, "learning_rate": 4.306458698327463e-05, "loss": 0.2348, "num_input_tokens_seen": 2209416, "step": 24525 }, { "epoch": 6.374740124740125, "grad_norm": 0.3017222583293915, "learning_rate": 4.306066721778622e-05, "loss": 0.26, "num_input_tokens_seen": 2209880, "step": 24530 }, { "epoch": 6.376039501039501, "grad_norm": 0.5943330526351929, "learning_rate": 4.30567465234204e-05, "loss": 0.2686, "num_input_tokens_seen": 2210344, "step": 24535 }, { "epoch": 6.377338877338877, "grad_norm": 0.3772176206111908, "learning_rate": 4.30528249003788e-05, "loss": 0.2583, "num_input_tokens_seen": 2210760, "step": 24540 }, { "epoch": 6.378638253638254, "grad_norm": 0.577759325504303, "learning_rate": 4.3048902348863116e-05, "loss": 0.2677, "num_input_tokens_seen": 2211208, "step": 24545 }, { "epoch": 6.37993762993763, "grad_norm": 0.5271965861320496, "learning_rate": 4.3044978869075094e-05, "loss": 0.2133, "num_input_tokens_seen": 2211640, "step": 24550 }, { "epoch": 6.381237006237006, "grad_norm": 0.5089123249053955, "learning_rate": 4.304105446121651e-05, "loss": 0.2042, "num_input_tokens_seen": 2212088, "step": 24555 }, { "epoch": 6.382536382536383, "grad_norm": 0.67337965965271, "learning_rate": 4.303712912548922e-05, "loss": 0.3324, "num_input_tokens_seen": 2212568, "step": 24560 }, { "epoch": 6.383835758835759, "grad_norm": 0.44467657804489136, "learning_rate": 4.303320286209509e-05, "loss": 0.2015, "num_input_tokens_seen": 2213032, "step": 24565 }, { "epoch": 6.385135135135135, "grad_norm": 0.3393182158470154, "learning_rate": 4.302927567123606e-05, "loss": 0.2992, "num_input_tokens_seen": 2213448, "step": 24570 }, { "epoch": 6.386434511434511, "grad_norm": 0.566485583782196, "learning_rate": 4.302534755311411e-05, "loss": 0.2932, "num_input_tokens_seen": 2213896, "step": 24575 }, { "epoch": 6.387733887733888, "grad_norm": 0.8510429263114929, "learning_rate": 4.3021418507931266e-05, "loss": 0.3036, "num_input_tokens_seen": 2214344, "step": 24580 }, { "epoch": 6.389033264033264, "grad_norm": 0.505199134349823, "learning_rate": 4.30174885358896e-05, "loss": 0.2349, "num_input_tokens_seen": 2214776, "step": 24585 }, { "epoch": 6.39033264033264, "grad_norm": 0.3289508521556854, "learning_rate": 4.301355763719123e-05, "loss": 0.278, "num_input_tokens_seen": 2215208, "step": 24590 }, { "epoch": 6.391632016632016, "grad_norm": 0.42573627829551697, "learning_rate": 4.300962581203833e-05, "loss": 0.2562, "num_input_tokens_seen": 2215672, "step": 24595 }, { "epoch": 6.392931392931393, "grad_norm": 0.4863666296005249, "learning_rate": 4.300569306063312e-05, "loss": 0.2866, "num_input_tokens_seen": 2216136, "step": 24600 }, { "epoch": 6.394230769230769, "grad_norm": 0.45822325348854065, "learning_rate": 4.3001759383177864e-05, "loss": 0.2918, "num_input_tokens_seen": 2216584, "step": 24605 }, { "epoch": 6.395530145530145, "grad_norm": 0.6587094068527222, "learning_rate": 4.299782477987488e-05, "loss": 0.2832, "num_input_tokens_seen": 2217048, "step": 24610 }, { "epoch": 6.396829521829522, "grad_norm": 0.504884660243988, "learning_rate": 4.299388925092652e-05, "loss": 0.2855, "num_input_tokens_seen": 2217528, "step": 24615 }, { "epoch": 6.398128898128898, "grad_norm": 0.4089847505092621, "learning_rate": 4.2989952796535196e-05, "loss": 0.2607, "num_input_tokens_seen": 2217976, "step": 24620 }, { "epoch": 6.399428274428274, "grad_norm": 0.46595293283462524, "learning_rate": 4.298601541690336e-05, "loss": 0.2523, "num_input_tokens_seen": 2218456, "step": 24625 }, { "epoch": 6.400727650727651, "grad_norm": 0.8285461664199829, "learning_rate": 4.298207711223351e-05, "loss": 0.2855, "num_input_tokens_seen": 2218920, "step": 24630 }, { "epoch": 6.402027027027027, "grad_norm": 0.41809380054473877, "learning_rate": 4.297813788272822e-05, "loss": 0.1782, "num_input_tokens_seen": 2219368, "step": 24635 }, { "epoch": 6.403326403326403, "grad_norm": 0.6285358667373657, "learning_rate": 4.297419772859006e-05, "loss": 0.2574, "num_input_tokens_seen": 2219768, "step": 24640 }, { "epoch": 6.404625779625779, "grad_norm": 0.4418317675590515, "learning_rate": 4.29702566500217e-05, "loss": 0.3221, "num_input_tokens_seen": 2220216, "step": 24645 }, { "epoch": 6.405925155925156, "grad_norm": 0.6469711661338806, "learning_rate": 4.296631464722581e-05, "loss": 0.2533, "num_input_tokens_seen": 2220696, "step": 24650 }, { "epoch": 6.407224532224532, "grad_norm": 0.648216724395752, "learning_rate": 4.2962371720405155e-05, "loss": 0.2865, "num_input_tokens_seen": 2221128, "step": 24655 }, { "epoch": 6.408523908523908, "grad_norm": 0.4433237612247467, "learning_rate": 4.2958427869762506e-05, "loss": 0.2794, "num_input_tokens_seen": 2221560, "step": 24660 }, { "epoch": 6.409823284823285, "grad_norm": 0.30130380392074585, "learning_rate": 4.2954483095500705e-05, "loss": 0.2658, "num_input_tokens_seen": 2221992, "step": 24665 }, { "epoch": 6.411122661122661, "grad_norm": 0.5411226749420166, "learning_rate": 4.295053739782263e-05, "loss": 0.276, "num_input_tokens_seen": 2222456, "step": 24670 }, { "epoch": 6.412422037422037, "grad_norm": 0.45146089792251587, "learning_rate": 4.2946590776931226e-05, "loss": 0.2056, "num_input_tokens_seen": 2222904, "step": 24675 }, { "epoch": 6.413721413721413, "grad_norm": 0.36217737197875977, "learning_rate": 4.294264323302946e-05, "loss": 0.2134, "num_input_tokens_seen": 2223368, "step": 24680 }, { "epoch": 6.41502079002079, "grad_norm": 0.3351461589336395, "learning_rate": 4.2938694766320356e-05, "loss": 0.2756, "num_input_tokens_seen": 2223832, "step": 24685 }, { "epoch": 6.416320166320166, "grad_norm": 0.5419186949729919, "learning_rate": 4.2934745377007e-05, "loss": 0.2508, "num_input_tokens_seen": 2224296, "step": 24690 }, { "epoch": 6.417619542619542, "grad_norm": 1.1771589517593384, "learning_rate": 4.2930795065292503e-05, "loss": 0.3743, "num_input_tokens_seen": 2224728, "step": 24695 }, { "epoch": 6.418918918918919, "grad_norm": 0.398782879114151, "learning_rate": 4.292684383138003e-05, "loss": 0.2052, "num_input_tokens_seen": 2225128, "step": 24700 }, { "epoch": 6.420218295218295, "grad_norm": 0.3940727710723877, "learning_rate": 4.292289167547281e-05, "loss": 0.1511, "num_input_tokens_seen": 2225592, "step": 24705 }, { "epoch": 6.421517671517671, "grad_norm": 0.769594669342041, "learning_rate": 4.291893859777409e-05, "loss": 0.3005, "num_input_tokens_seen": 2226008, "step": 24710 }, { "epoch": 6.422817047817047, "grad_norm": 0.5911003351211548, "learning_rate": 4.2914984598487197e-05, "loss": 0.3518, "num_input_tokens_seen": 2226472, "step": 24715 }, { "epoch": 6.424116424116424, "grad_norm": 0.3488897681236267, "learning_rate": 4.291102967781547e-05, "loss": 0.2768, "num_input_tokens_seen": 2226904, "step": 24720 }, { "epoch": 6.4254158004158, "grad_norm": 0.4042448103427887, "learning_rate": 4.2907073835962336e-05, "loss": 0.1538, "num_input_tokens_seen": 2227336, "step": 24725 }, { "epoch": 6.4267151767151764, "grad_norm": 0.4165872037410736, "learning_rate": 4.2903117073131225e-05, "loss": 0.2572, "num_input_tokens_seen": 2227752, "step": 24730 }, { "epoch": 6.428014553014553, "grad_norm": 0.6112045049667358, "learning_rate": 4.2899159389525664e-05, "loss": 0.29, "num_input_tokens_seen": 2228200, "step": 24735 }, { "epoch": 6.429313929313929, "grad_norm": 0.504123866558075, "learning_rate": 4.289520078534918e-05, "loss": 0.3132, "num_input_tokens_seen": 2228616, "step": 24740 }, { "epoch": 6.4306133056133055, "grad_norm": 0.7743216753005981, "learning_rate": 4.2891241260805374e-05, "loss": 0.3019, "num_input_tokens_seen": 2229048, "step": 24745 }, { "epoch": 6.4319126819126815, "grad_norm": 0.5069879293441772, "learning_rate": 4.288728081609787e-05, "loss": 0.2645, "num_input_tokens_seen": 2229480, "step": 24750 }, { "epoch": 6.433212058212058, "grad_norm": 0.4036056697368622, "learning_rate": 4.28833194514304e-05, "loss": 0.2708, "num_input_tokens_seen": 2229928, "step": 24755 }, { "epoch": 6.4345114345114345, "grad_norm": 0.6210891008377075, "learning_rate": 4.2879357167006664e-05, "loss": 0.2701, "num_input_tokens_seen": 2230408, "step": 24760 }, { "epoch": 6.4358108108108105, "grad_norm": 0.7526944279670715, "learning_rate": 4.287539396303046e-05, "loss": 0.2908, "num_input_tokens_seen": 2230888, "step": 24765 }, { "epoch": 6.4371101871101875, "grad_norm": 0.38221868872642517, "learning_rate": 4.2871429839705614e-05, "loss": 0.2748, "num_input_tokens_seen": 2231336, "step": 24770 }, { "epoch": 6.4384095634095635, "grad_norm": 0.5348846912384033, "learning_rate": 4.286746479723601e-05, "loss": 0.228, "num_input_tokens_seen": 2231784, "step": 24775 }, { "epoch": 6.4397089397089395, "grad_norm": 0.47319498658180237, "learning_rate": 4.286349883582557e-05, "loss": 0.2076, "num_input_tokens_seen": 2232248, "step": 24780 }, { "epoch": 6.4410083160083165, "grad_norm": 0.46826425194740295, "learning_rate": 4.285953195567827e-05, "loss": 0.2013, "num_input_tokens_seen": 2232728, "step": 24785 }, { "epoch": 6.4423076923076925, "grad_norm": 0.3920513391494751, "learning_rate": 4.285556415699813e-05, "loss": 0.2184, "num_input_tokens_seen": 2233160, "step": 24790 }, { "epoch": 6.4436070686070686, "grad_norm": 0.41609352827072144, "learning_rate": 4.285159543998922e-05, "loss": 0.2331, "num_input_tokens_seen": 2233576, "step": 24795 }, { "epoch": 6.444906444906445, "grad_norm": 0.3377145826816559, "learning_rate": 4.284762580485565e-05, "loss": 0.0483, "num_input_tokens_seen": 2234008, "step": 24800 }, { "epoch": 6.4462058212058215, "grad_norm": 1.855547547340393, "learning_rate": 4.284365525180158e-05, "loss": 0.4348, "num_input_tokens_seen": 2234472, "step": 24805 }, { "epoch": 6.447505197505198, "grad_norm": 0.321059912443161, "learning_rate": 4.283968378103123e-05, "loss": 0.181, "num_input_tokens_seen": 2234904, "step": 24810 }, { "epoch": 6.448804573804574, "grad_norm": 0.3164685368537903, "learning_rate": 4.2835711392748846e-05, "loss": 0.2703, "num_input_tokens_seen": 2235368, "step": 24815 }, { "epoch": 6.45010395010395, "grad_norm": 0.5544242858886719, "learning_rate": 4.283173808715873e-05, "loss": 0.178, "num_input_tokens_seen": 2235800, "step": 24820 }, { "epoch": 6.451403326403327, "grad_norm": 0.47020530700683594, "learning_rate": 4.282776386446524e-05, "loss": 0.2492, "num_input_tokens_seen": 2236264, "step": 24825 }, { "epoch": 6.452702702702703, "grad_norm": 0.6582627296447754, "learning_rate": 4.282378872487278e-05, "loss": 0.2544, "num_input_tokens_seen": 2236696, "step": 24830 }, { "epoch": 6.454002079002079, "grad_norm": 0.3802679777145386, "learning_rate": 4.281981266858579e-05, "loss": 0.1619, "num_input_tokens_seen": 2237160, "step": 24835 }, { "epoch": 6.455301455301456, "grad_norm": 0.27871742844581604, "learning_rate": 4.2815835695808754e-05, "loss": 0.1331, "num_input_tokens_seen": 2237576, "step": 24840 }, { "epoch": 6.456600831600832, "grad_norm": 1.232435941696167, "learning_rate": 4.2811857806746215e-05, "loss": 0.4693, "num_input_tokens_seen": 2238008, "step": 24845 }, { "epoch": 6.457900207900208, "grad_norm": 0.3433036506175995, "learning_rate": 4.2807879001602766e-05, "loss": 0.1343, "num_input_tokens_seen": 2238456, "step": 24850 }, { "epoch": 6.459199584199585, "grad_norm": 0.418805330991745, "learning_rate": 4.2803899280583034e-05, "loss": 0.2427, "num_input_tokens_seen": 2238872, "step": 24855 }, { "epoch": 6.460498960498961, "grad_norm": 0.45577865839004517, "learning_rate": 4.279991864389171e-05, "loss": 0.2023, "num_input_tokens_seen": 2239320, "step": 24860 }, { "epoch": 6.461798336798337, "grad_norm": 0.7374633550643921, "learning_rate": 4.2795937091733515e-05, "loss": 0.1959, "num_input_tokens_seen": 2239752, "step": 24865 }, { "epoch": 6.463097713097713, "grad_norm": 1.0800771713256836, "learning_rate": 4.2791954624313224e-05, "loss": 0.3203, "num_input_tokens_seen": 2240200, "step": 24870 }, { "epoch": 6.46439708939709, "grad_norm": 0.7242153882980347, "learning_rate": 4.278797124183566e-05, "loss": 0.2669, "num_input_tokens_seen": 2240664, "step": 24875 }, { "epoch": 6.465696465696466, "grad_norm": 0.5122697949409485, "learning_rate": 4.278398694450568e-05, "loss": 0.2339, "num_input_tokens_seen": 2241096, "step": 24880 }, { "epoch": 6.466995841995842, "grad_norm": 1.056696891784668, "learning_rate": 4.2780001732528224e-05, "loss": 0.3475, "num_input_tokens_seen": 2241560, "step": 24885 }, { "epoch": 6.468295218295219, "grad_norm": 1.3018351793289185, "learning_rate": 4.277601560610824e-05, "loss": 0.3283, "num_input_tokens_seen": 2242008, "step": 24890 }, { "epoch": 6.469594594594595, "grad_norm": 0.38214078545570374, "learning_rate": 4.277202856545074e-05, "loss": 0.2795, "num_input_tokens_seen": 2242472, "step": 24895 }, { "epoch": 6.470893970893971, "grad_norm": 0.693324625492096, "learning_rate": 4.276804061076078e-05, "loss": 0.3028, "num_input_tokens_seen": 2242920, "step": 24900 }, { "epoch": 6.472193347193347, "grad_norm": 1.4262923002243042, "learning_rate": 4.276405174224347e-05, "loss": 0.29, "num_input_tokens_seen": 2243384, "step": 24905 }, { "epoch": 6.473492723492724, "grad_norm": 1.308893084526062, "learning_rate": 4.2760061960103956e-05, "loss": 0.2648, "num_input_tokens_seen": 2243880, "step": 24910 }, { "epoch": 6.4747920997921, "grad_norm": 0.8320661187171936, "learning_rate": 4.275607126454744e-05, "loss": 0.2701, "num_input_tokens_seen": 2244328, "step": 24915 }, { "epoch": 6.476091476091476, "grad_norm": 0.5163965225219727, "learning_rate": 4.2752079655779165e-05, "loss": 0.2887, "num_input_tokens_seen": 2244808, "step": 24920 }, { "epoch": 6.477390852390853, "grad_norm": 0.7961263656616211, "learning_rate": 4.274808713400443e-05, "loss": 0.2942, "num_input_tokens_seen": 2245304, "step": 24925 }, { "epoch": 6.478690228690229, "grad_norm": 0.8627550601959229, "learning_rate": 4.274409369942856e-05, "loss": 0.3145, "num_input_tokens_seen": 2245736, "step": 24930 }, { "epoch": 6.479989604989605, "grad_norm": 0.8829097151756287, "learning_rate": 4.274009935225696e-05, "loss": 0.2659, "num_input_tokens_seen": 2246216, "step": 24935 }, { "epoch": 6.481288981288981, "grad_norm": 0.3499319851398468, "learning_rate": 4.273610409269504e-05, "loss": 0.2664, "num_input_tokens_seen": 2246664, "step": 24940 }, { "epoch": 6.482588357588358, "grad_norm": 0.40542155504226685, "learning_rate": 4.27321079209483e-05, "loss": 0.265, "num_input_tokens_seen": 2247112, "step": 24945 }, { "epoch": 6.483887733887734, "grad_norm": 0.5265867710113525, "learning_rate": 4.2728110837222255e-05, "loss": 0.2642, "num_input_tokens_seen": 2247576, "step": 24950 }, { "epoch": 6.48518711018711, "grad_norm": 0.4629558324813843, "learning_rate": 4.2724112841722484e-05, "loss": 0.2717, "num_input_tokens_seen": 2248056, "step": 24955 }, { "epoch": 6.486486486486487, "grad_norm": 0.3805306553840637, "learning_rate": 4.272011393465461e-05, "loss": 0.3054, "num_input_tokens_seen": 2248504, "step": 24960 }, { "epoch": 6.487785862785863, "grad_norm": 0.373820960521698, "learning_rate": 4.271611411622429e-05, "loss": 0.2759, "num_input_tokens_seen": 2248904, "step": 24965 }, { "epoch": 6.489085239085239, "grad_norm": 0.2913658320903778, "learning_rate": 4.271211338663726e-05, "loss": 0.3172, "num_input_tokens_seen": 2249336, "step": 24970 }, { "epoch": 6.490384615384615, "grad_norm": 0.6712636351585388, "learning_rate": 4.270811174609926e-05, "loss": 0.2765, "num_input_tokens_seen": 2249752, "step": 24975 }, { "epoch": 6.491683991683992, "grad_norm": 0.473029762506485, "learning_rate": 4.270410919481611e-05, "loss": 0.2495, "num_input_tokens_seen": 2250216, "step": 24980 }, { "epoch": 6.492983367983368, "grad_norm": 0.3956630527973175, "learning_rate": 4.270010573299366e-05, "loss": 0.2218, "num_input_tokens_seen": 2250616, "step": 24985 }, { "epoch": 6.494282744282744, "grad_norm": 0.3142928183078766, "learning_rate": 4.2696101360837806e-05, "loss": 0.2248, "num_input_tokens_seen": 2251064, "step": 24990 }, { "epoch": 6.495582120582121, "grad_norm": 0.5280904769897461, "learning_rate": 4.26920960785545e-05, "loss": 0.2745, "num_input_tokens_seen": 2251496, "step": 24995 }, { "epoch": 6.496881496881497, "grad_norm": 0.5386884212493896, "learning_rate": 4.268808988634975e-05, "loss": 0.212, "num_input_tokens_seen": 2251944, "step": 25000 }, { "epoch": 6.498180873180873, "grad_norm": 0.5912686586380005, "learning_rate": 4.2684082784429593e-05, "loss": 0.3065, "num_input_tokens_seen": 2252408, "step": 25005 }, { "epoch": 6.49948024948025, "grad_norm": 0.2548704445362091, "learning_rate": 4.2680074773000106e-05, "loss": 0.1629, "num_input_tokens_seen": 2252824, "step": 25010 }, { "epoch": 6.500779625779626, "grad_norm": 0.4951118230819702, "learning_rate": 4.2676065852267435e-05, "loss": 0.2987, "num_input_tokens_seen": 2253288, "step": 25015 }, { "epoch": 6.502079002079002, "grad_norm": 0.38555335998535156, "learning_rate": 4.267205602243777e-05, "loss": 0.2391, "num_input_tokens_seen": 2253768, "step": 25020 }, { "epoch": 6.503378378378378, "grad_norm": 0.7623212337493896, "learning_rate": 4.266804528371732e-05, "loss": 0.2541, "num_input_tokens_seen": 2254232, "step": 25025 }, { "epoch": 6.504677754677755, "grad_norm": 0.3818588852882385, "learning_rate": 4.2664033636312374e-05, "loss": 0.2266, "num_input_tokens_seen": 2254664, "step": 25030 }, { "epoch": 6.505977130977131, "grad_norm": 0.42539751529693604, "learning_rate": 4.2660021080429253e-05, "loss": 0.2492, "num_input_tokens_seen": 2255096, "step": 25035 }, { "epoch": 6.507276507276507, "grad_norm": 0.3953514099121094, "learning_rate": 4.265600761627433e-05, "loss": 0.1902, "num_input_tokens_seen": 2255592, "step": 25040 }, { "epoch": 6.508575883575883, "grad_norm": 0.28728145360946655, "learning_rate": 4.265199324405401e-05, "loss": 0.2327, "num_input_tokens_seen": 2256024, "step": 25045 }, { "epoch": 6.50987525987526, "grad_norm": 0.4582498371601105, "learning_rate": 4.264797796397477e-05, "loss": 0.2436, "num_input_tokens_seen": 2256472, "step": 25050 }, { "epoch": 6.511174636174636, "grad_norm": 0.34123700857162476, "learning_rate": 4.264396177624312e-05, "loss": 0.1361, "num_input_tokens_seen": 2256936, "step": 25055 }, { "epoch": 6.512474012474012, "grad_norm": 0.26933300495147705, "learning_rate": 4.26399446810656e-05, "loss": 0.2948, "num_input_tokens_seen": 2257368, "step": 25060 }, { "epoch": 6.513773388773389, "grad_norm": 0.32161518931388855, "learning_rate": 4.263592667864883e-05, "loss": 0.2271, "num_input_tokens_seen": 2257832, "step": 25065 }, { "epoch": 6.515072765072765, "grad_norm": 0.32464203238487244, "learning_rate": 4.2631907769199456e-05, "loss": 0.2152, "num_input_tokens_seen": 2258328, "step": 25070 }, { "epoch": 6.516372141372141, "grad_norm": 0.47760748863220215, "learning_rate": 4.2627887952924165e-05, "loss": 0.3497, "num_input_tokens_seen": 2258776, "step": 25075 }, { "epoch": 6.517671517671518, "grad_norm": 0.5988714694976807, "learning_rate": 4.26238672300297e-05, "loss": 0.3182, "num_input_tokens_seen": 2259240, "step": 25080 }, { "epoch": 6.518970893970894, "grad_norm": 0.27751415967941284, "learning_rate": 4.261984560072287e-05, "loss": 0.1193, "num_input_tokens_seen": 2259688, "step": 25085 }, { "epoch": 6.52027027027027, "grad_norm": 0.3592124581336975, "learning_rate": 4.2615823065210494e-05, "loss": 0.213, "num_input_tokens_seen": 2260120, "step": 25090 }, { "epoch": 6.521569646569646, "grad_norm": 0.34179335832595825, "learning_rate": 4.261179962369946e-05, "loss": 0.1737, "num_input_tokens_seen": 2260584, "step": 25095 }, { "epoch": 6.522869022869023, "grad_norm": 0.8029624819755554, "learning_rate": 4.260777527639669e-05, "loss": 0.284, "num_input_tokens_seen": 2261048, "step": 25100 }, { "epoch": 6.524168399168399, "grad_norm": 0.2552807033061981, "learning_rate": 4.260375002350917e-05, "loss": 0.1046, "num_input_tokens_seen": 2261560, "step": 25105 }, { "epoch": 6.525467775467775, "grad_norm": 0.20477628707885742, "learning_rate": 4.2599723865243926e-05, "loss": 0.2038, "num_input_tokens_seen": 2262008, "step": 25110 }, { "epoch": 6.526767151767151, "grad_norm": 0.43832334876060486, "learning_rate": 4.2595696801808014e-05, "loss": 0.2824, "num_input_tokens_seen": 2262456, "step": 25115 }, { "epoch": 6.528066528066528, "grad_norm": 0.5677269101142883, "learning_rate": 4.259166883340856e-05, "loss": 0.2557, "num_input_tokens_seen": 2262872, "step": 25120 }, { "epoch": 6.529365904365904, "grad_norm": 0.4324057996273041, "learning_rate": 4.258763996025271e-05, "loss": 0.3749, "num_input_tokens_seen": 2263288, "step": 25125 }, { "epoch": 6.53066528066528, "grad_norm": 0.4713900685310364, "learning_rate": 4.258361018254769e-05, "loss": 0.1705, "num_input_tokens_seen": 2263784, "step": 25130 }, { "epoch": 6.531964656964657, "grad_norm": 0.44603538513183594, "learning_rate": 4.257957950050076e-05, "loss": 0.2424, "num_input_tokens_seen": 2264200, "step": 25135 }, { "epoch": 6.533264033264033, "grad_norm": 0.5341939330101013, "learning_rate": 4.2575547914319206e-05, "loss": 0.2576, "num_input_tokens_seen": 2264696, "step": 25140 }, { "epoch": 6.534563409563409, "grad_norm": 0.468258261680603, "learning_rate": 4.257151542421038e-05, "loss": 0.2122, "num_input_tokens_seen": 2265144, "step": 25145 }, { "epoch": 6.535862785862786, "grad_norm": 0.5950300693511963, "learning_rate": 4.256748203038169e-05, "loss": 0.2262, "num_input_tokens_seen": 2265576, "step": 25150 }, { "epoch": 6.537162162162162, "grad_norm": 0.42070937156677246, "learning_rate": 4.256344773304056e-05, "loss": 0.1713, "num_input_tokens_seen": 2266024, "step": 25155 }, { "epoch": 6.538461538461538, "grad_norm": 0.36318424344062805, "learning_rate": 4.2559412532394486e-05, "loss": 0.2705, "num_input_tokens_seen": 2266440, "step": 25160 }, { "epoch": 6.539760914760915, "grad_norm": 0.2853027284145355, "learning_rate": 4.2555376428651e-05, "loss": 0.2495, "num_input_tokens_seen": 2266904, "step": 25165 }, { "epoch": 6.541060291060291, "grad_norm": 0.3731009364128113, "learning_rate": 4.2551339422017686e-05, "loss": 0.1465, "num_input_tokens_seen": 2267352, "step": 25170 }, { "epoch": 6.542359667359667, "grad_norm": 0.31753265857696533, "learning_rate": 4.2547301512702166e-05, "loss": 0.2197, "num_input_tokens_seen": 2267768, "step": 25175 }, { "epoch": 6.543659043659043, "grad_norm": 0.6640284657478333, "learning_rate": 4.254326270091211e-05, "loss": 0.3273, "num_input_tokens_seen": 2268184, "step": 25180 }, { "epoch": 6.54495841995842, "grad_norm": 0.34604325890541077, "learning_rate": 4.253922298685525e-05, "loss": 0.282, "num_input_tokens_seen": 2268648, "step": 25185 }, { "epoch": 6.546257796257796, "grad_norm": 0.5372418761253357, "learning_rate": 4.2535182370739345e-05, "loss": 0.2872, "num_input_tokens_seen": 2269128, "step": 25190 }, { "epoch": 6.547557172557172, "grad_norm": 0.5997734665870667, "learning_rate": 4.253114085277221e-05, "loss": 0.2234, "num_input_tokens_seen": 2269544, "step": 25195 }, { "epoch": 6.548856548856548, "grad_norm": 0.3014754056930542, "learning_rate": 4.252709843316171e-05, "loss": 0.1779, "num_input_tokens_seen": 2269992, "step": 25200 }, { "epoch": 6.550155925155925, "grad_norm": 0.7384049892425537, "learning_rate": 4.252305511211574e-05, "loss": 0.2304, "num_input_tokens_seen": 2270456, "step": 25205 }, { "epoch": 6.551455301455301, "grad_norm": 0.6834322214126587, "learning_rate": 4.251901088984225e-05, "loss": 0.3535, "num_input_tokens_seen": 2270936, "step": 25210 }, { "epoch": 6.5527546777546775, "grad_norm": 0.48945000767707825, "learning_rate": 4.2514965766549245e-05, "loss": 0.218, "num_input_tokens_seen": 2271384, "step": 25215 }, { "epoch": 6.554054054054054, "grad_norm": 0.5583035945892334, "learning_rate": 4.251091974244478e-05, "loss": 0.2627, "num_input_tokens_seen": 2271800, "step": 25220 }, { "epoch": 6.55535343035343, "grad_norm": 0.7922645211219788, "learning_rate": 4.250687281773692e-05, "loss": 0.2267, "num_input_tokens_seen": 2272264, "step": 25225 }, { "epoch": 6.5566528066528065, "grad_norm": 0.5348442792892456, "learning_rate": 4.2502824992633826e-05, "loss": 0.3252, "num_input_tokens_seen": 2272712, "step": 25230 }, { "epoch": 6.557952182952183, "grad_norm": 0.8417960405349731, "learning_rate": 4.2498776267343664e-05, "loss": 0.2629, "num_input_tokens_seen": 2273144, "step": 25235 }, { "epoch": 6.5592515592515594, "grad_norm": 0.6168434619903564, "learning_rate": 4.249472664207468e-05, "loss": 0.2575, "num_input_tokens_seen": 2273592, "step": 25240 }, { "epoch": 6.5605509355509355, "grad_norm": 0.4415130615234375, "learning_rate": 4.249067611703514e-05, "loss": 0.2542, "num_input_tokens_seen": 2274072, "step": 25245 }, { "epoch": 6.5618503118503115, "grad_norm": 0.3555799424648285, "learning_rate": 4.248662469243336e-05, "loss": 0.2718, "num_input_tokens_seen": 2274520, "step": 25250 }, { "epoch": 6.5631496881496885, "grad_norm": 0.2859342694282532, "learning_rate": 4.248257236847774e-05, "loss": 0.193, "num_input_tokens_seen": 2274984, "step": 25255 }, { "epoch": 6.5644490644490645, "grad_norm": 0.25540032982826233, "learning_rate": 4.247851914537665e-05, "loss": 0.3085, "num_input_tokens_seen": 2275448, "step": 25260 }, { "epoch": 6.5657484407484406, "grad_norm": 0.2617979347705841, "learning_rate": 4.247446502333858e-05, "loss": 0.1641, "num_input_tokens_seen": 2275912, "step": 25265 }, { "epoch": 6.567047817047817, "grad_norm": 0.20220108330249786, "learning_rate": 4.247041000257203e-05, "loss": 0.2097, "num_input_tokens_seen": 2276360, "step": 25270 }, { "epoch": 6.5683471933471935, "grad_norm": 0.751994788646698, "learning_rate": 4.2466354083285556e-05, "loss": 0.4262, "num_input_tokens_seen": 2276808, "step": 25275 }, { "epoch": 6.56964656964657, "grad_norm": 0.9094239473342896, "learning_rate": 4.2462297265687754e-05, "loss": 0.2935, "num_input_tokens_seen": 2277272, "step": 25280 }, { "epoch": 6.570945945945946, "grad_norm": 0.5348286628723145, "learning_rate": 4.245823954998728e-05, "loss": 0.227, "num_input_tokens_seen": 2277704, "step": 25285 }, { "epoch": 6.5722453222453225, "grad_norm": 0.6191619038581848, "learning_rate": 4.2454180936392805e-05, "loss": 0.2282, "num_input_tokens_seen": 2278168, "step": 25290 }, { "epoch": 6.573544698544699, "grad_norm": 0.5318905115127563, "learning_rate": 4.245012142511309e-05, "loss": 0.3195, "num_input_tokens_seen": 2278632, "step": 25295 }, { "epoch": 6.574844074844075, "grad_norm": 0.7843266129493713, "learning_rate": 4.244606101635691e-05, "loss": 0.3015, "num_input_tokens_seen": 2279096, "step": 25300 }, { "epoch": 6.576143451143452, "grad_norm": 0.6540118455886841, "learning_rate": 4.2441999710333094e-05, "loss": 0.2594, "num_input_tokens_seen": 2279544, "step": 25305 }, { "epoch": 6.577442827442828, "grad_norm": 0.5014480352401733, "learning_rate": 4.243793750725052e-05, "loss": 0.2251, "num_input_tokens_seen": 2280008, "step": 25310 }, { "epoch": 6.578742203742204, "grad_norm": 0.45949587225914, "learning_rate": 4.243387440731811e-05, "loss": 0.1927, "num_input_tokens_seen": 2280456, "step": 25315 }, { "epoch": 6.58004158004158, "grad_norm": 0.7195258140563965, "learning_rate": 4.2429810410744835e-05, "loss": 0.2577, "num_input_tokens_seen": 2280920, "step": 25320 }, { "epoch": 6.581340956340957, "grad_norm": 0.5122546553611755, "learning_rate": 4.242574551773971e-05, "loss": 0.3257, "num_input_tokens_seen": 2281352, "step": 25325 }, { "epoch": 6.582640332640333, "grad_norm": 0.6825939416885376, "learning_rate": 4.24216797285118e-05, "loss": 0.2969, "num_input_tokens_seen": 2281768, "step": 25330 }, { "epoch": 6.583939708939709, "grad_norm": 0.6157715916633606, "learning_rate": 4.241761304327021e-05, "loss": 0.2405, "num_input_tokens_seen": 2282232, "step": 25335 }, { "epoch": 6.585239085239085, "grad_norm": 0.41870519518852234, "learning_rate": 4.241354546222408e-05, "loss": 0.2202, "num_input_tokens_seen": 2282696, "step": 25340 }, { "epoch": 6.586538461538462, "grad_norm": 0.606205940246582, "learning_rate": 4.2409476985582644e-05, "loss": 0.2834, "num_input_tokens_seen": 2283128, "step": 25345 }, { "epoch": 6.587837837837838, "grad_norm": 0.8382205367088318, "learning_rate": 4.240540761355512e-05, "loss": 0.4142, "num_input_tokens_seen": 2283592, "step": 25350 }, { "epoch": 6.589137214137214, "grad_norm": 0.6629148721694946, "learning_rate": 4.240133734635079e-05, "loss": 0.2302, "num_input_tokens_seen": 2284040, "step": 25355 }, { "epoch": 6.590436590436591, "grad_norm": 0.8028268814086914, "learning_rate": 4.2397266184179015e-05, "loss": 0.3075, "num_input_tokens_seen": 2284504, "step": 25360 }, { "epoch": 6.591735966735967, "grad_norm": 0.5518119931221008, "learning_rate": 4.2393194127249166e-05, "loss": 0.268, "num_input_tokens_seen": 2284952, "step": 25365 }, { "epoch": 6.593035343035343, "grad_norm": 0.7226668000221252, "learning_rate": 4.238912117577069e-05, "loss": 0.246, "num_input_tokens_seen": 2285400, "step": 25370 }, { "epoch": 6.59433471933472, "grad_norm": 0.5347800254821777, "learning_rate": 4.238504732995304e-05, "loss": 0.2709, "num_input_tokens_seen": 2285832, "step": 25375 }, { "epoch": 6.595634095634096, "grad_norm": 0.43019920587539673, "learning_rate": 4.238097259000575e-05, "loss": 0.1732, "num_input_tokens_seen": 2286264, "step": 25380 }, { "epoch": 6.596933471933472, "grad_norm": 0.4135931134223938, "learning_rate": 4.237689695613839e-05, "loss": 0.1955, "num_input_tokens_seen": 2286712, "step": 25385 }, { "epoch": 6.598232848232849, "grad_norm": 0.2719467282295227, "learning_rate": 4.237282042856057e-05, "loss": 0.1165, "num_input_tokens_seen": 2287192, "step": 25390 }, { "epoch": 6.599532224532225, "grad_norm": 0.21049562096595764, "learning_rate": 4.236874300748195e-05, "loss": 0.28, "num_input_tokens_seen": 2287640, "step": 25395 }, { "epoch": 6.600831600831601, "grad_norm": 0.27417996525764465, "learning_rate": 4.2364664693112234e-05, "loss": 0.1377, "num_input_tokens_seen": 2288072, "step": 25400 }, { "epoch": 6.602130977130977, "grad_norm": 0.22865106165409088, "learning_rate": 4.2360585485661175e-05, "loss": 0.3084, "num_input_tokens_seen": 2288504, "step": 25405 }, { "epoch": 6.603430353430354, "grad_norm": 0.7420570850372314, "learning_rate": 4.2356505385338565e-05, "loss": 0.2948, "num_input_tokens_seen": 2288968, "step": 25410 }, { "epoch": 6.60472972972973, "grad_norm": 0.6655909419059753, "learning_rate": 4.2352424392354264e-05, "loss": 0.1954, "num_input_tokens_seen": 2289400, "step": 25415 }, { "epoch": 6.606029106029106, "grad_norm": 0.5726633071899414, "learning_rate": 4.234834250691814e-05, "loss": 0.2148, "num_input_tokens_seen": 2289864, "step": 25420 }, { "epoch": 6.607328482328482, "grad_norm": 0.35239988565444946, "learning_rate": 4.234425972924014e-05, "loss": 0.2139, "num_input_tokens_seen": 2290312, "step": 25425 }, { "epoch": 6.608627858627859, "grad_norm": 0.6135677099227905, "learning_rate": 4.234017605953025e-05, "loss": 0.3051, "num_input_tokens_seen": 2290776, "step": 25430 }, { "epoch": 6.609927234927235, "grad_norm": 0.6091713905334473, "learning_rate": 4.2336091497998484e-05, "loss": 0.1891, "num_input_tokens_seen": 2291208, "step": 25435 }, { "epoch": 6.611226611226611, "grad_norm": 1.013588786125183, "learning_rate": 4.2332006044854925e-05, "loss": 0.3616, "num_input_tokens_seen": 2291656, "step": 25440 }, { "epoch": 6.612525987525988, "grad_norm": 0.5359441637992859, "learning_rate": 4.232791970030968e-05, "loss": 0.342, "num_input_tokens_seen": 2292104, "step": 25445 }, { "epoch": 6.613825363825364, "grad_norm": 0.7184536457061768, "learning_rate": 4.232383246457293e-05, "loss": 0.3065, "num_input_tokens_seen": 2292552, "step": 25450 }, { "epoch": 6.61512474012474, "grad_norm": 0.5274298787117004, "learning_rate": 4.231974433785488e-05, "loss": 0.2689, "num_input_tokens_seen": 2293016, "step": 25455 }, { "epoch": 6.616424116424117, "grad_norm": 0.4355689287185669, "learning_rate": 4.231565532036578e-05, "loss": 0.2734, "num_input_tokens_seen": 2293464, "step": 25460 }, { "epoch": 6.617723492723493, "grad_norm": 0.4464882016181946, "learning_rate": 4.2311565412315934e-05, "loss": 0.2839, "num_input_tokens_seen": 2293928, "step": 25465 }, { "epoch": 6.619022869022869, "grad_norm": 0.4099251329898834, "learning_rate": 4.2307474613915694e-05, "loss": 0.2085, "num_input_tokens_seen": 2294360, "step": 25470 }, { "epoch": 6.620322245322245, "grad_norm": 0.3524523675441742, "learning_rate": 4.230338292537545e-05, "loss": 0.2336, "num_input_tokens_seen": 2294776, "step": 25475 }, { "epoch": 6.621621621621622, "grad_norm": 0.47397875785827637, "learning_rate": 4.2299290346905646e-05, "loss": 0.3477, "num_input_tokens_seen": 2295256, "step": 25480 }, { "epoch": 6.622920997920998, "grad_norm": 0.504621684551239, "learning_rate": 4.229519687871676e-05, "loss": 0.2641, "num_input_tokens_seen": 2295704, "step": 25485 }, { "epoch": 6.624220374220374, "grad_norm": 0.4016132652759552, "learning_rate": 4.2291102521019335e-05, "loss": 0.1796, "num_input_tokens_seen": 2296152, "step": 25490 }, { "epoch": 6.62551975051975, "grad_norm": 0.309938907623291, "learning_rate": 4.228700727402393e-05, "loss": 0.2922, "num_input_tokens_seen": 2296568, "step": 25495 }, { "epoch": 6.626819126819127, "grad_norm": 0.4121169447898865, "learning_rate": 4.2282911137941184e-05, "loss": 0.3249, "num_input_tokens_seen": 2297048, "step": 25500 }, { "epoch": 6.628118503118503, "grad_norm": 0.31424638628959656, "learning_rate": 4.227881411298175e-05, "loss": 0.23, "num_input_tokens_seen": 2297496, "step": 25505 }, { "epoch": 6.629417879417879, "grad_norm": 0.3783904016017914, "learning_rate": 4.2274716199356354e-05, "loss": 0.2404, "num_input_tokens_seen": 2297960, "step": 25510 }, { "epoch": 6.630717255717256, "grad_norm": 0.37010499835014343, "learning_rate": 4.227061739727576e-05, "loss": 0.2632, "num_input_tokens_seen": 2298408, "step": 25515 }, { "epoch": 6.632016632016632, "grad_norm": 0.3673059940338135, "learning_rate": 4.226651770695076e-05, "loss": 0.2379, "num_input_tokens_seen": 2298856, "step": 25520 }, { "epoch": 6.633316008316008, "grad_norm": 0.2888638377189636, "learning_rate": 4.226241712859221e-05, "loss": 0.2146, "num_input_tokens_seen": 2299304, "step": 25525 }, { "epoch": 6.634615384615385, "grad_norm": 0.3355424702167511, "learning_rate": 4.2258315662411e-05, "loss": 0.199, "num_input_tokens_seen": 2299720, "step": 25530 }, { "epoch": 6.635914760914761, "grad_norm": 0.274827778339386, "learning_rate": 4.225421330861809e-05, "loss": 0.2757, "num_input_tokens_seen": 2300136, "step": 25535 }, { "epoch": 6.637214137214137, "grad_norm": 0.3356552720069885, "learning_rate": 4.225011006742445e-05, "loss": 0.2928, "num_input_tokens_seen": 2300568, "step": 25540 }, { "epoch": 6.638513513513513, "grad_norm": 0.2995404601097107, "learning_rate": 4.224600593904113e-05, "loss": 0.2055, "num_input_tokens_seen": 2301000, "step": 25545 }, { "epoch": 6.63981288981289, "grad_norm": 0.2721008062362671, "learning_rate": 4.2241900923679196e-05, "loss": 0.2572, "num_input_tokens_seen": 2301464, "step": 25550 }, { "epoch": 6.641112266112266, "grad_norm": 0.48291048407554626, "learning_rate": 4.2237795021549776e-05, "loss": 0.2675, "num_input_tokens_seen": 2301976, "step": 25555 }, { "epoch": 6.642411642411642, "grad_norm": 0.2774825692176819, "learning_rate": 4.223368823286404e-05, "loss": 0.1753, "num_input_tokens_seen": 2302440, "step": 25560 }, { "epoch": 6.643711018711018, "grad_norm": 0.3667412996292114, "learning_rate": 4.2229580557833204e-05, "loss": 0.1996, "num_input_tokens_seen": 2302872, "step": 25565 }, { "epoch": 6.645010395010395, "grad_norm": 0.31885233521461487, "learning_rate": 4.222547199666854e-05, "loss": 0.2958, "num_input_tokens_seen": 2303288, "step": 25570 }, { "epoch": 6.646309771309771, "grad_norm": 0.3723791837692261, "learning_rate": 4.2221362549581334e-05, "loss": 0.2593, "num_input_tokens_seen": 2303752, "step": 25575 }, { "epoch": 6.647609147609147, "grad_norm": 0.34761157631874084, "learning_rate": 4.221725221678296e-05, "loss": 0.3048, "num_input_tokens_seen": 2304216, "step": 25580 }, { "epoch": 6.648908523908524, "grad_norm": 0.3297596573829651, "learning_rate": 4.221314099848481e-05, "loss": 0.2255, "num_input_tokens_seen": 2304664, "step": 25585 }, { "epoch": 6.6502079002079, "grad_norm": 0.3677104413509369, "learning_rate": 4.220902889489832e-05, "loss": 0.262, "num_input_tokens_seen": 2305096, "step": 25590 }, { "epoch": 6.651507276507276, "grad_norm": 0.416832834482193, "learning_rate": 4.220491590623499e-05, "loss": 0.2685, "num_input_tokens_seen": 2305544, "step": 25595 }, { "epoch": 6.652806652806653, "grad_norm": 0.375582218170166, "learning_rate": 4.220080203270634e-05, "loss": 0.2753, "num_input_tokens_seen": 2306008, "step": 25600 }, { "epoch": 6.654106029106029, "grad_norm": 0.3679879903793335, "learning_rate": 4.219668727452397e-05, "loss": 0.2183, "num_input_tokens_seen": 2306440, "step": 25605 }, { "epoch": 6.655405405405405, "grad_norm": 0.2674594223499298, "learning_rate": 4.219257163189949e-05, "loss": 0.193, "num_input_tokens_seen": 2306888, "step": 25610 }, { "epoch": 6.656704781704782, "grad_norm": 0.33281007409095764, "learning_rate": 4.218845510504458e-05, "loss": 0.2538, "num_input_tokens_seen": 2307304, "step": 25615 }, { "epoch": 6.658004158004158, "grad_norm": 0.40367764234542847, "learning_rate": 4.218433769417096e-05, "loss": 0.4324, "num_input_tokens_seen": 2307720, "step": 25620 }, { "epoch": 6.659303534303534, "grad_norm": 0.8532606363296509, "learning_rate": 4.218021939949038e-05, "loss": 0.2447, "num_input_tokens_seen": 2308152, "step": 25625 }, { "epoch": 6.66060291060291, "grad_norm": 0.40057671070098877, "learning_rate": 4.2176100221214666e-05, "loss": 0.2359, "num_input_tokens_seen": 2308600, "step": 25630 }, { "epoch": 6.661902286902287, "grad_norm": 0.6490449905395508, "learning_rate": 4.2171980159555644e-05, "loss": 0.2607, "num_input_tokens_seen": 2309032, "step": 25635 }, { "epoch": 6.663201663201663, "grad_norm": 0.39554378390312195, "learning_rate": 4.216785921472524e-05, "loss": 0.2393, "num_input_tokens_seen": 2309480, "step": 25640 }, { "epoch": 6.664501039501039, "grad_norm": 0.5117676258087158, "learning_rate": 4.216373738693539e-05, "loss": 0.2065, "num_input_tokens_seen": 2309912, "step": 25645 }, { "epoch": 6.665800415800415, "grad_norm": 0.616237461566925, "learning_rate": 4.215961467639807e-05, "loss": 0.2721, "num_input_tokens_seen": 2310392, "step": 25650 }, { "epoch": 6.667099792099792, "grad_norm": 0.5985793471336365, "learning_rate": 4.215549108332533e-05, "loss": 0.223, "num_input_tokens_seen": 2310824, "step": 25655 }, { "epoch": 6.668399168399168, "grad_norm": 0.32878759503364563, "learning_rate": 4.2151366607929246e-05, "loss": 0.2847, "num_input_tokens_seen": 2311304, "step": 25660 }, { "epoch": 6.669698544698544, "grad_norm": 0.3659745454788208, "learning_rate": 4.2147241250421944e-05, "loss": 0.1842, "num_input_tokens_seen": 2311736, "step": 25665 }, { "epoch": 6.670997920997921, "grad_norm": 0.6592721939086914, "learning_rate": 4.2143115011015597e-05, "loss": 0.3353, "num_input_tokens_seen": 2312184, "step": 25670 }, { "epoch": 6.672297297297297, "grad_norm": 0.4210660755634308, "learning_rate": 4.213898788992242e-05, "loss": 0.2375, "num_input_tokens_seen": 2312648, "step": 25675 }, { "epoch": 6.673596673596673, "grad_norm": 0.44563668966293335, "learning_rate": 4.213485988735467e-05, "loss": 0.2038, "num_input_tokens_seen": 2313096, "step": 25680 }, { "epoch": 6.67489604989605, "grad_norm": 0.3252295255661011, "learning_rate": 4.213073100352466e-05, "loss": 0.2935, "num_input_tokens_seen": 2313528, "step": 25685 }, { "epoch": 6.676195426195426, "grad_norm": 0.2661955654621124, "learning_rate": 4.212660123864474e-05, "loss": 0.2553, "num_input_tokens_seen": 2313960, "step": 25690 }, { "epoch": 6.677494802494802, "grad_norm": 0.3849993646144867, "learning_rate": 4.212247059292731e-05, "loss": 0.2327, "num_input_tokens_seen": 2314472, "step": 25695 }, { "epoch": 6.6787941787941785, "grad_norm": 0.31714990735054016, "learning_rate": 4.211833906658481e-05, "loss": 0.2357, "num_input_tokens_seen": 2314936, "step": 25700 }, { "epoch": 6.680093555093555, "grad_norm": 0.43975964188575745, "learning_rate": 4.211420665982973e-05, "loss": 0.1721, "num_input_tokens_seen": 2315368, "step": 25705 }, { "epoch": 6.6813929313929314, "grad_norm": 0.6064899563789368, "learning_rate": 4.2110073372874604e-05, "loss": 0.2648, "num_input_tokens_seen": 2315816, "step": 25710 }, { "epoch": 6.6826923076923075, "grad_norm": 0.25685277581214905, "learning_rate": 4.210593920593201e-05, "loss": 0.2525, "num_input_tokens_seen": 2316280, "step": 25715 }, { "epoch": 6.6839916839916835, "grad_norm": 0.2659926116466522, "learning_rate": 4.2101804159214576e-05, "loss": 0.1471, "num_input_tokens_seen": 2316744, "step": 25720 }, { "epoch": 6.6852910602910605, "grad_norm": 0.370807021856308, "learning_rate": 4.209766823293496e-05, "loss": 0.3016, "num_input_tokens_seen": 2317192, "step": 25725 }, { "epoch": 6.6865904365904365, "grad_norm": 0.5546363592147827, "learning_rate": 4.2093531427305886e-05, "loss": 0.1893, "num_input_tokens_seen": 2317624, "step": 25730 }, { "epoch": 6.6878898128898125, "grad_norm": 0.35105279088020325, "learning_rate": 4.2089393742540115e-05, "loss": 0.2009, "num_input_tokens_seen": 2318104, "step": 25735 }, { "epoch": 6.6891891891891895, "grad_norm": 0.25959357619285583, "learning_rate": 4.2085255178850457e-05, "loss": 0.2749, "num_input_tokens_seen": 2318536, "step": 25740 }, { "epoch": 6.6904885654885655, "grad_norm": 0.4141900837421417, "learning_rate": 4.208111573644975e-05, "loss": 0.111, "num_input_tokens_seen": 2318984, "step": 25745 }, { "epoch": 6.691787941787942, "grad_norm": 0.5525358319282532, "learning_rate": 4.207697541555089e-05, "loss": 0.172, "num_input_tokens_seen": 2319448, "step": 25750 }, { "epoch": 6.6930873180873185, "grad_norm": 1.4015437364578247, "learning_rate": 4.207283421636682e-05, "loss": 0.3268, "num_input_tokens_seen": 2319992, "step": 25755 }, { "epoch": 6.6943866943866945, "grad_norm": 0.8882898688316345, "learning_rate": 4.2068692139110536e-05, "loss": 0.3229, "num_input_tokens_seen": 2320408, "step": 25760 }, { "epoch": 6.695686070686071, "grad_norm": 0.5286097526550293, "learning_rate": 4.2064549183995056e-05, "loss": 0.3063, "num_input_tokens_seen": 2320888, "step": 25765 }, { "epoch": 6.696985446985447, "grad_norm": 0.7792578935623169, "learning_rate": 4.206040535123346e-05, "loss": 0.2898, "num_input_tokens_seen": 2321336, "step": 25770 }, { "epoch": 6.6982848232848236, "grad_norm": 0.4393055737018585, "learning_rate": 4.2056260641038874e-05, "loss": 0.2281, "num_input_tokens_seen": 2321800, "step": 25775 }, { "epoch": 6.6995841995842, "grad_norm": 0.49756288528442383, "learning_rate": 4.205211505362446e-05, "loss": 0.2399, "num_input_tokens_seen": 2322248, "step": 25780 }, { "epoch": 6.700883575883576, "grad_norm": 0.5044428706169128, "learning_rate": 4.204796858920343e-05, "loss": 0.3105, "num_input_tokens_seen": 2322744, "step": 25785 }, { "epoch": 6.702182952182953, "grad_norm": 0.7697969675064087, "learning_rate": 4.204382124798904e-05, "loss": 0.29, "num_input_tokens_seen": 2323192, "step": 25790 }, { "epoch": 6.703482328482329, "grad_norm": 0.5765802264213562, "learning_rate": 4.203967303019459e-05, "loss": 0.2673, "num_input_tokens_seen": 2323640, "step": 25795 }, { "epoch": 6.704781704781705, "grad_norm": 0.6644737124443054, "learning_rate": 4.203552393603343e-05, "loss": 0.2589, "num_input_tokens_seen": 2324072, "step": 25800 }, { "epoch": 6.706081081081081, "grad_norm": 0.7178676128387451, "learning_rate": 4.203137396571896e-05, "loss": 0.2785, "num_input_tokens_seen": 2324504, "step": 25805 }, { "epoch": 6.707380457380458, "grad_norm": 0.731638491153717, "learning_rate": 4.20272231194646e-05, "loss": 0.2863, "num_input_tokens_seen": 2325032, "step": 25810 }, { "epoch": 6.708679833679834, "grad_norm": 0.4760722815990448, "learning_rate": 4.202307139748384e-05, "loss": 0.2451, "num_input_tokens_seen": 2325464, "step": 25815 }, { "epoch": 6.70997920997921, "grad_norm": 0.4288446009159088, "learning_rate": 4.2018918799990216e-05, "loss": 0.2322, "num_input_tokens_seen": 2325896, "step": 25820 }, { "epoch": 6.711278586278587, "grad_norm": 0.4981541037559509, "learning_rate": 4.201476532719728e-05, "loss": 0.2583, "num_input_tokens_seen": 2326328, "step": 25825 }, { "epoch": 6.712577962577963, "grad_norm": 0.4643099009990692, "learning_rate": 4.2010610979318665e-05, "loss": 0.1143, "num_input_tokens_seen": 2326792, "step": 25830 }, { "epoch": 6.713877338877339, "grad_norm": 0.5133453607559204, "learning_rate": 4.200645575656803e-05, "loss": 0.3442, "num_input_tokens_seen": 2327240, "step": 25835 }, { "epoch": 6.715176715176716, "grad_norm": 0.7231188416481018, "learning_rate": 4.2002299659159074e-05, "loss": 0.2155, "num_input_tokens_seen": 2327672, "step": 25840 }, { "epoch": 6.716476091476092, "grad_norm": 0.31274276971817017, "learning_rate": 4.199814268730556e-05, "loss": 0.2665, "num_input_tokens_seen": 2328104, "step": 25845 }, { "epoch": 6.717775467775468, "grad_norm": 0.30479440093040466, "learning_rate": 4.1993984841221286e-05, "loss": 0.1662, "num_input_tokens_seen": 2328536, "step": 25850 }, { "epoch": 6.719074844074844, "grad_norm": 0.33707088232040405, "learning_rate": 4.198982612112008e-05, "loss": 0.3045, "num_input_tokens_seen": 2328984, "step": 25855 }, { "epoch": 6.720374220374221, "grad_norm": 0.48435208201408386, "learning_rate": 4.1985666527215845e-05, "loss": 0.1973, "num_input_tokens_seen": 2329416, "step": 25860 }, { "epoch": 6.721673596673597, "grad_norm": 0.7170389890670776, "learning_rate": 4.1981506059722505e-05, "loss": 0.3103, "num_input_tokens_seen": 2329848, "step": 25865 }, { "epoch": 6.722972972972973, "grad_norm": 0.912333607673645, "learning_rate": 4.197734471885404e-05, "loss": 0.3101, "num_input_tokens_seen": 2330312, "step": 25870 }, { "epoch": 6.724272349272349, "grad_norm": 0.5621986389160156, "learning_rate": 4.197318250482446e-05, "loss": 0.2269, "num_input_tokens_seen": 2330776, "step": 25875 }, { "epoch": 6.725571725571726, "grad_norm": 0.4547116160392761, "learning_rate": 4.196901941784784e-05, "loss": 0.2874, "num_input_tokens_seen": 2331224, "step": 25880 }, { "epoch": 6.726871101871102, "grad_norm": 0.4946645498275757, "learning_rate": 4.19648554581383e-05, "loss": 0.2806, "num_input_tokens_seen": 2331720, "step": 25885 }, { "epoch": 6.728170478170478, "grad_norm": 0.6217547655105591, "learning_rate": 4.1960690625909994e-05, "loss": 0.2566, "num_input_tokens_seen": 2332152, "step": 25890 }, { "epoch": 6.729469854469855, "grad_norm": 0.5049225687980652, "learning_rate": 4.1956524921377106e-05, "loss": 0.2258, "num_input_tokens_seen": 2332584, "step": 25895 }, { "epoch": 6.730769230769231, "grad_norm": 1.1522794961929321, "learning_rate": 4.1952358344753895e-05, "loss": 0.2804, "num_input_tokens_seen": 2333032, "step": 25900 }, { "epoch": 6.732068607068607, "grad_norm": 0.3850127160549164, "learning_rate": 4.194819089625466e-05, "loss": 0.2637, "num_input_tokens_seen": 2333544, "step": 25905 }, { "epoch": 6.733367983367984, "grad_norm": 0.2940993309020996, "learning_rate": 4.194402257609372e-05, "loss": 0.2312, "num_input_tokens_seen": 2334040, "step": 25910 }, { "epoch": 6.73466735966736, "grad_norm": 0.43335068225860596, "learning_rate": 4.193985338448547e-05, "loss": 0.3499, "num_input_tokens_seen": 2334472, "step": 25915 }, { "epoch": 6.735966735966736, "grad_norm": 0.34428974986076355, "learning_rate": 4.193568332164433e-05, "loss": 0.2148, "num_input_tokens_seen": 2334920, "step": 25920 }, { "epoch": 6.737266112266112, "grad_norm": 0.45348814129829407, "learning_rate": 4.1931512387784764e-05, "loss": 0.2226, "num_input_tokens_seen": 2335384, "step": 25925 }, { "epoch": 6.738565488565489, "grad_norm": 0.477854460477829, "learning_rate": 4.1927340583121296e-05, "loss": 0.2555, "num_input_tokens_seen": 2335800, "step": 25930 }, { "epoch": 6.739864864864865, "grad_norm": 0.4235714077949524, "learning_rate": 4.1923167907868475e-05, "loss": 0.2243, "num_input_tokens_seen": 2336312, "step": 25935 }, { "epoch": 6.741164241164241, "grad_norm": 0.6556537747383118, "learning_rate": 4.1918994362240924e-05, "loss": 0.2855, "num_input_tokens_seen": 2336776, "step": 25940 }, { "epoch": 6.742463617463617, "grad_norm": 0.5160104632377625, "learning_rate": 4.191481994645329e-05, "loss": 0.2423, "num_input_tokens_seen": 2337240, "step": 25945 }, { "epoch": 6.743762993762994, "grad_norm": 0.4652421176433563, "learning_rate": 4.191064466072024e-05, "loss": 0.2616, "num_input_tokens_seen": 2337688, "step": 25950 }, { "epoch": 6.74506237006237, "grad_norm": 0.6816482543945312, "learning_rate": 4.1906468505256544e-05, "loss": 0.244, "num_input_tokens_seen": 2338120, "step": 25955 }, { "epoch": 6.746361746361746, "grad_norm": 0.5244298577308655, "learning_rate": 4.190229148027697e-05, "loss": 0.354, "num_input_tokens_seen": 2338568, "step": 25960 }, { "epoch": 6.747661122661123, "grad_norm": 0.6574078798294067, "learning_rate": 4.1898113585996346e-05, "loss": 0.2456, "num_input_tokens_seen": 2339016, "step": 25965 }, { "epoch": 6.748960498960499, "grad_norm": 0.5643046498298645, "learning_rate": 4.189393482262956e-05, "loss": 0.2877, "num_input_tokens_seen": 2339480, "step": 25970 }, { "epoch": 6.750259875259875, "grad_norm": 0.39154699444770813, "learning_rate": 4.188975519039151e-05, "loss": 0.1826, "num_input_tokens_seen": 2339912, "step": 25975 }, { "epoch": 6.751559251559252, "grad_norm": 0.29915061593055725, "learning_rate": 4.1885574689497165e-05, "loss": 0.2191, "num_input_tokens_seen": 2340344, "step": 25980 }, { "epoch": 6.752858627858628, "grad_norm": 0.3399790823459625, "learning_rate": 4.188139332016154e-05, "loss": 0.2142, "num_input_tokens_seen": 2340776, "step": 25985 }, { "epoch": 6.754158004158004, "grad_norm": 0.7343361377716064, "learning_rate": 4.187721108259969e-05, "loss": 0.3339, "num_input_tokens_seen": 2341224, "step": 25990 }, { "epoch": 6.75545738045738, "grad_norm": 0.3414819538593292, "learning_rate": 4.187302797702669e-05, "loss": 0.3366, "num_input_tokens_seen": 2341640, "step": 25995 }, { "epoch": 6.756756756756757, "grad_norm": 0.6118847727775574, "learning_rate": 4.18688440036577e-05, "loss": 0.2896, "num_input_tokens_seen": 2342088, "step": 26000 }, { "epoch": 6.758056133056133, "grad_norm": 0.3829435408115387, "learning_rate": 4.18646591627079e-05, "loss": 0.284, "num_input_tokens_seen": 2342520, "step": 26005 }, { "epoch": 6.759355509355509, "grad_norm": 0.5313363075256348, "learning_rate": 4.1860473454392515e-05, "loss": 0.2627, "num_input_tokens_seen": 2342952, "step": 26010 }, { "epoch": 6.760654885654886, "grad_norm": 0.4971773624420166, "learning_rate": 4.185628687892683e-05, "loss": 0.2996, "num_input_tokens_seen": 2343368, "step": 26015 }, { "epoch": 6.761954261954262, "grad_norm": 0.49730491638183594, "learning_rate": 4.185209943652616e-05, "loss": 0.2892, "num_input_tokens_seen": 2343800, "step": 26020 }, { "epoch": 6.763253638253638, "grad_norm": 0.4110676646232605, "learning_rate": 4.184791112740587e-05, "loss": 0.2425, "num_input_tokens_seen": 2344232, "step": 26025 }, { "epoch": 6.764553014553014, "grad_norm": 0.42184320092201233, "learning_rate": 4.184372195178137e-05, "loss": 0.2057, "num_input_tokens_seen": 2344632, "step": 26030 }, { "epoch": 6.765852390852391, "grad_norm": 0.34691792726516724, "learning_rate": 4.1839531909868104e-05, "loss": 0.195, "num_input_tokens_seen": 2345048, "step": 26035 }, { "epoch": 6.767151767151767, "grad_norm": 0.29803916811943054, "learning_rate": 4.183534100188158e-05, "loss": 0.1085, "num_input_tokens_seen": 2345480, "step": 26040 }, { "epoch": 6.768451143451143, "grad_norm": 0.7094222903251648, "learning_rate": 4.183114922803734e-05, "loss": 0.2715, "num_input_tokens_seen": 2345976, "step": 26045 }, { "epoch": 6.76975051975052, "grad_norm": 0.30220964550971985, "learning_rate": 4.182695658855096e-05, "loss": 0.1407, "num_input_tokens_seen": 2346408, "step": 26050 }, { "epoch": 6.771049896049896, "grad_norm": 0.2848660945892334, "learning_rate": 4.182276308363809e-05, "loss": 0.2986, "num_input_tokens_seen": 2346840, "step": 26055 }, { "epoch": 6.772349272349272, "grad_norm": 0.19123893976211548, "learning_rate": 4.181856871351439e-05, "loss": 0.0678, "num_input_tokens_seen": 2347304, "step": 26060 }, { "epoch": 6.773648648648649, "grad_norm": 0.24077995121479034, "learning_rate": 4.1814373478395586e-05, "loss": 0.2795, "num_input_tokens_seen": 2347720, "step": 26065 }, { "epoch": 6.774948024948025, "grad_norm": 0.694949746131897, "learning_rate": 4.181017737849745e-05, "loss": 0.3376, "num_input_tokens_seen": 2348136, "step": 26070 }, { "epoch": 6.776247401247401, "grad_norm": 0.3217350244522095, "learning_rate": 4.180598041403578e-05, "loss": 0.2134, "num_input_tokens_seen": 2348616, "step": 26075 }, { "epoch": 6.777546777546777, "grad_norm": 0.38600340485572815, "learning_rate": 4.1801782585226435e-05, "loss": 0.105, "num_input_tokens_seen": 2349064, "step": 26080 }, { "epoch": 6.778846153846154, "grad_norm": 0.3119024336338043, "learning_rate": 4.179758389228531e-05, "loss": 0.2024, "num_input_tokens_seen": 2349496, "step": 26085 }, { "epoch": 6.78014553014553, "grad_norm": 0.7400250434875488, "learning_rate": 4.1793384335428356e-05, "loss": 0.2546, "num_input_tokens_seen": 2349976, "step": 26090 }, { "epoch": 6.781444906444906, "grad_norm": 0.2956269383430481, "learning_rate": 4.178918391487155e-05, "loss": 0.2346, "num_input_tokens_seen": 2350424, "step": 26095 }, { "epoch": 6.782744282744282, "grad_norm": 0.3368988633155823, "learning_rate": 4.178498263083093e-05, "loss": 0.2183, "num_input_tokens_seen": 2350872, "step": 26100 }, { "epoch": 6.784043659043659, "grad_norm": 0.29474160075187683, "learning_rate": 4.1780780483522575e-05, "loss": 0.2525, "num_input_tokens_seen": 2351304, "step": 26105 }, { "epoch": 6.785343035343035, "grad_norm": 0.2747010588645935, "learning_rate": 4.177657747316259e-05, "loss": 0.1634, "num_input_tokens_seen": 2351704, "step": 26110 }, { "epoch": 6.786642411642411, "grad_norm": 0.21281938254833221, "learning_rate": 4.1772373599967165e-05, "loss": 0.0862, "num_input_tokens_seen": 2352184, "step": 26115 }, { "epoch": 6.787941787941788, "grad_norm": 0.6161040663719177, "learning_rate": 4.176816886415248e-05, "loss": 0.278, "num_input_tokens_seen": 2352616, "step": 26120 }, { "epoch": 6.789241164241164, "grad_norm": 0.5623556971549988, "learning_rate": 4.176396326593482e-05, "loss": 0.381, "num_input_tokens_seen": 2353064, "step": 26125 }, { "epoch": 6.79054054054054, "grad_norm": 0.526049792766571, "learning_rate": 4.175975680553046e-05, "loss": 0.2514, "num_input_tokens_seen": 2353512, "step": 26130 }, { "epoch": 6.791839916839917, "grad_norm": 0.3539784252643585, "learning_rate": 4.175554948315575e-05, "loss": 0.338, "num_input_tokens_seen": 2353976, "step": 26135 }, { "epoch": 6.793139293139293, "grad_norm": 0.3972538709640503, "learning_rate": 4.175134129902707e-05, "loss": 0.2363, "num_input_tokens_seen": 2354408, "step": 26140 }, { "epoch": 6.794438669438669, "grad_norm": 0.46494391560554504, "learning_rate": 4.174713225336086e-05, "loss": 0.2066, "num_input_tokens_seen": 2354872, "step": 26145 }, { "epoch": 6.795738045738045, "grad_norm": 0.40916937589645386, "learning_rate": 4.1742922346373595e-05, "loss": 0.2632, "num_input_tokens_seen": 2355320, "step": 26150 }, { "epoch": 6.797037422037422, "grad_norm": 0.7625343203544617, "learning_rate": 4.173871157828179e-05, "loss": 0.3236, "num_input_tokens_seen": 2355784, "step": 26155 }, { "epoch": 6.798336798336798, "grad_norm": 0.4807976186275482, "learning_rate": 4.173449994930201e-05, "loss": 0.2524, "num_input_tokens_seen": 2356184, "step": 26160 }, { "epoch": 6.799636174636174, "grad_norm": 0.2741120159626007, "learning_rate": 4.1730287459650866e-05, "loss": 0.2832, "num_input_tokens_seen": 2356616, "step": 26165 }, { "epoch": 6.8009355509355505, "grad_norm": 0.7087774276733398, "learning_rate": 4.1726074109545e-05, "loss": 0.2727, "num_input_tokens_seen": 2357080, "step": 26170 }, { "epoch": 6.802234927234927, "grad_norm": 0.22933867573738098, "learning_rate": 4.172185989920112e-05, "loss": 0.2724, "num_input_tokens_seen": 2357512, "step": 26175 }, { "epoch": 6.803534303534303, "grad_norm": 0.7151850461959839, "learning_rate": 4.1717644828835965e-05, "loss": 0.2586, "num_input_tokens_seen": 2357928, "step": 26180 }, { "epoch": 6.8048336798336795, "grad_norm": 0.4876198172569275, "learning_rate": 4.171342889866632e-05, "loss": 0.3165, "num_input_tokens_seen": 2358376, "step": 26185 }, { "epoch": 6.806133056133056, "grad_norm": 0.4959189295768738, "learning_rate": 4.170921210890901e-05, "loss": 0.2542, "num_input_tokens_seen": 2358824, "step": 26190 }, { "epoch": 6.8074324324324325, "grad_norm": 0.26149219274520874, "learning_rate": 4.1704994459780914e-05, "loss": 0.2796, "num_input_tokens_seen": 2359256, "step": 26195 }, { "epoch": 6.8087318087318085, "grad_norm": 0.6477872133255005, "learning_rate": 4.170077595149894e-05, "loss": 0.253, "num_input_tokens_seen": 2359752, "step": 26200 }, { "epoch": 6.810031185031185, "grad_norm": 0.39619091153144836, "learning_rate": 4.1696556584280064e-05, "loss": 0.2325, "num_input_tokens_seen": 2360200, "step": 26205 }, { "epoch": 6.8113305613305615, "grad_norm": 0.34289324283599854, "learning_rate": 4.1692336358341285e-05, "loss": 0.2693, "num_input_tokens_seen": 2360632, "step": 26210 }, { "epoch": 6.8126299376299375, "grad_norm": 0.31122729182243347, "learning_rate": 4.168811527389965e-05, "loss": 0.1758, "num_input_tokens_seen": 2361064, "step": 26215 }, { "epoch": 6.813929313929314, "grad_norm": 0.7824474573135376, "learning_rate": 4.168389333117226e-05, "loss": 0.3454, "num_input_tokens_seen": 2361528, "step": 26220 }, { "epoch": 6.8152286902286905, "grad_norm": 0.3397117257118225, "learning_rate": 4.1679670530376244e-05, "loss": 0.2371, "num_input_tokens_seen": 2361976, "step": 26225 }, { "epoch": 6.8165280665280665, "grad_norm": 0.3660895526409149, "learning_rate": 4.16754468717288e-05, "loss": 0.2112, "num_input_tokens_seen": 2362440, "step": 26230 }, { "epoch": 6.817827442827443, "grad_norm": 0.6819344162940979, "learning_rate": 4.167122235544714e-05, "loss": 0.2599, "num_input_tokens_seen": 2362888, "step": 26235 }, { "epoch": 6.8191268191268195, "grad_norm": 0.4852336645126343, "learning_rate": 4.166699698174854e-05, "loss": 0.2817, "num_input_tokens_seen": 2363368, "step": 26240 }, { "epoch": 6.8204261954261955, "grad_norm": 0.713406503200531, "learning_rate": 4.166277075085031e-05, "loss": 0.2737, "num_input_tokens_seen": 2363816, "step": 26245 }, { "epoch": 6.821725571725572, "grad_norm": 0.46138784289360046, "learning_rate": 4.165854366296983e-05, "loss": 0.1842, "num_input_tokens_seen": 2364280, "step": 26250 }, { "epoch": 6.823024948024948, "grad_norm": 0.3748614192008972, "learning_rate": 4.1654315718324475e-05, "loss": 0.121, "num_input_tokens_seen": 2364744, "step": 26255 }, { "epoch": 6.824324324324325, "grad_norm": 0.9009304642677307, "learning_rate": 4.165008691713171e-05, "loss": 0.223, "num_input_tokens_seen": 2365208, "step": 26260 }, { "epoch": 6.825623700623701, "grad_norm": 0.6369712352752686, "learning_rate": 4.164585725960902e-05, "loss": 0.3451, "num_input_tokens_seen": 2365672, "step": 26265 }, { "epoch": 6.826923076923077, "grad_norm": 0.4579910337924957, "learning_rate": 4.164162674597395e-05, "loss": 0.2811, "num_input_tokens_seen": 2366120, "step": 26270 }, { "epoch": 6.828222453222454, "grad_norm": 0.34393051266670227, "learning_rate": 4.163739537644406e-05, "loss": 0.1861, "num_input_tokens_seen": 2366568, "step": 26275 }, { "epoch": 6.82952182952183, "grad_norm": 0.2949966490268707, "learning_rate": 4.1633163151236986e-05, "loss": 0.3035, "num_input_tokens_seen": 2367032, "step": 26280 }, { "epoch": 6.830821205821206, "grad_norm": 0.7759683132171631, "learning_rate": 4.162893007057041e-05, "loss": 0.3085, "num_input_tokens_seen": 2367464, "step": 26285 }, { "epoch": 6.832120582120583, "grad_norm": 0.44964149594306946, "learning_rate": 4.1624696134662e-05, "loss": 0.2565, "num_input_tokens_seen": 2367912, "step": 26290 }, { "epoch": 6.833419958419959, "grad_norm": 0.5139687061309814, "learning_rate": 4.1620461343729554e-05, "loss": 0.2026, "num_input_tokens_seen": 2368376, "step": 26295 }, { "epoch": 6.834719334719335, "grad_norm": 0.4086044132709503, "learning_rate": 4.161622569799086e-05, "loss": 0.2767, "num_input_tokens_seen": 2368840, "step": 26300 }, { "epoch": 6.836018711018711, "grad_norm": 0.3984735310077667, "learning_rate": 4.161198919766375e-05, "loss": 0.252, "num_input_tokens_seen": 2369288, "step": 26305 }, { "epoch": 6.837318087318088, "grad_norm": 0.8239732384681702, "learning_rate": 4.160775184296612e-05, "loss": 0.2552, "num_input_tokens_seen": 2369720, "step": 26310 }, { "epoch": 6.838617463617464, "grad_norm": 0.7130942940711975, "learning_rate": 4.16035136341159e-05, "loss": 0.3173, "num_input_tokens_seen": 2370184, "step": 26315 }, { "epoch": 6.83991683991684, "grad_norm": 0.41398829221725464, "learning_rate": 4.159927457133107e-05, "loss": 0.3065, "num_input_tokens_seen": 2370616, "step": 26320 }, { "epoch": 6.841216216216216, "grad_norm": 0.46925434470176697, "learning_rate": 4.159503465482964e-05, "loss": 0.2012, "num_input_tokens_seen": 2371048, "step": 26325 }, { "epoch": 6.842515592515593, "grad_norm": 0.28359872102737427, "learning_rate": 4.159079388482968e-05, "loss": 0.2671, "num_input_tokens_seen": 2371528, "step": 26330 }, { "epoch": 6.843814968814969, "grad_norm": 0.3216305673122406, "learning_rate": 4.1586552261549295e-05, "loss": 0.1379, "num_input_tokens_seen": 2371976, "step": 26335 }, { "epoch": 6.845114345114345, "grad_norm": 0.5356093049049377, "learning_rate": 4.158230978520663e-05, "loss": 0.1984, "num_input_tokens_seen": 2372440, "step": 26340 }, { "epoch": 6.846413721413722, "grad_norm": 0.2930409014225006, "learning_rate": 4.157806645601988e-05, "loss": 0.1467, "num_input_tokens_seen": 2372936, "step": 26345 }, { "epoch": 6.847713097713098, "grad_norm": 0.809766411781311, "learning_rate": 4.15738222742073e-05, "loss": 0.3833, "num_input_tokens_seen": 2373432, "step": 26350 }, { "epoch": 6.849012474012474, "grad_norm": 0.3537856638431549, "learning_rate": 4.156957723998715e-05, "loss": 0.1467, "num_input_tokens_seen": 2373896, "step": 26355 }, { "epoch": 6.850311850311851, "grad_norm": 0.3741576373577118, "learning_rate": 4.156533135357777e-05, "loss": 0.3347, "num_input_tokens_seen": 2374376, "step": 26360 }, { "epoch": 6.851611226611227, "grad_norm": 0.20858296751976013, "learning_rate": 4.156108461519753e-05, "loss": 0.2582, "num_input_tokens_seen": 2374824, "step": 26365 }, { "epoch": 6.852910602910603, "grad_norm": 0.22482629120349884, "learning_rate": 4.155683702506483e-05, "loss": 0.2604, "num_input_tokens_seen": 2375256, "step": 26370 }, { "epoch": 6.854209979209979, "grad_norm": 0.30996474623680115, "learning_rate": 4.1552588583398143e-05, "loss": 0.2912, "num_input_tokens_seen": 2375720, "step": 26375 }, { "epoch": 6.855509355509356, "grad_norm": 0.20313848555088043, "learning_rate": 4.154833929041597e-05, "loss": 0.2929, "num_input_tokens_seen": 2376152, "step": 26380 }, { "epoch": 6.856808731808732, "grad_norm": 0.5902697443962097, "learning_rate": 4.154408914633685e-05, "loss": 0.3008, "num_input_tokens_seen": 2376584, "step": 26385 }, { "epoch": 6.858108108108108, "grad_norm": 0.4572834372520447, "learning_rate": 4.153983815137937e-05, "loss": 0.2492, "num_input_tokens_seen": 2377032, "step": 26390 }, { "epoch": 6.859407484407484, "grad_norm": 0.13916264474391937, "learning_rate": 4.153558630576217e-05, "loss": 0.278, "num_input_tokens_seen": 2377464, "step": 26395 }, { "epoch": 6.860706860706861, "grad_norm": 0.42452436685562134, "learning_rate": 4.153133360970392e-05, "loss": 0.2324, "num_input_tokens_seen": 2377928, "step": 26400 }, { "epoch": 6.862006237006237, "grad_norm": 0.2831100821495056, "learning_rate": 4.152708006342334e-05, "loss": 0.2291, "num_input_tokens_seen": 2378392, "step": 26405 }, { "epoch": 6.863305613305613, "grad_norm": 0.20766809582710266, "learning_rate": 4.1522825667139206e-05, "loss": 0.1805, "num_input_tokens_seen": 2378856, "step": 26410 }, { "epoch": 6.86460498960499, "grad_norm": 0.2691105902194977, "learning_rate": 4.151857042107031e-05, "loss": 0.2056, "num_input_tokens_seen": 2379320, "step": 26415 }, { "epoch": 6.865904365904366, "grad_norm": 0.21351872384548187, "learning_rate": 4.151431432543551e-05, "loss": 0.156, "num_input_tokens_seen": 2379736, "step": 26420 }, { "epoch": 6.867203742203742, "grad_norm": 0.4473270773887634, "learning_rate": 4.15100573804537e-05, "loss": 0.2178, "num_input_tokens_seen": 2380200, "step": 26425 }, { "epoch": 6.868503118503119, "grad_norm": 0.21058127284049988, "learning_rate": 4.150579958634382e-05, "loss": 0.2151, "num_input_tokens_seen": 2380664, "step": 26430 }, { "epoch": 6.869802494802495, "grad_norm": 0.19353096187114716, "learning_rate": 4.150154094332486e-05, "loss": 0.1341, "num_input_tokens_seen": 2381096, "step": 26435 }, { "epoch": 6.871101871101871, "grad_norm": 0.1763746589422226, "learning_rate": 4.1497281451615824e-05, "loss": 0.2172, "num_input_tokens_seen": 2381544, "step": 26440 }, { "epoch": 6.872401247401247, "grad_norm": 0.1969282329082489, "learning_rate": 4.1493021111435805e-05, "loss": 0.2862, "num_input_tokens_seen": 2381960, "step": 26445 }, { "epoch": 6.873700623700624, "grad_norm": 0.4029649794101715, "learning_rate": 4.1488759923003904e-05, "loss": 0.2676, "num_input_tokens_seen": 2382408, "step": 26450 }, { "epoch": 6.875, "grad_norm": 0.7135408520698547, "learning_rate": 4.1484497886539286e-05, "loss": 0.3344, "num_input_tokens_seen": 2382856, "step": 26455 }, { "epoch": 6.876299376299376, "grad_norm": 0.40646448731422424, "learning_rate": 4.148023500226115e-05, "loss": 0.1632, "num_input_tokens_seen": 2383288, "step": 26460 }, { "epoch": 6.877598752598753, "grad_norm": 1.1066014766693115, "learning_rate": 4.147597127038873e-05, "loss": 0.2822, "num_input_tokens_seen": 2383720, "step": 26465 }, { "epoch": 6.878898128898129, "grad_norm": 0.5030465126037598, "learning_rate": 4.147170669114132e-05, "loss": 0.2756, "num_input_tokens_seen": 2384168, "step": 26470 }, { "epoch": 6.880197505197505, "grad_norm": 0.5802006721496582, "learning_rate": 4.146744126473826e-05, "loss": 0.2449, "num_input_tokens_seen": 2384584, "step": 26475 }, { "epoch": 6.881496881496881, "grad_norm": 0.5647400617599487, "learning_rate": 4.1463174991398914e-05, "loss": 0.2719, "num_input_tokens_seen": 2385016, "step": 26480 }, { "epoch": 6.882796257796258, "grad_norm": 0.35664135217666626, "learning_rate": 4.1458907871342694e-05, "loss": 0.2009, "num_input_tokens_seen": 2385448, "step": 26485 }, { "epoch": 6.884095634095634, "grad_norm": 0.38206595182418823, "learning_rate": 4.145463990478908e-05, "loss": 0.2278, "num_input_tokens_seen": 2385928, "step": 26490 }, { "epoch": 6.88539501039501, "grad_norm": 0.506562352180481, "learning_rate": 4.1450371091957576e-05, "loss": 0.183, "num_input_tokens_seen": 2386376, "step": 26495 }, { "epoch": 6.886694386694387, "grad_norm": 0.3070504665374756, "learning_rate": 4.144610143306772e-05, "loss": 0.2203, "num_input_tokens_seen": 2386792, "step": 26500 }, { "epoch": 6.887993762993763, "grad_norm": 0.5155667662620544, "learning_rate": 4.144183092833911e-05, "loss": 0.2502, "num_input_tokens_seen": 2387208, "step": 26505 }, { "epoch": 6.889293139293139, "grad_norm": 0.547186017036438, "learning_rate": 4.1437559577991376e-05, "loss": 0.2488, "num_input_tokens_seen": 2387672, "step": 26510 }, { "epoch": 6.890592515592516, "grad_norm": 0.49878716468811035, "learning_rate": 4.14332873822442e-05, "loss": 0.2208, "num_input_tokens_seen": 2388072, "step": 26515 }, { "epoch": 6.891891891891892, "grad_norm": 0.3420043885707855, "learning_rate": 4.142901434131732e-05, "loss": 0.0835, "num_input_tokens_seen": 2388584, "step": 26520 }, { "epoch": 6.893191268191268, "grad_norm": 0.2641647458076477, "learning_rate": 4.1424740455430486e-05, "loss": 0.1382, "num_input_tokens_seen": 2389016, "step": 26525 }, { "epoch": 6.894490644490644, "grad_norm": 0.248973086476326, "learning_rate": 4.1420465724803516e-05, "loss": 0.2379, "num_input_tokens_seen": 2389464, "step": 26530 }, { "epoch": 6.895790020790021, "grad_norm": 0.34430623054504395, "learning_rate": 4.141619014965626e-05, "loss": 0.2369, "num_input_tokens_seen": 2389864, "step": 26535 }, { "epoch": 6.897089397089397, "grad_norm": 1.019827961921692, "learning_rate": 4.141191373020861e-05, "loss": 0.4794, "num_input_tokens_seen": 2390312, "step": 26540 }, { "epoch": 6.898388773388773, "grad_norm": 0.4307378828525543, "learning_rate": 4.140763646668052e-05, "loss": 0.3769, "num_input_tokens_seen": 2390776, "step": 26545 }, { "epoch": 6.899688149688149, "grad_norm": 0.25274378061294556, "learning_rate": 4.140335835929196e-05, "loss": 0.223, "num_input_tokens_seen": 2391224, "step": 26550 }, { "epoch": 6.900987525987526, "grad_norm": 0.38502559065818787, "learning_rate": 4.139907940826296e-05, "loss": 0.2532, "num_input_tokens_seen": 2391640, "step": 26555 }, { "epoch": 6.902286902286902, "grad_norm": 0.5167540907859802, "learning_rate": 4.1394799613813595e-05, "loss": 0.2656, "num_input_tokens_seen": 2392104, "step": 26560 }, { "epoch": 6.903586278586278, "grad_norm": 0.4359169900417328, "learning_rate": 4.139051897616397e-05, "loss": 0.2168, "num_input_tokens_seen": 2392552, "step": 26565 }, { "epoch": 6.904885654885655, "grad_norm": 0.36090123653411865, "learning_rate": 4.1386237495534266e-05, "loss": 0.3134, "num_input_tokens_seen": 2393000, "step": 26570 }, { "epoch": 6.906185031185031, "grad_norm": 0.5694021582603455, "learning_rate": 4.138195517214466e-05, "loss": 0.2288, "num_input_tokens_seen": 2393448, "step": 26575 }, { "epoch": 6.907484407484407, "grad_norm": 0.6736671924591064, "learning_rate": 4.13776720062154e-05, "loss": 0.2502, "num_input_tokens_seen": 2393896, "step": 26580 }, { "epoch": 6.908783783783784, "grad_norm": 0.5611766576766968, "learning_rate": 4.1373387997966775e-05, "loss": 0.2914, "num_input_tokens_seen": 2394344, "step": 26585 }, { "epoch": 6.91008316008316, "grad_norm": 0.8903584480285645, "learning_rate": 4.1369103147619125e-05, "loss": 0.3724, "num_input_tokens_seen": 2394808, "step": 26590 }, { "epoch": 6.911382536382536, "grad_norm": 0.3234331011772156, "learning_rate": 4.136481745539281e-05, "loss": 0.2406, "num_input_tokens_seen": 2395256, "step": 26595 }, { "epoch": 6.912681912681912, "grad_norm": 0.616379976272583, "learning_rate": 4.136053092150826e-05, "loss": 0.2993, "num_input_tokens_seen": 2395736, "step": 26600 }, { "epoch": 6.913981288981289, "grad_norm": 0.6469025611877441, "learning_rate": 4.135624354618592e-05, "loss": 0.2358, "num_input_tokens_seen": 2396232, "step": 26605 }, { "epoch": 6.915280665280665, "grad_norm": 0.4385020434856415, "learning_rate": 4.13519553296463e-05, "loss": 0.2713, "num_input_tokens_seen": 2396664, "step": 26610 }, { "epoch": 6.916580041580041, "grad_norm": 0.27600282430648804, "learning_rate": 4.1347666272109955e-05, "loss": 0.2934, "num_input_tokens_seen": 2397096, "step": 26615 }, { "epoch": 6.917879417879417, "grad_norm": 0.46153271198272705, "learning_rate": 4.134337637379747e-05, "loss": 0.1747, "num_input_tokens_seen": 2397528, "step": 26620 }, { "epoch": 6.919178794178794, "grad_norm": 0.28544381260871887, "learning_rate": 4.1339085634929485e-05, "loss": 0.1537, "num_input_tokens_seen": 2397976, "step": 26625 }, { "epoch": 6.92047817047817, "grad_norm": 0.8997297286987305, "learning_rate": 4.133479405572667e-05, "loss": 0.2004, "num_input_tokens_seen": 2398440, "step": 26630 }, { "epoch": 6.921777546777546, "grad_norm": 0.6501917243003845, "learning_rate": 4.133050163640974e-05, "loss": 0.2222, "num_input_tokens_seen": 2398856, "step": 26635 }, { "epoch": 6.923076923076923, "grad_norm": 0.7622374296188354, "learning_rate": 4.132620837719946e-05, "loss": 0.2512, "num_input_tokens_seen": 2399336, "step": 26640 }, { "epoch": 6.924376299376299, "grad_norm": 0.4466249942779541, "learning_rate": 4.132191427831664e-05, "loss": 0.2437, "num_input_tokens_seen": 2399768, "step": 26645 }, { "epoch": 6.925675675675675, "grad_norm": 0.47570160031318665, "learning_rate": 4.1317619339982136e-05, "loss": 0.2647, "num_input_tokens_seen": 2400232, "step": 26650 }, { "epoch": 6.926975051975052, "grad_norm": 0.3293202221393585, "learning_rate": 4.131332356241683e-05, "loss": 0.1384, "num_input_tokens_seen": 2400664, "step": 26655 }, { "epoch": 6.928274428274428, "grad_norm": 0.47231239080429077, "learning_rate": 4.130902694584167e-05, "loss": 0.3107, "num_input_tokens_seen": 2401144, "step": 26660 }, { "epoch": 6.9295738045738045, "grad_norm": 0.6767469048500061, "learning_rate": 4.1304729490477624e-05, "loss": 0.2986, "num_input_tokens_seen": 2401608, "step": 26665 }, { "epoch": 6.9308731808731805, "grad_norm": 0.39865633845329285, "learning_rate": 4.1300431196545715e-05, "loss": 0.3523, "num_input_tokens_seen": 2402072, "step": 26670 }, { "epoch": 6.932172557172557, "grad_norm": 1.068910002708435, "learning_rate": 4.129613206426701e-05, "loss": 0.267, "num_input_tokens_seen": 2402520, "step": 26675 }, { "epoch": 6.9334719334719335, "grad_norm": 0.3746998608112335, "learning_rate": 4.1291832093862625e-05, "loss": 0.2946, "num_input_tokens_seen": 2402952, "step": 26680 }, { "epoch": 6.9347713097713095, "grad_norm": 0.9008748531341553, "learning_rate": 4.12875312855537e-05, "loss": 0.2993, "num_input_tokens_seen": 2403416, "step": 26685 }, { "epoch": 6.936070686070686, "grad_norm": 0.6270624995231628, "learning_rate": 4.128322963956143e-05, "loss": 0.2735, "num_input_tokens_seen": 2403848, "step": 26690 }, { "epoch": 6.9373700623700625, "grad_norm": 0.4047134518623352, "learning_rate": 4.127892715610707e-05, "loss": 0.181, "num_input_tokens_seen": 2404280, "step": 26695 }, { "epoch": 6.9386694386694385, "grad_norm": 0.3201245963573456, "learning_rate": 4.127462383541188e-05, "loss": 0.2067, "num_input_tokens_seen": 2404760, "step": 26700 }, { "epoch": 6.939968814968815, "grad_norm": 0.24921812117099762, "learning_rate": 4.12703196776972e-05, "loss": 0.2216, "num_input_tokens_seen": 2405240, "step": 26705 }, { "epoch": 6.9412681912681915, "grad_norm": 1.0962144136428833, "learning_rate": 4.1266014683184384e-05, "loss": 0.3691, "num_input_tokens_seen": 2405688, "step": 26710 }, { "epoch": 6.9425675675675675, "grad_norm": 0.46220862865448, "learning_rate": 4.126170885209485e-05, "loss": 0.3145, "num_input_tokens_seen": 2406152, "step": 26715 }, { "epoch": 6.943866943866944, "grad_norm": 0.43402671813964844, "learning_rate": 4.125740218465005e-05, "loss": 0.3026, "num_input_tokens_seen": 2406600, "step": 26720 }, { "epoch": 6.9451663201663205, "grad_norm": 0.635645866394043, "learning_rate": 4.125309468107147e-05, "loss": 0.2274, "num_input_tokens_seen": 2407032, "step": 26725 }, { "epoch": 6.946465696465697, "grad_norm": 0.6279147863388062, "learning_rate": 4.124878634158067e-05, "loss": 0.2712, "num_input_tokens_seen": 2407448, "step": 26730 }, { "epoch": 6.947765072765073, "grad_norm": 0.6639412045478821, "learning_rate": 4.124447716639921e-05, "loss": 0.2788, "num_input_tokens_seen": 2407912, "step": 26735 }, { "epoch": 6.9490644490644495, "grad_norm": 0.5823150277137756, "learning_rate": 4.124016715574873e-05, "loss": 0.2778, "num_input_tokens_seen": 2408376, "step": 26740 }, { "epoch": 6.950363825363826, "grad_norm": 1.2327083349227905, "learning_rate": 4.123585630985088e-05, "loss": 0.2854, "num_input_tokens_seen": 2408840, "step": 26745 }, { "epoch": 6.951663201663202, "grad_norm": 0.5253025889396667, "learning_rate": 4.12315446289274e-05, "loss": 0.3243, "num_input_tokens_seen": 2409304, "step": 26750 }, { "epoch": 6.952962577962578, "grad_norm": 0.5250392556190491, "learning_rate": 4.122723211320002e-05, "loss": 0.2157, "num_input_tokens_seen": 2409736, "step": 26755 }, { "epoch": 6.954261954261955, "grad_norm": 0.8263199925422668, "learning_rate": 4.122291876289055e-05, "loss": 0.3048, "num_input_tokens_seen": 2410200, "step": 26760 }, { "epoch": 6.955561330561331, "grad_norm": 0.43968838453292847, "learning_rate": 4.121860457822082e-05, "loss": 0.2826, "num_input_tokens_seen": 2410696, "step": 26765 }, { "epoch": 6.956860706860707, "grad_norm": 0.5704309940338135, "learning_rate": 4.121428955941271e-05, "loss": 0.3082, "num_input_tokens_seen": 2411128, "step": 26770 }, { "epoch": 6.958160083160083, "grad_norm": 0.41575887799263, "learning_rate": 4.120997370668815e-05, "loss": 0.3294, "num_input_tokens_seen": 2411608, "step": 26775 }, { "epoch": 6.95945945945946, "grad_norm": 0.5638936161994934, "learning_rate": 4.1205657020269126e-05, "loss": 0.2663, "num_input_tokens_seen": 2412040, "step": 26780 }, { "epoch": 6.960758835758836, "grad_norm": 0.23087993264198303, "learning_rate": 4.120133950037763e-05, "loss": 0.2766, "num_input_tokens_seen": 2412440, "step": 26785 }, { "epoch": 6.962058212058212, "grad_norm": 0.7833049297332764, "learning_rate": 4.1197021147235715e-05, "loss": 0.271, "num_input_tokens_seen": 2412888, "step": 26790 }, { "epoch": 6.963357588357589, "grad_norm": 0.5173795223236084, "learning_rate": 4.119270196106549e-05, "loss": 0.2539, "num_input_tokens_seen": 2413368, "step": 26795 }, { "epoch": 6.964656964656965, "grad_norm": 0.8040151596069336, "learning_rate": 4.118838194208908e-05, "loss": 0.2562, "num_input_tokens_seen": 2413800, "step": 26800 }, { "epoch": 6.965956340956341, "grad_norm": 0.397962361574173, "learning_rate": 4.1184061090528683e-05, "loss": 0.2155, "num_input_tokens_seen": 2414264, "step": 26805 }, { "epoch": 6.967255717255718, "grad_norm": 0.8737629055976868, "learning_rate": 4.117973940660651e-05, "loss": 0.401, "num_input_tokens_seen": 2414712, "step": 26810 }, { "epoch": 6.968555093555094, "grad_norm": 0.695267379283905, "learning_rate": 4.1175416890544836e-05, "loss": 0.2381, "num_input_tokens_seen": 2415144, "step": 26815 }, { "epoch": 6.96985446985447, "grad_norm": 0.39533039927482605, "learning_rate": 4.1171093542565976e-05, "loss": 0.2953, "num_input_tokens_seen": 2415592, "step": 26820 }, { "epoch": 6.971153846153846, "grad_norm": 0.711142897605896, "learning_rate": 4.116676936289228e-05, "loss": 0.2841, "num_input_tokens_seen": 2416056, "step": 26825 }, { "epoch": 6.972453222453223, "grad_norm": 0.7762202620506287, "learning_rate": 4.116244435174615e-05, "loss": 0.2847, "num_input_tokens_seen": 2416504, "step": 26830 }, { "epoch": 6.973752598752599, "grad_norm": 0.30159974098205566, "learning_rate": 4.115811850935002e-05, "loss": 0.2354, "num_input_tokens_seen": 2416920, "step": 26835 }, { "epoch": 6.975051975051975, "grad_norm": 0.4339241683483124, "learning_rate": 4.115379183592637e-05, "loss": 0.2088, "num_input_tokens_seen": 2417352, "step": 26840 }, { "epoch": 6.976351351351351, "grad_norm": 0.44627025723457336, "learning_rate": 4.114946433169773e-05, "loss": 0.247, "num_input_tokens_seen": 2417816, "step": 26845 }, { "epoch": 6.977650727650728, "grad_norm": 0.3747621178627014, "learning_rate": 4.1145135996886665e-05, "loss": 0.2876, "num_input_tokens_seen": 2418264, "step": 26850 }, { "epoch": 6.978950103950104, "grad_norm": 0.3414180874824524, "learning_rate": 4.114080683171579e-05, "loss": 0.2531, "num_input_tokens_seen": 2418696, "step": 26855 }, { "epoch": 6.98024948024948, "grad_norm": 0.2876482903957367, "learning_rate": 4.113647683640774e-05, "loss": 0.2145, "num_input_tokens_seen": 2419128, "step": 26860 }, { "epoch": 6.981548856548857, "grad_norm": 0.29738378524780273, "learning_rate": 4.113214601118524e-05, "loss": 0.3036, "num_input_tokens_seen": 2419592, "step": 26865 }, { "epoch": 6.982848232848233, "grad_norm": 0.32379740476608276, "learning_rate": 4.112781435627101e-05, "loss": 0.2215, "num_input_tokens_seen": 2420024, "step": 26870 }, { "epoch": 6.984147609147609, "grad_norm": 0.4804321229457855, "learning_rate": 4.112348187188783e-05, "loss": 0.2485, "num_input_tokens_seen": 2420472, "step": 26875 }, { "epoch": 6.985446985446986, "grad_norm": 0.3486195206642151, "learning_rate": 4.111914855825853e-05, "loss": 0.1753, "num_input_tokens_seen": 2420920, "step": 26880 }, { "epoch": 6.986746361746362, "grad_norm": 0.5080465078353882, "learning_rate": 4.111481441560598e-05, "loss": 0.2261, "num_input_tokens_seen": 2421320, "step": 26885 }, { "epoch": 6.988045738045738, "grad_norm": 0.4508894085884094, "learning_rate": 4.1110479444153084e-05, "loss": 0.3989, "num_input_tokens_seen": 2421752, "step": 26890 }, { "epoch": 6.989345114345114, "grad_norm": 0.6458614468574524, "learning_rate": 4.110614364412278e-05, "loss": 0.2066, "num_input_tokens_seen": 2422216, "step": 26895 }, { "epoch": 6.990644490644491, "grad_norm": 0.32647621631622314, "learning_rate": 4.1101807015738095e-05, "loss": 0.2608, "num_input_tokens_seen": 2422680, "step": 26900 }, { "epoch": 6.991943866943867, "grad_norm": 0.6169292330741882, "learning_rate": 4.1097469559222034e-05, "loss": 0.265, "num_input_tokens_seen": 2423144, "step": 26905 }, { "epoch": 6.993243243243243, "grad_norm": 0.8843952417373657, "learning_rate": 4.1093131274797694e-05, "loss": 0.2791, "num_input_tokens_seen": 2423576, "step": 26910 }, { "epoch": 6.99454261954262, "grad_norm": 0.49162670969963074, "learning_rate": 4.108879216268819e-05, "loss": 0.2647, "num_input_tokens_seen": 2424024, "step": 26915 }, { "epoch": 6.995841995841996, "grad_norm": 1.2764607667922974, "learning_rate": 4.10844522231167e-05, "loss": 0.3123, "num_input_tokens_seen": 2424488, "step": 26920 }, { "epoch": 6.997141372141372, "grad_norm": 0.6499413847923279, "learning_rate": 4.108011145630642e-05, "loss": 0.2563, "num_input_tokens_seen": 2424936, "step": 26925 }, { "epoch": 6.998440748440748, "grad_norm": 0.41949182748794556, "learning_rate": 4.10757698624806e-05, "loss": 0.2252, "num_input_tokens_seen": 2425384, "step": 26930 }, { "epoch": 6.999740124740125, "grad_norm": 0.5113545060157776, "learning_rate": 4.107142744186252e-05, "loss": 0.2849, "num_input_tokens_seen": 2425880, "step": 26935 }, { "epoch": 7.0, "eval_loss": 0.2418590486049652, "eval_runtime": 13.1642, "eval_samples_per_second": 65.025, "eval_steps_per_second": 32.513, "num_input_tokens_seen": 2425920, "step": 26936 }, { "epoch": 7.001039501039501, "grad_norm": 0.48936381936073303, "learning_rate": 4.106708419467553e-05, "loss": 0.2311, "num_input_tokens_seen": 2426272, "step": 26940 }, { "epoch": 7.002338877338877, "grad_norm": 0.49697861075401306, "learning_rate": 4.1062740121143016e-05, "loss": 0.2141, "num_input_tokens_seen": 2426736, "step": 26945 }, { "epoch": 7.003638253638254, "grad_norm": 0.3132886290550232, "learning_rate": 4.1058395221488375e-05, "loss": 0.212, "num_input_tokens_seen": 2427168, "step": 26950 }, { "epoch": 7.00493762993763, "grad_norm": 0.3188696503639221, "learning_rate": 4.105404949593509e-05, "loss": 0.2139, "num_input_tokens_seen": 2427616, "step": 26955 }, { "epoch": 7.006237006237006, "grad_norm": 0.2523837387561798, "learning_rate": 4.104970294470666e-05, "loss": 0.1528, "num_input_tokens_seen": 2428064, "step": 26960 }, { "epoch": 7.007536382536382, "grad_norm": 0.5013003349304199, "learning_rate": 4.1045355568026625e-05, "loss": 0.2468, "num_input_tokens_seen": 2428480, "step": 26965 }, { "epoch": 7.008835758835759, "grad_norm": 0.23591968417167664, "learning_rate": 4.1041007366118575e-05, "loss": 0.1501, "num_input_tokens_seen": 2428912, "step": 26970 }, { "epoch": 7.010135135135135, "grad_norm": 1.0029211044311523, "learning_rate": 4.1036658339206146e-05, "loss": 0.3277, "num_input_tokens_seen": 2429360, "step": 26975 }, { "epoch": 7.011434511434511, "grad_norm": 0.8552395701408386, "learning_rate": 4.1032308487513023e-05, "loss": 0.3944, "num_input_tokens_seen": 2429840, "step": 26980 }, { "epoch": 7.012733887733888, "grad_norm": 0.30299559235572815, "learning_rate": 4.1027957811262905e-05, "loss": 0.1699, "num_input_tokens_seen": 2430240, "step": 26985 }, { "epoch": 7.014033264033264, "grad_norm": 0.7022998929023743, "learning_rate": 4.1023606310679563e-05, "loss": 0.3029, "num_input_tokens_seen": 2430704, "step": 26990 }, { "epoch": 7.01533264033264, "grad_norm": 0.30842125415802, "learning_rate": 4.101925398598679e-05, "loss": 0.1623, "num_input_tokens_seen": 2431120, "step": 26995 }, { "epoch": 7.016632016632016, "grad_norm": 0.4816957712173462, "learning_rate": 4.101490083740844e-05, "loss": 0.1789, "num_input_tokens_seen": 2431616, "step": 27000 }, { "epoch": 7.017931392931393, "grad_norm": 0.7503506541252136, "learning_rate": 4.10105468651684e-05, "loss": 0.3112, "num_input_tokens_seen": 2432032, "step": 27005 }, { "epoch": 7.019230769230769, "grad_norm": 0.7045421600341797, "learning_rate": 4.100619206949059e-05, "loss": 0.3535, "num_input_tokens_seen": 2432512, "step": 27010 }, { "epoch": 7.020530145530145, "grad_norm": 0.38900113105773926, "learning_rate": 4.1001836450598986e-05, "loss": 0.298, "num_input_tokens_seen": 2432944, "step": 27015 }, { "epoch": 7.021829521829522, "grad_norm": 0.2930707335472107, "learning_rate": 4.0997480008717596e-05, "loss": 0.2914, "num_input_tokens_seen": 2433360, "step": 27020 }, { "epoch": 7.023128898128898, "grad_norm": 0.38991159200668335, "learning_rate": 4.099312274407048e-05, "loss": 0.1871, "num_input_tokens_seen": 2433792, "step": 27025 }, { "epoch": 7.024428274428274, "grad_norm": 0.5524689555168152, "learning_rate": 4.098876465688175e-05, "loss": 0.2856, "num_input_tokens_seen": 2434288, "step": 27030 }, { "epoch": 7.025727650727651, "grad_norm": 0.48350319266319275, "learning_rate": 4.0984405747375524e-05, "loss": 0.1683, "num_input_tokens_seen": 2434752, "step": 27035 }, { "epoch": 7.027027027027027, "grad_norm": 0.2885388433933258, "learning_rate": 4.0980046015776e-05, "loss": 0.1524, "num_input_tokens_seen": 2435168, "step": 27040 }, { "epoch": 7.028326403326403, "grad_norm": 0.5075386166572571, "learning_rate": 4.097568546230739e-05, "loss": 0.2114, "num_input_tokens_seen": 2435632, "step": 27045 }, { "epoch": 7.029625779625779, "grad_norm": 0.2474309206008911, "learning_rate": 4.0971324087193976e-05, "loss": 0.3144, "num_input_tokens_seen": 2436096, "step": 27050 }, { "epoch": 7.030925155925156, "grad_norm": 0.2473406195640564, "learning_rate": 4.096696189066006e-05, "loss": 0.2261, "num_input_tokens_seen": 2436544, "step": 27055 }, { "epoch": 7.032224532224532, "grad_norm": 0.2337954193353653, "learning_rate": 4.096259887293e-05, "loss": 0.293, "num_input_tokens_seen": 2437008, "step": 27060 }, { "epoch": 7.033523908523908, "grad_norm": 0.6716122627258301, "learning_rate": 4.0958235034228174e-05, "loss": 0.3425, "num_input_tokens_seen": 2437488, "step": 27065 }, { "epoch": 7.034823284823285, "grad_norm": 0.3956882059574127, "learning_rate": 4.095387037477904e-05, "loss": 0.2607, "num_input_tokens_seen": 2437936, "step": 27070 }, { "epoch": 7.036122661122661, "grad_norm": 0.6929346919059753, "learning_rate": 4.094950489480706e-05, "loss": 0.2305, "num_input_tokens_seen": 2438384, "step": 27075 }, { "epoch": 7.037422037422037, "grad_norm": 0.8948375582695007, "learning_rate": 4.094513859453676e-05, "loss": 0.2745, "num_input_tokens_seen": 2438848, "step": 27080 }, { "epoch": 7.038721413721413, "grad_norm": 1.1153905391693115, "learning_rate": 4.094077147419271e-05, "loss": 0.2907, "num_input_tokens_seen": 2439280, "step": 27085 }, { "epoch": 7.04002079002079, "grad_norm": 0.6149652600288391, "learning_rate": 4.093640353399951e-05, "loss": 0.2038, "num_input_tokens_seen": 2439728, "step": 27090 }, { "epoch": 7.041320166320166, "grad_norm": 0.6037598848342896, "learning_rate": 4.0932034774181805e-05, "loss": 0.2193, "num_input_tokens_seen": 2440224, "step": 27095 }, { "epoch": 7.042619542619542, "grad_norm": 0.5960108637809753, "learning_rate": 4.092766519496428e-05, "loss": 0.2108, "num_input_tokens_seen": 2440688, "step": 27100 }, { "epoch": 7.043918918918919, "grad_norm": 3.6594901084899902, "learning_rate": 4.0923294796571676e-05, "loss": 0.3049, "num_input_tokens_seen": 2441120, "step": 27105 }, { "epoch": 7.045218295218295, "grad_norm": 1.0351625680923462, "learning_rate": 4.091892357922877e-05, "loss": 0.2545, "num_input_tokens_seen": 2441584, "step": 27110 }, { "epoch": 7.046517671517671, "grad_norm": 0.3992156386375427, "learning_rate": 4.0914551543160374e-05, "loss": 0.1482, "num_input_tokens_seen": 2442032, "step": 27115 }, { "epoch": 7.047817047817047, "grad_norm": 0.3443335294723511, "learning_rate": 4.0910178688591335e-05, "loss": 0.2708, "num_input_tokens_seen": 2442512, "step": 27120 }, { "epoch": 7.049116424116424, "grad_norm": 0.4529040455818176, "learning_rate": 4.0905805015746564e-05, "loss": 0.2047, "num_input_tokens_seen": 2442976, "step": 27125 }, { "epoch": 7.0504158004158, "grad_norm": 0.4121137857437134, "learning_rate": 4.090143052485099e-05, "loss": 0.296, "num_input_tokens_seen": 2443424, "step": 27130 }, { "epoch": 7.0517151767151764, "grad_norm": 0.4528880715370178, "learning_rate": 4.089705521612963e-05, "loss": 0.3314, "num_input_tokens_seen": 2443872, "step": 27135 }, { "epoch": 7.053014553014553, "grad_norm": 0.43225353956222534, "learning_rate": 4.089267908980747e-05, "loss": 0.2399, "num_input_tokens_seen": 2444320, "step": 27140 }, { "epoch": 7.054313929313929, "grad_norm": 0.4205876588821411, "learning_rate": 4.0888302146109604e-05, "loss": 0.2548, "num_input_tokens_seen": 2444784, "step": 27145 }, { "epoch": 7.0556133056133055, "grad_norm": 0.38469648361206055, "learning_rate": 4.0883924385261134e-05, "loss": 0.2597, "num_input_tokens_seen": 2445232, "step": 27150 }, { "epoch": 7.0569126819126815, "grad_norm": 0.3713020086288452, "learning_rate": 4.087954580748722e-05, "loss": 0.2735, "num_input_tokens_seen": 2445680, "step": 27155 }, { "epoch": 7.058212058212058, "grad_norm": 0.8800563216209412, "learning_rate": 4.087516641301304e-05, "loss": 0.3058, "num_input_tokens_seen": 2446112, "step": 27160 }, { "epoch": 7.0595114345114345, "grad_norm": 1.0789716243743896, "learning_rate": 4.0870786202063847e-05, "loss": 0.2981, "num_input_tokens_seen": 2446576, "step": 27165 }, { "epoch": 7.0608108108108105, "grad_norm": 0.7686265110969543, "learning_rate": 4.086640517486491e-05, "loss": 0.256, "num_input_tokens_seen": 2447024, "step": 27170 }, { "epoch": 7.0621101871101875, "grad_norm": 0.6717885136604309, "learning_rate": 4.086202333164155e-05, "loss": 0.2671, "num_input_tokens_seen": 2447472, "step": 27175 }, { "epoch": 7.0634095634095635, "grad_norm": 0.6531169414520264, "learning_rate": 4.085764067261914e-05, "loss": 0.2715, "num_input_tokens_seen": 2447968, "step": 27180 }, { "epoch": 7.0647089397089395, "grad_norm": 0.37927359342575073, "learning_rate": 4.085325719802307e-05, "loss": 0.2286, "num_input_tokens_seen": 2448384, "step": 27185 }, { "epoch": 7.066008316008316, "grad_norm": 0.41426634788513184, "learning_rate": 4.084887290807879e-05, "loss": 0.1685, "num_input_tokens_seen": 2448816, "step": 27190 }, { "epoch": 7.0673076923076925, "grad_norm": 1.2175551652908325, "learning_rate": 4.08444878030118e-05, "loss": 0.3704, "num_input_tokens_seen": 2449296, "step": 27195 }, { "epoch": 7.0686070686070686, "grad_norm": 0.38356441259384155, "learning_rate": 4.084010188304761e-05, "loss": 0.1664, "num_input_tokens_seen": 2449728, "step": 27200 }, { "epoch": 7.069906444906445, "grad_norm": 0.4374336898326874, "learning_rate": 4.0835715148411814e-05, "loss": 0.2041, "num_input_tokens_seen": 2450192, "step": 27205 }, { "epoch": 7.0712058212058215, "grad_norm": 1.0304501056671143, "learning_rate": 4.0831327599330004e-05, "loss": 0.2187, "num_input_tokens_seen": 2450624, "step": 27210 }, { "epoch": 7.072505197505198, "grad_norm": 0.4027014374732971, "learning_rate": 4.082693923602785e-05, "loss": 0.2076, "num_input_tokens_seen": 2451056, "step": 27215 }, { "epoch": 7.073804573804574, "grad_norm": 0.9778949618339539, "learning_rate": 4.082255005873104e-05, "loss": 0.3158, "num_input_tokens_seen": 2451520, "step": 27220 }, { "epoch": 7.07510395010395, "grad_norm": 0.6905168294906616, "learning_rate": 4.081816006766534e-05, "loss": 0.2807, "num_input_tokens_seen": 2451984, "step": 27225 }, { "epoch": 7.076403326403327, "grad_norm": 0.8470519185066223, "learning_rate": 4.08137692630565e-05, "loss": 0.2158, "num_input_tokens_seen": 2452400, "step": 27230 }, { "epoch": 7.077702702702703, "grad_norm": 0.5550942420959473, "learning_rate": 4.080937764513035e-05, "loss": 0.1362, "num_input_tokens_seen": 2452832, "step": 27235 }, { "epoch": 7.079002079002079, "grad_norm": 0.4233839511871338, "learning_rate": 4.080498521411277e-05, "loss": 0.1623, "num_input_tokens_seen": 2453264, "step": 27240 }, { "epoch": 7.080301455301456, "grad_norm": 0.2741113305091858, "learning_rate": 4.080059197022965e-05, "loss": 0.1291, "num_input_tokens_seen": 2453728, "step": 27245 }, { "epoch": 7.081600831600832, "grad_norm": 0.23880544304847717, "learning_rate": 4.079619791370695e-05, "loss": 0.1359, "num_input_tokens_seen": 2454208, "step": 27250 }, { "epoch": 7.082900207900208, "grad_norm": 2.44569730758667, "learning_rate": 4.079180304477065e-05, "loss": 0.7199, "num_input_tokens_seen": 2454640, "step": 27255 }, { "epoch": 7.084199584199585, "grad_norm": 0.33800214529037476, "learning_rate": 4.078740736364679e-05, "loss": 0.2677, "num_input_tokens_seen": 2455104, "step": 27260 }, { "epoch": 7.085498960498961, "grad_norm": 0.9180848598480225, "learning_rate": 4.078301087056144e-05, "loss": 0.3645, "num_input_tokens_seen": 2455600, "step": 27265 }, { "epoch": 7.086798336798337, "grad_norm": 0.44000110030174255, "learning_rate": 4.077861356574073e-05, "loss": 0.2309, "num_input_tokens_seen": 2456016, "step": 27270 }, { "epoch": 7.088097713097713, "grad_norm": 0.5082180500030518, "learning_rate": 4.0774215449410794e-05, "loss": 0.2768, "num_input_tokens_seen": 2456496, "step": 27275 }, { "epoch": 7.08939708939709, "grad_norm": 0.6740713119506836, "learning_rate": 4.076981652179785e-05, "loss": 0.2608, "num_input_tokens_seen": 2456928, "step": 27280 }, { "epoch": 7.090696465696466, "grad_norm": 0.6423798203468323, "learning_rate": 4.076541678312813e-05, "loss": 0.2099, "num_input_tokens_seen": 2457360, "step": 27285 }, { "epoch": 7.091995841995842, "grad_norm": 0.913262128829956, "learning_rate": 4.076101623362791e-05, "loss": 0.2585, "num_input_tokens_seen": 2457792, "step": 27290 }, { "epoch": 7.093295218295219, "grad_norm": 0.8683629631996155, "learning_rate": 4.075661487352354e-05, "loss": 0.2422, "num_input_tokens_seen": 2458208, "step": 27295 }, { "epoch": 7.094594594594595, "grad_norm": 0.8312925100326538, "learning_rate": 4.0752212703041356e-05, "loss": 0.3779, "num_input_tokens_seen": 2458640, "step": 27300 }, { "epoch": 7.095893970893971, "grad_norm": 0.5081321001052856, "learning_rate": 4.074780972240779e-05, "loss": 0.2996, "num_input_tokens_seen": 2459072, "step": 27305 }, { "epoch": 7.097193347193347, "grad_norm": 1.331343173980713, "learning_rate": 4.074340593184928e-05, "loss": 0.2678, "num_input_tokens_seen": 2459536, "step": 27310 }, { "epoch": 7.098492723492724, "grad_norm": 0.863731324672699, "learning_rate": 4.073900133159231e-05, "loss": 0.2568, "num_input_tokens_seen": 2459984, "step": 27315 }, { "epoch": 7.0997920997921, "grad_norm": 0.6866310834884644, "learning_rate": 4.073459592186343e-05, "loss": 0.2607, "num_input_tokens_seen": 2460416, "step": 27320 }, { "epoch": 7.101091476091476, "grad_norm": 0.8139947652816772, "learning_rate": 4.0730189702889205e-05, "loss": 0.2622, "num_input_tokens_seen": 2460864, "step": 27325 }, { "epoch": 7.102390852390853, "grad_norm": 0.6551228761672974, "learning_rate": 4.072578267489625e-05, "loss": 0.2408, "num_input_tokens_seen": 2461312, "step": 27330 }, { "epoch": 7.103690228690229, "grad_norm": 0.6715789437294006, "learning_rate": 4.072137483811122e-05, "loss": 0.2375, "num_input_tokens_seen": 2461776, "step": 27335 }, { "epoch": 7.104989604989605, "grad_norm": 0.42727959156036377, "learning_rate": 4.0716966192760816e-05, "loss": 0.1964, "num_input_tokens_seen": 2462256, "step": 27340 }, { "epoch": 7.106288981288981, "grad_norm": 0.5222122669219971, "learning_rate": 4.0712556739071795e-05, "loss": 0.2767, "num_input_tokens_seen": 2462752, "step": 27345 }, { "epoch": 7.107588357588358, "grad_norm": 0.9531799554824829, "learning_rate": 4.0708146477270916e-05, "loss": 0.2481, "num_input_tokens_seen": 2463216, "step": 27350 }, { "epoch": 7.108887733887734, "grad_norm": 1.2598153352737427, "learning_rate": 4.0703735407585014e-05, "loss": 0.2261, "num_input_tokens_seen": 2463664, "step": 27355 }, { "epoch": 7.11018711018711, "grad_norm": 0.31367257237434387, "learning_rate": 4.069932353024096e-05, "loss": 0.269, "num_input_tokens_seen": 2464144, "step": 27360 }, { "epoch": 7.111486486486487, "grad_norm": 0.46397385001182556, "learning_rate": 4.069491084546564e-05, "loss": 0.3297, "num_input_tokens_seen": 2464608, "step": 27365 }, { "epoch": 7.112785862785863, "grad_norm": 0.4016430974006653, "learning_rate": 4.069049735348603e-05, "loss": 0.3031, "num_input_tokens_seen": 2465040, "step": 27370 }, { "epoch": 7.114085239085239, "grad_norm": 0.5388334393501282, "learning_rate": 4.0686083054529105e-05, "loss": 0.2238, "num_input_tokens_seen": 2465456, "step": 27375 }, { "epoch": 7.115384615384615, "grad_norm": 0.8068399429321289, "learning_rate": 4.06816679488219e-05, "loss": 0.3219, "num_input_tokens_seen": 2465904, "step": 27380 }, { "epoch": 7.116683991683992, "grad_norm": 0.5455402731895447, "learning_rate": 4.0677252036591484e-05, "loss": 0.224, "num_input_tokens_seen": 2466352, "step": 27385 }, { "epoch": 7.117983367983368, "grad_norm": 0.6650122404098511, "learning_rate": 4.0672835318064975e-05, "loss": 0.2148, "num_input_tokens_seen": 2466784, "step": 27390 }, { "epoch": 7.119282744282744, "grad_norm": 0.32149651646614075, "learning_rate": 4.066841779346953e-05, "loss": 0.2532, "num_input_tokens_seen": 2467216, "step": 27395 }, { "epoch": 7.120582120582121, "grad_norm": 1.175368070602417, "learning_rate": 4.0663999463032336e-05, "loss": 0.2183, "num_input_tokens_seen": 2467648, "step": 27400 }, { "epoch": 7.121881496881497, "grad_norm": 0.35923904180526733, "learning_rate": 4.065958032698065e-05, "loss": 0.21, "num_input_tokens_seen": 2468096, "step": 27405 }, { "epoch": 7.123180873180873, "grad_norm": 0.4247687757015228, "learning_rate": 4.065516038554174e-05, "loss": 0.3246, "num_input_tokens_seen": 2468560, "step": 27410 }, { "epoch": 7.124480249480249, "grad_norm": 0.44183143973350525, "learning_rate": 4.065073963894294e-05, "loss": 0.2687, "num_input_tokens_seen": 2468992, "step": 27415 }, { "epoch": 7.125779625779626, "grad_norm": 0.5112112760543823, "learning_rate": 4.064631808741159e-05, "loss": 0.2889, "num_input_tokens_seen": 2469488, "step": 27420 }, { "epoch": 7.127079002079002, "grad_norm": 0.5208763480186462, "learning_rate": 4.064189573117512e-05, "loss": 0.2484, "num_input_tokens_seen": 2469888, "step": 27425 }, { "epoch": 7.128378378378378, "grad_norm": 0.4432801306247711, "learning_rate": 4.063747257046096e-05, "loss": 0.1451, "num_input_tokens_seen": 2470368, "step": 27430 }, { "epoch": 7.129677754677755, "grad_norm": 0.26571258902549744, "learning_rate": 4.06330486054966e-05, "loss": 0.1312, "num_input_tokens_seen": 2470800, "step": 27435 }, { "epoch": 7.130977130977131, "grad_norm": 0.3074236810207367, "learning_rate": 4.062862383650958e-05, "loss": 0.273, "num_input_tokens_seen": 2471232, "step": 27440 }, { "epoch": 7.132276507276507, "grad_norm": 0.6204419732093811, "learning_rate": 4.062419826372746e-05, "loss": 0.3106, "num_input_tokens_seen": 2471664, "step": 27445 }, { "epoch": 7.133575883575883, "grad_norm": 0.8937852382659912, "learning_rate": 4.0619771887377846e-05, "loss": 0.2667, "num_input_tokens_seen": 2472128, "step": 27450 }, { "epoch": 7.13487525987526, "grad_norm": 0.25898218154907227, "learning_rate": 4.061534470768841e-05, "loss": 0.3019, "num_input_tokens_seen": 2472576, "step": 27455 }, { "epoch": 7.136174636174636, "grad_norm": 0.3329523801803589, "learning_rate": 4.061091672488682e-05, "loss": 0.2716, "num_input_tokens_seen": 2473024, "step": 27460 }, { "epoch": 7.137474012474012, "grad_norm": 0.5143791437149048, "learning_rate": 4.0606487939200834e-05, "loss": 0.2602, "num_input_tokens_seen": 2473488, "step": 27465 }, { "epoch": 7.138773388773389, "grad_norm": 0.553300142288208, "learning_rate": 4.060205835085821e-05, "loss": 0.3187, "num_input_tokens_seen": 2473920, "step": 27470 }, { "epoch": 7.140072765072765, "grad_norm": 0.45859014987945557, "learning_rate": 4.059762796008679e-05, "loss": 0.1758, "num_input_tokens_seen": 2474384, "step": 27475 }, { "epoch": 7.141372141372141, "grad_norm": 0.4821738600730896, "learning_rate": 4.0593196767114405e-05, "loss": 0.2388, "num_input_tokens_seen": 2474800, "step": 27480 }, { "epoch": 7.142671517671518, "grad_norm": 0.539079487323761, "learning_rate": 4.058876477216898e-05, "loss": 0.3332, "num_input_tokens_seen": 2475216, "step": 27485 }, { "epoch": 7.143970893970894, "grad_norm": 0.31160789728164673, "learning_rate": 4.058433197547844e-05, "loss": 0.1834, "num_input_tokens_seen": 2475664, "step": 27490 }, { "epoch": 7.14527027027027, "grad_norm": 0.5620928406715393, "learning_rate": 4.0579898377270774e-05, "loss": 0.2672, "num_input_tokens_seen": 2476160, "step": 27495 }, { "epoch": 7.146569646569646, "grad_norm": 0.521671712398529, "learning_rate": 4.057546397777401e-05, "loss": 0.1746, "num_input_tokens_seen": 2476624, "step": 27500 }, { "epoch": 7.147869022869023, "grad_norm": 0.6495948433876038, "learning_rate": 4.0571028777216214e-05, "loss": 0.2021, "num_input_tokens_seen": 2477088, "step": 27505 }, { "epoch": 7.149168399168399, "grad_norm": 0.6827281713485718, "learning_rate": 4.0566592775825486e-05, "loss": 0.272, "num_input_tokens_seen": 2477536, "step": 27510 }, { "epoch": 7.150467775467775, "grad_norm": 1.0039143562316895, "learning_rate": 4.056215597382997e-05, "loss": 0.2359, "num_input_tokens_seen": 2478000, "step": 27515 }, { "epoch": 7.151767151767152, "grad_norm": 0.2856770157814026, "learning_rate": 4.055771837145787e-05, "loss": 0.0997, "num_input_tokens_seen": 2478432, "step": 27520 }, { "epoch": 7.153066528066528, "grad_norm": 0.5682768225669861, "learning_rate": 4.05532799689374e-05, "loss": 0.2856, "num_input_tokens_seen": 2478880, "step": 27525 }, { "epoch": 7.154365904365904, "grad_norm": 0.8762511014938354, "learning_rate": 4.054884076649684e-05, "loss": 0.2017, "num_input_tokens_seen": 2479344, "step": 27530 }, { "epoch": 7.15566528066528, "grad_norm": 0.2837176024913788, "learning_rate": 4.054440076436451e-05, "loss": 0.148, "num_input_tokens_seen": 2479824, "step": 27535 }, { "epoch": 7.156964656964657, "grad_norm": 0.2581818699836731, "learning_rate": 4.053995996276875e-05, "loss": 0.2812, "num_input_tokens_seen": 2480256, "step": 27540 }, { "epoch": 7.158264033264033, "grad_norm": 0.3996944725513458, "learning_rate": 4.053551836193795e-05, "loss": 0.1464, "num_input_tokens_seen": 2480720, "step": 27545 }, { "epoch": 7.159563409563409, "grad_norm": 0.26672446727752686, "learning_rate": 4.0531075962100564e-05, "loss": 0.2135, "num_input_tokens_seen": 2481168, "step": 27550 }, { "epoch": 7.160862785862786, "grad_norm": 0.2909160554409027, "learning_rate": 4.052663276348506e-05, "loss": 0.2362, "num_input_tokens_seen": 2481632, "step": 27555 }, { "epoch": 7.162162162162162, "grad_norm": 0.3161768913269043, "learning_rate": 4.0522188766319954e-05, "loss": 0.2299, "num_input_tokens_seen": 2482080, "step": 27560 }, { "epoch": 7.163461538461538, "grad_norm": 0.7400423884391785, "learning_rate": 4.051774397083381e-05, "loss": 0.26, "num_input_tokens_seen": 2482496, "step": 27565 }, { "epoch": 7.164760914760914, "grad_norm": 0.3379380404949188, "learning_rate": 4.0513298377255225e-05, "loss": 0.2538, "num_input_tokens_seen": 2482928, "step": 27570 }, { "epoch": 7.166060291060291, "grad_norm": 0.7508407235145569, "learning_rate": 4.050885198581283e-05, "loss": 0.1934, "num_input_tokens_seen": 2483376, "step": 27575 }, { "epoch": 7.167359667359667, "grad_norm": 0.6143977642059326, "learning_rate": 4.050440479673533e-05, "loss": 0.3698, "num_input_tokens_seen": 2483808, "step": 27580 }, { "epoch": 7.168659043659043, "grad_norm": 0.5794991254806519, "learning_rate": 4.049995681025143e-05, "loss": 0.2836, "num_input_tokens_seen": 2484256, "step": 27585 }, { "epoch": 7.16995841995842, "grad_norm": 1.2596445083618164, "learning_rate": 4.04955080265899e-05, "loss": 0.2887, "num_input_tokens_seen": 2484720, "step": 27590 }, { "epoch": 7.171257796257796, "grad_norm": 1.5982577800750732, "learning_rate": 4.0491058445979545e-05, "loss": 0.2927, "num_input_tokens_seen": 2485152, "step": 27595 }, { "epoch": 7.172557172557172, "grad_norm": 0.9875068068504333, "learning_rate": 4.0486608068649217e-05, "loss": 0.2558, "num_input_tokens_seen": 2485568, "step": 27600 }, { "epoch": 7.173856548856548, "grad_norm": 1.0997403860092163, "learning_rate": 4.0482156894827794e-05, "loss": 0.3015, "num_input_tokens_seen": 2486064, "step": 27605 }, { "epoch": 7.175155925155925, "grad_norm": 0.5055245757102966, "learning_rate": 4.0477704924744207e-05, "loss": 0.3042, "num_input_tokens_seen": 2486480, "step": 27610 }, { "epoch": 7.176455301455301, "grad_norm": 0.5204838514328003, "learning_rate": 4.047325215862743e-05, "loss": 0.2066, "num_input_tokens_seen": 2486896, "step": 27615 }, { "epoch": 7.1777546777546775, "grad_norm": 0.33699122071266174, "learning_rate": 4.046879859670646e-05, "loss": 0.2446, "num_input_tokens_seen": 2487344, "step": 27620 }, { "epoch": 7.179054054054054, "grad_norm": 0.3534998893737793, "learning_rate": 4.046434423921036e-05, "loss": 0.2699, "num_input_tokens_seen": 2487760, "step": 27625 }, { "epoch": 7.18035343035343, "grad_norm": 0.2871042788028717, "learning_rate": 4.045988908636822e-05, "loss": 0.1691, "num_input_tokens_seen": 2488192, "step": 27630 }, { "epoch": 7.1816528066528065, "grad_norm": 0.3498574495315552, "learning_rate": 4.045543313840917e-05, "loss": 0.1391, "num_input_tokens_seen": 2488688, "step": 27635 }, { "epoch": 7.182952182952183, "grad_norm": 0.30462154746055603, "learning_rate": 4.045097639556239e-05, "loss": 0.2602, "num_input_tokens_seen": 2489152, "step": 27640 }, { "epoch": 7.1842515592515594, "grad_norm": 0.3100115656852722, "learning_rate": 4.0446518858057083e-05, "loss": 0.2203, "num_input_tokens_seen": 2489568, "step": 27645 }, { "epoch": 7.1855509355509355, "grad_norm": 0.5813714265823364, "learning_rate": 4.0442060526122515e-05, "loss": 0.4037, "num_input_tokens_seen": 2490032, "step": 27650 }, { "epoch": 7.1868503118503115, "grad_norm": 0.4617728888988495, "learning_rate": 4.043760139998798e-05, "loss": 0.3147, "num_input_tokens_seen": 2490464, "step": 27655 }, { "epoch": 7.1881496881496885, "grad_norm": 0.5300977826118469, "learning_rate": 4.0433141479882806e-05, "loss": 0.328, "num_input_tokens_seen": 2490928, "step": 27660 }, { "epoch": 7.1894490644490645, "grad_norm": 0.3027913570404053, "learning_rate": 4.0428680766036384e-05, "loss": 0.2831, "num_input_tokens_seen": 2491376, "step": 27665 }, { "epoch": 7.1907484407484406, "grad_norm": 0.7162767648696899, "learning_rate": 4.0424219258678126e-05, "loss": 0.2969, "num_input_tokens_seen": 2491808, "step": 27670 }, { "epoch": 7.192047817047817, "grad_norm": 0.8287840485572815, "learning_rate": 4.041975695803749e-05, "loss": 0.2529, "num_input_tokens_seen": 2492224, "step": 27675 }, { "epoch": 7.1933471933471935, "grad_norm": 0.5308674573898315, "learning_rate": 4.041529386434398e-05, "loss": 0.2376, "num_input_tokens_seen": 2492640, "step": 27680 }, { "epoch": 7.19464656964657, "grad_norm": 0.769834578037262, "learning_rate": 4.0410829977827135e-05, "loss": 0.2909, "num_input_tokens_seen": 2493088, "step": 27685 }, { "epoch": 7.195945945945946, "grad_norm": 0.5560197234153748, "learning_rate": 4.040636529871654e-05, "loss": 0.2394, "num_input_tokens_seen": 2493520, "step": 27690 }, { "epoch": 7.1972453222453225, "grad_norm": 0.5445865988731384, "learning_rate": 4.040189982724182e-05, "loss": 0.2307, "num_input_tokens_seen": 2493936, "step": 27695 }, { "epoch": 7.198544698544699, "grad_norm": 0.2782667875289917, "learning_rate": 4.0397433563632634e-05, "loss": 0.1741, "num_input_tokens_seen": 2494352, "step": 27700 }, { "epoch": 7.199844074844075, "grad_norm": 0.46642395853996277, "learning_rate": 4.039296650811869e-05, "loss": 0.2732, "num_input_tokens_seen": 2494784, "step": 27705 }, { "epoch": 7.201143451143452, "grad_norm": 0.8244816064834595, "learning_rate": 4.038849866092972e-05, "loss": 0.251, "num_input_tokens_seen": 2495216, "step": 27710 }, { "epoch": 7.202442827442828, "grad_norm": 0.338346391916275, "learning_rate": 4.038403002229553e-05, "loss": 0.2546, "num_input_tokens_seen": 2495680, "step": 27715 }, { "epoch": 7.203742203742204, "grad_norm": 0.9016839861869812, "learning_rate": 4.037956059244592e-05, "loss": 0.2991, "num_input_tokens_seen": 2496144, "step": 27720 }, { "epoch": 7.20504158004158, "grad_norm": 0.3043974041938782, "learning_rate": 4.037509037161079e-05, "loss": 0.1264, "num_input_tokens_seen": 2496560, "step": 27725 }, { "epoch": 7.206340956340957, "grad_norm": 0.9352195262908936, "learning_rate": 4.037061936002002e-05, "loss": 0.3347, "num_input_tokens_seen": 2497040, "step": 27730 }, { "epoch": 7.207640332640333, "grad_norm": 0.4802286922931671, "learning_rate": 4.0366147557903565e-05, "loss": 0.2257, "num_input_tokens_seen": 2497488, "step": 27735 }, { "epoch": 7.208939708939709, "grad_norm": 0.4829723536968231, "learning_rate": 4.0361674965491426e-05, "loss": 0.1886, "num_input_tokens_seen": 2497968, "step": 27740 }, { "epoch": 7.210239085239086, "grad_norm": 0.5202610492706299, "learning_rate": 4.035720158301363e-05, "loss": 0.1979, "num_input_tokens_seen": 2498432, "step": 27745 }, { "epoch": 7.211538461538462, "grad_norm": 0.4647113084793091, "learning_rate": 4.035272741070023e-05, "loss": 0.2161, "num_input_tokens_seen": 2498864, "step": 27750 }, { "epoch": 7.212837837837838, "grad_norm": 0.22458595037460327, "learning_rate": 4.0348252448781356e-05, "loss": 0.1983, "num_input_tokens_seen": 2499312, "step": 27755 }, { "epoch": 7.214137214137214, "grad_norm": 0.8986942768096924, "learning_rate": 4.0343776697487145e-05, "loss": 0.3258, "num_input_tokens_seen": 2499760, "step": 27760 }, { "epoch": 7.215436590436591, "grad_norm": 0.4157329499721527, "learning_rate": 4.0339300157047805e-05, "loss": 0.2158, "num_input_tokens_seen": 2500224, "step": 27765 }, { "epoch": 7.216735966735967, "grad_norm": 0.6733846068382263, "learning_rate": 4.0334822827693565e-05, "loss": 0.3131, "num_input_tokens_seen": 2500688, "step": 27770 }, { "epoch": 7.218035343035343, "grad_norm": 0.6783133745193481, "learning_rate": 4.033034470965468e-05, "loss": 0.2538, "num_input_tokens_seen": 2501120, "step": 27775 }, { "epoch": 7.21933471933472, "grad_norm": 0.8590250611305237, "learning_rate": 4.032586580316149e-05, "loss": 0.3655, "num_input_tokens_seen": 2501536, "step": 27780 }, { "epoch": 7.220634095634096, "grad_norm": 0.754985511302948, "learning_rate": 4.0321386108444325e-05, "loss": 0.2008, "num_input_tokens_seen": 2502000, "step": 27785 }, { "epoch": 7.221933471933472, "grad_norm": 0.36633649468421936, "learning_rate": 4.03169056257336e-05, "loss": 0.2164, "num_input_tokens_seen": 2502432, "step": 27790 }, { "epoch": 7.223232848232848, "grad_norm": 0.6455479264259338, "learning_rate": 4.031242435525974e-05, "loss": 0.2098, "num_input_tokens_seen": 2502848, "step": 27795 }, { "epoch": 7.224532224532225, "grad_norm": 0.5344406366348267, "learning_rate": 4.0307942297253225e-05, "loss": 0.2435, "num_input_tokens_seen": 2503280, "step": 27800 }, { "epoch": 7.225831600831601, "grad_norm": 0.6340008974075317, "learning_rate": 4.030345945194457e-05, "loss": 0.2952, "num_input_tokens_seen": 2503696, "step": 27805 }, { "epoch": 7.227130977130977, "grad_norm": 0.8270812034606934, "learning_rate": 4.029897581956433e-05, "loss": 0.2917, "num_input_tokens_seen": 2504144, "step": 27810 }, { "epoch": 7.228430353430354, "grad_norm": 0.4685575067996979, "learning_rate": 4.0294491400343116e-05, "loss": 0.2827, "num_input_tokens_seen": 2504608, "step": 27815 }, { "epoch": 7.22972972972973, "grad_norm": 0.6062761545181274, "learning_rate": 4.029000619451154e-05, "loss": 0.2856, "num_input_tokens_seen": 2505120, "step": 27820 }, { "epoch": 7.231029106029106, "grad_norm": 0.6199995875358582, "learning_rate": 4.028552020230031e-05, "loss": 0.2517, "num_input_tokens_seen": 2505552, "step": 27825 }, { "epoch": 7.232328482328482, "grad_norm": 0.5772632956504822, "learning_rate": 4.028103342394012e-05, "loss": 0.2273, "num_input_tokens_seen": 2505984, "step": 27830 }, { "epoch": 7.233627858627859, "grad_norm": 0.5396027565002441, "learning_rate": 4.0276545859661744e-05, "loss": 0.1749, "num_input_tokens_seen": 2506384, "step": 27835 }, { "epoch": 7.234927234927235, "grad_norm": 0.8587715029716492, "learning_rate": 4.027205750969597e-05, "loss": 0.3109, "num_input_tokens_seen": 2506848, "step": 27840 }, { "epoch": 7.236226611226611, "grad_norm": 0.5606780648231506, "learning_rate": 4.0267568374273644e-05, "loss": 0.2562, "num_input_tokens_seen": 2507296, "step": 27845 }, { "epoch": 7.237525987525988, "grad_norm": 0.7663915157318115, "learning_rate": 4.026307845362566e-05, "loss": 0.2213, "num_input_tokens_seen": 2507760, "step": 27850 }, { "epoch": 7.238825363825364, "grad_norm": 0.3984088599681854, "learning_rate": 4.025858774798292e-05, "loss": 0.1738, "num_input_tokens_seen": 2508176, "step": 27855 }, { "epoch": 7.24012474012474, "grad_norm": 1.6038907766342163, "learning_rate": 4.025409625757639e-05, "loss": 0.2443, "num_input_tokens_seen": 2508656, "step": 27860 }, { "epoch": 7.241424116424117, "grad_norm": 0.451397567987442, "learning_rate": 4.024960398263708e-05, "loss": 0.2844, "num_input_tokens_seen": 2509104, "step": 27865 }, { "epoch": 7.242723492723493, "grad_norm": 0.5357276797294617, "learning_rate": 4.024511092339602e-05, "loss": 0.2766, "num_input_tokens_seen": 2509568, "step": 27870 }, { "epoch": 7.244022869022869, "grad_norm": 0.40067169070243835, "learning_rate": 4.02406170800843e-05, "loss": 0.1678, "num_input_tokens_seen": 2510032, "step": 27875 }, { "epoch": 7.245322245322245, "grad_norm": 0.4826951324939728, "learning_rate": 4.023612245293304e-05, "loss": 0.1666, "num_input_tokens_seen": 2510464, "step": 27880 }, { "epoch": 7.246621621621622, "grad_norm": 0.3629143536090851, "learning_rate": 4.02316270421734e-05, "loss": 0.1488, "num_input_tokens_seen": 2510912, "step": 27885 }, { "epoch": 7.247920997920998, "grad_norm": 0.26360082626342773, "learning_rate": 4.0227130848036595e-05, "loss": 0.2583, "num_input_tokens_seen": 2511360, "step": 27890 }, { "epoch": 7.249220374220374, "grad_norm": 0.9140949249267578, "learning_rate": 4.0222633870753855e-05, "loss": 0.2214, "num_input_tokens_seen": 2511792, "step": 27895 }, { "epoch": 7.25051975051975, "grad_norm": 0.30122268199920654, "learning_rate": 4.0218136110556474e-05, "loss": 0.2522, "num_input_tokens_seen": 2512272, "step": 27900 }, { "epoch": 7.251819126819127, "grad_norm": 0.40883877873420715, "learning_rate": 4.0213637567675774e-05, "loss": 0.1653, "num_input_tokens_seen": 2512720, "step": 27905 }, { "epoch": 7.253118503118503, "grad_norm": 0.3057047426700592, "learning_rate": 4.0209138242343104e-05, "loss": 0.1293, "num_input_tokens_seen": 2513168, "step": 27910 }, { "epoch": 7.254417879417879, "grad_norm": 1.0013848543167114, "learning_rate": 4.020463813478989e-05, "loss": 0.3695, "num_input_tokens_seen": 2513584, "step": 27915 }, { "epoch": 7.255717255717256, "grad_norm": 0.31467553973197937, "learning_rate": 4.020013724524757e-05, "loss": 0.2788, "num_input_tokens_seen": 2514048, "step": 27920 }, { "epoch": 7.257016632016632, "grad_norm": 0.6083568930625916, "learning_rate": 4.019563557394762e-05, "loss": 0.2546, "num_input_tokens_seen": 2514496, "step": 27925 }, { "epoch": 7.258316008316008, "grad_norm": 0.4565292000770569, "learning_rate": 4.0191133121121584e-05, "loss": 0.2387, "num_input_tokens_seen": 2514944, "step": 27930 }, { "epoch": 7.259615384615385, "grad_norm": 1.2571214437484741, "learning_rate": 4.0186629887001e-05, "loss": 0.3206, "num_input_tokens_seen": 2515392, "step": 27935 }, { "epoch": 7.260914760914761, "grad_norm": 0.7922497391700745, "learning_rate": 4.018212587181751e-05, "loss": 0.2894, "num_input_tokens_seen": 2515808, "step": 27940 }, { "epoch": 7.262214137214137, "grad_norm": 1.9978001117706299, "learning_rate": 4.0177621075802724e-05, "loss": 0.2807, "num_input_tokens_seen": 2516272, "step": 27945 }, { "epoch": 7.263513513513513, "grad_norm": 0.9569900035858154, "learning_rate": 4.0173115499188355e-05, "loss": 0.279, "num_input_tokens_seen": 2516720, "step": 27950 }, { "epoch": 7.26481288981289, "grad_norm": 0.7543973326683044, "learning_rate": 4.01686091422061e-05, "loss": 0.2805, "num_input_tokens_seen": 2517184, "step": 27955 }, { "epoch": 7.266112266112266, "grad_norm": 1.3686699867248535, "learning_rate": 4.016410200508776e-05, "loss": 0.2633, "num_input_tokens_seen": 2517616, "step": 27960 }, { "epoch": 7.267411642411642, "grad_norm": 0.6891204714775085, "learning_rate": 4.0159594088065126e-05, "loss": 0.249, "num_input_tokens_seen": 2518032, "step": 27965 }, { "epoch": 7.268711018711019, "grad_norm": 0.6250878572463989, "learning_rate": 4.015508539137003e-05, "loss": 0.2773, "num_input_tokens_seen": 2518464, "step": 27970 }, { "epoch": 7.270010395010395, "grad_norm": 0.7241123914718628, "learning_rate": 4.015057591523438e-05, "loss": 0.2215, "num_input_tokens_seen": 2518912, "step": 27975 }, { "epoch": 7.271309771309771, "grad_norm": 1.8998090028762817, "learning_rate": 4.014606565989009e-05, "loss": 0.1955, "num_input_tokens_seen": 2519376, "step": 27980 }, { "epoch": 7.272609147609147, "grad_norm": 1.6252161264419556, "learning_rate": 4.0141554625569125e-05, "loss": 0.3761, "num_input_tokens_seen": 2519792, "step": 27985 }, { "epoch": 7.273908523908524, "grad_norm": 1.4809176921844482, "learning_rate": 4.013704281250351e-05, "loss": 0.3514, "num_input_tokens_seen": 2520288, "step": 27990 }, { "epoch": 7.2752079002079, "grad_norm": 0.8105494976043701, "learning_rate": 4.0132530220925266e-05, "loss": 0.2588, "num_input_tokens_seen": 2520752, "step": 27995 }, { "epoch": 7.276507276507276, "grad_norm": 0.9748122096061707, "learning_rate": 4.012801685106651e-05, "loss": 0.1821, "num_input_tokens_seen": 2521184, "step": 28000 }, { "epoch": 7.277806652806653, "grad_norm": 0.625241756439209, "learning_rate": 4.012350270315934e-05, "loss": 0.1757, "num_input_tokens_seen": 2521648, "step": 28005 }, { "epoch": 7.279106029106029, "grad_norm": 0.3587626814842224, "learning_rate": 4.011898777743594e-05, "loss": 0.1217, "num_input_tokens_seen": 2522080, "step": 28010 }, { "epoch": 7.280405405405405, "grad_norm": 0.39469578862190247, "learning_rate": 4.011447207412851e-05, "loss": 0.1414, "num_input_tokens_seen": 2522544, "step": 28015 }, { "epoch": 7.281704781704781, "grad_norm": 0.39396435022354126, "learning_rate": 4.01099555934693e-05, "loss": 0.2288, "num_input_tokens_seen": 2523008, "step": 28020 }, { "epoch": 7.283004158004158, "grad_norm": 0.37056615948677063, "learning_rate": 4.0105438335690597e-05, "loss": 0.4001, "num_input_tokens_seen": 2523440, "step": 28025 }, { "epoch": 7.284303534303534, "grad_norm": 0.4339316785335541, "learning_rate": 4.010092030102473e-05, "loss": 0.2992, "num_input_tokens_seen": 2523856, "step": 28030 }, { "epoch": 7.28560291060291, "grad_norm": 0.34462934732437134, "learning_rate": 4.009640148970406e-05, "loss": 0.217, "num_input_tokens_seen": 2524304, "step": 28035 }, { "epoch": 7.286902286902287, "grad_norm": 0.8728686571121216, "learning_rate": 4.0091881901961e-05, "loss": 0.3153, "num_input_tokens_seen": 2524752, "step": 28040 }, { "epoch": 7.288201663201663, "grad_norm": 0.5766733288764954, "learning_rate": 4.008736153802799e-05, "loss": 0.2853, "num_input_tokens_seen": 2525184, "step": 28045 }, { "epoch": 7.289501039501039, "grad_norm": 0.7263427972793579, "learning_rate": 4.0082840398137514e-05, "loss": 0.299, "num_input_tokens_seen": 2525632, "step": 28050 }, { "epoch": 7.290800415800415, "grad_norm": 1.0388182401657104, "learning_rate": 4.007831848252211e-05, "loss": 0.2908, "num_input_tokens_seen": 2526080, "step": 28055 }, { "epoch": 7.292099792099792, "grad_norm": 0.7129603028297424, "learning_rate": 4.007379579141434e-05, "loss": 0.2618, "num_input_tokens_seen": 2526528, "step": 28060 }, { "epoch": 7.293399168399168, "grad_norm": 0.9656732082366943, "learning_rate": 4.0069272325046816e-05, "loss": 0.2811, "num_input_tokens_seen": 2526960, "step": 28065 }, { "epoch": 7.294698544698544, "grad_norm": 0.9820788502693176, "learning_rate": 4.006474808365217e-05, "loss": 0.2571, "num_input_tokens_seen": 2527392, "step": 28070 }, { "epoch": 7.295997920997921, "grad_norm": 0.5776952505111694, "learning_rate": 4.0060223067463094e-05, "loss": 0.3215, "num_input_tokens_seen": 2527824, "step": 28075 }, { "epoch": 7.297297297297297, "grad_norm": 0.6785297393798828, "learning_rate": 4.005569727671231e-05, "loss": 0.2761, "num_input_tokens_seen": 2528272, "step": 28080 }, { "epoch": 7.298596673596673, "grad_norm": 1.1010019779205322, "learning_rate": 4.00511707116326e-05, "loss": 0.2526, "num_input_tokens_seen": 2528688, "step": 28085 }, { "epoch": 7.29989604989605, "grad_norm": 0.6470034718513489, "learning_rate": 4.004664337245676e-05, "loss": 0.2375, "num_input_tokens_seen": 2529168, "step": 28090 }, { "epoch": 7.301195426195426, "grad_norm": 0.6282581686973572, "learning_rate": 4.004211525941763e-05, "loss": 0.2899, "num_input_tokens_seen": 2529616, "step": 28095 }, { "epoch": 7.302494802494802, "grad_norm": 0.5668028593063354, "learning_rate": 4.00375863727481e-05, "loss": 0.3207, "num_input_tokens_seen": 2530048, "step": 28100 }, { "epoch": 7.3037941787941785, "grad_norm": 0.596917986869812, "learning_rate": 4.00330567126811e-05, "loss": 0.2048, "num_input_tokens_seen": 2530512, "step": 28105 }, { "epoch": 7.305093555093555, "grad_norm": 0.8540268540382385, "learning_rate": 4.002852627944958e-05, "loss": 0.2599, "num_input_tokens_seen": 2530960, "step": 28110 }, { "epoch": 7.3063929313929314, "grad_norm": 0.7419785261154175, "learning_rate": 4.002399507328656e-05, "loss": 0.2004, "num_input_tokens_seen": 2531376, "step": 28115 }, { "epoch": 7.3076923076923075, "grad_norm": 0.4402478039264679, "learning_rate": 4.001946309442508e-05, "loss": 0.2068, "num_input_tokens_seen": 2531824, "step": 28120 }, { "epoch": 7.3089916839916835, "grad_norm": 1.1518003940582275, "learning_rate": 4.0014930343098214e-05, "loss": 0.2435, "num_input_tokens_seen": 2532304, "step": 28125 }, { "epoch": 7.3102910602910605, "grad_norm": 0.929987907409668, "learning_rate": 4.00103968195391e-05, "loss": 0.3021, "num_input_tokens_seen": 2532784, "step": 28130 }, { "epoch": 7.3115904365904365, "grad_norm": 1.2564812898635864, "learning_rate": 4.000586252398089e-05, "loss": 0.2216, "num_input_tokens_seen": 2533232, "step": 28135 }, { "epoch": 7.3128898128898125, "grad_norm": 0.944151759147644, "learning_rate": 4.00013274566568e-05, "loss": 0.2282, "num_input_tokens_seen": 2533696, "step": 28140 }, { "epoch": 7.3141891891891895, "grad_norm": 0.4225441813468933, "learning_rate": 3.999679161780005e-05, "loss": 0.2058, "num_input_tokens_seen": 2534160, "step": 28145 }, { "epoch": 7.3154885654885655, "grad_norm": 0.9554732441902161, "learning_rate": 3.999225500764396e-05, "loss": 0.2736, "num_input_tokens_seen": 2534608, "step": 28150 }, { "epoch": 7.316787941787942, "grad_norm": 0.5336652398109436, "learning_rate": 3.998771762642182e-05, "loss": 0.2051, "num_input_tokens_seen": 2535040, "step": 28155 }, { "epoch": 7.3180873180873185, "grad_norm": 1.129451036453247, "learning_rate": 3.998317947436701e-05, "loss": 0.2561, "num_input_tokens_seen": 2535488, "step": 28160 }, { "epoch": 7.3193866943866945, "grad_norm": 0.6326776742935181, "learning_rate": 3.997864055171291e-05, "loss": 0.1335, "num_input_tokens_seen": 2535952, "step": 28165 }, { "epoch": 7.320686070686071, "grad_norm": 2.4469823837280273, "learning_rate": 3.997410085869297e-05, "loss": 0.1983, "num_input_tokens_seen": 2536416, "step": 28170 }, { "epoch": 7.321985446985447, "grad_norm": 1.8773691654205322, "learning_rate": 3.9969560395540685e-05, "loss": 0.2151, "num_input_tokens_seen": 2536832, "step": 28175 }, { "epoch": 7.3232848232848236, "grad_norm": 0.30129754543304443, "learning_rate": 3.996501916248957e-05, "loss": 0.1373, "num_input_tokens_seen": 2537280, "step": 28180 }, { "epoch": 7.3245841995842, "grad_norm": 0.2585851550102234, "learning_rate": 3.996047715977318e-05, "loss": 0.4448, "num_input_tokens_seen": 2537728, "step": 28185 }, { "epoch": 7.325883575883576, "grad_norm": 1.5134820938110352, "learning_rate": 3.995593438762511e-05, "loss": 0.2937, "num_input_tokens_seen": 2538176, "step": 28190 }, { "epoch": 7.327182952182953, "grad_norm": 0.38046279549598694, "learning_rate": 3.9951390846279004e-05, "loss": 0.2028, "num_input_tokens_seen": 2538656, "step": 28195 }, { "epoch": 7.328482328482329, "grad_norm": 0.31770122051239014, "learning_rate": 3.994684653596854e-05, "loss": 0.1893, "num_input_tokens_seen": 2539104, "step": 28200 }, { "epoch": 7.329781704781705, "grad_norm": 1.0972193479537964, "learning_rate": 3.994230145692744e-05, "loss": 0.2626, "num_input_tokens_seen": 2539568, "step": 28205 }, { "epoch": 7.331081081081081, "grad_norm": 0.6698467135429382, "learning_rate": 3.993775560938946e-05, "loss": 0.2247, "num_input_tokens_seen": 2540000, "step": 28210 }, { "epoch": 7.332380457380458, "grad_norm": 0.831179678440094, "learning_rate": 3.993320899358839e-05, "loss": 0.193, "num_input_tokens_seen": 2540464, "step": 28215 }, { "epoch": 7.333679833679834, "grad_norm": 0.3081294298171997, "learning_rate": 3.9928661609758076e-05, "loss": 0.2006, "num_input_tokens_seen": 2540928, "step": 28220 }, { "epoch": 7.33497920997921, "grad_norm": 0.3619738519191742, "learning_rate": 3.99241134581324e-05, "loss": 0.2539, "num_input_tokens_seen": 2541392, "step": 28225 }, { "epoch": 7.336278586278587, "grad_norm": 0.33981412649154663, "learning_rate": 3.991956453894526e-05, "loss": 0.206, "num_input_tokens_seen": 2541808, "step": 28230 }, { "epoch": 7.337577962577963, "grad_norm": 0.549746572971344, "learning_rate": 3.9915014852430626e-05, "loss": 0.2302, "num_input_tokens_seen": 2542272, "step": 28235 }, { "epoch": 7.338877338877339, "grad_norm": 0.32232701778411865, "learning_rate": 3.991046439882249e-05, "loss": 0.1607, "num_input_tokens_seen": 2542704, "step": 28240 }, { "epoch": 7.340176715176715, "grad_norm": 0.47296491265296936, "learning_rate": 3.990591317835488e-05, "loss": 0.2746, "num_input_tokens_seen": 2543152, "step": 28245 }, { "epoch": 7.341476091476092, "grad_norm": 0.5680856704711914, "learning_rate": 3.9901361191261866e-05, "loss": 0.2151, "num_input_tokens_seen": 2543616, "step": 28250 }, { "epoch": 7.342775467775468, "grad_norm": 0.31301045417785645, "learning_rate": 3.989680843777757e-05, "loss": 0.2127, "num_input_tokens_seen": 2544064, "step": 28255 }, { "epoch": 7.344074844074844, "grad_norm": 0.574872612953186, "learning_rate": 3.989225491813615e-05, "loss": 0.3127, "num_input_tokens_seen": 2544544, "step": 28260 }, { "epoch": 7.345374220374221, "grad_norm": 0.3900495171546936, "learning_rate": 3.988770063257179e-05, "loss": 0.389, "num_input_tokens_seen": 2544960, "step": 28265 }, { "epoch": 7.346673596673597, "grad_norm": 1.3205926418304443, "learning_rate": 3.988314558131871e-05, "loss": 0.288, "num_input_tokens_seen": 2545440, "step": 28270 }, { "epoch": 7.347972972972973, "grad_norm": 0.31131404638290405, "learning_rate": 3.9878589764611205e-05, "loss": 0.281, "num_input_tokens_seen": 2545904, "step": 28275 }, { "epoch": 7.349272349272349, "grad_norm": 0.6750195622444153, "learning_rate": 3.987403318268357e-05, "loss": 0.2358, "num_input_tokens_seen": 2546416, "step": 28280 }, { "epoch": 7.350571725571726, "grad_norm": 1.2284622192382812, "learning_rate": 3.9869475835770153e-05, "loss": 0.256, "num_input_tokens_seen": 2546880, "step": 28285 }, { "epoch": 7.351871101871102, "grad_norm": 0.36290422081947327, "learning_rate": 3.986491772410535e-05, "loss": 0.1466, "num_input_tokens_seen": 2547296, "step": 28290 }, { "epoch": 7.353170478170478, "grad_norm": 0.3229653835296631, "learning_rate": 3.9860358847923586e-05, "loss": 0.3562, "num_input_tokens_seen": 2547760, "step": 28295 }, { "epoch": 7.354469854469855, "grad_norm": 0.4732034206390381, "learning_rate": 3.9855799207459324e-05, "loss": 0.2508, "num_input_tokens_seen": 2548272, "step": 28300 }, { "epoch": 7.355769230769231, "grad_norm": 1.0285426378250122, "learning_rate": 3.985123880294708e-05, "loss": 0.2435, "num_input_tokens_seen": 2548752, "step": 28305 }, { "epoch": 7.357068607068607, "grad_norm": 0.9701297283172607, "learning_rate": 3.984667763462139e-05, "loss": 0.2644, "num_input_tokens_seen": 2549216, "step": 28310 }, { "epoch": 7.358367983367984, "grad_norm": 0.43193763494491577, "learning_rate": 3.984211570271684e-05, "loss": 0.1988, "num_input_tokens_seen": 2549664, "step": 28315 }, { "epoch": 7.35966735966736, "grad_norm": 0.6681077480316162, "learning_rate": 3.9837553007468063e-05, "loss": 0.242, "num_input_tokens_seen": 2550096, "step": 28320 }, { "epoch": 7.360966735966736, "grad_norm": 1.0509240627288818, "learning_rate": 3.983298954910972e-05, "loss": 0.3194, "num_input_tokens_seen": 2550512, "step": 28325 }, { "epoch": 7.362266112266112, "grad_norm": 0.7462378144264221, "learning_rate": 3.982842532787651e-05, "loss": 0.2717, "num_input_tokens_seen": 2550976, "step": 28330 }, { "epoch": 7.363565488565489, "grad_norm": 0.5140407681465149, "learning_rate": 3.982386034400318e-05, "loss": 0.2657, "num_input_tokens_seen": 2551440, "step": 28335 }, { "epoch": 7.364864864864865, "grad_norm": 0.3886635899543762, "learning_rate": 3.9819294597724504e-05, "loss": 0.3024, "num_input_tokens_seen": 2551872, "step": 28340 }, { "epoch": 7.366164241164241, "grad_norm": 0.5561324954032898, "learning_rate": 3.981472808927531e-05, "loss": 0.251, "num_input_tokens_seen": 2552352, "step": 28345 }, { "epoch": 7.367463617463617, "grad_norm": 0.4713996648788452, "learning_rate": 3.981016081889046e-05, "loss": 0.2205, "num_input_tokens_seen": 2552800, "step": 28350 }, { "epoch": 7.368762993762994, "grad_norm": 0.5202950835227966, "learning_rate": 3.980559278680485e-05, "loss": 0.2268, "num_input_tokens_seen": 2553296, "step": 28355 }, { "epoch": 7.37006237006237, "grad_norm": 0.741445779800415, "learning_rate": 3.980102399325341e-05, "loss": 0.2813, "num_input_tokens_seen": 2553776, "step": 28360 }, { "epoch": 7.371361746361746, "grad_norm": 0.60337895154953, "learning_rate": 3.979645443847112e-05, "loss": 0.1669, "num_input_tokens_seen": 2554240, "step": 28365 }, { "epoch": 7.372661122661123, "grad_norm": 0.28745490312576294, "learning_rate": 3.979188412269301e-05, "loss": 0.2365, "num_input_tokens_seen": 2554672, "step": 28370 }, { "epoch": 7.373960498960499, "grad_norm": 0.38469356298446655, "learning_rate": 3.978731304615413e-05, "loss": 0.3342, "num_input_tokens_seen": 2555136, "step": 28375 }, { "epoch": 7.375259875259875, "grad_norm": 0.28192418813705444, "learning_rate": 3.978274120908956e-05, "loss": 0.2685, "num_input_tokens_seen": 2555568, "step": 28380 }, { "epoch": 7.376559251559252, "grad_norm": 0.4320954382419586, "learning_rate": 3.9778168611734456e-05, "loss": 0.1804, "num_input_tokens_seen": 2555984, "step": 28385 }, { "epoch": 7.377858627858628, "grad_norm": 0.30136796832084656, "learning_rate": 3.977359525432397e-05, "loss": 0.1616, "num_input_tokens_seen": 2556400, "step": 28390 }, { "epoch": 7.379158004158004, "grad_norm": 0.4397607445716858, "learning_rate": 3.9769021137093335e-05, "loss": 0.2291, "num_input_tokens_seen": 2556896, "step": 28395 }, { "epoch": 7.38045738045738, "grad_norm": 0.3099246323108673, "learning_rate": 3.976444626027779e-05, "loss": 0.2333, "num_input_tokens_seen": 2557360, "step": 28400 }, { "epoch": 7.381756756756757, "grad_norm": 0.7090809941291809, "learning_rate": 3.9759870624112615e-05, "loss": 0.2479, "num_input_tokens_seen": 2557808, "step": 28405 }, { "epoch": 7.383056133056133, "grad_norm": 0.9869657754898071, "learning_rate": 3.975529422883316e-05, "loss": 0.3806, "num_input_tokens_seen": 2558256, "step": 28410 }, { "epoch": 7.384355509355509, "grad_norm": 0.8101493120193481, "learning_rate": 3.9750717074674796e-05, "loss": 0.2744, "num_input_tokens_seen": 2558752, "step": 28415 }, { "epoch": 7.385654885654886, "grad_norm": 1.5017645359039307, "learning_rate": 3.974613916187291e-05, "loss": 0.297, "num_input_tokens_seen": 2559232, "step": 28420 }, { "epoch": 7.386954261954262, "grad_norm": 0.5900455117225647, "learning_rate": 3.9741560490662954e-05, "loss": 0.2836, "num_input_tokens_seen": 2559696, "step": 28425 }, { "epoch": 7.388253638253638, "grad_norm": 1.0258280038833618, "learning_rate": 3.973698106128042e-05, "loss": 0.271, "num_input_tokens_seen": 2560160, "step": 28430 }, { "epoch": 7.389553014553014, "grad_norm": 0.821957528591156, "learning_rate": 3.973240087396083e-05, "loss": 0.2621, "num_input_tokens_seen": 2560560, "step": 28435 }, { "epoch": 7.390852390852391, "grad_norm": 0.7646477222442627, "learning_rate": 3.972781992893975e-05, "loss": 0.2798, "num_input_tokens_seen": 2560992, "step": 28440 }, { "epoch": 7.392151767151767, "grad_norm": 1.0603177547454834, "learning_rate": 3.972323822645278e-05, "loss": 0.2467, "num_input_tokens_seen": 2561424, "step": 28445 }, { "epoch": 7.393451143451143, "grad_norm": 0.6466362476348877, "learning_rate": 3.971865576673556e-05, "loss": 0.2173, "num_input_tokens_seen": 2561872, "step": 28450 }, { "epoch": 7.39475051975052, "grad_norm": 0.6195075511932373, "learning_rate": 3.971407255002376e-05, "loss": 0.3136, "num_input_tokens_seen": 2562320, "step": 28455 }, { "epoch": 7.396049896049896, "grad_norm": 1.0208879709243774, "learning_rate": 3.9709488576553125e-05, "loss": 0.1844, "num_input_tokens_seen": 2562768, "step": 28460 }, { "epoch": 7.397349272349272, "grad_norm": 0.5609393119812012, "learning_rate": 3.970490384655939e-05, "loss": 0.1921, "num_input_tokens_seen": 2563232, "step": 28465 }, { "epoch": 7.398648648648648, "grad_norm": 0.6975537538528442, "learning_rate": 3.970031836027837e-05, "loss": 0.2037, "num_input_tokens_seen": 2563664, "step": 28470 }, { "epoch": 7.399948024948025, "grad_norm": 0.950673520565033, "learning_rate": 3.9695732117945876e-05, "loss": 0.215, "num_input_tokens_seen": 2564112, "step": 28475 }, { "epoch": 7.401247401247401, "grad_norm": 0.5337761640548706, "learning_rate": 3.9691145119797814e-05, "loss": 0.2885, "num_input_tokens_seen": 2564560, "step": 28480 }, { "epoch": 7.402546777546777, "grad_norm": 0.979495644569397, "learning_rate": 3.968655736607007e-05, "loss": 0.2164, "num_input_tokens_seen": 2565056, "step": 28485 }, { "epoch": 7.403846153846154, "grad_norm": 0.9721000790596008, "learning_rate": 3.968196885699862e-05, "loss": 0.2613, "num_input_tokens_seen": 2565488, "step": 28490 }, { "epoch": 7.40514553014553, "grad_norm": 0.6210526823997498, "learning_rate": 3.967737959281944e-05, "loss": 0.3641, "num_input_tokens_seen": 2566000, "step": 28495 }, { "epoch": 7.406444906444906, "grad_norm": 0.532821774482727, "learning_rate": 3.967278957376856e-05, "loss": 0.2937, "num_input_tokens_seen": 2566464, "step": 28500 }, { "epoch": 7.407744282744282, "grad_norm": 0.40260425209999084, "learning_rate": 3.9668198800082055e-05, "loss": 0.235, "num_input_tokens_seen": 2566896, "step": 28505 }, { "epoch": 7.409043659043659, "grad_norm": 0.6637915372848511, "learning_rate": 3.9663607271996025e-05, "loss": 0.2306, "num_input_tokens_seen": 2567312, "step": 28510 }, { "epoch": 7.410343035343035, "grad_norm": 0.3754558265209198, "learning_rate": 3.9659014989746634e-05, "loss": 0.1737, "num_input_tokens_seen": 2567760, "step": 28515 }, { "epoch": 7.411642411642411, "grad_norm": 0.4432363212108612, "learning_rate": 3.9654421953570056e-05, "loss": 0.272, "num_input_tokens_seen": 2568208, "step": 28520 }, { "epoch": 7.412941787941788, "grad_norm": 0.4615253210067749, "learning_rate": 3.964982816370252e-05, "loss": 0.3132, "num_input_tokens_seen": 2568672, "step": 28525 }, { "epoch": 7.414241164241164, "grad_norm": 0.42456409335136414, "learning_rate": 3.9645233620380274e-05, "loss": 0.3112, "num_input_tokens_seen": 2569152, "step": 28530 }, { "epoch": 7.41554054054054, "grad_norm": 0.4340350329875946, "learning_rate": 3.964063832383963e-05, "loss": 0.2639, "num_input_tokens_seen": 2569600, "step": 28535 }, { "epoch": 7.416839916839917, "grad_norm": 0.669490396976471, "learning_rate": 3.963604227431693e-05, "loss": 0.2679, "num_input_tokens_seen": 2570032, "step": 28540 }, { "epoch": 7.418139293139293, "grad_norm": 0.7575657367706299, "learning_rate": 3.963144547204856e-05, "loss": 0.2763, "num_input_tokens_seen": 2570480, "step": 28545 }, { "epoch": 7.419438669438669, "grad_norm": 0.586055338382721, "learning_rate": 3.9626847917270925e-05, "loss": 0.2679, "num_input_tokens_seen": 2570944, "step": 28550 }, { "epoch": 7.420738045738045, "grad_norm": 0.47402429580688477, "learning_rate": 3.962224961022049e-05, "loss": 0.2285, "num_input_tokens_seen": 2571456, "step": 28555 }, { "epoch": 7.422037422037422, "grad_norm": 0.6484946012496948, "learning_rate": 3.9617650551133746e-05, "loss": 0.1681, "num_input_tokens_seen": 2571872, "step": 28560 }, { "epoch": 7.423336798336798, "grad_norm": 0.9236800670623779, "learning_rate": 3.9613050740247224e-05, "loss": 0.241, "num_input_tokens_seen": 2572336, "step": 28565 }, { "epoch": 7.424636174636174, "grad_norm": 1.4100877046585083, "learning_rate": 3.960845017779751e-05, "loss": 0.248, "num_input_tokens_seen": 2572816, "step": 28570 }, { "epoch": 7.4259355509355505, "grad_norm": 0.20450875163078308, "learning_rate": 3.9603848864021196e-05, "loss": 0.3131, "num_input_tokens_seen": 2573232, "step": 28575 }, { "epoch": 7.427234927234927, "grad_norm": 0.30792778730392456, "learning_rate": 3.959924679915494e-05, "loss": 0.156, "num_input_tokens_seen": 2573648, "step": 28580 }, { "epoch": 7.428534303534303, "grad_norm": 0.8509684801101685, "learning_rate": 3.959464398343544e-05, "loss": 0.2438, "num_input_tokens_seen": 2574096, "step": 28585 }, { "epoch": 7.4298336798336795, "grad_norm": 0.8103195428848267, "learning_rate": 3.9590040417099415e-05, "loss": 0.2964, "num_input_tokens_seen": 2574544, "step": 28590 }, { "epoch": 7.431133056133056, "grad_norm": 0.48024702072143555, "learning_rate": 3.958543610038363e-05, "loss": 0.1153, "num_input_tokens_seen": 2575024, "step": 28595 }, { "epoch": 7.4324324324324325, "grad_norm": 1.1018928289413452, "learning_rate": 3.95808310335249e-05, "loss": 0.279, "num_input_tokens_seen": 2575440, "step": 28600 }, { "epoch": 7.4337318087318085, "grad_norm": 0.3149776756763458, "learning_rate": 3.957622521676005e-05, "loss": 0.2897, "num_input_tokens_seen": 2575872, "step": 28605 }, { "epoch": 7.435031185031185, "grad_norm": 0.5958376526832581, "learning_rate": 3.957161865032597e-05, "loss": 0.2515, "num_input_tokens_seen": 2576320, "step": 28610 }, { "epoch": 7.4363305613305615, "grad_norm": 0.508858323097229, "learning_rate": 3.9567011334459585e-05, "loss": 0.2434, "num_input_tokens_seen": 2576768, "step": 28615 }, { "epoch": 7.4376299376299375, "grad_norm": 0.6183465123176575, "learning_rate": 3.9562403269397854e-05, "loss": 0.2603, "num_input_tokens_seen": 2577280, "step": 28620 }, { "epoch": 7.438929313929314, "grad_norm": 0.6812925338745117, "learning_rate": 3.955779445537776e-05, "loss": 0.1978, "num_input_tokens_seen": 2577680, "step": 28625 }, { "epoch": 7.4402286902286905, "grad_norm": 0.6927912831306458, "learning_rate": 3.955318489263635e-05, "loss": 0.2964, "num_input_tokens_seen": 2578112, "step": 28630 }, { "epoch": 7.4415280665280665, "grad_norm": 0.6408953070640564, "learning_rate": 3.9548574581410705e-05, "loss": 0.258, "num_input_tokens_seen": 2578592, "step": 28635 }, { "epoch": 7.442827442827443, "grad_norm": 0.42134347558021545, "learning_rate": 3.954396352193792e-05, "loss": 0.3101, "num_input_tokens_seen": 2579040, "step": 28640 }, { "epoch": 7.4441268191268195, "grad_norm": 0.7032097578048706, "learning_rate": 3.953935171445516e-05, "loss": 0.2473, "num_input_tokens_seen": 2579472, "step": 28645 }, { "epoch": 7.4454261954261955, "grad_norm": 0.30426570773124695, "learning_rate": 3.953473915919962e-05, "loss": 0.2631, "num_input_tokens_seen": 2579904, "step": 28650 }, { "epoch": 7.446725571725572, "grad_norm": 0.6358968019485474, "learning_rate": 3.953012585640851e-05, "loss": 0.252, "num_input_tokens_seen": 2580304, "step": 28655 }, { "epoch": 7.448024948024948, "grad_norm": 0.4215839207172394, "learning_rate": 3.952551180631909e-05, "loss": 0.293, "num_input_tokens_seen": 2580752, "step": 28660 }, { "epoch": 7.449324324324325, "grad_norm": 0.4139849841594696, "learning_rate": 3.95208970091687e-05, "loss": 0.2539, "num_input_tokens_seen": 2581232, "step": 28665 }, { "epoch": 7.450623700623701, "grad_norm": 0.48055368661880493, "learning_rate": 3.951628146519466e-05, "loss": 0.2338, "num_input_tokens_seen": 2581680, "step": 28670 }, { "epoch": 7.451923076923077, "grad_norm": 0.34592390060424805, "learning_rate": 3.951166517463435e-05, "loss": 0.1823, "num_input_tokens_seen": 2582128, "step": 28675 }, { "epoch": 7.453222453222454, "grad_norm": 0.3491158187389374, "learning_rate": 3.95070481377252e-05, "loss": 0.2195, "num_input_tokens_seen": 2582592, "step": 28680 }, { "epoch": 7.45452182952183, "grad_norm": 0.24902990460395813, "learning_rate": 3.950243035470467e-05, "loss": 0.264, "num_input_tokens_seen": 2583072, "step": 28685 }, { "epoch": 7.455821205821206, "grad_norm": 1.164336085319519, "learning_rate": 3.9497811825810235e-05, "loss": 0.3398, "num_input_tokens_seen": 2583520, "step": 28690 }, { "epoch": 7.457120582120582, "grad_norm": 0.6157727241516113, "learning_rate": 3.949319255127946e-05, "loss": 0.1645, "num_input_tokens_seen": 2583952, "step": 28695 }, { "epoch": 7.458419958419959, "grad_norm": 0.34629982709884644, "learning_rate": 3.9488572531349904e-05, "loss": 0.2588, "num_input_tokens_seen": 2584400, "step": 28700 }, { "epoch": 7.459719334719335, "grad_norm": 1.0300536155700684, "learning_rate": 3.948395176625918e-05, "loss": 0.2362, "num_input_tokens_seen": 2584864, "step": 28705 }, { "epoch": 7.461018711018711, "grad_norm": 0.3817208409309387, "learning_rate": 3.947933025624493e-05, "loss": 0.1636, "num_input_tokens_seen": 2585296, "step": 28710 }, { "epoch": 7.462318087318088, "grad_norm": 0.649124264717102, "learning_rate": 3.947470800154486e-05, "loss": 0.1739, "num_input_tokens_seen": 2585728, "step": 28715 }, { "epoch": 7.463617463617464, "grad_norm": 1.3068969249725342, "learning_rate": 3.947008500239669e-05, "loss": 0.3664, "num_input_tokens_seen": 2586176, "step": 28720 }, { "epoch": 7.46491683991684, "grad_norm": 0.3413121998310089, "learning_rate": 3.9465461259038176e-05, "loss": 0.353, "num_input_tokens_seen": 2586624, "step": 28725 }, { "epoch": 7.466216216216216, "grad_norm": 0.387199729681015, "learning_rate": 3.946083677170713e-05, "loss": 0.1943, "num_input_tokens_seen": 2587104, "step": 28730 }, { "epoch": 7.467515592515593, "grad_norm": 0.8237054944038391, "learning_rate": 3.9456211540641405e-05, "loss": 0.2806, "num_input_tokens_seen": 2587568, "step": 28735 }, { "epoch": 7.468814968814969, "grad_norm": 0.34329864382743835, "learning_rate": 3.945158556607886e-05, "loss": 0.1514, "num_input_tokens_seen": 2588016, "step": 28740 }, { "epoch": 7.470114345114345, "grad_norm": 0.8561919927597046, "learning_rate": 3.944695884825742e-05, "loss": 0.2611, "num_input_tokens_seen": 2588464, "step": 28745 }, { "epoch": 7.471413721413722, "grad_norm": 0.263515442609787, "learning_rate": 3.9442331387415054e-05, "loss": 0.2474, "num_input_tokens_seen": 2588928, "step": 28750 }, { "epoch": 7.472713097713098, "grad_norm": 0.38451075553894043, "learning_rate": 3.943770318378974e-05, "loss": 0.116, "num_input_tokens_seen": 2589376, "step": 28755 }, { "epoch": 7.474012474012474, "grad_norm": 0.5977644920349121, "learning_rate": 3.943307423761952e-05, "loss": 0.3491, "num_input_tokens_seen": 2589808, "step": 28760 }, { "epoch": 7.475311850311851, "grad_norm": 0.752581000328064, "learning_rate": 3.942844454914246e-05, "loss": 0.4449, "num_input_tokens_seen": 2590224, "step": 28765 }, { "epoch": 7.476611226611227, "grad_norm": 0.4768357574939728, "learning_rate": 3.9423814118596666e-05, "loss": 0.3583, "num_input_tokens_seen": 2590672, "step": 28770 }, { "epoch": 7.477910602910603, "grad_norm": 0.3323616087436676, "learning_rate": 3.94191829462203e-05, "loss": 0.32, "num_input_tokens_seen": 2591168, "step": 28775 }, { "epoch": 7.479209979209979, "grad_norm": 0.804267168045044, "learning_rate": 3.941455103225154e-05, "loss": 0.2447, "num_input_tokens_seen": 2591600, "step": 28780 }, { "epoch": 7.480509355509356, "grad_norm": 0.4428192377090454, "learning_rate": 3.9409918376928604e-05, "loss": 0.2759, "num_input_tokens_seen": 2592096, "step": 28785 }, { "epoch": 7.481808731808732, "grad_norm": 0.5383554100990295, "learning_rate": 3.940528498048977e-05, "loss": 0.2686, "num_input_tokens_seen": 2592592, "step": 28790 }, { "epoch": 7.483108108108108, "grad_norm": 0.5620923638343811, "learning_rate": 3.9400650843173317e-05, "loss": 0.2886, "num_input_tokens_seen": 2593040, "step": 28795 }, { "epoch": 7.484407484407485, "grad_norm": 0.4812561571598053, "learning_rate": 3.939601596521759e-05, "loss": 0.299, "num_input_tokens_seen": 2593536, "step": 28800 }, { "epoch": 7.485706860706861, "grad_norm": 0.4729587137699127, "learning_rate": 3.939138034686097e-05, "loss": 0.286, "num_input_tokens_seen": 2593984, "step": 28805 }, { "epoch": 7.487006237006237, "grad_norm": 0.5161852836608887, "learning_rate": 3.938674398834188e-05, "loss": 0.2409, "num_input_tokens_seen": 2594432, "step": 28810 }, { "epoch": 7.488305613305613, "grad_norm": 0.5772103667259216, "learning_rate": 3.938210688989876e-05, "loss": 0.2654, "num_input_tokens_seen": 2594896, "step": 28815 }, { "epoch": 7.48960498960499, "grad_norm": 0.490416020154953, "learning_rate": 3.9377469051770094e-05, "loss": 0.2701, "num_input_tokens_seen": 2595392, "step": 28820 }, { "epoch": 7.490904365904366, "grad_norm": 0.5615676641464233, "learning_rate": 3.937283047419442e-05, "loss": 0.2435, "num_input_tokens_seen": 2595824, "step": 28825 }, { "epoch": 7.492203742203742, "grad_norm": 0.49905794858932495, "learning_rate": 3.936819115741031e-05, "loss": 0.2368, "num_input_tokens_seen": 2596288, "step": 28830 }, { "epoch": 7.493503118503119, "grad_norm": 0.5240632891654968, "learning_rate": 3.936355110165636e-05, "loss": 0.2192, "num_input_tokens_seen": 2596736, "step": 28835 }, { "epoch": 7.494802494802495, "grad_norm": 0.5733780860900879, "learning_rate": 3.935891030717121e-05, "loss": 0.2585, "num_input_tokens_seen": 2597200, "step": 28840 }, { "epoch": 7.496101871101871, "grad_norm": 0.6226598024368286, "learning_rate": 3.935426877419355e-05, "loss": 0.2169, "num_input_tokens_seen": 2597648, "step": 28845 }, { "epoch": 7.497401247401247, "grad_norm": 0.5269359350204468, "learning_rate": 3.9349626502962084e-05, "loss": 0.1837, "num_input_tokens_seen": 2598112, "step": 28850 }, { "epoch": 7.498700623700624, "grad_norm": 1.195717215538025, "learning_rate": 3.934498349371558e-05, "loss": 0.2797, "num_input_tokens_seen": 2598560, "step": 28855 }, { "epoch": 7.5, "grad_norm": 1.432667851448059, "learning_rate": 3.9340339746692835e-05, "loss": 0.3113, "num_input_tokens_seen": 2598960, "step": 28860 }, { "epoch": 7.501299376299376, "grad_norm": 1.0430142879486084, "learning_rate": 3.933569526213268e-05, "loss": 0.2633, "num_input_tokens_seen": 2599424, "step": 28865 }, { "epoch": 7.502598752598753, "grad_norm": 0.6708252429962158, "learning_rate": 3.9331050040273974e-05, "loss": 0.2948, "num_input_tokens_seen": 2599856, "step": 28870 }, { "epoch": 7.503898128898129, "grad_norm": 1.0195953845977783, "learning_rate": 3.932640408135564e-05, "loss": 0.2501, "num_input_tokens_seen": 2600288, "step": 28875 }, { "epoch": 7.505197505197505, "grad_norm": 0.6039809584617615, "learning_rate": 3.932175738561662e-05, "loss": 0.2399, "num_input_tokens_seen": 2600736, "step": 28880 }, { "epoch": 7.506496881496881, "grad_norm": 0.5739884376525879, "learning_rate": 3.931710995329588e-05, "loss": 0.2087, "num_input_tokens_seen": 2601168, "step": 28885 }, { "epoch": 7.507796257796258, "grad_norm": 0.4666866362094879, "learning_rate": 3.931246178463247e-05, "loss": 0.2218, "num_input_tokens_seen": 2601632, "step": 28890 }, { "epoch": 7.509095634095634, "grad_norm": 0.6063982248306274, "learning_rate": 3.930781287986543e-05, "loss": 0.309, "num_input_tokens_seen": 2602096, "step": 28895 }, { "epoch": 7.51039501039501, "grad_norm": 0.5400211215019226, "learning_rate": 3.930316323923387e-05, "loss": 0.2958, "num_input_tokens_seen": 2602544, "step": 28900 }, { "epoch": 7.511694386694387, "grad_norm": 0.4312880337238312, "learning_rate": 3.9298512862976915e-05, "loss": 0.2114, "num_input_tokens_seen": 2603008, "step": 28905 }, { "epoch": 7.512993762993763, "grad_norm": 0.5560147762298584, "learning_rate": 3.929386175133374e-05, "loss": 0.2592, "num_input_tokens_seen": 2603440, "step": 28910 }, { "epoch": 7.514293139293139, "grad_norm": 0.5049778819084167, "learning_rate": 3.9289209904543565e-05, "loss": 0.2043, "num_input_tokens_seen": 2603872, "step": 28915 }, { "epoch": 7.515592515592516, "grad_norm": 0.45949700474739075, "learning_rate": 3.928455732284564e-05, "loss": 0.2247, "num_input_tokens_seen": 2604336, "step": 28920 }, { "epoch": 7.516891891891892, "grad_norm": 0.4295653700828552, "learning_rate": 3.927990400647924e-05, "loss": 0.222, "num_input_tokens_seen": 2604832, "step": 28925 }, { "epoch": 7.518191268191268, "grad_norm": 0.419271320104599, "learning_rate": 3.927524995568369e-05, "loss": 0.1845, "num_input_tokens_seen": 2605296, "step": 28930 }, { "epoch": 7.519490644490644, "grad_norm": 0.36915430426597595, "learning_rate": 3.927059517069836e-05, "loss": 0.2721, "num_input_tokens_seen": 2605760, "step": 28935 }, { "epoch": 7.520790020790021, "grad_norm": 0.6739271283149719, "learning_rate": 3.926593965176265e-05, "loss": 0.2701, "num_input_tokens_seen": 2606192, "step": 28940 }, { "epoch": 7.522089397089397, "grad_norm": 0.6876218318939209, "learning_rate": 3.926128339911599e-05, "loss": 0.2479, "num_input_tokens_seen": 2606688, "step": 28945 }, { "epoch": 7.523388773388773, "grad_norm": 0.44495663046836853, "learning_rate": 3.925662641299787e-05, "loss": 0.3498, "num_input_tokens_seen": 2607152, "step": 28950 }, { "epoch": 7.524688149688149, "grad_norm": 0.5692443251609802, "learning_rate": 3.925196869364778e-05, "loss": 0.3223, "num_input_tokens_seen": 2607616, "step": 28955 }, { "epoch": 7.525987525987526, "grad_norm": 0.41775792837142944, "learning_rate": 3.9247310241305294e-05, "loss": 0.2485, "num_input_tokens_seen": 2608064, "step": 28960 }, { "epoch": 7.527286902286902, "grad_norm": 0.7619384527206421, "learning_rate": 3.9242651056209976e-05, "loss": 0.2523, "num_input_tokens_seen": 2608496, "step": 28965 }, { "epoch": 7.528586278586278, "grad_norm": 0.4330582618713379, "learning_rate": 3.923799113860148e-05, "loss": 0.2691, "num_input_tokens_seen": 2608944, "step": 28970 }, { "epoch": 7.529885654885655, "grad_norm": 0.4212552309036255, "learning_rate": 3.923333048871946e-05, "loss": 0.2291, "num_input_tokens_seen": 2609408, "step": 28975 }, { "epoch": 7.531185031185031, "grad_norm": 0.6701382994651794, "learning_rate": 3.922866910680361e-05, "loss": 0.2271, "num_input_tokens_seen": 2609824, "step": 28980 }, { "epoch": 7.532484407484407, "grad_norm": 0.49693137407302856, "learning_rate": 3.9224006993093675e-05, "loss": 0.2443, "num_input_tokens_seen": 2610288, "step": 28985 }, { "epoch": 7.533783783783784, "grad_norm": 0.45399031043052673, "learning_rate": 3.921934414782944e-05, "loss": 0.2839, "num_input_tokens_seen": 2610736, "step": 28990 }, { "epoch": 7.53508316008316, "grad_norm": 0.8465616703033447, "learning_rate": 3.9214680571250696e-05, "loss": 0.149, "num_input_tokens_seen": 2611184, "step": 28995 }, { "epoch": 7.536382536382536, "grad_norm": 0.5798704624176025, "learning_rate": 3.921001626359732e-05, "loss": 0.3214, "num_input_tokens_seen": 2611648, "step": 29000 }, { "epoch": 7.537681912681912, "grad_norm": 0.40888315439224243, "learning_rate": 3.9205351225109185e-05, "loss": 0.1942, "num_input_tokens_seen": 2612096, "step": 29005 }, { "epoch": 7.538981288981289, "grad_norm": 1.1344008445739746, "learning_rate": 3.920068545602624e-05, "loss": 0.3603, "num_input_tokens_seen": 2612576, "step": 29010 }, { "epoch": 7.540280665280665, "grad_norm": 0.5434755682945251, "learning_rate": 3.919601895658843e-05, "loss": 0.1631, "num_input_tokens_seen": 2613024, "step": 29015 }, { "epoch": 7.541580041580041, "grad_norm": 0.6055443286895752, "learning_rate": 3.919135172703575e-05, "loss": 0.2134, "num_input_tokens_seen": 2613472, "step": 29020 }, { "epoch": 7.542879417879417, "grad_norm": 0.36725154519081116, "learning_rate": 3.918668376760827e-05, "loss": 0.1883, "num_input_tokens_seen": 2614000, "step": 29025 }, { "epoch": 7.544178794178794, "grad_norm": 0.6381920576095581, "learning_rate": 3.918201507854604e-05, "loss": 0.2393, "num_input_tokens_seen": 2614432, "step": 29030 }, { "epoch": 7.54547817047817, "grad_norm": 0.4642923176288605, "learning_rate": 3.9177345660089196e-05, "loss": 0.2865, "num_input_tokens_seen": 2614880, "step": 29035 }, { "epoch": 7.546777546777546, "grad_norm": 0.4580107033252716, "learning_rate": 3.917267551247787e-05, "loss": 0.1998, "num_input_tokens_seen": 2615312, "step": 29040 }, { "epoch": 7.548076923076923, "grad_norm": 0.47080254554748535, "learning_rate": 3.916800463595227e-05, "loss": 0.324, "num_input_tokens_seen": 2615776, "step": 29045 }, { "epoch": 7.549376299376299, "grad_norm": 0.373937726020813, "learning_rate": 3.916333303075261e-05, "loss": 0.1428, "num_input_tokens_seen": 2616224, "step": 29050 }, { "epoch": 7.550675675675675, "grad_norm": 0.7242900133132935, "learning_rate": 3.9158660697119167e-05, "loss": 0.1972, "num_input_tokens_seen": 2616704, "step": 29055 }, { "epoch": 7.551975051975052, "grad_norm": 0.7262824177742004, "learning_rate": 3.915398763529223e-05, "loss": 0.3036, "num_input_tokens_seen": 2617168, "step": 29060 }, { "epoch": 7.553274428274428, "grad_norm": 0.4144487679004669, "learning_rate": 3.914931384551216e-05, "loss": 0.2623, "num_input_tokens_seen": 2617600, "step": 29065 }, { "epoch": 7.5545738045738045, "grad_norm": 0.4129636883735657, "learning_rate": 3.9144639328019314e-05, "loss": 0.1806, "num_input_tokens_seen": 2618064, "step": 29070 }, { "epoch": 7.5558731808731805, "grad_norm": 0.6485305428504944, "learning_rate": 3.913996408305412e-05, "loss": 0.2231, "num_input_tokens_seen": 2618496, "step": 29075 }, { "epoch": 7.557172557172557, "grad_norm": 0.34080028533935547, "learning_rate": 3.9135288110857014e-05, "loss": 0.1997, "num_input_tokens_seen": 2618944, "step": 29080 }, { "epoch": 7.5584719334719335, "grad_norm": 0.9000988006591797, "learning_rate": 3.9130611411668507e-05, "loss": 0.2606, "num_input_tokens_seen": 2619408, "step": 29085 }, { "epoch": 7.5597713097713095, "grad_norm": 0.4644002914428711, "learning_rate": 3.912593398572911e-05, "loss": 0.2547, "num_input_tokens_seen": 2619872, "step": 29090 }, { "epoch": 7.561070686070686, "grad_norm": 0.3823189437389374, "learning_rate": 3.912125583327939e-05, "loss": 0.2181, "num_input_tokens_seen": 2620352, "step": 29095 }, { "epoch": 7.5623700623700625, "grad_norm": 0.3211715817451477, "learning_rate": 3.911657695455996e-05, "loss": 0.2309, "num_input_tokens_seen": 2620752, "step": 29100 }, { "epoch": 7.5636694386694385, "grad_norm": 1.0376006364822388, "learning_rate": 3.9111897349811454e-05, "loss": 0.237, "num_input_tokens_seen": 2621184, "step": 29105 }, { "epoch": 7.564968814968815, "grad_norm": 0.9032500982284546, "learning_rate": 3.910721701927454e-05, "loss": 0.1889, "num_input_tokens_seen": 2621648, "step": 29110 }, { "epoch": 7.5662681912681915, "grad_norm": 0.31070902943611145, "learning_rate": 3.910253596318994e-05, "loss": 0.1525, "num_input_tokens_seen": 2622096, "step": 29115 }, { "epoch": 7.5675675675675675, "grad_norm": 0.47327685356140137, "learning_rate": 3.90978541817984e-05, "loss": 0.2846, "num_input_tokens_seen": 2622496, "step": 29120 }, { "epoch": 7.568866943866944, "grad_norm": 0.49640271067619324, "learning_rate": 3.909317167534071e-05, "loss": 0.2917, "num_input_tokens_seen": 2622944, "step": 29125 }, { "epoch": 7.5701663201663205, "grad_norm": 0.6703773736953735, "learning_rate": 3.9088488444057704e-05, "loss": 0.312, "num_input_tokens_seen": 2623376, "step": 29130 }, { "epoch": 7.571465696465697, "grad_norm": 0.5866013765335083, "learning_rate": 3.9083804488190235e-05, "loss": 0.3056, "num_input_tokens_seen": 2623824, "step": 29135 }, { "epoch": 7.572765072765073, "grad_norm": 0.5651673078536987, "learning_rate": 3.9079119807979214e-05, "loss": 0.2365, "num_input_tokens_seen": 2624304, "step": 29140 }, { "epoch": 7.5740644490644495, "grad_norm": 0.4210246801376343, "learning_rate": 3.907443440366556e-05, "loss": 0.2324, "num_input_tokens_seen": 2624720, "step": 29145 }, { "epoch": 7.575363825363826, "grad_norm": 0.8889409303665161, "learning_rate": 3.906974827549026e-05, "loss": 0.3138, "num_input_tokens_seen": 2625216, "step": 29150 }, { "epoch": 7.576663201663202, "grad_norm": 1.1214499473571777, "learning_rate": 3.906506142369433e-05, "loss": 0.2704, "num_input_tokens_seen": 2625648, "step": 29155 }, { "epoch": 7.577962577962578, "grad_norm": 0.45995035767555237, "learning_rate": 3.906037384851881e-05, "loss": 0.3021, "num_input_tokens_seen": 2626112, "step": 29160 }, { "epoch": 7.579261954261955, "grad_norm": 0.40966808795928955, "learning_rate": 3.90556855502048e-05, "loss": 0.2599, "num_input_tokens_seen": 2626576, "step": 29165 }, { "epoch": 7.580561330561331, "grad_norm": 0.45006510615348816, "learning_rate": 3.90509965289934e-05, "loss": 0.2535, "num_input_tokens_seen": 2627024, "step": 29170 }, { "epoch": 7.581860706860707, "grad_norm": 0.4773740768432617, "learning_rate": 3.90463067851258e-05, "loss": 0.2599, "num_input_tokens_seen": 2627504, "step": 29175 }, { "epoch": 7.583160083160083, "grad_norm": 0.9238289594650269, "learning_rate": 3.904161631884318e-05, "loss": 0.2216, "num_input_tokens_seen": 2627984, "step": 29180 }, { "epoch": 7.58445945945946, "grad_norm": 0.5036132335662842, "learning_rate": 3.903692513038677e-05, "loss": 0.2948, "num_input_tokens_seen": 2628400, "step": 29185 }, { "epoch": 7.585758835758836, "grad_norm": 0.38140395283699036, "learning_rate": 3.903223321999786e-05, "loss": 0.2457, "num_input_tokens_seen": 2628832, "step": 29190 }, { "epoch": 7.587058212058212, "grad_norm": 0.4671124517917633, "learning_rate": 3.9027540587917744e-05, "loss": 0.3191, "num_input_tokens_seen": 2629296, "step": 29195 }, { "epoch": 7.588357588357589, "grad_norm": 0.884597659111023, "learning_rate": 3.902284723438778e-05, "loss": 0.1904, "num_input_tokens_seen": 2629760, "step": 29200 }, { "epoch": 7.589656964656965, "grad_norm": 0.559109091758728, "learning_rate": 3.901815315964935e-05, "loss": 0.1612, "num_input_tokens_seen": 2630208, "step": 29205 }, { "epoch": 7.590956340956341, "grad_norm": 0.6129180788993835, "learning_rate": 3.901345836394387e-05, "loss": 0.1974, "num_input_tokens_seen": 2630640, "step": 29210 }, { "epoch": 7.592255717255718, "grad_norm": 0.7600153088569641, "learning_rate": 3.900876284751279e-05, "loss": 0.2689, "num_input_tokens_seen": 2631120, "step": 29215 }, { "epoch": 7.593555093555094, "grad_norm": 1.007277011871338, "learning_rate": 3.900406661059762e-05, "loss": 0.268, "num_input_tokens_seen": 2631584, "step": 29220 }, { "epoch": 7.59485446985447, "grad_norm": 0.47314417362213135, "learning_rate": 3.899936965343989e-05, "loss": 0.3147, "num_input_tokens_seen": 2632016, "step": 29225 }, { "epoch": 7.596153846153846, "grad_norm": 0.6238728761672974, "learning_rate": 3.899467197628116e-05, "loss": 0.1798, "num_input_tokens_seen": 2632496, "step": 29230 }, { "epoch": 7.597453222453223, "grad_norm": 0.2766742706298828, "learning_rate": 3.8989973579363045e-05, "loss": 0.1547, "num_input_tokens_seen": 2632896, "step": 29235 }, { "epoch": 7.598752598752599, "grad_norm": 0.6987143754959106, "learning_rate": 3.898527446292718e-05, "loss": 0.2107, "num_input_tokens_seen": 2633328, "step": 29240 }, { "epoch": 7.600051975051975, "grad_norm": 0.2844797968864441, "learning_rate": 3.898057462721525e-05, "loss": 0.1366, "num_input_tokens_seen": 2633760, "step": 29245 }, { "epoch": 7.601351351351351, "grad_norm": 0.4042426645755768, "learning_rate": 3.897587407246898e-05, "loss": 0.0891, "num_input_tokens_seen": 2634176, "step": 29250 }, { "epoch": 7.602650727650728, "grad_norm": 0.2484966218471527, "learning_rate": 3.89711727989301e-05, "loss": 0.1862, "num_input_tokens_seen": 2634656, "step": 29255 }, { "epoch": 7.603950103950104, "grad_norm": 0.8825258016586304, "learning_rate": 3.8966470806840426e-05, "loss": 0.1929, "num_input_tokens_seen": 2635088, "step": 29260 }, { "epoch": 7.60524948024948, "grad_norm": 0.3123357892036438, "learning_rate": 3.896176809644178e-05, "loss": 0.1555, "num_input_tokens_seen": 2635504, "step": 29265 }, { "epoch": 7.606548856548857, "grad_norm": 1.4090458154678345, "learning_rate": 3.895706466797601e-05, "loss": 0.3356, "num_input_tokens_seen": 2635920, "step": 29270 }, { "epoch": 7.607848232848233, "grad_norm": 0.2641846537590027, "learning_rate": 3.895236052168505e-05, "loss": 0.2226, "num_input_tokens_seen": 2636416, "step": 29275 }, { "epoch": 7.609147609147609, "grad_norm": 0.3680464029312134, "learning_rate": 3.8947655657810805e-05, "loss": 0.2398, "num_input_tokens_seen": 2636832, "step": 29280 }, { "epoch": 7.610446985446986, "grad_norm": 1.2234742641448975, "learning_rate": 3.894295007659527e-05, "loss": 0.3526, "num_input_tokens_seen": 2637312, "step": 29285 }, { "epoch": 7.611746361746362, "grad_norm": 0.6551410555839539, "learning_rate": 3.893824377828045e-05, "loss": 0.3459, "num_input_tokens_seen": 2637760, "step": 29290 }, { "epoch": 7.613045738045738, "grad_norm": 1.135499358177185, "learning_rate": 3.89335367631084e-05, "loss": 0.1926, "num_input_tokens_seen": 2638208, "step": 29295 }, { "epoch": 7.614345114345114, "grad_norm": 0.5866337418556213, "learning_rate": 3.892882903132121e-05, "loss": 0.3441, "num_input_tokens_seen": 2638640, "step": 29300 }, { "epoch": 7.615644490644491, "grad_norm": 0.4976266324520111, "learning_rate": 3.8924120583160985e-05, "loss": 0.2186, "num_input_tokens_seen": 2639072, "step": 29305 }, { "epoch": 7.616943866943867, "grad_norm": 0.5496166348457336, "learning_rate": 3.8919411418869904e-05, "loss": 0.1861, "num_input_tokens_seen": 2639472, "step": 29310 }, { "epoch": 7.618243243243243, "grad_norm": 0.8752246499061584, "learning_rate": 3.891470153869016e-05, "loss": 0.2713, "num_input_tokens_seen": 2639936, "step": 29315 }, { "epoch": 7.61954261954262, "grad_norm": 0.35591205954551697, "learning_rate": 3.8909990942863976e-05, "loss": 0.2137, "num_input_tokens_seen": 2640384, "step": 29320 }, { "epoch": 7.620841995841996, "grad_norm": 0.8803020715713501, "learning_rate": 3.890527963163363e-05, "loss": 0.2626, "num_input_tokens_seen": 2640816, "step": 29325 }, { "epoch": 7.622141372141372, "grad_norm": 0.5129987001419067, "learning_rate": 3.890056760524143e-05, "loss": 0.2237, "num_input_tokens_seen": 2641296, "step": 29330 }, { "epoch": 7.623440748440748, "grad_norm": 0.8352836966514587, "learning_rate": 3.8895854863929725e-05, "loss": 0.2651, "num_input_tokens_seen": 2641728, "step": 29335 }, { "epoch": 7.624740124740125, "grad_norm": 0.328952431678772, "learning_rate": 3.889114140794089e-05, "loss": 0.273, "num_input_tokens_seen": 2642176, "step": 29340 }, { "epoch": 7.626039501039501, "grad_norm": 0.6763343811035156, "learning_rate": 3.8886427237517344e-05, "loss": 0.2187, "num_input_tokens_seen": 2642624, "step": 29345 }, { "epoch": 7.627338877338877, "grad_norm": 0.99024498462677, "learning_rate": 3.8881712352901536e-05, "loss": 0.2721, "num_input_tokens_seen": 2643072, "step": 29350 }, { "epoch": 7.628638253638254, "grad_norm": 0.5047992467880249, "learning_rate": 3.887699675433596e-05, "loss": 0.2621, "num_input_tokens_seen": 2643488, "step": 29355 }, { "epoch": 7.62993762993763, "grad_norm": 0.4140578806400299, "learning_rate": 3.887228044206314e-05, "loss": 0.1776, "num_input_tokens_seen": 2643968, "step": 29360 }, { "epoch": 7.631237006237006, "grad_norm": 0.37006014585494995, "learning_rate": 3.8867563416325644e-05, "loss": 0.3351, "num_input_tokens_seen": 2644400, "step": 29365 }, { "epoch": 7.632536382536383, "grad_norm": 0.6014026999473572, "learning_rate": 3.886284567736608e-05, "loss": 0.205, "num_input_tokens_seen": 2644864, "step": 29370 }, { "epoch": 7.633835758835759, "grad_norm": 0.3538932800292969, "learning_rate": 3.8858127225427086e-05, "loss": 0.324, "num_input_tokens_seen": 2645296, "step": 29375 }, { "epoch": 7.635135135135135, "grad_norm": 0.5042012333869934, "learning_rate": 3.8853408060751315e-05, "loss": 0.2395, "num_input_tokens_seen": 2645712, "step": 29380 }, { "epoch": 7.636434511434511, "grad_norm": 0.4391274154186249, "learning_rate": 3.884868818358151e-05, "loss": 0.3249, "num_input_tokens_seen": 2646160, "step": 29385 }, { "epoch": 7.637733887733888, "grad_norm": 0.5392986536026001, "learning_rate": 3.884396759416039e-05, "loss": 0.1661, "num_input_tokens_seen": 2646592, "step": 29390 }, { "epoch": 7.639033264033264, "grad_norm": 0.3876347243785858, "learning_rate": 3.8839246292730756e-05, "loss": 0.1723, "num_input_tokens_seen": 2647056, "step": 29395 }, { "epoch": 7.64033264033264, "grad_norm": 0.7236722111701965, "learning_rate": 3.8834524279535436e-05, "loss": 0.3965, "num_input_tokens_seen": 2647536, "step": 29400 }, { "epoch": 7.641632016632016, "grad_norm": 0.4705955386161804, "learning_rate": 3.882980155481727e-05, "loss": 0.2297, "num_input_tokens_seen": 2648000, "step": 29405 }, { "epoch": 7.642931392931393, "grad_norm": 0.4901101589202881, "learning_rate": 3.8825078118819155e-05, "loss": 0.2184, "num_input_tokens_seen": 2648432, "step": 29410 }, { "epoch": 7.644230769230769, "grad_norm": 0.5060411095619202, "learning_rate": 3.882035397178403e-05, "loss": 0.2869, "num_input_tokens_seen": 2648864, "step": 29415 }, { "epoch": 7.645530145530145, "grad_norm": 0.47359856963157654, "learning_rate": 3.8815629113954847e-05, "loss": 0.2811, "num_input_tokens_seen": 2649280, "step": 29420 }, { "epoch": 7.646829521829522, "grad_norm": 0.5103369951248169, "learning_rate": 3.881090354557463e-05, "loss": 0.2018, "num_input_tokens_seen": 2649760, "step": 29425 }, { "epoch": 7.648128898128898, "grad_norm": 0.5407160520553589, "learning_rate": 3.880617726688641e-05, "loss": 0.291, "num_input_tokens_seen": 2650224, "step": 29430 }, { "epoch": 7.649428274428274, "grad_norm": 0.471935898065567, "learning_rate": 3.8801450278133264e-05, "loss": 0.2703, "num_input_tokens_seen": 2650720, "step": 29435 }, { "epoch": 7.650727650727651, "grad_norm": 0.7870497107505798, "learning_rate": 3.879672257955831e-05, "loss": 0.309, "num_input_tokens_seen": 2651184, "step": 29440 }, { "epoch": 7.652027027027027, "grad_norm": 0.8114237785339355, "learning_rate": 3.8791994171404694e-05, "loss": 0.2931, "num_input_tokens_seen": 2651664, "step": 29445 }, { "epoch": 7.653326403326403, "grad_norm": 0.9087156653404236, "learning_rate": 3.8787265053915603e-05, "loss": 0.2464, "num_input_tokens_seen": 2652096, "step": 29450 }, { "epoch": 7.654625779625779, "grad_norm": 0.693263053894043, "learning_rate": 3.878253522733426e-05, "loss": 0.2635, "num_input_tokens_seen": 2652560, "step": 29455 }, { "epoch": 7.655925155925156, "grad_norm": 0.678378701210022, "learning_rate": 3.8777804691903916e-05, "loss": 0.2551, "num_input_tokens_seen": 2653024, "step": 29460 }, { "epoch": 7.657224532224532, "grad_norm": 0.5470691323280334, "learning_rate": 3.877307344786788e-05, "loss": 0.2312, "num_input_tokens_seen": 2653456, "step": 29465 }, { "epoch": 7.658523908523908, "grad_norm": 0.5847586393356323, "learning_rate": 3.876834149546948e-05, "loss": 0.3239, "num_input_tokens_seen": 2653936, "step": 29470 }, { "epoch": 7.659823284823284, "grad_norm": 0.6291776895523071, "learning_rate": 3.8763608834952094e-05, "loss": 0.2577, "num_input_tokens_seen": 2654448, "step": 29475 }, { "epoch": 7.661122661122661, "grad_norm": 0.3664972484111786, "learning_rate": 3.875887546655911e-05, "loss": 0.2197, "num_input_tokens_seen": 2654912, "step": 29480 }, { "epoch": 7.662422037422037, "grad_norm": 0.5327669382095337, "learning_rate": 3.8754141390533974e-05, "loss": 0.2222, "num_input_tokens_seen": 2655344, "step": 29485 }, { "epoch": 7.663721413721413, "grad_norm": 0.6950947046279907, "learning_rate": 3.874940660712018e-05, "loss": 0.2347, "num_input_tokens_seen": 2655776, "step": 29490 }, { "epoch": 7.66502079002079, "grad_norm": 0.7171635031700134, "learning_rate": 3.8744671116561216e-05, "loss": 0.2601, "num_input_tokens_seen": 2656224, "step": 29495 }, { "epoch": 7.666320166320166, "grad_norm": 0.517142653465271, "learning_rate": 3.873993491910065e-05, "loss": 0.2548, "num_input_tokens_seen": 2656672, "step": 29500 }, { "epoch": 7.667619542619542, "grad_norm": 0.5012622475624084, "learning_rate": 3.8735198014982064e-05, "loss": 0.2938, "num_input_tokens_seen": 2657136, "step": 29505 }, { "epoch": 7.668918918918919, "grad_norm": 0.8150507211685181, "learning_rate": 3.873046040444909e-05, "loss": 0.2888, "num_input_tokens_seen": 2657568, "step": 29510 }, { "epoch": 7.670218295218295, "grad_norm": 1.039941430091858, "learning_rate": 3.8725722087745384e-05, "loss": 0.2741, "num_input_tokens_seen": 2658032, "step": 29515 }, { "epoch": 7.671517671517671, "grad_norm": 0.4859367609024048, "learning_rate": 3.872098306511463e-05, "loss": 0.2934, "num_input_tokens_seen": 2658496, "step": 29520 }, { "epoch": 7.672817047817047, "grad_norm": 0.8271624445915222, "learning_rate": 3.871624333680057e-05, "loss": 0.2747, "num_input_tokens_seen": 2658960, "step": 29525 }, { "epoch": 7.674116424116424, "grad_norm": 0.5949661731719971, "learning_rate": 3.871150290304698e-05, "loss": 0.3088, "num_input_tokens_seen": 2659376, "step": 29530 }, { "epoch": 7.6754158004158, "grad_norm": 0.7753222584724426, "learning_rate": 3.870676176409767e-05, "loss": 0.2433, "num_input_tokens_seen": 2659824, "step": 29535 }, { "epoch": 7.6767151767151764, "grad_norm": 0.7510029673576355, "learning_rate": 3.870201992019645e-05, "loss": 0.271, "num_input_tokens_seen": 2660304, "step": 29540 }, { "epoch": 7.678014553014553, "grad_norm": 0.983244001865387, "learning_rate": 3.869727737158722e-05, "loss": 0.2718, "num_input_tokens_seen": 2660784, "step": 29545 }, { "epoch": 7.679313929313929, "grad_norm": 0.7097533941268921, "learning_rate": 3.86925341185139e-05, "loss": 0.2542, "num_input_tokens_seen": 2661200, "step": 29550 }, { "epoch": 7.6806133056133055, "grad_norm": 0.4842154383659363, "learning_rate": 3.8687790161220414e-05, "loss": 0.239, "num_input_tokens_seen": 2661632, "step": 29555 }, { "epoch": 7.6819126819126815, "grad_norm": 0.869590699672699, "learning_rate": 3.8683045499950774e-05, "loss": 0.2782, "num_input_tokens_seen": 2662112, "step": 29560 }, { "epoch": 7.683212058212058, "grad_norm": 0.8053648471832275, "learning_rate": 3.8678300134948996e-05, "loss": 0.252, "num_input_tokens_seen": 2662576, "step": 29565 }, { "epoch": 7.6845114345114345, "grad_norm": 0.7704540491104126, "learning_rate": 3.8673554066459136e-05, "loss": 0.1938, "num_input_tokens_seen": 2663008, "step": 29570 }, { "epoch": 7.6858108108108105, "grad_norm": 0.38226911425590515, "learning_rate": 3.866880729472529e-05, "loss": 0.2033, "num_input_tokens_seen": 2663488, "step": 29575 }, { "epoch": 7.6871101871101875, "grad_norm": 0.3852075934410095, "learning_rate": 3.8664059819991586e-05, "loss": 0.1557, "num_input_tokens_seen": 2663984, "step": 29580 }, { "epoch": 7.6884095634095635, "grad_norm": 0.9037538766860962, "learning_rate": 3.865931164250219e-05, "loss": 0.2696, "num_input_tokens_seen": 2664432, "step": 29585 }, { "epoch": 7.6897089397089395, "grad_norm": 0.32659560441970825, "learning_rate": 3.865456276250131e-05, "loss": 0.154, "num_input_tokens_seen": 2664880, "step": 29590 }, { "epoch": 7.6910083160083165, "grad_norm": 0.23632366955280304, "learning_rate": 3.864981318023319e-05, "loss": 0.252, "num_input_tokens_seen": 2665328, "step": 29595 }, { "epoch": 7.6923076923076925, "grad_norm": 1.3955968618392944, "learning_rate": 3.8645062895942096e-05, "loss": 0.3642, "num_input_tokens_seen": 2665808, "step": 29600 }, { "epoch": 7.6936070686070686, "grad_norm": 0.6311962604522705, "learning_rate": 3.864031190987234e-05, "loss": 0.2763, "num_input_tokens_seen": 2666240, "step": 29605 }, { "epoch": 7.694906444906445, "grad_norm": 0.7775958776473999, "learning_rate": 3.863556022226827e-05, "loss": 0.3265, "num_input_tokens_seen": 2666704, "step": 29610 }, { "epoch": 7.6962058212058215, "grad_norm": 0.5304272174835205, "learning_rate": 3.8630807833374286e-05, "loss": 0.2298, "num_input_tokens_seen": 2667152, "step": 29615 }, { "epoch": 7.697505197505198, "grad_norm": 1.069427490234375, "learning_rate": 3.862605474343478e-05, "loss": 0.2479, "num_input_tokens_seen": 2667584, "step": 29620 }, { "epoch": 7.698804573804574, "grad_norm": 0.5383809804916382, "learning_rate": 3.862130095269423e-05, "loss": 0.1863, "num_input_tokens_seen": 2668048, "step": 29625 }, { "epoch": 7.70010395010395, "grad_norm": 0.3702625632286072, "learning_rate": 3.861654646139712e-05, "loss": 0.1619, "num_input_tokens_seen": 2668528, "step": 29630 }, { "epoch": 7.701403326403327, "grad_norm": 1.0236552953720093, "learning_rate": 3.861179126978798e-05, "loss": 0.2736, "num_input_tokens_seen": 2669008, "step": 29635 }, { "epoch": 7.702702702702703, "grad_norm": 0.6450647115707397, "learning_rate": 3.8607035378111375e-05, "loss": 0.2128, "num_input_tokens_seen": 2669488, "step": 29640 }, { "epoch": 7.704002079002079, "grad_norm": 0.26450875401496887, "learning_rate": 3.860227878661189e-05, "loss": 0.1183, "num_input_tokens_seen": 2669952, "step": 29645 }, { "epoch": 7.705301455301456, "grad_norm": 0.9919171333312988, "learning_rate": 3.859752149553419e-05, "loss": 0.3794, "num_input_tokens_seen": 2670400, "step": 29650 }, { "epoch": 7.706600831600832, "grad_norm": 0.47347089648246765, "learning_rate": 3.859276350512292e-05, "loss": 0.1656, "num_input_tokens_seen": 2670864, "step": 29655 }, { "epoch": 7.707900207900208, "grad_norm": 0.4597070813179016, "learning_rate": 3.8588004815622805e-05, "loss": 0.3193, "num_input_tokens_seen": 2671312, "step": 29660 }, { "epoch": 7.709199584199585, "grad_norm": 0.7281845211982727, "learning_rate": 3.8583245427278584e-05, "loss": 0.226, "num_input_tokens_seen": 2671776, "step": 29665 }, { "epoch": 7.710498960498961, "grad_norm": 0.6725016832351685, "learning_rate": 3.8578485340335035e-05, "loss": 0.2546, "num_input_tokens_seen": 2672208, "step": 29670 }, { "epoch": 7.711798336798337, "grad_norm": 0.3193703293800354, "learning_rate": 3.857372455503697e-05, "loss": 0.2781, "num_input_tokens_seen": 2672656, "step": 29675 }, { "epoch": 7.713097713097713, "grad_norm": 0.6523266434669495, "learning_rate": 3.856896307162925e-05, "loss": 0.1948, "num_input_tokens_seen": 2673072, "step": 29680 }, { "epoch": 7.71439708939709, "grad_norm": 1.0772041082382202, "learning_rate": 3.856420089035676e-05, "loss": 0.3171, "num_input_tokens_seen": 2673504, "step": 29685 }, { "epoch": 7.715696465696466, "grad_norm": 0.43961411714553833, "learning_rate": 3.855943801146441e-05, "loss": 0.2143, "num_input_tokens_seen": 2673968, "step": 29690 }, { "epoch": 7.716995841995842, "grad_norm": 1.1459957361221313, "learning_rate": 3.855467443519718e-05, "loss": 0.2907, "num_input_tokens_seen": 2674384, "step": 29695 }, { "epoch": 7.718295218295218, "grad_norm": 0.3939387798309326, "learning_rate": 3.854991016180005e-05, "loss": 0.268, "num_input_tokens_seen": 2674832, "step": 29700 }, { "epoch": 7.719594594594595, "grad_norm": 1.0712969303131104, "learning_rate": 3.854514519151807e-05, "loss": 0.1984, "num_input_tokens_seen": 2675296, "step": 29705 }, { "epoch": 7.720893970893971, "grad_norm": 0.6850691437721252, "learning_rate": 3.854037952459628e-05, "loss": 0.2075, "num_input_tokens_seen": 2675728, "step": 29710 }, { "epoch": 7.722193347193347, "grad_norm": 1.1506702899932861, "learning_rate": 3.853561316127981e-05, "loss": 0.2335, "num_input_tokens_seen": 2676192, "step": 29715 }, { "epoch": 7.723492723492724, "grad_norm": 1.056848168373108, "learning_rate": 3.853084610181378e-05, "loss": 0.2429, "num_input_tokens_seen": 2676592, "step": 29720 }, { "epoch": 7.7247920997921, "grad_norm": 0.9741544723510742, "learning_rate": 3.852607834644337e-05, "loss": 0.2507, "num_input_tokens_seen": 2677024, "step": 29725 }, { "epoch": 7.726091476091476, "grad_norm": 0.6662579774856567, "learning_rate": 3.852130989541379e-05, "loss": 0.1949, "num_input_tokens_seen": 2677472, "step": 29730 }, { "epoch": 7.727390852390853, "grad_norm": 0.7783995866775513, "learning_rate": 3.851654074897029e-05, "loss": 0.2853, "num_input_tokens_seen": 2677904, "step": 29735 }, { "epoch": 7.728690228690229, "grad_norm": 0.9503052830696106, "learning_rate": 3.851177090735815e-05, "loss": 0.2172, "num_input_tokens_seen": 2678352, "step": 29740 }, { "epoch": 7.729989604989605, "grad_norm": 1.3725584745407104, "learning_rate": 3.850700037082268e-05, "loss": 0.2688, "num_input_tokens_seen": 2678816, "step": 29745 }, { "epoch": 7.731288981288982, "grad_norm": 0.9271870255470276, "learning_rate": 3.8502229139609236e-05, "loss": 0.2971, "num_input_tokens_seen": 2679280, "step": 29750 }, { "epoch": 7.732588357588358, "grad_norm": 0.6701781749725342, "learning_rate": 3.849745721396322e-05, "loss": 0.2702, "num_input_tokens_seen": 2679680, "step": 29755 }, { "epoch": 7.733887733887734, "grad_norm": 0.8581846952438354, "learning_rate": 3.849268459413003e-05, "loss": 0.2016, "num_input_tokens_seen": 2680128, "step": 29760 }, { "epoch": 7.73518711018711, "grad_norm": 1.0271817445755005, "learning_rate": 3.8487911280355164e-05, "loss": 0.3257, "num_input_tokens_seen": 2680576, "step": 29765 }, { "epoch": 7.736486486486487, "grad_norm": 0.6077153086662292, "learning_rate": 3.848313727288409e-05, "loss": 0.2714, "num_input_tokens_seen": 2681040, "step": 29770 }, { "epoch": 7.737785862785863, "grad_norm": 0.4594694674015045, "learning_rate": 3.847836257196235e-05, "loss": 0.255, "num_input_tokens_seen": 2681472, "step": 29775 }, { "epoch": 7.739085239085239, "grad_norm": 1.517944097518921, "learning_rate": 3.8473587177835504e-05, "loss": 0.2495, "num_input_tokens_seen": 2681888, "step": 29780 }, { "epoch": 7.740384615384615, "grad_norm": 0.8407453298568726, "learning_rate": 3.8468811090749155e-05, "loss": 0.1897, "num_input_tokens_seen": 2682352, "step": 29785 }, { "epoch": 7.741683991683992, "grad_norm": 0.3808477520942688, "learning_rate": 3.846403431094895e-05, "loss": 0.2949, "num_input_tokens_seen": 2682832, "step": 29790 }, { "epoch": 7.742983367983368, "grad_norm": 0.5084884762763977, "learning_rate": 3.845925683868057e-05, "loss": 0.2576, "num_input_tokens_seen": 2683248, "step": 29795 }, { "epoch": 7.744282744282744, "grad_norm": 0.610636293888092, "learning_rate": 3.84544786741897e-05, "loss": 0.3126, "num_input_tokens_seen": 2683696, "step": 29800 }, { "epoch": 7.745582120582121, "grad_norm": 0.5506364107131958, "learning_rate": 3.844969981772211e-05, "loss": 0.2427, "num_input_tokens_seen": 2684176, "step": 29805 }, { "epoch": 7.746881496881497, "grad_norm": 0.45346808433532715, "learning_rate": 3.844492026952356e-05, "loss": 0.2179, "num_input_tokens_seen": 2684640, "step": 29810 }, { "epoch": 7.748180873180873, "grad_norm": 0.9237051606178284, "learning_rate": 3.844014002983989e-05, "loss": 0.2364, "num_input_tokens_seen": 2685072, "step": 29815 }, { "epoch": 7.74948024948025, "grad_norm": 0.510221004486084, "learning_rate": 3.843535909891694e-05, "loss": 0.1908, "num_input_tokens_seen": 2685520, "step": 29820 }, { "epoch": 7.750779625779626, "grad_norm": 0.6452023386955261, "learning_rate": 3.8430577477000595e-05, "loss": 0.2898, "num_input_tokens_seen": 2685968, "step": 29825 }, { "epoch": 7.752079002079002, "grad_norm": 0.3812721371650696, "learning_rate": 3.842579516433677e-05, "loss": 0.2329, "num_input_tokens_seen": 2686400, "step": 29830 }, { "epoch": 7.753378378378378, "grad_norm": 1.102591633796692, "learning_rate": 3.8421012161171454e-05, "loss": 0.2487, "num_input_tokens_seen": 2686832, "step": 29835 }, { "epoch": 7.754677754677755, "grad_norm": 1.2412525415420532, "learning_rate": 3.841622846775062e-05, "loss": 0.2192, "num_input_tokens_seen": 2687264, "step": 29840 }, { "epoch": 7.755977130977131, "grad_norm": 0.6425469517707825, "learning_rate": 3.84114440843203e-05, "loss": 0.2485, "num_input_tokens_seen": 2687712, "step": 29845 }, { "epoch": 7.757276507276507, "grad_norm": 0.43632006645202637, "learning_rate": 3.840665901112656e-05, "loss": 0.3013, "num_input_tokens_seen": 2688160, "step": 29850 }, { "epoch": 7.758575883575883, "grad_norm": 0.9383424520492554, "learning_rate": 3.8401873248415494e-05, "loss": 0.2606, "num_input_tokens_seen": 2688608, "step": 29855 }, { "epoch": 7.75987525987526, "grad_norm": 1.0888638496398926, "learning_rate": 3.839708679643326e-05, "loss": 0.2517, "num_input_tokens_seen": 2689040, "step": 29860 }, { "epoch": 7.761174636174636, "grad_norm": 0.6657912731170654, "learning_rate": 3.8392299655426e-05, "loss": 0.2268, "num_input_tokens_seen": 2689472, "step": 29865 }, { "epoch": 7.762474012474012, "grad_norm": 1.0772144794464111, "learning_rate": 3.8387511825639946e-05, "loss": 0.356, "num_input_tokens_seen": 2689952, "step": 29870 }, { "epoch": 7.763773388773389, "grad_norm": 1.0721333026885986, "learning_rate": 3.8382723307321325e-05, "loss": 0.2275, "num_input_tokens_seen": 2690384, "step": 29875 }, { "epoch": 7.765072765072765, "grad_norm": 0.5151247978210449, "learning_rate": 3.837793410071643e-05, "loss": 0.2031, "num_input_tokens_seen": 2690816, "step": 29880 }, { "epoch": 7.766372141372141, "grad_norm": 1.2267954349517822, "learning_rate": 3.8373144206071556e-05, "loss": 0.216, "num_input_tokens_seen": 2691264, "step": 29885 }, { "epoch": 7.767671517671518, "grad_norm": 0.2928399443626404, "learning_rate": 3.836835362363307e-05, "loss": 0.1798, "num_input_tokens_seen": 2691712, "step": 29890 }, { "epoch": 7.768970893970894, "grad_norm": 0.31981801986694336, "learning_rate": 3.836356235364735e-05, "loss": 0.236, "num_input_tokens_seen": 2692144, "step": 29895 }, { "epoch": 7.77027027027027, "grad_norm": 0.9643441438674927, "learning_rate": 3.835877039636081e-05, "loss": 0.3529, "num_input_tokens_seen": 2692624, "step": 29900 }, { "epoch": 7.771569646569646, "grad_norm": 0.2454742193222046, "learning_rate": 3.835397775201991e-05, "loss": 0.1285, "num_input_tokens_seen": 2693088, "step": 29905 }, { "epoch": 7.772869022869023, "grad_norm": 0.6121850609779358, "learning_rate": 3.8349184420871134e-05, "loss": 0.4278, "num_input_tokens_seen": 2693504, "step": 29910 }, { "epoch": 7.774168399168399, "grad_norm": 1.5002309083938599, "learning_rate": 3.8344390403161016e-05, "loss": 0.3728, "num_input_tokens_seen": 2693952, "step": 29915 }, { "epoch": 7.775467775467775, "grad_norm": 0.5651830434799194, "learning_rate": 3.833959569913611e-05, "loss": 0.1805, "num_input_tokens_seen": 2694400, "step": 29920 }, { "epoch": 7.776767151767151, "grad_norm": 0.5831028819084167, "learning_rate": 3.833480030904303e-05, "loss": 0.237, "num_input_tokens_seen": 2694848, "step": 29925 }, { "epoch": 7.778066528066528, "grad_norm": 0.5844065546989441, "learning_rate": 3.833000423312838e-05, "loss": 0.2398, "num_input_tokens_seen": 2695360, "step": 29930 }, { "epoch": 7.779365904365904, "grad_norm": 0.8335083723068237, "learning_rate": 3.8325207471638844e-05, "loss": 0.2337, "num_input_tokens_seen": 2695776, "step": 29935 }, { "epoch": 7.78066528066528, "grad_norm": 0.42377611994743347, "learning_rate": 3.8320410024821115e-05, "loss": 0.2241, "num_input_tokens_seen": 2696208, "step": 29940 }, { "epoch": 7.781964656964657, "grad_norm": 1.2521411180496216, "learning_rate": 3.8315611892921946e-05, "loss": 0.2435, "num_input_tokens_seen": 2696688, "step": 29945 }, { "epoch": 7.783264033264033, "grad_norm": 0.358009397983551, "learning_rate": 3.831081307618809e-05, "loss": 0.2503, "num_input_tokens_seen": 2697136, "step": 29950 }, { "epoch": 7.784563409563409, "grad_norm": 1.449013113975525, "learning_rate": 3.8306013574866365e-05, "loss": 0.3634, "num_input_tokens_seen": 2697616, "step": 29955 }, { "epoch": 7.785862785862786, "grad_norm": 0.7924980521202087, "learning_rate": 3.830121338920362e-05, "loss": 0.2465, "num_input_tokens_seen": 2698064, "step": 29960 }, { "epoch": 7.787162162162162, "grad_norm": 0.8353302478790283, "learning_rate": 3.829641251944672e-05, "loss": 0.3404, "num_input_tokens_seen": 2698560, "step": 29965 }, { "epoch": 7.788461538461538, "grad_norm": 0.8898561596870422, "learning_rate": 3.8291610965842584e-05, "loss": 0.2633, "num_input_tokens_seen": 2698992, "step": 29970 }, { "epoch": 7.789760914760915, "grad_norm": 0.9057690501213074, "learning_rate": 3.8286808728638154e-05, "loss": 0.2559, "num_input_tokens_seen": 2699440, "step": 29975 }, { "epoch": 7.791060291060291, "grad_norm": 0.48872220516204834, "learning_rate": 3.828200580808043e-05, "loss": 0.2864, "num_input_tokens_seen": 2699904, "step": 29980 }, { "epoch": 7.792359667359667, "grad_norm": 0.531065821647644, "learning_rate": 3.827720220441642e-05, "loss": 0.2201, "num_input_tokens_seen": 2700368, "step": 29985 }, { "epoch": 7.793659043659043, "grad_norm": 0.7151813507080078, "learning_rate": 3.827239791789318e-05, "loss": 0.2804, "num_input_tokens_seen": 2700784, "step": 29990 }, { "epoch": 7.79495841995842, "grad_norm": 0.7275384068489075, "learning_rate": 3.826759294875779e-05, "loss": 0.265, "num_input_tokens_seen": 2701232, "step": 29995 }, { "epoch": 7.796257796257796, "grad_norm": 0.5862311720848083, "learning_rate": 3.826278729725739e-05, "loss": 0.2857, "num_input_tokens_seen": 2701664, "step": 30000 }, { "epoch": 7.797557172557172, "grad_norm": 0.5861730575561523, "learning_rate": 3.8257980963639125e-05, "loss": 0.2967, "num_input_tokens_seen": 2702096, "step": 30005 }, { "epoch": 7.798856548856548, "grad_norm": 0.9436389207839966, "learning_rate": 3.82531739481502e-05, "loss": 0.2693, "num_input_tokens_seen": 2702560, "step": 30010 }, { "epoch": 7.800155925155925, "grad_norm": 0.8029035329818726, "learning_rate": 3.824836625103784e-05, "loss": 0.2549, "num_input_tokens_seen": 2703024, "step": 30015 }, { "epoch": 7.801455301455301, "grad_norm": 0.8870340585708618, "learning_rate": 3.824355787254931e-05, "loss": 0.2592, "num_input_tokens_seen": 2703504, "step": 30020 }, { "epoch": 7.8027546777546775, "grad_norm": 0.7243672013282776, "learning_rate": 3.8238748812931904e-05, "loss": 0.2718, "num_input_tokens_seen": 2703952, "step": 30025 }, { "epoch": 7.804054054054054, "grad_norm": 0.7855131030082703, "learning_rate": 3.823393907243297e-05, "loss": 0.2789, "num_input_tokens_seen": 2704384, "step": 30030 }, { "epoch": 7.80535343035343, "grad_norm": 0.46674638986587524, "learning_rate": 3.8229128651299875e-05, "loss": 0.2832, "num_input_tokens_seen": 2704816, "step": 30035 }, { "epoch": 7.8066528066528065, "grad_norm": 0.5491660833358765, "learning_rate": 3.822431754978e-05, "loss": 0.2922, "num_input_tokens_seen": 2705264, "step": 30040 }, { "epoch": 7.807952182952183, "grad_norm": 0.4362480938434601, "learning_rate": 3.821950576812081e-05, "loss": 0.3603, "num_input_tokens_seen": 2705696, "step": 30045 }, { "epoch": 7.8092515592515594, "grad_norm": 0.8409289717674255, "learning_rate": 3.821469330656978e-05, "loss": 0.2532, "num_input_tokens_seen": 2706144, "step": 30050 }, { "epoch": 7.8105509355509355, "grad_norm": 0.6418017745018005, "learning_rate": 3.8209880165374415e-05, "loss": 0.2509, "num_input_tokens_seen": 2706576, "step": 30055 }, { "epoch": 7.8118503118503115, "grad_norm": 0.6761335730552673, "learning_rate": 3.820506634478224e-05, "loss": 0.2019, "num_input_tokens_seen": 2706992, "step": 30060 }, { "epoch": 7.8131496881496885, "grad_norm": 0.8255136609077454, "learning_rate": 3.8200251845040855e-05, "loss": 0.2557, "num_input_tokens_seen": 2707440, "step": 30065 }, { "epoch": 7.8144490644490645, "grad_norm": 0.3665785491466522, "learning_rate": 3.8195436666397874e-05, "loss": 0.1884, "num_input_tokens_seen": 2707856, "step": 30070 }, { "epoch": 7.8157484407484406, "grad_norm": 0.5115717649459839, "learning_rate": 3.8190620809100936e-05, "loss": 0.1266, "num_input_tokens_seen": 2708288, "step": 30075 }, { "epoch": 7.817047817047817, "grad_norm": 1.0180166959762573, "learning_rate": 3.8185804273397726e-05, "loss": 0.247, "num_input_tokens_seen": 2708736, "step": 30080 }, { "epoch": 7.8183471933471935, "grad_norm": 0.26228460669517517, "learning_rate": 3.8180987059535984e-05, "loss": 0.2345, "num_input_tokens_seen": 2709248, "step": 30085 }, { "epoch": 7.81964656964657, "grad_norm": 0.23022349178791046, "learning_rate": 3.8176169167763444e-05, "loss": 0.0792, "num_input_tokens_seen": 2709712, "step": 30090 }, { "epoch": 7.820945945945946, "grad_norm": 0.33841490745544434, "learning_rate": 3.8171350598327894e-05, "loss": 0.2637, "num_input_tokens_seen": 2710208, "step": 30095 }, { "epoch": 7.8222453222453225, "grad_norm": 0.2155514657497406, "learning_rate": 3.816653135147716e-05, "loss": 0.2029, "num_input_tokens_seen": 2710656, "step": 30100 }, { "epoch": 7.823544698544699, "grad_norm": 1.2758686542510986, "learning_rate": 3.816171142745911e-05, "loss": 0.4429, "num_input_tokens_seen": 2711104, "step": 30105 }, { "epoch": 7.824844074844075, "grad_norm": 0.6377951502799988, "learning_rate": 3.815689082652163e-05, "loss": 0.2218, "num_input_tokens_seen": 2711568, "step": 30110 }, { "epoch": 7.826143451143452, "grad_norm": 0.47879448533058167, "learning_rate": 3.8152069548912646e-05, "loss": 0.3026, "num_input_tokens_seen": 2711984, "step": 30115 }, { "epoch": 7.827442827442828, "grad_norm": 0.7094780802726746, "learning_rate": 3.814724759488012e-05, "loss": 0.2329, "num_input_tokens_seen": 2712432, "step": 30120 }, { "epoch": 7.828742203742204, "grad_norm": 0.4818746745586395, "learning_rate": 3.814242496467206e-05, "loss": 0.2394, "num_input_tokens_seen": 2712864, "step": 30125 }, { "epoch": 7.83004158004158, "grad_norm": 1.0861660242080688, "learning_rate": 3.813760165853649e-05, "loss": 0.243, "num_input_tokens_seen": 2713328, "step": 30130 }, { "epoch": 7.831340956340957, "grad_norm": 0.6697964072227478, "learning_rate": 3.8132777676721475e-05, "loss": 0.2265, "num_input_tokens_seen": 2713808, "step": 30135 }, { "epoch": 7.832640332640333, "grad_norm": 1.1574362516403198, "learning_rate": 3.812795301947512e-05, "loss": 0.3038, "num_input_tokens_seen": 2714304, "step": 30140 }, { "epoch": 7.833939708939709, "grad_norm": 0.6302375197410583, "learning_rate": 3.812312768704557e-05, "loss": 0.1852, "num_input_tokens_seen": 2714784, "step": 30145 }, { "epoch": 7.835239085239085, "grad_norm": 0.5820710062980652, "learning_rate": 3.811830167968098e-05, "loss": 0.2606, "num_input_tokens_seen": 2715248, "step": 30150 }, { "epoch": 7.836538461538462, "grad_norm": 0.596377968788147, "learning_rate": 3.8113474997629573e-05, "loss": 0.2577, "num_input_tokens_seen": 2715696, "step": 30155 }, { "epoch": 7.837837837837838, "grad_norm": 0.8492439389228821, "learning_rate": 3.810864764113957e-05, "loss": 0.3392, "num_input_tokens_seen": 2716160, "step": 30160 }, { "epoch": 7.839137214137214, "grad_norm": 0.732224702835083, "learning_rate": 3.810381961045927e-05, "loss": 0.2582, "num_input_tokens_seen": 2716592, "step": 30165 }, { "epoch": 7.840436590436591, "grad_norm": 0.8242688775062561, "learning_rate": 3.809899090583697e-05, "loss": 0.2766, "num_input_tokens_seen": 2717056, "step": 30170 }, { "epoch": 7.841735966735967, "grad_norm": 1.0767916440963745, "learning_rate": 3.809416152752102e-05, "loss": 0.297, "num_input_tokens_seen": 2717488, "step": 30175 }, { "epoch": 7.843035343035343, "grad_norm": 0.854902446269989, "learning_rate": 3.808933147575979e-05, "loss": 0.2459, "num_input_tokens_seen": 2717952, "step": 30180 }, { "epoch": 7.84433471933472, "grad_norm": 1.1658849716186523, "learning_rate": 3.808450075080171e-05, "loss": 0.2786, "num_input_tokens_seen": 2718384, "step": 30185 }, { "epoch": 7.845634095634096, "grad_norm": 2.0153653621673584, "learning_rate": 3.8079669352895206e-05, "loss": 0.3115, "num_input_tokens_seen": 2718816, "step": 30190 }, { "epoch": 7.846933471933472, "grad_norm": 0.8754251599311829, "learning_rate": 3.807483728228879e-05, "loss": 0.2898, "num_input_tokens_seen": 2719264, "step": 30195 }, { "epoch": 7.848232848232849, "grad_norm": 0.9526715874671936, "learning_rate": 3.807000453923095e-05, "loss": 0.3057, "num_input_tokens_seen": 2719728, "step": 30200 }, { "epoch": 7.849532224532225, "grad_norm": 0.5319003462791443, "learning_rate": 3.806517112397026e-05, "loss": 0.2995, "num_input_tokens_seen": 2720176, "step": 30205 }, { "epoch": 7.850831600831601, "grad_norm": 0.5705223679542542, "learning_rate": 3.80603370367553e-05, "loss": 0.2001, "num_input_tokens_seen": 2720640, "step": 30210 }, { "epoch": 7.852130977130977, "grad_norm": 1.1572734117507935, "learning_rate": 3.8055502277834695e-05, "loss": 0.3026, "num_input_tokens_seen": 2721104, "step": 30215 }, { "epoch": 7.853430353430354, "grad_norm": 0.8227284550666809, "learning_rate": 3.80506668474571e-05, "loss": 0.2749, "num_input_tokens_seen": 2721552, "step": 30220 }, { "epoch": 7.85472972972973, "grad_norm": 0.8400813937187195, "learning_rate": 3.8045830745871195e-05, "loss": 0.297, "num_input_tokens_seen": 2722016, "step": 30225 }, { "epoch": 7.856029106029106, "grad_norm": 0.4854797422885895, "learning_rate": 3.804099397332572e-05, "loss": 0.2522, "num_input_tokens_seen": 2722432, "step": 30230 }, { "epoch": 7.857328482328482, "grad_norm": 1.1294258832931519, "learning_rate": 3.803615653006943e-05, "loss": 0.2918, "num_input_tokens_seen": 2722864, "step": 30235 }, { "epoch": 7.858627858627859, "grad_norm": 0.6777042150497437, "learning_rate": 3.803131841635112e-05, "loss": 0.1876, "num_input_tokens_seen": 2723312, "step": 30240 }, { "epoch": 7.859927234927235, "grad_norm": 0.40478163957595825, "learning_rate": 3.802647963241962e-05, "loss": 0.2122, "num_input_tokens_seen": 2723776, "step": 30245 }, { "epoch": 7.861226611226611, "grad_norm": 1.3618000745773315, "learning_rate": 3.8021640178523786e-05, "loss": 0.2116, "num_input_tokens_seen": 2724208, "step": 30250 }, { "epoch": 7.862525987525988, "grad_norm": 0.5186648964881897, "learning_rate": 3.801680005491254e-05, "loss": 0.1786, "num_input_tokens_seen": 2724720, "step": 30255 }, { "epoch": 7.863825363825364, "grad_norm": 1.3702853918075562, "learning_rate": 3.801195926183477e-05, "loss": 0.2152, "num_input_tokens_seen": 2725200, "step": 30260 }, { "epoch": 7.86512474012474, "grad_norm": 1.3124918937683105, "learning_rate": 3.8007117799539475e-05, "loss": 0.2894, "num_input_tokens_seen": 2725648, "step": 30265 }, { "epoch": 7.866424116424117, "grad_norm": 0.3603619635105133, "learning_rate": 3.8002275668275655e-05, "loss": 0.2828, "num_input_tokens_seen": 2726128, "step": 30270 }, { "epoch": 7.867723492723493, "grad_norm": 0.4023003578186035, "learning_rate": 3.7997432868292326e-05, "loss": 0.3436, "num_input_tokens_seen": 2726576, "step": 30275 }, { "epoch": 7.869022869022869, "grad_norm": 1.2876018285751343, "learning_rate": 3.7992589399838585e-05, "loss": 0.3035, "num_input_tokens_seen": 2727040, "step": 30280 }, { "epoch": 7.870322245322245, "grad_norm": 0.730274498462677, "learning_rate": 3.798774526316351e-05, "loss": 0.2178, "num_input_tokens_seen": 2727488, "step": 30285 }, { "epoch": 7.871621621621622, "grad_norm": 0.5067550539970398, "learning_rate": 3.7982900458516267e-05, "loss": 0.2033, "num_input_tokens_seen": 2727920, "step": 30290 }, { "epoch": 7.872920997920998, "grad_norm": 0.3327484130859375, "learning_rate": 3.7978054986146e-05, "loss": 0.1988, "num_input_tokens_seen": 2728384, "step": 30295 }, { "epoch": 7.874220374220374, "grad_norm": 0.42195457220077515, "learning_rate": 3.797320884630194e-05, "loss": 0.1717, "num_input_tokens_seen": 2728832, "step": 30300 }, { "epoch": 7.87551975051975, "grad_norm": 0.3342796564102173, "learning_rate": 3.7968362039233316e-05, "loss": 0.1334, "num_input_tokens_seen": 2729264, "step": 30305 }, { "epoch": 7.876819126819127, "grad_norm": 0.44020456075668335, "learning_rate": 3.7963514565189404e-05, "loss": 0.3604, "num_input_tokens_seen": 2729696, "step": 30310 }, { "epoch": 7.878118503118503, "grad_norm": 1.2664421796798706, "learning_rate": 3.7958666424419506e-05, "loss": 0.2277, "num_input_tokens_seen": 2730112, "step": 30315 }, { "epoch": 7.879417879417879, "grad_norm": 0.7626039385795593, "learning_rate": 3.7953817617173e-05, "loss": 0.3896, "num_input_tokens_seen": 2730576, "step": 30320 }, { "epoch": 7.880717255717256, "grad_norm": 0.6335678100585938, "learning_rate": 3.794896814369924e-05, "loss": 0.2656, "num_input_tokens_seen": 2731024, "step": 30325 }, { "epoch": 7.882016632016632, "grad_norm": 0.5186589360237122, "learning_rate": 3.794411800424762e-05, "loss": 0.3215, "num_input_tokens_seen": 2731488, "step": 30330 }, { "epoch": 7.883316008316008, "grad_norm": 0.8593418598175049, "learning_rate": 3.7939267199067624e-05, "loss": 0.2015, "num_input_tokens_seen": 2731936, "step": 30335 }, { "epoch": 7.884615384615385, "grad_norm": 0.3488559424877167, "learning_rate": 3.7934415728408715e-05, "loss": 0.1666, "num_input_tokens_seen": 2732384, "step": 30340 }, { "epoch": 7.885914760914761, "grad_norm": 0.600048840045929, "learning_rate": 3.792956359252041e-05, "loss": 0.2314, "num_input_tokens_seen": 2732848, "step": 30345 }, { "epoch": 7.887214137214137, "grad_norm": 0.31185218691825867, "learning_rate": 3.792471079165226e-05, "loss": 0.1033, "num_input_tokens_seen": 2733312, "step": 30350 }, { "epoch": 7.888513513513513, "grad_norm": 0.25627994537353516, "learning_rate": 3.791985732605386e-05, "loss": 0.2557, "num_input_tokens_seen": 2733792, "step": 30355 }, { "epoch": 7.88981288981289, "grad_norm": 1.1036640405654907, "learning_rate": 3.7915003195974815e-05, "loss": 0.3421, "num_input_tokens_seen": 2734304, "step": 30360 }, { "epoch": 7.891112266112266, "grad_norm": 0.4789753258228302, "learning_rate": 3.791014840166478e-05, "loss": 0.2843, "num_input_tokens_seen": 2734736, "step": 30365 }, { "epoch": 7.892411642411642, "grad_norm": 0.3801281154155731, "learning_rate": 3.7905292943373434e-05, "loss": 0.2268, "num_input_tokens_seen": 2735184, "step": 30370 }, { "epoch": 7.893711018711018, "grad_norm": 0.3500073552131653, "learning_rate": 3.7900436821350514e-05, "loss": 0.2209, "num_input_tokens_seen": 2735632, "step": 30375 }, { "epoch": 7.895010395010395, "grad_norm": 0.5504429936408997, "learning_rate": 3.7895580035845766e-05, "loss": 0.2981, "num_input_tokens_seen": 2736064, "step": 30380 }, { "epoch": 7.896309771309771, "grad_norm": 1.0138864517211914, "learning_rate": 3.789072258710898e-05, "loss": 0.3146, "num_input_tokens_seen": 2736496, "step": 30385 }, { "epoch": 7.897609147609147, "grad_norm": 0.6924748420715332, "learning_rate": 3.788586447538999e-05, "loss": 0.2537, "num_input_tokens_seen": 2736912, "step": 30390 }, { "epoch": 7.898908523908524, "grad_norm": 0.5919308662414551, "learning_rate": 3.7881005700938635e-05, "loss": 0.2462, "num_input_tokens_seen": 2737344, "step": 30395 }, { "epoch": 7.9002079002079, "grad_norm": 0.5979207754135132, "learning_rate": 3.787614626400481e-05, "loss": 0.2275, "num_input_tokens_seen": 2737760, "step": 30400 }, { "epoch": 7.901507276507276, "grad_norm": 0.45706838369369507, "learning_rate": 3.7871286164838445e-05, "loss": 0.2209, "num_input_tokens_seen": 2738224, "step": 30405 }, { "epoch": 7.902806652806653, "grad_norm": 0.4731247127056122, "learning_rate": 3.786642540368951e-05, "loss": 0.2092, "num_input_tokens_seen": 2738640, "step": 30410 }, { "epoch": 7.904106029106029, "grad_norm": 0.6719282269477844, "learning_rate": 3.7861563980807994e-05, "loss": 0.2131, "num_input_tokens_seen": 2739104, "step": 30415 }, { "epoch": 7.905405405405405, "grad_norm": 0.34041914343833923, "learning_rate": 3.7856701896443905e-05, "loss": 0.3068, "num_input_tokens_seen": 2739536, "step": 30420 }, { "epoch": 7.906704781704782, "grad_norm": 0.5446985363960266, "learning_rate": 3.785183915084732e-05, "loss": 0.3243, "num_input_tokens_seen": 2740016, "step": 30425 }, { "epoch": 7.908004158004158, "grad_norm": 0.6975852847099304, "learning_rate": 3.784697574426833e-05, "loss": 0.3156, "num_input_tokens_seen": 2740464, "step": 30430 }, { "epoch": 7.909303534303534, "grad_norm": 0.5664700865745544, "learning_rate": 3.7842111676957085e-05, "loss": 0.2341, "num_input_tokens_seen": 2740880, "step": 30435 }, { "epoch": 7.91060291060291, "grad_norm": 0.6205486059188843, "learning_rate": 3.783724694916371e-05, "loss": 0.2272, "num_input_tokens_seen": 2741312, "step": 30440 }, { "epoch": 7.911902286902287, "grad_norm": 0.9819144606590271, "learning_rate": 3.783238156113844e-05, "loss": 0.218, "num_input_tokens_seen": 2741792, "step": 30445 }, { "epoch": 7.913201663201663, "grad_norm": 0.9923608899116516, "learning_rate": 3.7827515513131486e-05, "loss": 0.2505, "num_input_tokens_seen": 2742304, "step": 30450 }, { "epoch": 7.914501039501039, "grad_norm": 0.9918773174285889, "learning_rate": 3.7822648805393117e-05, "loss": 0.2531, "num_input_tokens_seen": 2742800, "step": 30455 }, { "epoch": 7.915800415800415, "grad_norm": 1.9151756763458252, "learning_rate": 3.781778143817364e-05, "loss": 0.2418, "num_input_tokens_seen": 2743264, "step": 30460 }, { "epoch": 7.917099792099792, "grad_norm": 0.6993116736412048, "learning_rate": 3.781291341172338e-05, "loss": 0.2623, "num_input_tokens_seen": 2743712, "step": 30465 }, { "epoch": 7.918399168399168, "grad_norm": 1.0285218954086304, "learning_rate": 3.7808044726292705e-05, "loss": 0.2044, "num_input_tokens_seen": 2744160, "step": 30470 }, { "epoch": 7.919698544698544, "grad_norm": 0.5909109711647034, "learning_rate": 3.780317538213201e-05, "loss": 0.2615, "num_input_tokens_seen": 2744560, "step": 30475 }, { "epoch": 7.920997920997921, "grad_norm": 1.5903490781784058, "learning_rate": 3.779830537949175e-05, "loss": 0.2415, "num_input_tokens_seen": 2744992, "step": 30480 }, { "epoch": 7.922297297297297, "grad_norm": 1.5224432945251465, "learning_rate": 3.7793434718622376e-05, "loss": 0.3709, "num_input_tokens_seen": 2745504, "step": 30485 }, { "epoch": 7.923596673596673, "grad_norm": 0.7493104338645935, "learning_rate": 3.778856339977439e-05, "loss": 0.2289, "num_input_tokens_seen": 2745968, "step": 30490 }, { "epoch": 7.92489604989605, "grad_norm": 0.43728992342948914, "learning_rate": 3.778369142319834e-05, "loss": 0.2953, "num_input_tokens_seen": 2746432, "step": 30495 }, { "epoch": 7.926195426195426, "grad_norm": 0.5076588988304138, "learning_rate": 3.77788187891448e-05, "loss": 0.2636, "num_input_tokens_seen": 2746864, "step": 30500 }, { "epoch": 7.927494802494802, "grad_norm": 0.8159943222999573, "learning_rate": 3.7773945497864354e-05, "loss": 0.2312, "num_input_tokens_seen": 2747296, "step": 30505 }, { "epoch": 7.9287941787941785, "grad_norm": 1.2212172746658325, "learning_rate": 3.776907154960765e-05, "loss": 0.2284, "num_input_tokens_seen": 2747744, "step": 30510 }, { "epoch": 7.930093555093555, "grad_norm": 0.8890619874000549, "learning_rate": 3.7764196944625375e-05, "loss": 0.2651, "num_input_tokens_seen": 2748160, "step": 30515 }, { "epoch": 7.9313929313929314, "grad_norm": 1.0213983058929443, "learning_rate": 3.775932168316822e-05, "loss": 0.2796, "num_input_tokens_seen": 2748592, "step": 30520 }, { "epoch": 7.9326923076923075, "grad_norm": 1.0528570413589478, "learning_rate": 3.775444576548691e-05, "loss": 0.2091, "num_input_tokens_seen": 2749008, "step": 30525 }, { "epoch": 7.9339916839916835, "grad_norm": 0.9233798384666443, "learning_rate": 3.7749569191832245e-05, "loss": 0.3425, "num_input_tokens_seen": 2749424, "step": 30530 }, { "epoch": 7.9352910602910605, "grad_norm": 1.1654576063156128, "learning_rate": 3.774469196245502e-05, "loss": 0.1743, "num_input_tokens_seen": 2749888, "step": 30535 }, { "epoch": 7.9365904365904365, "grad_norm": 1.0693248510360718, "learning_rate": 3.773981407760607e-05, "loss": 0.2192, "num_input_tokens_seen": 2750320, "step": 30540 }, { "epoch": 7.9378898128898125, "grad_norm": 1.0019199848175049, "learning_rate": 3.7734935537536276e-05, "loss": 0.3326, "num_input_tokens_seen": 2750768, "step": 30545 }, { "epoch": 7.9391891891891895, "grad_norm": 0.7637479901313782, "learning_rate": 3.773005634249656e-05, "loss": 0.1897, "num_input_tokens_seen": 2751232, "step": 30550 }, { "epoch": 7.9404885654885655, "grad_norm": 0.31419047713279724, "learning_rate": 3.772517649273783e-05, "loss": 0.1944, "num_input_tokens_seen": 2751664, "step": 30555 }, { "epoch": 7.941787941787942, "grad_norm": 0.976052463054657, "learning_rate": 3.772029598851109e-05, "loss": 0.204, "num_input_tokens_seen": 2752112, "step": 30560 }, { "epoch": 7.9430873180873185, "grad_norm": 1.8154025077819824, "learning_rate": 3.771541483006733e-05, "loss": 0.3427, "num_input_tokens_seen": 2752544, "step": 30565 }, { "epoch": 7.9443866943866945, "grad_norm": 0.752923309803009, "learning_rate": 3.771053301765761e-05, "loss": 0.3334, "num_input_tokens_seen": 2752976, "step": 30570 }, { "epoch": 7.945686070686071, "grad_norm": 1.048643708229065, "learning_rate": 3.7705650551533e-05, "loss": 0.3062, "num_input_tokens_seen": 2753424, "step": 30575 }, { "epoch": 7.946985446985447, "grad_norm": 0.5522472262382507, "learning_rate": 3.770076743194461e-05, "loss": 0.3035, "num_input_tokens_seen": 2753824, "step": 30580 }, { "epoch": 7.9482848232848236, "grad_norm": 0.6901319026947021, "learning_rate": 3.769588365914358e-05, "loss": 0.3002, "num_input_tokens_seen": 2754256, "step": 30585 }, { "epoch": 7.9495841995842, "grad_norm": 0.9735859036445618, "learning_rate": 3.769099923338108e-05, "loss": 0.2626, "num_input_tokens_seen": 2754720, "step": 30590 }, { "epoch": 7.950883575883576, "grad_norm": 0.6373990178108215, "learning_rate": 3.7686114154908336e-05, "loss": 0.3098, "num_input_tokens_seen": 2755136, "step": 30595 }, { "epoch": 7.952182952182953, "grad_norm": 0.4708815813064575, "learning_rate": 3.7681228423976586e-05, "loss": 0.263, "num_input_tokens_seen": 2755584, "step": 30600 }, { "epoch": 7.953482328482329, "grad_norm": 0.5430421829223633, "learning_rate": 3.767634204083711e-05, "loss": 0.1906, "num_input_tokens_seen": 2756032, "step": 30605 }, { "epoch": 7.954781704781705, "grad_norm": 0.9873490929603577, "learning_rate": 3.7671455005741207e-05, "loss": 0.2388, "num_input_tokens_seen": 2756480, "step": 30610 }, { "epoch": 7.956081081081081, "grad_norm": 0.37561091780662537, "learning_rate": 3.766656731894024e-05, "loss": 0.25, "num_input_tokens_seen": 2756944, "step": 30615 }, { "epoch": 7.957380457380458, "grad_norm": 1.0867379903793335, "learning_rate": 3.766167898068558e-05, "loss": 0.282, "num_input_tokens_seen": 2757344, "step": 30620 }, { "epoch": 7.958679833679834, "grad_norm": 1.4880493879318237, "learning_rate": 3.7656789991228636e-05, "loss": 0.235, "num_input_tokens_seen": 2757824, "step": 30625 }, { "epoch": 7.95997920997921, "grad_norm": 0.8909096717834473, "learning_rate": 3.765190035082086e-05, "loss": 0.196, "num_input_tokens_seen": 2758288, "step": 30630 }, { "epoch": 7.961278586278587, "grad_norm": 1.6176916360855103, "learning_rate": 3.764701005971372e-05, "loss": 0.2981, "num_input_tokens_seen": 2758736, "step": 30635 }, { "epoch": 7.962577962577963, "grad_norm": 0.37450888752937317, "learning_rate": 3.764211911815873e-05, "loss": 0.1332, "num_input_tokens_seen": 2759168, "step": 30640 }, { "epoch": 7.963877338877339, "grad_norm": 0.5920262932777405, "learning_rate": 3.7637227526407456e-05, "loss": 0.233, "num_input_tokens_seen": 2759680, "step": 30645 }, { "epoch": 7.965176715176716, "grad_norm": 1.1584049463272095, "learning_rate": 3.763233528471145e-05, "loss": 0.2956, "num_input_tokens_seen": 2760128, "step": 30650 }, { "epoch": 7.966476091476092, "grad_norm": 1.2327755689620972, "learning_rate": 3.7627442393322344e-05, "loss": 0.3919, "num_input_tokens_seen": 2760592, "step": 30655 }, { "epoch": 7.967775467775468, "grad_norm": 0.9649475812911987, "learning_rate": 3.762254885249177e-05, "loss": 0.2927, "num_input_tokens_seen": 2760992, "step": 30660 }, { "epoch": 7.969074844074844, "grad_norm": 0.7954826354980469, "learning_rate": 3.761765466247142e-05, "loss": 0.2548, "num_input_tokens_seen": 2761440, "step": 30665 }, { "epoch": 7.970374220374221, "grad_norm": 1.2545521259307861, "learning_rate": 3.761275982351299e-05, "loss": 0.2372, "num_input_tokens_seen": 2761872, "step": 30670 }, { "epoch": 7.971673596673597, "grad_norm": 1.780153751373291, "learning_rate": 3.7607864335868246e-05, "loss": 0.2996, "num_input_tokens_seen": 2762320, "step": 30675 }, { "epoch": 7.972972972972973, "grad_norm": 0.8430473804473877, "learning_rate": 3.760296819978896e-05, "loss": 0.3118, "num_input_tokens_seen": 2762768, "step": 30680 }, { "epoch": 7.974272349272349, "grad_norm": 0.7619122862815857, "learning_rate": 3.759807141552695e-05, "loss": 0.3026, "num_input_tokens_seen": 2763168, "step": 30685 }, { "epoch": 7.975571725571726, "grad_norm": 1.2578577995300293, "learning_rate": 3.759317398333404e-05, "loss": 0.2332, "num_input_tokens_seen": 2763632, "step": 30690 }, { "epoch": 7.976871101871102, "grad_norm": 0.5985924601554871, "learning_rate": 3.758827590346213e-05, "loss": 0.272, "num_input_tokens_seen": 2764064, "step": 30695 }, { "epoch": 7.978170478170478, "grad_norm": 0.654753565788269, "learning_rate": 3.7583377176163135e-05, "loss": 0.2153, "num_input_tokens_seen": 2764496, "step": 30700 }, { "epoch": 7.979469854469855, "grad_norm": 1.7070597410202026, "learning_rate": 3.7578477801689e-05, "loss": 0.2924, "num_input_tokens_seen": 2764912, "step": 30705 }, { "epoch": 7.980769230769231, "grad_norm": 1.5569230318069458, "learning_rate": 3.757357778029169e-05, "loss": 0.2962, "num_input_tokens_seen": 2765408, "step": 30710 }, { "epoch": 7.982068607068607, "grad_norm": 0.5971019268035889, "learning_rate": 3.7568677112223235e-05, "loss": 0.2793, "num_input_tokens_seen": 2765920, "step": 30715 }, { "epoch": 7.983367983367984, "grad_norm": 0.7679886817932129, "learning_rate": 3.756377579773567e-05, "loss": 0.2841, "num_input_tokens_seen": 2766400, "step": 30720 }, { "epoch": 7.98466735966736, "grad_norm": 1.09817373752594, "learning_rate": 3.755887383708107e-05, "loss": 0.2382, "num_input_tokens_seen": 2766864, "step": 30725 }, { "epoch": 7.985966735966736, "grad_norm": 1.2388701438903809, "learning_rate": 3.755397123051157e-05, "loss": 0.2784, "num_input_tokens_seen": 2767328, "step": 30730 }, { "epoch": 7.987266112266112, "grad_norm": 0.5399410128593445, "learning_rate": 3.7549067978279285e-05, "loss": 0.2533, "num_input_tokens_seen": 2767792, "step": 30735 }, { "epoch": 7.988565488565489, "grad_norm": 0.684619128704071, "learning_rate": 3.754416408063642e-05, "loss": 0.2535, "num_input_tokens_seen": 2768272, "step": 30740 }, { "epoch": 7.989864864864865, "grad_norm": 1.0637974739074707, "learning_rate": 3.753925953783517e-05, "loss": 0.2921, "num_input_tokens_seen": 2768704, "step": 30745 }, { "epoch": 7.991164241164241, "grad_norm": 0.6429445147514343, "learning_rate": 3.7534354350127796e-05, "loss": 0.1564, "num_input_tokens_seen": 2769136, "step": 30750 }, { "epoch": 7.992463617463617, "grad_norm": 0.9927061796188354, "learning_rate": 3.7529448517766566e-05, "loss": 0.2547, "num_input_tokens_seen": 2769600, "step": 30755 }, { "epoch": 7.993762993762994, "grad_norm": 1.1585510969161987, "learning_rate": 3.7524542041003785e-05, "loss": 0.3862, "num_input_tokens_seen": 2770096, "step": 30760 }, { "epoch": 7.99506237006237, "grad_norm": 0.5106329917907715, "learning_rate": 3.7519634920091814e-05, "loss": 0.2258, "num_input_tokens_seen": 2770544, "step": 30765 }, { "epoch": 7.996361746361746, "grad_norm": 1.1573693752288818, "learning_rate": 3.751472715528302e-05, "loss": 0.2407, "num_input_tokens_seen": 2770976, "step": 30770 }, { "epoch": 7.997661122661123, "grad_norm": 1.2229576110839844, "learning_rate": 3.750981874682982e-05, "loss": 0.2296, "num_input_tokens_seen": 2771408, "step": 30775 }, { "epoch": 7.998960498960499, "grad_norm": 0.5235351324081421, "learning_rate": 3.750490969498466e-05, "loss": 0.1858, "num_input_tokens_seen": 2771840, "step": 30780 }, { "epoch": 8.0, "eval_loss": 0.23655234277248383, "eval_runtime": 13.1651, "eval_samples_per_second": 65.02, "eval_steps_per_second": 32.51, "num_input_tokens_seen": 2772144, "step": 30784 }, { "epoch": 8.000259875259875, "grad_norm": 0.7297502160072327, "learning_rate": 3.7500000000000003e-05, "loss": 0.4073, "num_input_tokens_seen": 2772224, "step": 30785 }, { "epoch": 8.001559251559252, "grad_norm": 0.5600941181182861, "learning_rate": 3.7495089662128375e-05, "loss": 0.2191, "num_input_tokens_seen": 2772688, "step": 30790 }, { "epoch": 8.002858627858627, "grad_norm": 1.1790621280670166, "learning_rate": 3.749017868162231e-05, "loss": 0.2937, "num_input_tokens_seen": 2773168, "step": 30795 }, { "epoch": 8.004158004158004, "grad_norm": 0.6277881264686584, "learning_rate": 3.748526705873439e-05, "loss": 0.2985, "num_input_tokens_seen": 2773600, "step": 30800 }, { "epoch": 8.005457380457381, "grad_norm": 0.5769849419593811, "learning_rate": 3.748035479371722e-05, "loss": 0.174, "num_input_tokens_seen": 2774048, "step": 30805 }, { "epoch": 8.006756756756756, "grad_norm": 1.0787156820297241, "learning_rate": 3.7475441886823454e-05, "loss": 0.3373, "num_input_tokens_seen": 2774512, "step": 30810 }, { "epoch": 8.008056133056133, "grad_norm": 0.6718502044677734, "learning_rate": 3.747052833830574e-05, "loss": 0.3074, "num_input_tokens_seen": 2774960, "step": 30815 }, { "epoch": 8.00935550935551, "grad_norm": 0.5757381319999695, "learning_rate": 3.746561414841682e-05, "loss": 0.189, "num_input_tokens_seen": 2775376, "step": 30820 }, { "epoch": 8.010654885654885, "grad_norm": 0.513646125793457, "learning_rate": 3.746069931740942e-05, "loss": 0.2131, "num_input_tokens_seen": 2775840, "step": 30825 }, { "epoch": 8.011954261954262, "grad_norm": 0.3782300651073456, "learning_rate": 3.7455783845536316e-05, "loss": 0.3261, "num_input_tokens_seen": 2776256, "step": 30830 }, { "epoch": 8.013253638253639, "grad_norm": 0.5634860992431641, "learning_rate": 3.7450867733050306e-05, "loss": 0.1953, "num_input_tokens_seen": 2776720, "step": 30835 }, { "epoch": 8.014553014553014, "grad_norm": 0.913105309009552, "learning_rate": 3.744595098020425e-05, "loss": 0.293, "num_input_tokens_seen": 2777184, "step": 30840 }, { "epoch": 8.015852390852391, "grad_norm": 0.6217769384384155, "learning_rate": 3.7441033587251e-05, "loss": 0.2662, "num_input_tokens_seen": 2777632, "step": 30845 }, { "epoch": 8.017151767151768, "grad_norm": 0.7457616925239563, "learning_rate": 3.7436115554443474e-05, "loss": 0.2015, "num_input_tokens_seen": 2778064, "step": 30850 }, { "epoch": 8.018451143451143, "grad_norm": 0.5207383036613464, "learning_rate": 3.743119688203461e-05, "loss": 0.2129, "num_input_tokens_seen": 2778496, "step": 30855 }, { "epoch": 8.01975051975052, "grad_norm": 0.7601924538612366, "learning_rate": 3.742627757027738e-05, "loss": 0.1965, "num_input_tokens_seen": 2778928, "step": 30860 }, { "epoch": 8.021049896049895, "grad_norm": 0.8117995858192444, "learning_rate": 3.742135761942479e-05, "loss": 0.2801, "num_input_tokens_seen": 2779408, "step": 30865 }, { "epoch": 8.022349272349272, "grad_norm": 1.523534893989563, "learning_rate": 3.7416437029729865e-05, "loss": 0.2602, "num_input_tokens_seen": 2779872, "step": 30870 }, { "epoch": 8.02364864864865, "grad_norm": 1.2825431823730469, "learning_rate": 3.7411515801445695e-05, "loss": 0.3048, "num_input_tokens_seen": 2780336, "step": 30875 }, { "epoch": 8.024948024948024, "grad_norm": 0.46031397581100464, "learning_rate": 3.740659393482538e-05, "loss": 0.1806, "num_input_tokens_seen": 2780768, "step": 30880 }, { "epoch": 8.026247401247401, "grad_norm": 1.1716593503952026, "learning_rate": 3.740167143012204e-05, "loss": 0.266, "num_input_tokens_seen": 2781184, "step": 30885 }, { "epoch": 8.027546777546778, "grad_norm": 1.1285511255264282, "learning_rate": 3.739674828758887e-05, "loss": 0.2127, "num_input_tokens_seen": 2781632, "step": 30890 }, { "epoch": 8.028846153846153, "grad_norm": 0.5398102402687073, "learning_rate": 3.739182450747905e-05, "loss": 0.2126, "num_input_tokens_seen": 2782048, "step": 30895 }, { "epoch": 8.03014553014553, "grad_norm": 1.442922592163086, "learning_rate": 3.738690009004582e-05, "loss": 0.2346, "num_input_tokens_seen": 2782480, "step": 30900 }, { "epoch": 8.031444906444907, "grad_norm": 0.8036077618598938, "learning_rate": 3.738197503554245e-05, "loss": 0.153, "num_input_tokens_seen": 2782928, "step": 30905 }, { "epoch": 8.032744282744282, "grad_norm": 1.1951597929000854, "learning_rate": 3.737704934422225e-05, "loss": 0.2775, "num_input_tokens_seen": 2783376, "step": 30910 }, { "epoch": 8.03404365904366, "grad_norm": 1.1226261854171753, "learning_rate": 3.737212301633853e-05, "loss": 0.2327, "num_input_tokens_seen": 2783840, "step": 30915 }, { "epoch": 8.035343035343036, "grad_norm": 0.69877690076828, "learning_rate": 3.7367196052144676e-05, "loss": 0.2293, "num_input_tokens_seen": 2784272, "step": 30920 }, { "epoch": 8.036642411642411, "grad_norm": 1.6976343393325806, "learning_rate": 3.7362268451894075e-05, "loss": 0.2942, "num_input_tokens_seen": 2784752, "step": 30925 }, { "epoch": 8.037941787941788, "grad_norm": 1.1544880867004395, "learning_rate": 3.7357340215840175e-05, "loss": 0.404, "num_input_tokens_seen": 2785184, "step": 30930 }, { "epoch": 8.039241164241163, "grad_norm": 1.223552942276001, "learning_rate": 3.7352411344236414e-05, "loss": 0.2551, "num_input_tokens_seen": 2785600, "step": 30935 }, { "epoch": 8.04054054054054, "grad_norm": 0.9335450530052185, "learning_rate": 3.7347481837336306e-05, "loss": 0.2238, "num_input_tokens_seen": 2786032, "step": 30940 }, { "epoch": 8.041839916839917, "grad_norm": 0.7334284782409668, "learning_rate": 3.734255169539337e-05, "loss": 0.2274, "num_input_tokens_seen": 2786480, "step": 30945 }, { "epoch": 8.043139293139292, "grad_norm": 1.0289695262908936, "learning_rate": 3.7337620918661185e-05, "loss": 0.2543, "num_input_tokens_seen": 2786976, "step": 30950 }, { "epoch": 8.04443866943867, "grad_norm": 1.0071563720703125, "learning_rate": 3.7332689507393334e-05, "loss": 0.2945, "num_input_tokens_seen": 2787424, "step": 30955 }, { "epoch": 8.045738045738046, "grad_norm": 1.139727234840393, "learning_rate": 3.7327757461843435e-05, "loss": 0.221, "num_input_tokens_seen": 2787888, "step": 30960 }, { "epoch": 8.047037422037421, "grad_norm": 1.1666291952133179, "learning_rate": 3.732282478226516e-05, "loss": 0.2576, "num_input_tokens_seen": 2788384, "step": 30965 }, { "epoch": 8.048336798336798, "grad_norm": 0.7107606530189514, "learning_rate": 3.7317891468912214e-05, "loss": 0.2521, "num_input_tokens_seen": 2788848, "step": 30970 }, { "epoch": 8.049636174636175, "grad_norm": 1.463889479637146, "learning_rate": 3.7312957522038293e-05, "loss": 0.2887, "num_input_tokens_seen": 2789280, "step": 30975 }, { "epoch": 8.05093555093555, "grad_norm": 1.4309965372085571, "learning_rate": 3.730802294189718e-05, "loss": 0.2188, "num_input_tokens_seen": 2789744, "step": 30980 }, { "epoch": 8.052234927234927, "grad_norm": 1.4455840587615967, "learning_rate": 3.7303087728742644e-05, "loss": 0.2954, "num_input_tokens_seen": 2790208, "step": 30985 }, { "epoch": 8.053534303534304, "grad_norm": 1.5715326070785522, "learning_rate": 3.7298151882828526e-05, "loss": 0.2381, "num_input_tokens_seen": 2790656, "step": 30990 }, { "epoch": 8.05483367983368, "grad_norm": 0.580683708190918, "learning_rate": 3.729321540440866e-05, "loss": 0.1785, "num_input_tokens_seen": 2791120, "step": 30995 }, { "epoch": 8.056133056133056, "grad_norm": 0.474183052778244, "learning_rate": 3.728827829373696e-05, "loss": 0.0911, "num_input_tokens_seen": 2791568, "step": 31000 }, { "epoch": 8.057432432432432, "grad_norm": 1.7255618572235107, "learning_rate": 3.728334055106732e-05, "loss": 0.4338, "num_input_tokens_seen": 2791984, "step": 31005 }, { "epoch": 8.058731808731808, "grad_norm": 0.33944568037986755, "learning_rate": 3.727840217665371e-05, "loss": 0.3222, "num_input_tokens_seen": 2792400, "step": 31010 }, { "epoch": 8.060031185031185, "grad_norm": 0.41213417053222656, "learning_rate": 3.727346317075012e-05, "loss": 0.1452, "num_input_tokens_seen": 2792848, "step": 31015 }, { "epoch": 8.06133056133056, "grad_norm": 1.3531615734100342, "learning_rate": 3.7268523533610564e-05, "loss": 0.3931, "num_input_tokens_seen": 2793296, "step": 31020 }, { "epoch": 8.062629937629938, "grad_norm": 1.6163910627365112, "learning_rate": 3.7263583265489074e-05, "loss": 0.3496, "num_input_tokens_seen": 2793728, "step": 31025 }, { "epoch": 8.063929313929314, "grad_norm": 0.3255499601364136, "learning_rate": 3.7258642366639754e-05, "loss": 0.1798, "num_input_tokens_seen": 2794176, "step": 31030 }, { "epoch": 8.06522869022869, "grad_norm": 0.5711843371391296, "learning_rate": 3.72537008373167e-05, "loss": 0.3691, "num_input_tokens_seen": 2794608, "step": 31035 }, { "epoch": 8.066528066528067, "grad_norm": 0.8537285923957825, "learning_rate": 3.724875867777409e-05, "loss": 0.2424, "num_input_tokens_seen": 2795024, "step": 31040 }, { "epoch": 8.067827442827443, "grad_norm": 0.5271860957145691, "learning_rate": 3.724381588826607e-05, "loss": 0.2975, "num_input_tokens_seen": 2795488, "step": 31045 }, { "epoch": 8.069126819126819, "grad_norm": 0.8775919079780579, "learning_rate": 3.723887246904687e-05, "loss": 0.2831, "num_input_tokens_seen": 2795952, "step": 31050 }, { "epoch": 8.070426195426196, "grad_norm": 0.5628589987754822, "learning_rate": 3.7233928420370735e-05, "loss": 0.2673, "num_input_tokens_seen": 2796384, "step": 31055 }, { "epoch": 8.071725571725572, "grad_norm": 0.6132852435112, "learning_rate": 3.722898374249194e-05, "loss": 0.2438, "num_input_tokens_seen": 2796800, "step": 31060 }, { "epoch": 8.073024948024948, "grad_norm": 0.7953711152076721, "learning_rate": 3.722403843566479e-05, "loss": 0.1827, "num_input_tokens_seen": 2797232, "step": 31065 }, { "epoch": 8.074324324324325, "grad_norm": 0.476730078458786, "learning_rate": 3.721909250014363e-05, "loss": 0.1862, "num_input_tokens_seen": 2797648, "step": 31070 }, { "epoch": 8.075623700623701, "grad_norm": 0.6491103768348694, "learning_rate": 3.721414593618284e-05, "loss": 0.2705, "num_input_tokens_seen": 2798096, "step": 31075 }, { "epoch": 8.076923076923077, "grad_norm": 0.8999497294425964, "learning_rate": 3.720919874403682e-05, "loss": 0.2467, "num_input_tokens_seen": 2798528, "step": 31080 }, { "epoch": 8.078222453222454, "grad_norm": 0.9812321066856384, "learning_rate": 3.720425092396001e-05, "loss": 0.3253, "num_input_tokens_seen": 2798976, "step": 31085 }, { "epoch": 8.079521829521829, "grad_norm": 0.5183930993080139, "learning_rate": 3.7199302476206883e-05, "loss": 0.1835, "num_input_tokens_seen": 2799408, "step": 31090 }, { "epoch": 8.080821205821206, "grad_norm": 1.0836021900177002, "learning_rate": 3.719435340103193e-05, "loss": 0.3684, "num_input_tokens_seen": 2799840, "step": 31095 }, { "epoch": 8.082120582120583, "grad_norm": 0.8333508372306824, "learning_rate": 3.718940369868972e-05, "loss": 0.2984, "num_input_tokens_seen": 2800288, "step": 31100 }, { "epoch": 8.083419958419958, "grad_norm": 0.7895557880401611, "learning_rate": 3.718445336943478e-05, "loss": 0.2923, "num_input_tokens_seen": 2800736, "step": 31105 }, { "epoch": 8.084719334719335, "grad_norm": 0.7856354713439941, "learning_rate": 3.717950241352173e-05, "loss": 0.2694, "num_input_tokens_seen": 2801200, "step": 31110 }, { "epoch": 8.086018711018712, "grad_norm": 0.7654414772987366, "learning_rate": 3.717455083120521e-05, "loss": 0.2586, "num_input_tokens_seen": 2801664, "step": 31115 }, { "epoch": 8.087318087318087, "grad_norm": 0.716525673866272, "learning_rate": 3.716959862273987e-05, "loss": 0.1841, "num_input_tokens_seen": 2802096, "step": 31120 }, { "epoch": 8.088617463617464, "grad_norm": 0.8304663300514221, "learning_rate": 3.716464578838041e-05, "loss": 0.2713, "num_input_tokens_seen": 2802544, "step": 31125 }, { "epoch": 8.08991683991684, "grad_norm": 1.5926724672317505, "learning_rate": 3.715969232838157e-05, "loss": 0.2851, "num_input_tokens_seen": 2802976, "step": 31130 }, { "epoch": 8.091216216216216, "grad_norm": 0.7654193043708801, "learning_rate": 3.715473824299809e-05, "loss": 0.1999, "num_input_tokens_seen": 2803440, "step": 31135 }, { "epoch": 8.092515592515593, "grad_norm": 1.1670998334884644, "learning_rate": 3.7149783532484784e-05, "loss": 0.246, "num_input_tokens_seen": 2803920, "step": 31140 }, { "epoch": 8.09381496881497, "grad_norm": 1.2043228149414062, "learning_rate": 3.714482819709647e-05, "loss": 0.267, "num_input_tokens_seen": 2804368, "step": 31145 }, { "epoch": 8.095114345114345, "grad_norm": 0.38859182596206665, "learning_rate": 3.7139872237088e-05, "loss": 0.2483, "num_input_tokens_seen": 2804832, "step": 31150 }, { "epoch": 8.096413721413722, "grad_norm": 1.298851490020752, "learning_rate": 3.713491565271427e-05, "loss": 0.2681, "num_input_tokens_seen": 2805328, "step": 31155 }, { "epoch": 8.097713097713097, "grad_norm": 0.5908510088920593, "learning_rate": 3.712995844423021e-05, "loss": 0.2034, "num_input_tokens_seen": 2805760, "step": 31160 }, { "epoch": 8.099012474012474, "grad_norm": 0.5329923629760742, "learning_rate": 3.7125000611890754e-05, "loss": 0.1528, "num_input_tokens_seen": 2806208, "step": 31165 }, { "epoch": 8.10031185031185, "grad_norm": 0.8241252899169922, "learning_rate": 3.712004215595091e-05, "loss": 0.2664, "num_input_tokens_seen": 2806656, "step": 31170 }, { "epoch": 8.101611226611226, "grad_norm": 1.2593952417373657, "learning_rate": 3.711508307666568e-05, "loss": 0.3121, "num_input_tokens_seen": 2807088, "step": 31175 }, { "epoch": 8.102910602910603, "grad_norm": 1.627236247062683, "learning_rate": 3.711012337429011e-05, "loss": 0.261, "num_input_tokens_seen": 2807536, "step": 31180 }, { "epoch": 8.10420997920998, "grad_norm": 0.822765052318573, "learning_rate": 3.710516304907931e-05, "loss": 0.1503, "num_input_tokens_seen": 2807968, "step": 31185 }, { "epoch": 8.105509355509355, "grad_norm": 1.0602638721466064, "learning_rate": 3.7100202101288355e-05, "loss": 0.1569, "num_input_tokens_seen": 2808432, "step": 31190 }, { "epoch": 8.106808731808732, "grad_norm": 1.6895984411239624, "learning_rate": 3.709524053117242e-05, "loss": 0.3294, "num_input_tokens_seen": 2808848, "step": 31195 }, { "epoch": 8.108108108108109, "grad_norm": 0.9791503548622131, "learning_rate": 3.7090278338986685e-05, "loss": 0.216, "num_input_tokens_seen": 2809296, "step": 31200 }, { "epoch": 8.109407484407484, "grad_norm": 0.9888114333152771, "learning_rate": 3.708531552498634e-05, "loss": 0.2257, "num_input_tokens_seen": 2809744, "step": 31205 }, { "epoch": 8.11070686070686, "grad_norm": 1.3444759845733643, "learning_rate": 3.708035208942664e-05, "loss": 0.3703, "num_input_tokens_seen": 2810224, "step": 31210 }, { "epoch": 8.112006237006238, "grad_norm": 7.585683822631836, "learning_rate": 3.7075388032562855e-05, "loss": 0.1997, "num_input_tokens_seen": 2810688, "step": 31215 }, { "epoch": 8.113305613305613, "grad_norm": 13.856188774108887, "learning_rate": 3.7070423354650296e-05, "loss": 0.4748, "num_input_tokens_seen": 2811152, "step": 31220 }, { "epoch": 8.11460498960499, "grad_norm": 15.314982414245605, "learning_rate": 3.70654580559443e-05, "loss": 0.813, "num_input_tokens_seen": 2811616, "step": 31225 }, { "epoch": 8.115904365904365, "grad_norm": 3.5752735137939453, "learning_rate": 3.706049213670023e-05, "loss": 0.2291, "num_input_tokens_seen": 2812080, "step": 31230 }, { "epoch": 8.117203742203742, "grad_norm": 4.1743574142456055, "learning_rate": 3.705552559717351e-05, "loss": 0.718, "num_input_tokens_seen": 2812576, "step": 31235 }, { "epoch": 8.118503118503119, "grad_norm": 0.8077378273010254, "learning_rate": 3.7050558437619544e-05, "loss": 0.2385, "num_input_tokens_seen": 2813040, "step": 31240 }, { "epoch": 8.119802494802494, "grad_norm": 1.1448960304260254, "learning_rate": 3.704559065829381e-05, "loss": 0.2039, "num_input_tokens_seen": 2813488, "step": 31245 }, { "epoch": 8.121101871101871, "grad_norm": 0.45476120710372925, "learning_rate": 3.7040622259451815e-05, "loss": 0.2993, "num_input_tokens_seen": 2813936, "step": 31250 }, { "epoch": 8.122401247401248, "grad_norm": 1.1605373620986938, "learning_rate": 3.7035653241349075e-05, "loss": 0.2922, "num_input_tokens_seen": 2814368, "step": 31255 }, { "epoch": 8.123700623700623, "grad_norm": 1.0762308835983276, "learning_rate": 3.703068360424115e-05, "loss": 0.2792, "num_input_tokens_seen": 2814800, "step": 31260 }, { "epoch": 8.125, "grad_norm": 1.0942726135253906, "learning_rate": 3.702571334838365e-05, "loss": 0.1997, "num_input_tokens_seen": 2815248, "step": 31265 }, { "epoch": 8.126299376299377, "grad_norm": 0.9332022666931152, "learning_rate": 3.702074247403219e-05, "loss": 0.2514, "num_input_tokens_seen": 2815696, "step": 31270 }, { "epoch": 8.127598752598752, "grad_norm": 0.5998968482017517, "learning_rate": 3.701577098144242e-05, "loss": 0.2991, "num_input_tokens_seen": 2816144, "step": 31275 }, { "epoch": 8.128898128898129, "grad_norm": 0.5287196040153503, "learning_rate": 3.7010798870870036e-05, "loss": 0.1448, "num_input_tokens_seen": 2816592, "step": 31280 }, { "epoch": 8.130197505197506, "grad_norm": 0.3703957200050354, "learning_rate": 3.7005826142570765e-05, "loss": 0.3055, "num_input_tokens_seen": 2817088, "step": 31285 }, { "epoch": 8.131496881496881, "grad_norm": 1.7124298810958862, "learning_rate": 3.700085279680035e-05, "loss": 0.464, "num_input_tokens_seen": 2817536, "step": 31290 }, { "epoch": 8.132796257796258, "grad_norm": 1.0857915878295898, "learning_rate": 3.699587883381458e-05, "loss": 0.2194, "num_input_tokens_seen": 2817984, "step": 31295 }, { "epoch": 8.134095634095635, "grad_norm": 0.7123256325721741, "learning_rate": 3.699090425386926e-05, "loss": 0.2016, "num_input_tokens_seen": 2818432, "step": 31300 }, { "epoch": 8.13539501039501, "grad_norm": 0.5390133261680603, "learning_rate": 3.698592905722025e-05, "loss": 0.1227, "num_input_tokens_seen": 2818880, "step": 31305 }, { "epoch": 8.136694386694387, "grad_norm": 0.4035891592502594, "learning_rate": 3.698095324412342e-05, "loss": 0.2246, "num_input_tokens_seen": 2819360, "step": 31310 }, { "epoch": 8.137993762993762, "grad_norm": 1.663476586341858, "learning_rate": 3.6975976814834685e-05, "loss": 0.4023, "num_input_tokens_seen": 2819792, "step": 31315 }, { "epoch": 8.13929313929314, "grad_norm": 0.8787336349487305, "learning_rate": 3.6970999769609995e-05, "loss": 0.2599, "num_input_tokens_seen": 2820256, "step": 31320 }, { "epoch": 8.140592515592516, "grad_norm": 0.5966840386390686, "learning_rate": 3.696602210870531e-05, "loss": 0.3165, "num_input_tokens_seen": 2820720, "step": 31325 }, { "epoch": 8.141891891891891, "grad_norm": 0.9441940784454346, "learning_rate": 3.6961043832376646e-05, "loss": 0.2448, "num_input_tokens_seen": 2821184, "step": 31330 }, { "epoch": 8.143191268191268, "grad_norm": 0.8052143454551697, "learning_rate": 3.695606494088003e-05, "loss": 0.2703, "num_input_tokens_seen": 2821616, "step": 31335 }, { "epoch": 8.144490644490645, "grad_norm": 0.5228008031845093, "learning_rate": 3.695108543447154e-05, "loss": 0.3021, "num_input_tokens_seen": 2822112, "step": 31340 }, { "epoch": 8.14579002079002, "grad_norm": 0.698159396648407, "learning_rate": 3.694610531340729e-05, "loss": 0.238, "num_input_tokens_seen": 2822576, "step": 31345 }, { "epoch": 8.147089397089397, "grad_norm": 0.9391912221908569, "learning_rate": 3.6941124577943384e-05, "loss": 0.3003, "num_input_tokens_seen": 2823024, "step": 31350 }, { "epoch": 8.148388773388774, "grad_norm": 0.6941974759101868, "learning_rate": 3.6936143228336e-05, "loss": 0.2514, "num_input_tokens_seen": 2823520, "step": 31355 }, { "epoch": 8.14968814968815, "grad_norm": 1.0199142694473267, "learning_rate": 3.6931161264841334e-05, "loss": 0.2359, "num_input_tokens_seen": 2823984, "step": 31360 }, { "epoch": 8.150987525987526, "grad_norm": 0.6091079711914062, "learning_rate": 3.692617868771562e-05, "loss": 0.3142, "num_input_tokens_seen": 2824464, "step": 31365 }, { "epoch": 8.152286902286903, "grad_norm": 0.43908950686454773, "learning_rate": 3.69211954972151e-05, "loss": 0.2677, "num_input_tokens_seen": 2824912, "step": 31370 }, { "epoch": 8.153586278586278, "grad_norm": 0.5107591152191162, "learning_rate": 3.6916211693596074e-05, "loss": 0.2569, "num_input_tokens_seen": 2825344, "step": 31375 }, { "epoch": 8.154885654885655, "grad_norm": 0.4987008571624756, "learning_rate": 3.6911227277114866e-05, "loss": 0.2412, "num_input_tokens_seen": 2825760, "step": 31380 }, { "epoch": 8.15618503118503, "grad_norm": 0.6023520231246948, "learning_rate": 3.6906242248027826e-05, "loss": 0.1889, "num_input_tokens_seen": 2826288, "step": 31385 }, { "epoch": 8.157484407484407, "grad_norm": 0.5940874218940735, "learning_rate": 3.690125660659134e-05, "loss": 0.2321, "num_input_tokens_seen": 2826752, "step": 31390 }, { "epoch": 8.158783783783784, "grad_norm": 0.6228432655334473, "learning_rate": 3.6896270353061824e-05, "loss": 0.3044, "num_input_tokens_seen": 2827184, "step": 31395 }, { "epoch": 8.16008316008316, "grad_norm": 0.7151340842247009, "learning_rate": 3.6891283487695723e-05, "loss": 0.2507, "num_input_tokens_seen": 2827616, "step": 31400 }, { "epoch": 8.161382536382536, "grad_norm": 1.1323654651641846, "learning_rate": 3.688629601074951e-05, "loss": 0.2707, "num_input_tokens_seen": 2828048, "step": 31405 }, { "epoch": 8.162681912681913, "grad_norm": 0.8723901510238647, "learning_rate": 3.688130792247971e-05, "loss": 0.2686, "num_input_tokens_seen": 2828544, "step": 31410 }, { "epoch": 8.163981288981288, "grad_norm": 1.0510879755020142, "learning_rate": 3.687631922314287e-05, "loss": 0.299, "num_input_tokens_seen": 2828976, "step": 31415 }, { "epoch": 8.165280665280665, "grad_norm": 1.113869309425354, "learning_rate": 3.687132991299554e-05, "loss": 0.2318, "num_input_tokens_seen": 2829424, "step": 31420 }, { "epoch": 8.166580041580042, "grad_norm": 0.9990830421447754, "learning_rate": 3.6866339992294344e-05, "loss": 0.2926, "num_input_tokens_seen": 2829904, "step": 31425 }, { "epoch": 8.167879417879417, "grad_norm": 1.6928255558013916, "learning_rate": 3.686134946129592e-05, "loss": 0.2835, "num_input_tokens_seen": 2830336, "step": 31430 }, { "epoch": 8.169178794178794, "grad_norm": 1.137877345085144, "learning_rate": 3.685635832025692e-05, "loss": 0.2291, "num_input_tokens_seen": 2830800, "step": 31435 }, { "epoch": 8.170478170478171, "grad_norm": 0.8286589980125427, "learning_rate": 3.685136656943405e-05, "loss": 0.2417, "num_input_tokens_seen": 2831264, "step": 31440 }, { "epoch": 8.171777546777546, "grad_norm": 0.8423067927360535, "learning_rate": 3.684637420908405e-05, "loss": 0.2197, "num_input_tokens_seen": 2831712, "step": 31445 }, { "epoch": 8.173076923076923, "grad_norm": 0.6933526992797852, "learning_rate": 3.6841381239463665e-05, "loss": 0.214, "num_input_tokens_seen": 2832176, "step": 31450 }, { "epoch": 8.174376299376299, "grad_norm": 1.427011489868164, "learning_rate": 3.68363876608297e-05, "loss": 0.4397, "num_input_tokens_seen": 2832624, "step": 31455 }, { "epoch": 8.175675675675675, "grad_norm": 0.7521753311157227, "learning_rate": 3.683139347343897e-05, "loss": 0.2206, "num_input_tokens_seen": 2833040, "step": 31460 }, { "epoch": 8.176975051975052, "grad_norm": 1.7803269624710083, "learning_rate": 3.682639867754834e-05, "loss": 0.2254, "num_input_tokens_seen": 2833488, "step": 31465 }, { "epoch": 8.178274428274428, "grad_norm": 0.44054585695266724, "learning_rate": 3.68214032734147e-05, "loss": 0.2196, "num_input_tokens_seen": 2833936, "step": 31470 }, { "epoch": 8.179573804573804, "grad_norm": 1.8802889585494995, "learning_rate": 3.681640726129496e-05, "loss": 0.1918, "num_input_tokens_seen": 2834400, "step": 31475 }, { "epoch": 8.180873180873181, "grad_norm": 0.647053599357605, "learning_rate": 3.681141064144607e-05, "loss": 0.239, "num_input_tokens_seen": 2834864, "step": 31480 }, { "epoch": 8.182172557172557, "grad_norm": 0.7945212721824646, "learning_rate": 3.6806413414125016e-05, "loss": 0.1539, "num_input_tokens_seen": 2835280, "step": 31485 }, { "epoch": 8.183471933471933, "grad_norm": 2.412903308868408, "learning_rate": 3.680141557958881e-05, "loss": 0.4372, "num_input_tokens_seen": 2835760, "step": 31490 }, { "epoch": 8.18477130977131, "grad_norm": 1.3907482624053955, "learning_rate": 3.67964171380945e-05, "loss": 0.2209, "num_input_tokens_seen": 2836208, "step": 31495 }, { "epoch": 8.186070686070686, "grad_norm": 2.849496364593506, "learning_rate": 3.679141808989914e-05, "loss": 0.2741, "num_input_tokens_seen": 2836672, "step": 31500 }, { "epoch": 8.187370062370062, "grad_norm": 0.3586200177669525, "learning_rate": 3.6786418435259854e-05, "loss": 0.1873, "num_input_tokens_seen": 2837136, "step": 31505 }, { "epoch": 8.18866943866944, "grad_norm": 1.4037244319915771, "learning_rate": 3.678141817443377e-05, "loss": 0.2673, "num_input_tokens_seen": 2837616, "step": 31510 }, { "epoch": 8.189968814968815, "grad_norm": 1.861360788345337, "learning_rate": 3.677641730767807e-05, "loss": 0.2358, "num_input_tokens_seen": 2838096, "step": 31515 }, { "epoch": 8.191268191268192, "grad_norm": 1.1447491645812988, "learning_rate": 3.6771415835249946e-05, "loss": 0.2045, "num_input_tokens_seen": 2838544, "step": 31520 }, { "epoch": 8.192567567567568, "grad_norm": 1.3428466320037842, "learning_rate": 3.676641375740662e-05, "loss": 0.2844, "num_input_tokens_seen": 2838976, "step": 31525 }, { "epoch": 8.193866943866944, "grad_norm": 0.48005518317222595, "learning_rate": 3.676141107440536e-05, "loss": 0.2399, "num_input_tokens_seen": 2839424, "step": 31530 }, { "epoch": 8.19516632016632, "grad_norm": 1.6265908479690552, "learning_rate": 3.675640778650346e-05, "loss": 0.3093, "num_input_tokens_seen": 2839888, "step": 31535 }, { "epoch": 8.196465696465696, "grad_norm": 0.9848093390464783, "learning_rate": 3.6751403893958244e-05, "loss": 0.3333, "num_input_tokens_seen": 2840368, "step": 31540 }, { "epoch": 8.197765072765073, "grad_norm": 0.6474142670631409, "learning_rate": 3.674639939702707e-05, "loss": 0.1932, "num_input_tokens_seen": 2840800, "step": 31545 }, { "epoch": 8.19906444906445, "grad_norm": 1.283359408378601, "learning_rate": 3.6741394295967315e-05, "loss": 0.3253, "num_input_tokens_seen": 2841200, "step": 31550 }, { "epoch": 8.200363825363825, "grad_norm": 0.9599580764770508, "learning_rate": 3.6736388591036406e-05, "loss": 0.2708, "num_input_tokens_seen": 2841648, "step": 31555 }, { "epoch": 8.201663201663202, "grad_norm": 0.7099705338478088, "learning_rate": 3.6731382282491786e-05, "loss": 0.2167, "num_input_tokens_seen": 2842064, "step": 31560 }, { "epoch": 8.202962577962579, "grad_norm": 0.8221606016159058, "learning_rate": 3.672637537059093e-05, "loss": 0.3524, "num_input_tokens_seen": 2842496, "step": 31565 }, { "epoch": 8.204261954261954, "grad_norm": 0.8764787912368774, "learning_rate": 3.6721367855591355e-05, "loss": 0.2168, "num_input_tokens_seen": 2842928, "step": 31570 }, { "epoch": 8.20556133056133, "grad_norm": 0.7371549010276794, "learning_rate": 3.67163597377506e-05, "loss": 0.2771, "num_input_tokens_seen": 2843392, "step": 31575 }, { "epoch": 8.206860706860708, "grad_norm": 0.7789387106895447, "learning_rate": 3.671135101732624e-05, "loss": 0.2057, "num_input_tokens_seen": 2843840, "step": 31580 }, { "epoch": 8.208160083160083, "grad_norm": 1.1693415641784668, "learning_rate": 3.670634169457587e-05, "loss": 0.1983, "num_input_tokens_seen": 2844304, "step": 31585 }, { "epoch": 8.20945945945946, "grad_norm": 0.42379167675971985, "learning_rate": 3.6701331769757134e-05, "loss": 0.0758, "num_input_tokens_seen": 2844768, "step": 31590 }, { "epoch": 8.210758835758837, "grad_norm": 0.5209147930145264, "learning_rate": 3.6696321243127696e-05, "loss": 0.272, "num_input_tokens_seen": 2845200, "step": 31595 }, { "epoch": 8.212058212058212, "grad_norm": 2.25661301612854, "learning_rate": 3.6691310114945244e-05, "loss": 0.339, "num_input_tokens_seen": 2845632, "step": 31600 }, { "epoch": 8.213357588357589, "grad_norm": 2.376323699951172, "learning_rate": 3.668629838546751e-05, "loss": 0.1742, "num_input_tokens_seen": 2846096, "step": 31605 }, { "epoch": 8.214656964656964, "grad_norm": 0.5143817663192749, "learning_rate": 3.668128605495226e-05, "loss": 0.3459, "num_input_tokens_seen": 2846544, "step": 31610 }, { "epoch": 8.21595634095634, "grad_norm": 1.7318620681762695, "learning_rate": 3.6676273123657275e-05, "loss": 0.378, "num_input_tokens_seen": 2846976, "step": 31615 }, { "epoch": 8.217255717255718, "grad_norm": 1.6494723558425903, "learning_rate": 3.667125959184037e-05, "loss": 0.238, "num_input_tokens_seen": 2847424, "step": 31620 }, { "epoch": 8.218555093555093, "grad_norm": 0.41248753666877747, "learning_rate": 3.666624545975941e-05, "loss": 0.1678, "num_input_tokens_seen": 2847856, "step": 31625 }, { "epoch": 8.21985446985447, "grad_norm": 1.2351261377334595, "learning_rate": 3.666123072767226e-05, "loss": 0.4297, "num_input_tokens_seen": 2848336, "step": 31630 }, { "epoch": 8.221153846153847, "grad_norm": 0.7741250991821289, "learning_rate": 3.665621539583684e-05, "loss": 0.1623, "num_input_tokens_seen": 2848784, "step": 31635 }, { "epoch": 8.222453222453222, "grad_norm": 0.7424018383026123, "learning_rate": 3.6651199464511096e-05, "loss": 0.223, "num_input_tokens_seen": 2849200, "step": 31640 }, { "epoch": 8.223752598752599, "grad_norm": 0.3944171369075775, "learning_rate": 3.6646182933953e-05, "loss": 0.2031, "num_input_tokens_seen": 2849632, "step": 31645 }, { "epoch": 8.225051975051976, "grad_norm": 0.5768181681632996, "learning_rate": 3.6641165804420565e-05, "loss": 0.1613, "num_input_tokens_seen": 2850144, "step": 31650 }, { "epoch": 8.22635135135135, "grad_norm": 0.5969049334526062, "learning_rate": 3.66361480761718e-05, "loss": 0.1919, "num_input_tokens_seen": 2850624, "step": 31655 }, { "epoch": 8.227650727650728, "grad_norm": 0.3073466122150421, "learning_rate": 3.66311297494648e-05, "loss": 0.1984, "num_input_tokens_seen": 2851104, "step": 31660 }, { "epoch": 8.228950103950105, "grad_norm": 0.525782585144043, "learning_rate": 3.662611082455766e-05, "loss": 0.221, "num_input_tokens_seen": 2851552, "step": 31665 }, { "epoch": 8.23024948024948, "grad_norm": 0.8867175579071045, "learning_rate": 3.662109130170849e-05, "loss": 0.23, "num_input_tokens_seen": 2852000, "step": 31670 }, { "epoch": 8.231548856548857, "grad_norm": 0.5989955067634583, "learning_rate": 3.6616071181175466e-05, "loss": 0.1514, "num_input_tokens_seen": 2852432, "step": 31675 }, { "epoch": 8.232848232848234, "grad_norm": 1.4756510257720947, "learning_rate": 3.661105046321677e-05, "loss": 0.2656, "num_input_tokens_seen": 2852864, "step": 31680 }, { "epoch": 8.234147609147609, "grad_norm": 1.4036167860031128, "learning_rate": 3.660602914809062e-05, "loss": 0.1965, "num_input_tokens_seen": 2853296, "step": 31685 }, { "epoch": 8.235446985446986, "grad_norm": 0.5978082418441772, "learning_rate": 3.660100723605527e-05, "loss": 0.125, "num_input_tokens_seen": 2853744, "step": 31690 }, { "epoch": 8.236746361746361, "grad_norm": 0.9105921387672424, "learning_rate": 3.659598472736901e-05, "loss": 0.2188, "num_input_tokens_seen": 2854176, "step": 31695 }, { "epoch": 8.238045738045738, "grad_norm": 0.8001453876495361, "learning_rate": 3.6590961622290145e-05, "loss": 0.3954, "num_input_tokens_seen": 2854624, "step": 31700 }, { "epoch": 8.239345114345115, "grad_norm": 1.0312529802322388, "learning_rate": 3.658593792107701e-05, "loss": 0.2672, "num_input_tokens_seen": 2855088, "step": 31705 }, { "epoch": 8.24064449064449, "grad_norm": 0.530490517616272, "learning_rate": 3.6580913623987986e-05, "loss": 0.1997, "num_input_tokens_seen": 2855536, "step": 31710 }, { "epoch": 8.241943866943867, "grad_norm": 0.9387537240982056, "learning_rate": 3.657588873128148e-05, "loss": 0.3016, "num_input_tokens_seen": 2856000, "step": 31715 }, { "epoch": 8.243243243243244, "grad_norm": 0.8332202434539795, "learning_rate": 3.657086324321593e-05, "loss": 0.2415, "num_input_tokens_seen": 2856448, "step": 31720 }, { "epoch": 8.244542619542619, "grad_norm": 0.8668851256370544, "learning_rate": 3.65658371600498e-05, "loss": 0.2775, "num_input_tokens_seen": 2856896, "step": 31725 }, { "epoch": 8.245841995841996, "grad_norm": 0.5886247754096985, "learning_rate": 3.656081048204157e-05, "loss": 0.2018, "num_input_tokens_seen": 2857344, "step": 31730 }, { "epoch": 8.247141372141373, "grad_norm": 0.7610467672348022, "learning_rate": 3.655578320944979e-05, "loss": 0.2542, "num_input_tokens_seen": 2857776, "step": 31735 }, { "epoch": 8.248440748440748, "grad_norm": 1.2847460508346558, "learning_rate": 3.6550755342533e-05, "loss": 0.2062, "num_input_tokens_seen": 2858208, "step": 31740 }, { "epoch": 8.249740124740125, "grad_norm": 0.7691554427146912, "learning_rate": 3.654572688154979e-05, "loss": 0.1935, "num_input_tokens_seen": 2858640, "step": 31745 }, { "epoch": 8.2510395010395, "grad_norm": 0.9165221452713013, "learning_rate": 3.65406978267588e-05, "loss": 0.2438, "num_input_tokens_seen": 2859088, "step": 31750 }, { "epoch": 8.252338877338877, "grad_norm": 0.606766939163208, "learning_rate": 3.6535668178418656e-05, "loss": 0.2649, "num_input_tokens_seen": 2859552, "step": 31755 }, { "epoch": 8.253638253638254, "grad_norm": 1.2731614112854004, "learning_rate": 3.653063793678804e-05, "loss": 0.2378, "num_input_tokens_seen": 2859984, "step": 31760 }, { "epoch": 8.25493762993763, "grad_norm": 1.2499662637710571, "learning_rate": 3.652560710212567e-05, "loss": 0.2228, "num_input_tokens_seen": 2860416, "step": 31765 }, { "epoch": 8.256237006237006, "grad_norm": 1.1008217334747314, "learning_rate": 3.652057567469029e-05, "loss": 0.2431, "num_input_tokens_seen": 2860880, "step": 31770 }, { "epoch": 8.257536382536383, "grad_norm": 0.36598193645477295, "learning_rate": 3.651554365474066e-05, "loss": 0.1579, "num_input_tokens_seen": 2861312, "step": 31775 }, { "epoch": 8.258835758835758, "grad_norm": 1.66969895362854, "learning_rate": 3.651051104253558e-05, "loss": 0.1935, "num_input_tokens_seen": 2861728, "step": 31780 }, { "epoch": 8.260135135135135, "grad_norm": 0.5086959004402161, "learning_rate": 3.65054778383339e-05, "loss": 0.2496, "num_input_tokens_seen": 2862176, "step": 31785 }, { "epoch": 8.261434511434512, "grad_norm": 0.5020013451576233, "learning_rate": 3.650044404239447e-05, "loss": 0.2246, "num_input_tokens_seen": 2862592, "step": 31790 }, { "epoch": 8.262733887733887, "grad_norm": 1.1406575441360474, "learning_rate": 3.6495409654976176e-05, "loss": 0.1442, "num_input_tokens_seen": 2863008, "step": 31795 }, { "epoch": 8.264033264033264, "grad_norm": 0.39123719930648804, "learning_rate": 3.649037467633795e-05, "loss": 0.2138, "num_input_tokens_seen": 2863472, "step": 31800 }, { "epoch": 8.265332640332641, "grad_norm": 1.867932677268982, "learning_rate": 3.648533910673874e-05, "loss": 0.4613, "num_input_tokens_seen": 2863920, "step": 31805 }, { "epoch": 8.266632016632016, "grad_norm": 0.4006393551826477, "learning_rate": 3.6480302946437545e-05, "loss": 0.2662, "num_input_tokens_seen": 2864368, "step": 31810 }, { "epoch": 8.267931392931393, "grad_norm": 1.3539063930511475, "learning_rate": 3.647526619569336e-05, "loss": 0.3665, "num_input_tokens_seen": 2864848, "step": 31815 }, { "epoch": 8.26923076923077, "grad_norm": 1.446711778640747, "learning_rate": 3.6470228854765245e-05, "loss": 0.3262, "num_input_tokens_seen": 2865312, "step": 31820 }, { "epoch": 8.270530145530145, "grad_norm": 0.38054540753364563, "learning_rate": 3.646519092391227e-05, "loss": 0.272, "num_input_tokens_seen": 2865728, "step": 31825 }, { "epoch": 8.271829521829522, "grad_norm": 1.2409636974334717, "learning_rate": 3.646015240339355e-05, "loss": 0.2755, "num_input_tokens_seen": 2866176, "step": 31830 }, { "epoch": 8.273128898128897, "grad_norm": 1.6394891738891602, "learning_rate": 3.6455113293468197e-05, "loss": 0.3076, "num_input_tokens_seen": 2866592, "step": 31835 }, { "epoch": 8.274428274428274, "grad_norm": 2.809539794921875, "learning_rate": 3.6450073594395394e-05, "loss": 0.2815, "num_input_tokens_seen": 2867024, "step": 31840 }, { "epoch": 8.275727650727651, "grad_norm": 1.4196364879608154, "learning_rate": 3.644503330643434e-05, "loss": 0.2479, "num_input_tokens_seen": 2867488, "step": 31845 }, { "epoch": 8.277027027027026, "grad_norm": 2.6747915744781494, "learning_rate": 3.643999242984426e-05, "loss": 0.3007, "num_input_tokens_seen": 2867920, "step": 31850 }, { "epoch": 8.278326403326403, "grad_norm": 0.5175474882125854, "learning_rate": 3.64349509648844e-05, "loss": 1.0154, "num_input_tokens_seen": 2868336, "step": 31855 }, { "epoch": 8.27962577962578, "grad_norm": 1.1107004880905151, "learning_rate": 3.642990891181405e-05, "loss": 0.2425, "num_input_tokens_seen": 2868784, "step": 31860 }, { "epoch": 8.280925155925155, "grad_norm": 1.449825406074524, "learning_rate": 3.6424866270892546e-05, "loss": 0.2744, "num_input_tokens_seen": 2869200, "step": 31865 }, { "epoch": 8.282224532224532, "grad_norm": 0.9067140817642212, "learning_rate": 3.641982304237921e-05, "loss": 0.1977, "num_input_tokens_seen": 2869648, "step": 31870 }, { "epoch": 8.28352390852391, "grad_norm": 0.5408750176429749, "learning_rate": 3.6414779226533444e-05, "loss": 0.1718, "num_input_tokens_seen": 2870080, "step": 31875 }, { "epoch": 8.284823284823284, "grad_norm": 1.6768046617507935, "learning_rate": 3.640973482361464e-05, "loss": 0.3307, "num_input_tokens_seen": 2870528, "step": 31880 }, { "epoch": 8.286122661122661, "grad_norm": 0.49436765909194946, "learning_rate": 3.640468983388224e-05, "loss": 0.1224, "num_input_tokens_seen": 2870976, "step": 31885 }, { "epoch": 8.287422037422038, "grad_norm": 0.6735735535621643, "learning_rate": 3.6399644257595726e-05, "loss": 0.1116, "num_input_tokens_seen": 2871440, "step": 31890 }, { "epoch": 8.288721413721413, "grad_norm": 1.0288081169128418, "learning_rate": 3.639459809501458e-05, "loss": 0.2891, "num_input_tokens_seen": 2871904, "step": 31895 }, { "epoch": 8.29002079002079, "grad_norm": 0.3896328806877136, "learning_rate": 3.638955134639833e-05, "loss": 0.2081, "num_input_tokens_seen": 2872368, "step": 31900 }, { "epoch": 8.291320166320165, "grad_norm": 0.5881620645523071, "learning_rate": 3.6384504012006544e-05, "loss": 0.2693, "num_input_tokens_seen": 2872816, "step": 31905 }, { "epoch": 8.292619542619542, "grad_norm": 0.5103244185447693, "learning_rate": 3.6379456092098806e-05, "loss": 0.2746, "num_input_tokens_seen": 2873248, "step": 31910 }, { "epoch": 8.29391891891892, "grad_norm": 0.34453436732292175, "learning_rate": 3.6374407586934744e-05, "loss": 0.1638, "num_input_tokens_seen": 2873744, "step": 31915 }, { "epoch": 8.295218295218294, "grad_norm": 1.4331003427505493, "learning_rate": 3.6369358496774e-05, "loss": 0.313, "num_input_tokens_seen": 2874192, "step": 31920 }, { "epoch": 8.296517671517671, "grad_norm": 0.7481818199157715, "learning_rate": 3.636430882187625e-05, "loss": 0.1966, "num_input_tokens_seen": 2874608, "step": 31925 }, { "epoch": 8.297817047817048, "grad_norm": 2.361675262451172, "learning_rate": 3.635925856250121e-05, "loss": 0.2669, "num_input_tokens_seen": 2875024, "step": 31930 }, { "epoch": 8.299116424116423, "grad_norm": 1.2451642751693726, "learning_rate": 3.635420771890862e-05, "loss": 0.2286, "num_input_tokens_seen": 2875424, "step": 31935 }, { "epoch": 8.3004158004158, "grad_norm": 1.2572435140609741, "learning_rate": 3.6349156291358253e-05, "loss": 0.2681, "num_input_tokens_seen": 2875856, "step": 31940 }, { "epoch": 8.301715176715177, "grad_norm": 1.0498288869857788, "learning_rate": 3.63441042801099e-05, "loss": 0.1624, "num_input_tokens_seen": 2876320, "step": 31945 }, { "epoch": 8.303014553014552, "grad_norm": 1.9024649858474731, "learning_rate": 3.633905168542339e-05, "loss": 0.2429, "num_input_tokens_seen": 2876784, "step": 31950 }, { "epoch": 8.30431392931393, "grad_norm": 0.41197454929351807, "learning_rate": 3.63339985075586e-05, "loss": 0.1463, "num_input_tokens_seen": 2877216, "step": 31955 }, { "epoch": 8.305613305613306, "grad_norm": 0.36334702372550964, "learning_rate": 3.632894474677539e-05, "loss": 0.1517, "num_input_tokens_seen": 2877680, "step": 31960 }, { "epoch": 8.306912681912682, "grad_norm": 0.9388946294784546, "learning_rate": 3.6323890403333704e-05, "loss": 0.3477, "num_input_tokens_seen": 2878096, "step": 31965 }, { "epoch": 8.308212058212058, "grad_norm": 0.38427409529685974, "learning_rate": 3.631883547749348e-05, "loss": 0.1906, "num_input_tokens_seen": 2878544, "step": 31970 }, { "epoch": 8.309511434511435, "grad_norm": 2.2837252616882324, "learning_rate": 3.631377996951472e-05, "loss": 0.3236, "num_input_tokens_seen": 2878992, "step": 31975 }, { "epoch": 8.31081081081081, "grad_norm": 1.062089204788208, "learning_rate": 3.63087238796574e-05, "loss": 0.3782, "num_input_tokens_seen": 2879424, "step": 31980 }, { "epoch": 8.312110187110187, "grad_norm": 0.414436399936676, "learning_rate": 3.6303667208181575e-05, "loss": 0.2427, "num_input_tokens_seen": 2879904, "step": 31985 }, { "epoch": 8.313409563409563, "grad_norm": 0.895263135433197, "learning_rate": 3.6298609955347316e-05, "loss": 0.2405, "num_input_tokens_seen": 2880336, "step": 31990 }, { "epoch": 8.31470893970894, "grad_norm": 3.614243268966675, "learning_rate": 3.629355212141472e-05, "loss": 0.2892, "num_input_tokens_seen": 2880816, "step": 31995 }, { "epoch": 8.316008316008316, "grad_norm": 1.2002825736999512, "learning_rate": 3.628849370664392e-05, "loss": 0.216, "num_input_tokens_seen": 2881344, "step": 32000 }, { "epoch": 8.317307692307692, "grad_norm": 3.81948184967041, "learning_rate": 3.6283434711295076e-05, "loss": 0.2652, "num_input_tokens_seen": 2881824, "step": 32005 }, { "epoch": 8.318607068607069, "grad_norm": 0.7597483396530151, "learning_rate": 3.627837513562837e-05, "loss": 0.1481, "num_input_tokens_seen": 2882256, "step": 32010 }, { "epoch": 8.319906444906445, "grad_norm": 0.5800196528434753, "learning_rate": 3.6273314979904026e-05, "loss": 0.1888, "num_input_tokens_seen": 2882736, "step": 32015 }, { "epoch": 8.32120582120582, "grad_norm": 1.3563826084136963, "learning_rate": 3.6268254244382294e-05, "loss": 0.4016, "num_input_tokens_seen": 2883168, "step": 32020 }, { "epoch": 8.322505197505198, "grad_norm": 0.7883554697036743, "learning_rate": 3.6263192929323445e-05, "loss": 0.1287, "num_input_tokens_seen": 2883632, "step": 32025 }, { "epoch": 8.323804573804575, "grad_norm": 1.439443588256836, "learning_rate": 3.625813103498779e-05, "loss": 0.491, "num_input_tokens_seen": 2884080, "step": 32030 }, { "epoch": 8.32510395010395, "grad_norm": 1.2285518646240234, "learning_rate": 3.625306856163567e-05, "loss": 0.2089, "num_input_tokens_seen": 2884528, "step": 32035 }, { "epoch": 8.326403326403327, "grad_norm": 2.477919816970825, "learning_rate": 3.624800550952746e-05, "loss": 0.2596, "num_input_tokens_seen": 2884976, "step": 32040 }, { "epoch": 8.327702702702704, "grad_norm": 1.0465189218521118, "learning_rate": 3.6242941878923545e-05, "loss": 0.2757, "num_input_tokens_seen": 2885440, "step": 32045 }, { "epoch": 8.329002079002079, "grad_norm": 1.5956003665924072, "learning_rate": 3.623787767008435e-05, "loss": 0.3415, "num_input_tokens_seen": 2885840, "step": 32050 }, { "epoch": 8.330301455301456, "grad_norm": 1.963087797164917, "learning_rate": 3.623281288327034e-05, "loss": 0.2825, "num_input_tokens_seen": 2886272, "step": 32055 }, { "epoch": 8.33160083160083, "grad_norm": 0.9153960943222046, "learning_rate": 3.6227747518742005e-05, "loss": 0.2237, "num_input_tokens_seen": 2886688, "step": 32060 }, { "epoch": 8.332900207900208, "grad_norm": 0.40510544180870056, "learning_rate": 3.622268157675986e-05, "loss": 0.1244, "num_input_tokens_seen": 2887136, "step": 32065 }, { "epoch": 8.334199584199585, "grad_norm": 1.633100152015686, "learning_rate": 3.6217615057584435e-05, "loss": 0.4242, "num_input_tokens_seen": 2887584, "step": 32070 }, { "epoch": 8.33549896049896, "grad_norm": 0.6305146813392639, "learning_rate": 3.6212547961476336e-05, "loss": 0.3004, "num_input_tokens_seen": 2888048, "step": 32075 }, { "epoch": 8.336798336798337, "grad_norm": 0.9027585983276367, "learning_rate": 3.6207480288696144e-05, "loss": 0.208, "num_input_tokens_seen": 2888512, "step": 32080 }, { "epoch": 8.338097713097714, "grad_norm": 0.9110301733016968, "learning_rate": 3.62024120395045e-05, "loss": 0.2, "num_input_tokens_seen": 2888976, "step": 32085 }, { "epoch": 8.339397089397089, "grad_norm": 1.2505476474761963, "learning_rate": 3.619734321416208e-05, "loss": 0.219, "num_input_tokens_seen": 2889392, "step": 32090 }, { "epoch": 8.340696465696466, "grad_norm": 1.3337751626968384, "learning_rate": 3.619227381292956e-05, "loss": 0.2007, "num_input_tokens_seen": 2889808, "step": 32095 }, { "epoch": 8.341995841995843, "grad_norm": 1.3161985874176025, "learning_rate": 3.618720383606768e-05, "loss": 0.1694, "num_input_tokens_seen": 2890272, "step": 32100 }, { "epoch": 8.343295218295218, "grad_norm": 1.362457275390625, "learning_rate": 3.618213328383718e-05, "loss": 0.4272, "num_input_tokens_seen": 2890720, "step": 32105 }, { "epoch": 8.344594594594595, "grad_norm": 0.4140280485153198, "learning_rate": 3.617706215649886e-05, "loss": 0.1941, "num_input_tokens_seen": 2891120, "step": 32110 }, { "epoch": 8.345893970893972, "grad_norm": 2.4848194122314453, "learning_rate": 3.617199045431352e-05, "loss": 0.2607, "num_input_tokens_seen": 2891584, "step": 32115 }, { "epoch": 8.347193347193347, "grad_norm": 1.8423792123794556, "learning_rate": 3.6166918177542006e-05, "loss": 0.4167, "num_input_tokens_seen": 2892016, "step": 32120 }, { "epoch": 8.348492723492724, "grad_norm": 1.7970430850982666, "learning_rate": 3.616184532644519e-05, "loss": 0.3372, "num_input_tokens_seen": 2892448, "step": 32125 }, { "epoch": 8.3497920997921, "grad_norm": 1.7862658500671387, "learning_rate": 3.615677190128398e-05, "loss": 0.2393, "num_input_tokens_seen": 2892912, "step": 32130 }, { "epoch": 8.351091476091476, "grad_norm": 2.468101739883423, "learning_rate": 3.6151697902319296e-05, "loss": 0.3679, "num_input_tokens_seen": 2893360, "step": 32135 }, { "epoch": 8.352390852390853, "grad_norm": 1.1402812004089355, "learning_rate": 3.61466233298121e-05, "loss": 0.1925, "num_input_tokens_seen": 2893824, "step": 32140 }, { "epoch": 8.353690228690228, "grad_norm": 3.70955753326416, "learning_rate": 3.614154818402339e-05, "loss": 0.2963, "num_input_tokens_seen": 2894256, "step": 32145 }, { "epoch": 8.354989604989605, "grad_norm": 0.6582499742507935, "learning_rate": 3.613647246521419e-05, "loss": 0.2443, "num_input_tokens_seen": 2894704, "step": 32150 }, { "epoch": 8.356288981288982, "grad_norm": 1.7814563512802124, "learning_rate": 3.613139617364553e-05, "loss": 0.263, "num_input_tokens_seen": 2895152, "step": 32155 }, { "epoch": 8.357588357588357, "grad_norm": 1.3362303972244263, "learning_rate": 3.61263193095785e-05, "loss": 0.261, "num_input_tokens_seen": 2895600, "step": 32160 }, { "epoch": 8.358887733887734, "grad_norm": 1.0401599407196045, "learning_rate": 3.6121241873274205e-05, "loss": 0.2956, "num_input_tokens_seen": 2896016, "step": 32165 }, { "epoch": 8.36018711018711, "grad_norm": 1.2590967416763306, "learning_rate": 3.611616386499379e-05, "loss": 0.2454, "num_input_tokens_seen": 2896480, "step": 32170 }, { "epoch": 8.361486486486486, "grad_norm": 2.920532464981079, "learning_rate": 3.611108528499841e-05, "loss": 0.2239, "num_input_tokens_seen": 2896912, "step": 32175 }, { "epoch": 8.362785862785863, "grad_norm": 0.546238124370575, "learning_rate": 3.610600613354927e-05, "loss": 0.2882, "num_input_tokens_seen": 2897344, "step": 32180 }, { "epoch": 8.36408523908524, "grad_norm": 0.5675384402275085, "learning_rate": 3.6100926410907605e-05, "loss": 0.2493, "num_input_tokens_seen": 2897792, "step": 32185 }, { "epoch": 8.365384615384615, "grad_norm": 0.7779598236083984, "learning_rate": 3.6095846117334656e-05, "loss": 0.2733, "num_input_tokens_seen": 2898224, "step": 32190 }, { "epoch": 8.366683991683992, "grad_norm": 0.6382104754447937, "learning_rate": 3.60907652530917e-05, "loss": 0.1856, "num_input_tokens_seen": 2898704, "step": 32195 }, { "epoch": 8.367983367983369, "grad_norm": 0.7914802432060242, "learning_rate": 3.6085683818440055e-05, "loss": 0.2137, "num_input_tokens_seen": 2899136, "step": 32200 }, { "epoch": 8.369282744282744, "grad_norm": 0.7421646118164062, "learning_rate": 3.6080601813641076e-05, "loss": 0.3366, "num_input_tokens_seen": 2899568, "step": 32205 }, { "epoch": 8.370582120582121, "grad_norm": 1.951170802116394, "learning_rate": 3.6075519238956135e-05, "loss": 0.2295, "num_input_tokens_seen": 2900000, "step": 32210 }, { "epoch": 8.371881496881496, "grad_norm": 1.920065999031067, "learning_rate": 3.6070436094646626e-05, "loss": 0.3033, "num_input_tokens_seen": 2900432, "step": 32215 }, { "epoch": 8.373180873180873, "grad_norm": 0.4269039034843445, "learning_rate": 3.6065352380973984e-05, "loss": 0.0683, "num_input_tokens_seen": 2900912, "step": 32220 }, { "epoch": 8.37448024948025, "grad_norm": 1.5111689567565918, "learning_rate": 3.606026809819966e-05, "loss": 0.4462, "num_input_tokens_seen": 2901376, "step": 32225 }, { "epoch": 8.375779625779625, "grad_norm": 2.460935115814209, "learning_rate": 3.605518324658514e-05, "loss": 0.3098, "num_input_tokens_seen": 2901808, "step": 32230 }, { "epoch": 8.377079002079002, "grad_norm": 1.3834837675094604, "learning_rate": 3.605009782639197e-05, "loss": 0.1601, "num_input_tokens_seen": 2902272, "step": 32235 }, { "epoch": 8.378378378378379, "grad_norm": 1.1751000881195068, "learning_rate": 3.604501183788168e-05, "loss": 0.2732, "num_input_tokens_seen": 2902720, "step": 32240 }, { "epoch": 8.379677754677754, "grad_norm": 1.162832498550415, "learning_rate": 3.603992528131584e-05, "loss": 0.3126, "num_input_tokens_seen": 2903184, "step": 32245 }, { "epoch": 8.380977130977131, "grad_norm": 1.0210940837860107, "learning_rate": 3.6034838156956066e-05, "loss": 0.3305, "num_input_tokens_seen": 2903664, "step": 32250 }, { "epoch": 8.382276507276508, "grad_norm": 0.8683295249938965, "learning_rate": 3.6029750465064e-05, "loss": 0.2514, "num_input_tokens_seen": 2904096, "step": 32255 }, { "epoch": 8.383575883575883, "grad_norm": 0.5679810047149658, "learning_rate": 3.602466220590129e-05, "loss": 0.2067, "num_input_tokens_seen": 2904544, "step": 32260 }, { "epoch": 8.38487525987526, "grad_norm": 0.9820987582206726, "learning_rate": 3.6019573379729643e-05, "loss": 0.1976, "num_input_tokens_seen": 2904960, "step": 32265 }, { "epoch": 8.386174636174637, "grad_norm": 1.1624196767807007, "learning_rate": 3.6014483986810784e-05, "loss": 0.247, "num_input_tokens_seen": 2905456, "step": 32270 }, { "epoch": 8.387474012474012, "grad_norm": 1.071312665939331, "learning_rate": 3.6009394027406454e-05, "loss": 0.3075, "num_input_tokens_seen": 2905872, "step": 32275 }, { "epoch": 8.388773388773389, "grad_norm": 0.4088590145111084, "learning_rate": 3.600430350177845e-05, "loss": 0.2124, "num_input_tokens_seen": 2906320, "step": 32280 }, { "epoch": 8.390072765072764, "grad_norm": 1.160956859588623, "learning_rate": 3.599921241018856e-05, "loss": 0.2933, "num_input_tokens_seen": 2906784, "step": 32285 }, { "epoch": 8.391372141372141, "grad_norm": 1.3444596529006958, "learning_rate": 3.599412075289864e-05, "loss": 0.3141, "num_input_tokens_seen": 2907248, "step": 32290 }, { "epoch": 8.392671517671518, "grad_norm": 1.369410753250122, "learning_rate": 3.598902853017057e-05, "loss": 0.2955, "num_input_tokens_seen": 2907712, "step": 32295 }, { "epoch": 8.393970893970893, "grad_norm": 0.8464891910552979, "learning_rate": 3.598393574226621e-05, "loss": 0.1886, "num_input_tokens_seen": 2908160, "step": 32300 }, { "epoch": 8.39527027027027, "grad_norm": 1.5685373544692993, "learning_rate": 3.597884238944752e-05, "loss": 0.3305, "num_input_tokens_seen": 2908576, "step": 32305 }, { "epoch": 8.396569646569647, "grad_norm": 1.2901923656463623, "learning_rate": 3.597374847197646e-05, "loss": 0.189, "num_input_tokens_seen": 2909008, "step": 32310 }, { "epoch": 8.397869022869022, "grad_norm": 0.8336505889892578, "learning_rate": 3.5968653990114984e-05, "loss": 0.2898, "num_input_tokens_seen": 2909440, "step": 32315 }, { "epoch": 8.3991683991684, "grad_norm": 0.7803589105606079, "learning_rate": 3.596355894412512e-05, "loss": 0.1832, "num_input_tokens_seen": 2909888, "step": 32320 }, { "epoch": 8.400467775467776, "grad_norm": 0.9453974962234497, "learning_rate": 3.5958463334268925e-05, "loss": 0.1977, "num_input_tokens_seen": 2910336, "step": 32325 }, { "epoch": 8.401767151767151, "grad_norm": 0.8630952835083008, "learning_rate": 3.595336716080846e-05, "loss": 0.3642, "num_input_tokens_seen": 2910752, "step": 32330 }, { "epoch": 8.403066528066528, "grad_norm": 0.7875455617904663, "learning_rate": 3.594827042400583e-05, "loss": 0.2849, "num_input_tokens_seen": 2911184, "step": 32335 }, { "epoch": 8.404365904365905, "grad_norm": 0.4031147360801697, "learning_rate": 3.5943173124123156e-05, "loss": 0.2681, "num_input_tokens_seen": 2911616, "step": 32340 }, { "epoch": 8.40566528066528, "grad_norm": 0.9369676113128662, "learning_rate": 3.593807526142261e-05, "loss": 0.2443, "num_input_tokens_seen": 2912032, "step": 32345 }, { "epoch": 8.406964656964657, "grad_norm": 1.413833498954773, "learning_rate": 3.5932976836166366e-05, "loss": 0.2774, "num_input_tokens_seen": 2912480, "step": 32350 }, { "epoch": 8.408264033264032, "grad_norm": 1.4390206336975098, "learning_rate": 3.592787784861665e-05, "loss": 0.307, "num_input_tokens_seen": 2912912, "step": 32355 }, { "epoch": 8.40956340956341, "grad_norm": 0.7949983477592468, "learning_rate": 3.5922778299035706e-05, "loss": 0.2043, "num_input_tokens_seen": 2913360, "step": 32360 }, { "epoch": 8.410862785862786, "grad_norm": 1.8533381223678589, "learning_rate": 3.5917678187685814e-05, "loss": 0.2183, "num_input_tokens_seen": 2913824, "step": 32365 }, { "epoch": 8.412162162162161, "grad_norm": 0.7344167828559875, "learning_rate": 3.5912577514829264e-05, "loss": 0.2207, "num_input_tokens_seen": 2914272, "step": 32370 }, { "epoch": 8.413461538461538, "grad_norm": 0.5216479897499084, "learning_rate": 3.5907476280728405e-05, "loss": 0.1971, "num_input_tokens_seen": 2914736, "step": 32375 }, { "epoch": 8.414760914760915, "grad_norm": 0.6902527809143066, "learning_rate": 3.590237448564558e-05, "loss": 0.225, "num_input_tokens_seen": 2915200, "step": 32380 }, { "epoch": 8.41606029106029, "grad_norm": 0.5570885539054871, "learning_rate": 3.5897272129843194e-05, "loss": 0.1667, "num_input_tokens_seen": 2915664, "step": 32385 }, { "epoch": 8.417359667359667, "grad_norm": 1.860923171043396, "learning_rate": 3.589216921358366e-05, "loss": 0.2851, "num_input_tokens_seen": 2916128, "step": 32390 }, { "epoch": 8.418659043659044, "grad_norm": 0.5311372876167297, "learning_rate": 3.588706573712942e-05, "loss": 0.3149, "num_input_tokens_seen": 2916560, "step": 32395 }, { "epoch": 8.41995841995842, "grad_norm": 0.7692933678627014, "learning_rate": 3.588196170074297e-05, "loss": 0.195, "num_input_tokens_seen": 2917024, "step": 32400 }, { "epoch": 8.421257796257796, "grad_norm": 1.4936250448226929, "learning_rate": 3.58768571046868e-05, "loss": 0.3219, "num_input_tokens_seen": 2917488, "step": 32405 }, { "epoch": 8.422557172557173, "grad_norm": 0.6077067852020264, "learning_rate": 3.5871751949223444e-05, "loss": 0.2439, "num_input_tokens_seen": 2917968, "step": 32410 }, { "epoch": 8.423856548856548, "grad_norm": 0.3727877736091614, "learning_rate": 3.5866646234615474e-05, "loss": 0.2796, "num_input_tokens_seen": 2918416, "step": 32415 }, { "epoch": 8.425155925155925, "grad_norm": 0.4930076599121094, "learning_rate": 3.5861539961125475e-05, "loss": 0.1685, "num_input_tokens_seen": 2918896, "step": 32420 }, { "epoch": 8.426455301455302, "grad_norm": 0.6107742190361023, "learning_rate": 3.585643312901606e-05, "loss": 0.263, "num_input_tokens_seen": 2919376, "step": 32425 }, { "epoch": 8.427754677754677, "grad_norm": 1.5886470079421997, "learning_rate": 3.585132573854989e-05, "loss": 0.208, "num_input_tokens_seen": 2919872, "step": 32430 }, { "epoch": 8.429054054054054, "grad_norm": 1.6592979431152344, "learning_rate": 3.5846217789989644e-05, "loss": 0.3197, "num_input_tokens_seen": 2920336, "step": 32435 }, { "epoch": 8.43035343035343, "grad_norm": 0.4261212944984436, "learning_rate": 3.584110928359803e-05, "loss": 0.0991, "num_input_tokens_seen": 2920768, "step": 32440 }, { "epoch": 8.431652806652806, "grad_norm": 1.5403931140899658, "learning_rate": 3.5836000219637765e-05, "loss": 0.1445, "num_input_tokens_seen": 2921248, "step": 32445 }, { "epoch": 8.432952182952183, "grad_norm": 1.8495874404907227, "learning_rate": 3.5830890598371635e-05, "loss": 0.3298, "num_input_tokens_seen": 2921712, "step": 32450 }, { "epoch": 8.434251559251559, "grad_norm": 0.14951293170452118, "learning_rate": 3.582578042006242e-05, "loss": 0.3311, "num_input_tokens_seen": 2922160, "step": 32455 }, { "epoch": 8.435550935550935, "grad_norm": 0.4510830044746399, "learning_rate": 3.5820669684972955e-05, "loss": 0.2522, "num_input_tokens_seen": 2922592, "step": 32460 }, { "epoch": 8.436850311850312, "grad_norm": 0.36900073289871216, "learning_rate": 3.581555839336606e-05, "loss": 0.1873, "num_input_tokens_seen": 2923024, "step": 32465 }, { "epoch": 8.438149688149688, "grad_norm": 0.7087225914001465, "learning_rate": 3.581044654550465e-05, "loss": 0.2537, "num_input_tokens_seen": 2923504, "step": 32470 }, { "epoch": 8.439449064449065, "grad_norm": 0.4934324324131012, "learning_rate": 3.580533414165162e-05, "loss": 0.1746, "num_input_tokens_seen": 2924000, "step": 32475 }, { "epoch": 8.440748440748441, "grad_norm": 1.0542287826538086, "learning_rate": 3.580022118206989e-05, "loss": 0.2246, "num_input_tokens_seen": 2924448, "step": 32480 }, { "epoch": 8.442047817047817, "grad_norm": 1.4244168996810913, "learning_rate": 3.579510766702244e-05, "loss": 0.3321, "num_input_tokens_seen": 2924864, "step": 32485 }, { "epoch": 8.443347193347194, "grad_norm": 1.6053340435028076, "learning_rate": 3.578999359677226e-05, "loss": 0.281, "num_input_tokens_seen": 2925344, "step": 32490 }, { "epoch": 8.44464656964657, "grad_norm": 0.7286517024040222, "learning_rate": 3.578487897158236e-05, "loss": 0.2212, "num_input_tokens_seen": 2925808, "step": 32495 }, { "epoch": 8.445945945945946, "grad_norm": 0.3492123484611511, "learning_rate": 3.577976379171581e-05, "loss": 0.2228, "num_input_tokens_seen": 2926240, "step": 32500 }, { "epoch": 8.447245322245323, "grad_norm": 0.5111204385757446, "learning_rate": 3.577464805743569e-05, "loss": 0.1674, "num_input_tokens_seen": 2926704, "step": 32505 }, { "epoch": 8.448544698544698, "grad_norm": 0.4847742021083832, "learning_rate": 3.576953176900509e-05, "loss": 0.1445, "num_input_tokens_seen": 2927152, "step": 32510 }, { "epoch": 8.449844074844075, "grad_norm": 0.40403005480766296, "learning_rate": 3.5764414926687144e-05, "loss": 0.2095, "num_input_tokens_seen": 2927584, "step": 32515 }, { "epoch": 8.451143451143452, "grad_norm": 0.4254247546195984, "learning_rate": 3.575929753074503e-05, "loss": 0.2283, "num_input_tokens_seen": 2928048, "step": 32520 }, { "epoch": 8.452442827442827, "grad_norm": 0.2538084089756012, "learning_rate": 3.575417958144194e-05, "loss": 0.1309, "num_input_tokens_seen": 2928480, "step": 32525 }, { "epoch": 8.453742203742204, "grad_norm": 0.3276709318161011, "learning_rate": 3.574906107904108e-05, "loss": 0.2603, "num_input_tokens_seen": 2928944, "step": 32530 }, { "epoch": 8.45504158004158, "grad_norm": 1.6601537466049194, "learning_rate": 3.5743942023805715e-05, "loss": 0.3835, "num_input_tokens_seen": 2929408, "step": 32535 }, { "epoch": 8.456340956340956, "grad_norm": 0.32502931356430054, "learning_rate": 3.573882241599912e-05, "loss": 0.3444, "num_input_tokens_seen": 2929840, "step": 32540 }, { "epoch": 8.457640332640333, "grad_norm": 0.38987764716148376, "learning_rate": 3.57337022558846e-05, "loss": 0.2371, "num_input_tokens_seen": 2930320, "step": 32545 }, { "epoch": 8.45893970893971, "grad_norm": 1.071996808052063, "learning_rate": 3.572858154372548e-05, "loss": 0.1964, "num_input_tokens_seen": 2930736, "step": 32550 }, { "epoch": 8.460239085239085, "grad_norm": 0.6704517602920532, "learning_rate": 3.5723460279785135e-05, "loss": 0.2842, "num_input_tokens_seen": 2931152, "step": 32555 }, { "epoch": 8.461538461538462, "grad_norm": 0.7240414619445801, "learning_rate": 3.571833846432696e-05, "loss": 0.2431, "num_input_tokens_seen": 2931568, "step": 32560 }, { "epoch": 8.462837837837839, "grad_norm": 0.8985859155654907, "learning_rate": 3.571321609761435e-05, "loss": 0.2107, "num_input_tokens_seen": 2932032, "step": 32565 }, { "epoch": 8.464137214137214, "grad_norm": 1.1994751691818237, "learning_rate": 3.5708093179910786e-05, "loss": 0.3211, "num_input_tokens_seen": 2932480, "step": 32570 }, { "epoch": 8.46543659043659, "grad_norm": 0.9425951838493347, "learning_rate": 3.5702969711479726e-05, "loss": 0.1944, "num_input_tokens_seen": 2932928, "step": 32575 }, { "epoch": 8.466735966735968, "grad_norm": 1.260801076889038, "learning_rate": 3.569784569258469e-05, "loss": 0.2449, "num_input_tokens_seen": 2933408, "step": 32580 }, { "epoch": 8.468035343035343, "grad_norm": 1.3474088907241821, "learning_rate": 3.569272112348918e-05, "loss": 0.3029, "num_input_tokens_seen": 2933872, "step": 32585 }, { "epoch": 8.46933471933472, "grad_norm": 2.7751340866088867, "learning_rate": 3.5687596004456785e-05, "loss": 0.3456, "num_input_tokens_seen": 2934352, "step": 32590 }, { "epoch": 8.470634095634095, "grad_norm": 0.42293035984039307, "learning_rate": 3.568247033575109e-05, "loss": 0.2634, "num_input_tokens_seen": 2934800, "step": 32595 }, { "epoch": 8.471933471933472, "grad_norm": 1.6815898418426514, "learning_rate": 3.567734411763571e-05, "loss": 0.1746, "num_input_tokens_seen": 2935248, "step": 32600 }, { "epoch": 8.473232848232849, "grad_norm": 1.050193190574646, "learning_rate": 3.5672217350374284e-05, "loss": 0.2499, "num_input_tokens_seen": 2935712, "step": 32605 }, { "epoch": 8.474532224532224, "grad_norm": 0.3688344657421112, "learning_rate": 3.566709003423051e-05, "loss": 0.2708, "num_input_tokens_seen": 2936160, "step": 32610 }, { "epoch": 8.4758316008316, "grad_norm": 0.3624579608440399, "learning_rate": 3.5661962169468065e-05, "loss": 0.1017, "num_input_tokens_seen": 2936608, "step": 32615 }, { "epoch": 8.477130977130978, "grad_norm": 0.741247296333313, "learning_rate": 3.565683375635068e-05, "loss": 0.1773, "num_input_tokens_seen": 2937024, "step": 32620 }, { "epoch": 8.478430353430353, "grad_norm": 0.5107184052467346, "learning_rate": 3.565170479514214e-05, "loss": 0.1742, "num_input_tokens_seen": 2937456, "step": 32625 }, { "epoch": 8.47972972972973, "grad_norm": 0.39877399802207947, "learning_rate": 3.564657528610621e-05, "loss": 0.2354, "num_input_tokens_seen": 2937904, "step": 32630 }, { "epoch": 8.481029106029107, "grad_norm": 1.6645106077194214, "learning_rate": 3.564144522950671e-05, "loss": 0.4172, "num_input_tokens_seen": 2938352, "step": 32635 }, { "epoch": 8.482328482328482, "grad_norm": 1.8636291027069092, "learning_rate": 3.563631462560749e-05, "loss": 0.391, "num_input_tokens_seen": 2938784, "step": 32640 }, { "epoch": 8.483627858627859, "grad_norm": 0.4096364378929138, "learning_rate": 3.563118347467241e-05, "loss": 0.2023, "num_input_tokens_seen": 2939232, "step": 32645 }, { "epoch": 8.484927234927236, "grad_norm": 0.5016515851020813, "learning_rate": 3.562605177696539e-05, "loss": 0.2019, "num_input_tokens_seen": 2939664, "step": 32650 }, { "epoch": 8.486226611226611, "grad_norm": 1.2809839248657227, "learning_rate": 3.562091953275034e-05, "loss": 0.3767, "num_input_tokens_seen": 2940128, "step": 32655 }, { "epoch": 8.487525987525988, "grad_norm": 0.7392911314964294, "learning_rate": 3.561578674229122e-05, "loss": 0.2211, "num_input_tokens_seen": 2940592, "step": 32660 }, { "epoch": 8.488825363825363, "grad_norm": 0.42997097969055176, "learning_rate": 3.5610653405852014e-05, "loss": 0.1841, "num_input_tokens_seen": 2941040, "step": 32665 }, { "epoch": 8.49012474012474, "grad_norm": 0.8284299969673157, "learning_rate": 3.560551952369674e-05, "loss": 0.1933, "num_input_tokens_seen": 2941472, "step": 32670 }, { "epoch": 8.491424116424117, "grad_norm": 1.040833592414856, "learning_rate": 3.560038509608944e-05, "loss": 0.2298, "num_input_tokens_seen": 2941904, "step": 32675 }, { "epoch": 8.492723492723492, "grad_norm": 0.5243371725082397, "learning_rate": 3.5595250123294175e-05, "loss": 0.2311, "num_input_tokens_seen": 2942368, "step": 32680 }, { "epoch": 8.494022869022869, "grad_norm": 0.7436420917510986, "learning_rate": 3.559011460557504e-05, "loss": 0.2691, "num_input_tokens_seen": 2942848, "step": 32685 }, { "epoch": 8.495322245322246, "grad_norm": 0.6471184492111206, "learning_rate": 3.558497854319617e-05, "loss": 0.3877, "num_input_tokens_seen": 2943280, "step": 32690 }, { "epoch": 8.496621621621621, "grad_norm": 0.8110925555229187, "learning_rate": 3.5579841936421696e-05, "loss": 0.2746, "num_input_tokens_seen": 2943712, "step": 32695 }, { "epoch": 8.497920997920998, "grad_norm": 0.8185574412345886, "learning_rate": 3.557470478551583e-05, "loss": 0.2175, "num_input_tokens_seen": 2944176, "step": 32700 }, { "epoch": 8.499220374220375, "grad_norm": 1.1559951305389404, "learning_rate": 3.5569567090742764e-05, "loss": 0.2041, "num_input_tokens_seen": 2944624, "step": 32705 }, { "epoch": 8.50051975051975, "grad_norm": 0.6487485766410828, "learning_rate": 3.5564428852366725e-05, "loss": 0.2476, "num_input_tokens_seen": 2945056, "step": 32710 }, { "epoch": 8.501819126819127, "grad_norm": 0.9543159008026123, "learning_rate": 3.5559290070652e-05, "loss": 0.2907, "num_input_tokens_seen": 2945520, "step": 32715 }, { "epoch": 8.503118503118504, "grad_norm": 1.3900682926177979, "learning_rate": 3.555415074586286e-05, "loss": 0.2997, "num_input_tokens_seen": 2945936, "step": 32720 }, { "epoch": 8.504417879417879, "grad_norm": 0.7064160108566284, "learning_rate": 3.554901087826364e-05, "loss": 0.3031, "num_input_tokens_seen": 2946416, "step": 32725 }, { "epoch": 8.505717255717256, "grad_norm": 0.9166541695594788, "learning_rate": 3.5543870468118676e-05, "loss": 0.3091, "num_input_tokens_seen": 2946832, "step": 32730 }, { "epoch": 8.507016632016633, "grad_norm": 1.3187055587768555, "learning_rate": 3.553872951569236e-05, "loss": 0.2117, "num_input_tokens_seen": 2947296, "step": 32735 }, { "epoch": 8.508316008316008, "grad_norm": 0.843633770942688, "learning_rate": 3.5533588021249084e-05, "loss": 0.292, "num_input_tokens_seen": 2947776, "step": 32740 }, { "epoch": 8.509615384615385, "grad_norm": 0.6059906482696533, "learning_rate": 3.552844598505328e-05, "loss": 0.3174, "num_input_tokens_seen": 2948208, "step": 32745 }, { "epoch": 8.51091476091476, "grad_norm": 1.570090651512146, "learning_rate": 3.552330340736942e-05, "loss": 0.3275, "num_input_tokens_seen": 2948688, "step": 32750 }, { "epoch": 8.512214137214137, "grad_norm": 1.5977721214294434, "learning_rate": 3.5518160288461975e-05, "loss": 0.2771, "num_input_tokens_seen": 2949136, "step": 32755 }, { "epoch": 8.513513513513514, "grad_norm": 1.161639928817749, "learning_rate": 3.551301662859548e-05, "loss": 0.2649, "num_input_tokens_seen": 2949568, "step": 32760 }, { "epoch": 8.51481288981289, "grad_norm": 1.0055031776428223, "learning_rate": 3.550787242803444e-05, "loss": 0.209, "num_input_tokens_seen": 2950016, "step": 32765 }, { "epoch": 8.516112266112266, "grad_norm": 0.5868513584136963, "learning_rate": 3.5502727687043476e-05, "loss": 0.2242, "num_input_tokens_seen": 2950464, "step": 32770 }, { "epoch": 8.517411642411643, "grad_norm": 1.234824299812317, "learning_rate": 3.549758240588716e-05, "loss": 0.2847, "num_input_tokens_seen": 2950896, "step": 32775 }, { "epoch": 8.518711018711018, "grad_norm": 1.9087167978286743, "learning_rate": 3.549243658483012e-05, "loss": 0.4007, "num_input_tokens_seen": 2951376, "step": 32780 }, { "epoch": 8.520010395010395, "grad_norm": 0.614454448223114, "learning_rate": 3.548729022413701e-05, "loss": 0.2221, "num_input_tokens_seen": 2951808, "step": 32785 }, { "epoch": 8.521309771309772, "grad_norm": 0.840697705745697, "learning_rate": 3.5482143324072517e-05, "loss": 0.27, "num_input_tokens_seen": 2952304, "step": 32790 }, { "epoch": 8.522609147609147, "grad_norm": 0.777106761932373, "learning_rate": 3.5476995884901357e-05, "loss": 0.2443, "num_input_tokens_seen": 2952784, "step": 32795 }, { "epoch": 8.523908523908524, "grad_norm": 1.3523765802383423, "learning_rate": 3.547184790688825e-05, "loss": 0.2538, "num_input_tokens_seen": 2953216, "step": 32800 }, { "epoch": 8.5252079002079, "grad_norm": 0.9351479411125183, "learning_rate": 3.546669939029798e-05, "loss": 0.2239, "num_input_tokens_seen": 2953648, "step": 32805 }, { "epoch": 8.526507276507276, "grad_norm": 0.7386221885681152, "learning_rate": 3.546155033539533e-05, "loss": 0.2432, "num_input_tokens_seen": 2954112, "step": 32810 }, { "epoch": 8.527806652806653, "grad_norm": 0.694995641708374, "learning_rate": 3.5456400742445115e-05, "loss": 0.2063, "num_input_tokens_seen": 2954576, "step": 32815 }, { "epoch": 8.529106029106028, "grad_norm": 0.9974072575569153, "learning_rate": 3.545125061171219e-05, "loss": 0.2027, "num_input_tokens_seen": 2955008, "step": 32820 }, { "epoch": 8.530405405405405, "grad_norm": 0.4360382556915283, "learning_rate": 3.5446099943461445e-05, "loss": 0.1319, "num_input_tokens_seen": 2955440, "step": 32825 }, { "epoch": 8.531704781704782, "grad_norm": 0.40233176946640015, "learning_rate": 3.5440948737957756e-05, "loss": 0.2558, "num_input_tokens_seen": 2955920, "step": 32830 }, { "epoch": 8.533004158004157, "grad_norm": 0.4668116569519043, "learning_rate": 3.543579699546607e-05, "loss": 0.1402, "num_input_tokens_seen": 2956384, "step": 32835 }, { "epoch": 8.534303534303534, "grad_norm": 0.6647194027900696, "learning_rate": 3.543064471625136e-05, "loss": 0.334, "num_input_tokens_seen": 2956880, "step": 32840 }, { "epoch": 8.535602910602911, "grad_norm": 1.9611222743988037, "learning_rate": 3.5425491900578586e-05, "loss": 0.299, "num_input_tokens_seen": 2957344, "step": 32845 }, { "epoch": 8.536902286902286, "grad_norm": 1.150789499282837, "learning_rate": 3.542033854871278e-05, "loss": 0.2207, "num_input_tokens_seen": 2957808, "step": 32850 }, { "epoch": 8.538201663201663, "grad_norm": 1.7083462476730347, "learning_rate": 3.5415184660918974e-05, "loss": 0.2366, "num_input_tokens_seen": 2958288, "step": 32855 }, { "epoch": 8.53950103950104, "grad_norm": 1.4700897932052612, "learning_rate": 3.541003023746225e-05, "loss": 0.2289, "num_input_tokens_seen": 2958736, "step": 32860 }, { "epoch": 8.540800415800415, "grad_norm": 0.8139554262161255, "learning_rate": 3.540487527860769e-05, "loss": 0.3541, "num_input_tokens_seen": 2959184, "step": 32865 }, { "epoch": 8.542099792099792, "grad_norm": 1.499388337135315, "learning_rate": 3.539971978462043e-05, "loss": 0.3567, "num_input_tokens_seen": 2959664, "step": 32870 }, { "epoch": 8.54339916839917, "grad_norm": 0.685308039188385, "learning_rate": 3.5394563755765615e-05, "loss": 0.1929, "num_input_tokens_seen": 2960112, "step": 32875 }, { "epoch": 8.544698544698544, "grad_norm": 1.2543525695800781, "learning_rate": 3.538940719230842e-05, "loss": 0.2084, "num_input_tokens_seen": 2960544, "step": 32880 }, { "epoch": 8.545997920997921, "grad_norm": 0.586144208908081, "learning_rate": 3.5384250094514073e-05, "loss": 0.2758, "num_input_tokens_seen": 2961024, "step": 32885 }, { "epoch": 8.547297297297296, "grad_norm": 1.0819332599639893, "learning_rate": 3.5379092462647776e-05, "loss": 0.1438, "num_input_tokens_seen": 2961456, "step": 32890 }, { "epoch": 8.548596673596673, "grad_norm": 4.355114459991455, "learning_rate": 3.5373934296974816e-05, "loss": 0.2783, "num_input_tokens_seen": 2961920, "step": 32895 }, { "epoch": 8.54989604989605, "grad_norm": 1.1292152404785156, "learning_rate": 3.536877559776048e-05, "loss": 0.1792, "num_input_tokens_seen": 2962368, "step": 32900 }, { "epoch": 8.551195426195425, "grad_norm": 1.8300811052322388, "learning_rate": 3.5363616365270075e-05, "loss": 0.3963, "num_input_tokens_seen": 2962816, "step": 32905 }, { "epoch": 8.552494802494802, "grad_norm": 1.7564799785614014, "learning_rate": 3.535845659976895e-05, "loss": 0.1941, "num_input_tokens_seen": 2963296, "step": 32910 }, { "epoch": 8.55379417879418, "grad_norm": 2.4897348880767822, "learning_rate": 3.5353296301522474e-05, "loss": 0.171, "num_input_tokens_seen": 2963728, "step": 32915 }, { "epoch": 8.555093555093555, "grad_norm": 2.2495100498199463, "learning_rate": 3.5348135470796054e-05, "loss": 0.3484, "num_input_tokens_seen": 2964192, "step": 32920 }, { "epoch": 8.556392931392931, "grad_norm": 1.2992236614227295, "learning_rate": 3.534297410785512e-05, "loss": 0.1588, "num_input_tokens_seen": 2964624, "step": 32925 }, { "epoch": 8.557692307692308, "grad_norm": 1.0203250646591187, "learning_rate": 3.53378122129651e-05, "loss": 0.1695, "num_input_tokens_seen": 2965072, "step": 32930 }, { "epoch": 8.558991683991684, "grad_norm": 0.35133954882621765, "learning_rate": 3.533264978639151e-05, "loss": 0.251, "num_input_tokens_seen": 2965504, "step": 32935 }, { "epoch": 8.56029106029106, "grad_norm": 2.7313241958618164, "learning_rate": 3.5327486828399834e-05, "loss": 0.2903, "num_input_tokens_seen": 2965968, "step": 32940 }, { "epoch": 8.561590436590437, "grad_norm": 1.1472249031066895, "learning_rate": 3.53223233392556e-05, "loss": 0.2861, "num_input_tokens_seen": 2966368, "step": 32945 }, { "epoch": 8.562889812889813, "grad_norm": 1.6830193996429443, "learning_rate": 3.5317159319224406e-05, "loss": 0.1914, "num_input_tokens_seen": 2966800, "step": 32950 }, { "epoch": 8.56418918918919, "grad_norm": 0.7004402279853821, "learning_rate": 3.531199476857182e-05, "loss": 0.2015, "num_input_tokens_seen": 2967248, "step": 32955 }, { "epoch": 8.565488565488565, "grad_norm": 2.0289809703826904, "learning_rate": 3.5306829687563455e-05, "loss": 0.3231, "num_input_tokens_seen": 2967696, "step": 32960 }, { "epoch": 8.566787941787942, "grad_norm": 0.44173750281333923, "learning_rate": 3.530166407646497e-05, "loss": 0.0675, "num_input_tokens_seen": 2968128, "step": 32965 }, { "epoch": 8.568087318087318, "grad_norm": 2.0461604595184326, "learning_rate": 3.529649793554203e-05, "loss": 0.4535, "num_input_tokens_seen": 2968592, "step": 32970 }, { "epoch": 8.569386694386694, "grad_norm": 2.4324753284454346, "learning_rate": 3.5291331265060336e-05, "loss": 0.3292, "num_input_tokens_seen": 2969040, "step": 32975 }, { "epoch": 8.57068607068607, "grad_norm": 0.6557482481002808, "learning_rate": 3.528616406528561e-05, "loss": 0.1968, "num_input_tokens_seen": 2969488, "step": 32980 }, { "epoch": 8.571985446985448, "grad_norm": 1.3229018449783325, "learning_rate": 3.5280996336483614e-05, "loss": 0.1601, "num_input_tokens_seen": 2969888, "step": 32985 }, { "epoch": 8.573284823284823, "grad_norm": 1.5210387706756592, "learning_rate": 3.527582807892013e-05, "loss": 0.2289, "num_input_tokens_seen": 2970320, "step": 32990 }, { "epoch": 8.5745841995842, "grad_norm": 0.8185821175575256, "learning_rate": 3.527065929286095e-05, "loss": 0.1475, "num_input_tokens_seen": 2970736, "step": 32995 }, { "epoch": 8.575883575883577, "grad_norm": 0.6321808695793152, "learning_rate": 3.526548997857193e-05, "loss": 0.1391, "num_input_tokens_seen": 2971168, "step": 33000 }, { "epoch": 8.577182952182952, "grad_norm": 0.6605991721153259, "learning_rate": 3.526032013631893e-05, "loss": 0.2448, "num_input_tokens_seen": 2971600, "step": 33005 }, { "epoch": 8.578482328482329, "grad_norm": 2.080444574356079, "learning_rate": 3.5255149766367826e-05, "loss": 0.3825, "num_input_tokens_seen": 2972080, "step": 33010 }, { "epoch": 8.579781704781706, "grad_norm": 1.2406573295593262, "learning_rate": 3.524997886898454e-05, "loss": 0.1906, "num_input_tokens_seen": 2972512, "step": 33015 }, { "epoch": 8.58108108108108, "grad_norm": 0.5904759168624878, "learning_rate": 3.524480744443503e-05, "loss": 0.3335, "num_input_tokens_seen": 2972976, "step": 33020 }, { "epoch": 8.582380457380458, "grad_norm": 0.8520180583000183, "learning_rate": 3.523963549298525e-05, "loss": 0.2594, "num_input_tokens_seen": 2973440, "step": 33025 }, { "epoch": 8.583679833679835, "grad_norm": 2.9931132793426514, "learning_rate": 3.52344630149012e-05, "loss": 0.3066, "num_input_tokens_seen": 2973920, "step": 33030 }, { "epoch": 8.58497920997921, "grad_norm": 2.2759311199188232, "learning_rate": 3.5229290010448915e-05, "loss": 0.3126, "num_input_tokens_seen": 2974432, "step": 33035 }, { "epoch": 8.586278586278587, "grad_norm": 0.556185781955719, "learning_rate": 3.5224116479894456e-05, "loss": 0.1539, "num_input_tokens_seen": 2974848, "step": 33040 }, { "epoch": 8.587577962577962, "grad_norm": 0.8618103861808777, "learning_rate": 3.5218942423503874e-05, "loss": 0.1887, "num_input_tokens_seen": 2975296, "step": 33045 }, { "epoch": 8.588877338877339, "grad_norm": 1.8739547729492188, "learning_rate": 3.521376784154331e-05, "loss": 0.3525, "num_input_tokens_seen": 2975744, "step": 33050 }, { "epoch": 8.590176715176716, "grad_norm": 1.999815821647644, "learning_rate": 3.5208592734278854e-05, "loss": 0.3172, "num_input_tokens_seen": 2976160, "step": 33055 }, { "epoch": 8.59147609147609, "grad_norm": 3.122692108154297, "learning_rate": 3.520341710197671e-05, "loss": 0.3732, "num_input_tokens_seen": 2976608, "step": 33060 }, { "epoch": 8.592775467775468, "grad_norm": 1.3774129152297974, "learning_rate": 3.519824094490305e-05, "loss": 0.3121, "num_input_tokens_seen": 2977088, "step": 33065 }, { "epoch": 8.594074844074845, "grad_norm": 1.7361156940460205, "learning_rate": 3.519306426332408e-05, "loss": 0.2463, "num_input_tokens_seen": 2977536, "step": 33070 }, { "epoch": 8.59537422037422, "grad_norm": 0.6849130988121033, "learning_rate": 3.518788705750605e-05, "loss": 0.1347, "num_input_tokens_seen": 2978032, "step": 33075 }, { "epoch": 8.596673596673597, "grad_norm": 2.069999933242798, "learning_rate": 3.518270932771523e-05, "loss": 0.2441, "num_input_tokens_seen": 2978480, "step": 33080 }, { "epoch": 8.597972972972974, "grad_norm": 1.0998097658157349, "learning_rate": 3.5177531074217906e-05, "loss": 0.1463, "num_input_tokens_seen": 2978928, "step": 33085 }, { "epoch": 8.599272349272349, "grad_norm": 0.3745591938495636, "learning_rate": 3.517235229728041e-05, "loss": 0.1373, "num_input_tokens_seen": 2979376, "step": 33090 }, { "epoch": 8.600571725571726, "grad_norm": 0.1846238076686859, "learning_rate": 3.516717299716909e-05, "loss": 0.2081, "num_input_tokens_seen": 2979808, "step": 33095 }, { "epoch": 8.601871101871101, "grad_norm": 2.2163615226745605, "learning_rate": 3.516199317415032e-05, "loss": 0.415, "num_input_tokens_seen": 2980368, "step": 33100 }, { "epoch": 8.603170478170478, "grad_norm": 2.637763261795044, "learning_rate": 3.51568128284905e-05, "loss": 0.2776, "num_input_tokens_seen": 2980816, "step": 33105 }, { "epoch": 8.604469854469855, "grad_norm": 0.2569689154624939, "learning_rate": 3.515163196045607e-05, "loss": 0.311, "num_input_tokens_seen": 2981296, "step": 33110 }, { "epoch": 8.60576923076923, "grad_norm": 0.5420318245887756, "learning_rate": 3.514645057031348e-05, "loss": 0.2188, "num_input_tokens_seen": 2981776, "step": 33115 }, { "epoch": 8.607068607068607, "grad_norm": 1.0730289220809937, "learning_rate": 3.514126865832922e-05, "loss": 0.1473, "num_input_tokens_seen": 2982208, "step": 33120 }, { "epoch": 8.608367983367984, "grad_norm": 0.5845275521278381, "learning_rate": 3.513608622476979e-05, "loss": 0.2037, "num_input_tokens_seen": 2982624, "step": 33125 }, { "epoch": 8.609667359667359, "grad_norm": 1.7508999109268188, "learning_rate": 3.513090326990174e-05, "loss": 0.3504, "num_input_tokens_seen": 2983104, "step": 33130 }, { "epoch": 8.610966735966736, "grad_norm": 0.3999031186103821, "learning_rate": 3.512571979399162e-05, "loss": 0.1554, "num_input_tokens_seen": 2983552, "step": 33135 }, { "epoch": 8.612266112266113, "grad_norm": 0.6823370456695557, "learning_rate": 3.5120535797306034e-05, "loss": 0.2025, "num_input_tokens_seen": 2983952, "step": 33140 }, { "epoch": 8.613565488565488, "grad_norm": 0.6647040843963623, "learning_rate": 3.511535128011159e-05, "loss": 0.2741, "num_input_tokens_seen": 2984400, "step": 33145 }, { "epoch": 8.614864864864865, "grad_norm": 1.1027530431747437, "learning_rate": 3.511016624267495e-05, "loss": 0.1546, "num_input_tokens_seen": 2984848, "step": 33150 }, { "epoch": 8.616164241164242, "grad_norm": 1.4148246049880981, "learning_rate": 3.510498068526276e-05, "loss": 0.2867, "num_input_tokens_seen": 2985328, "step": 33155 }, { "epoch": 8.617463617463617, "grad_norm": 1.5198004245758057, "learning_rate": 3.5099794608141734e-05, "loss": 0.37, "num_input_tokens_seen": 2985760, "step": 33160 }, { "epoch": 8.618762993762994, "grad_norm": 1.3812909126281738, "learning_rate": 3.50946080115786e-05, "loss": 0.2175, "num_input_tokens_seen": 2986192, "step": 33165 }, { "epoch": 8.62006237006237, "grad_norm": 0.6180137395858765, "learning_rate": 3.5089420895840095e-05, "loss": 0.3713, "num_input_tokens_seen": 2986656, "step": 33170 }, { "epoch": 8.621361746361746, "grad_norm": 0.9166000485420227, "learning_rate": 3.508423326119301e-05, "loss": 0.2159, "num_input_tokens_seen": 2987104, "step": 33175 }, { "epoch": 8.622661122661123, "grad_norm": 0.495542049407959, "learning_rate": 3.507904510790414e-05, "loss": 0.1579, "num_input_tokens_seen": 2987568, "step": 33180 }, { "epoch": 8.6239604989605, "grad_norm": 0.6941995024681091, "learning_rate": 3.5073856436240334e-05, "loss": 0.2475, "num_input_tokens_seen": 2988000, "step": 33185 }, { "epoch": 8.625259875259875, "grad_norm": 0.9990458488464355, "learning_rate": 3.5068667246468436e-05, "loss": 0.1781, "num_input_tokens_seen": 2988432, "step": 33190 }, { "epoch": 8.626559251559252, "grad_norm": 0.33910199999809265, "learning_rate": 3.506347753885533e-05, "loss": 0.1973, "num_input_tokens_seen": 2988928, "step": 33195 }, { "epoch": 8.627858627858627, "grad_norm": 0.8594053387641907, "learning_rate": 3.5058287313667936e-05, "loss": 0.2451, "num_input_tokens_seen": 2989360, "step": 33200 }, { "epoch": 8.629158004158004, "grad_norm": 0.531284749507904, "learning_rate": 3.505309657117319e-05, "loss": 0.201, "num_input_tokens_seen": 2989808, "step": 33205 }, { "epoch": 8.630457380457381, "grad_norm": 1.4672998189926147, "learning_rate": 3.5047905311638045e-05, "loss": 0.4358, "num_input_tokens_seen": 2990224, "step": 33210 }, { "epoch": 8.631756756756756, "grad_norm": 0.4821852147579193, "learning_rate": 3.504271353532951e-05, "loss": 0.2059, "num_input_tokens_seen": 2990688, "step": 33215 }, { "epoch": 8.633056133056133, "grad_norm": 0.7296642065048218, "learning_rate": 3.5037521242514595e-05, "loss": 0.1105, "num_input_tokens_seen": 2991136, "step": 33220 }, { "epoch": 8.63435550935551, "grad_norm": 1.9276353120803833, "learning_rate": 3.5032328433460346e-05, "loss": 0.3632, "num_input_tokens_seen": 2991584, "step": 33225 }, { "epoch": 8.635654885654885, "grad_norm": 0.7571041584014893, "learning_rate": 3.502713510843383e-05, "loss": 0.1821, "num_input_tokens_seen": 2992064, "step": 33230 }, { "epoch": 8.636954261954262, "grad_norm": 0.9198720455169678, "learning_rate": 3.5021941267702144e-05, "loss": 0.2978, "num_input_tokens_seen": 2992512, "step": 33235 }, { "epoch": 8.638253638253639, "grad_norm": 0.6401759386062622, "learning_rate": 3.5016746911532425e-05, "loss": 0.2888, "num_input_tokens_seen": 2992928, "step": 33240 }, { "epoch": 8.639553014553014, "grad_norm": 0.5603438019752502, "learning_rate": 3.5011552040191806e-05, "loss": 0.2261, "num_input_tokens_seen": 2993344, "step": 33245 }, { "epoch": 8.640852390852391, "grad_norm": 1.903157353401184, "learning_rate": 3.500635665394748e-05, "loss": 0.225, "num_input_tokens_seen": 2993808, "step": 33250 }, { "epoch": 8.642151767151766, "grad_norm": 0.42151468992233276, "learning_rate": 3.500116075306664e-05, "loss": 0.2024, "num_input_tokens_seen": 2994272, "step": 33255 }, { "epoch": 8.643451143451143, "grad_norm": 0.5758864879608154, "learning_rate": 3.499596433781653e-05, "loss": 0.2396, "num_input_tokens_seen": 2994720, "step": 33260 }, { "epoch": 8.64475051975052, "grad_norm": 1.594009280204773, "learning_rate": 3.499076740846438e-05, "loss": 0.3091, "num_input_tokens_seen": 2995136, "step": 33265 }, { "epoch": 8.646049896049895, "grad_norm": 1.1274558305740356, "learning_rate": 3.49855699652775e-05, "loss": 0.2088, "num_input_tokens_seen": 2995552, "step": 33270 }, { "epoch": 8.647349272349272, "grad_norm": 0.7014943957328796, "learning_rate": 3.498037200852319e-05, "loss": 0.2058, "num_input_tokens_seen": 2995984, "step": 33275 }, { "epoch": 8.64864864864865, "grad_norm": 2.652972459793091, "learning_rate": 3.497517353846878e-05, "loss": 0.2684, "num_input_tokens_seen": 2996448, "step": 33280 }, { "epoch": 8.649948024948024, "grad_norm": 1.0566489696502686, "learning_rate": 3.4969974555381636e-05, "loss": 0.2535, "num_input_tokens_seen": 2996880, "step": 33285 }, { "epoch": 8.651247401247401, "grad_norm": 0.4431261122226715, "learning_rate": 3.496477505952915e-05, "loss": 0.2114, "num_input_tokens_seen": 2997344, "step": 33290 }, { "epoch": 8.652546777546778, "grad_norm": 1.822780966758728, "learning_rate": 3.4959575051178735e-05, "loss": 0.2633, "num_input_tokens_seen": 2997776, "step": 33295 }, { "epoch": 8.653846153846153, "grad_norm": 0.5438888072967529, "learning_rate": 3.495437453059783e-05, "loss": 0.2021, "num_input_tokens_seen": 2998256, "step": 33300 }, { "epoch": 8.65514553014553, "grad_norm": 0.9373254776000977, "learning_rate": 3.49491734980539e-05, "loss": 0.1729, "num_input_tokens_seen": 2998752, "step": 33305 }, { "epoch": 8.656444906444907, "grad_norm": 0.8032802939414978, "learning_rate": 3.494397195381446e-05, "loss": 0.2679, "num_input_tokens_seen": 2999168, "step": 33310 }, { "epoch": 8.657744282744282, "grad_norm": 0.8264546394348145, "learning_rate": 3.493876989814701e-05, "loss": 0.2252, "num_input_tokens_seen": 2999664, "step": 33315 }, { "epoch": 8.65904365904366, "grad_norm": 2.0005853176116943, "learning_rate": 3.493356733131909e-05, "loss": 0.2536, "num_input_tokens_seen": 3000096, "step": 33320 }, { "epoch": 8.660343035343036, "grad_norm": 0.9624836444854736, "learning_rate": 3.49283642535983e-05, "loss": 0.2133, "num_input_tokens_seen": 3000528, "step": 33325 }, { "epoch": 8.661642411642411, "grad_norm": 1.1006519794464111, "learning_rate": 3.492316066525221e-05, "loss": 0.3526, "num_input_tokens_seen": 3000976, "step": 33330 }, { "epoch": 8.662941787941788, "grad_norm": 2.0073740482330322, "learning_rate": 3.491795656654846e-05, "loss": 0.4226, "num_input_tokens_seen": 3001424, "step": 33335 }, { "epoch": 8.664241164241163, "grad_norm": 1.1539020538330078, "learning_rate": 3.491275195775471e-05, "loss": 0.4316, "num_input_tokens_seen": 3001888, "step": 33340 }, { "epoch": 8.66554054054054, "grad_norm": 1.5057017803192139, "learning_rate": 3.490754683913863e-05, "loss": 0.2726, "num_input_tokens_seen": 3002336, "step": 33345 }, { "epoch": 8.666839916839917, "grad_norm": 0.8031529784202576, "learning_rate": 3.490234121096791e-05, "loss": 0.1849, "num_input_tokens_seen": 3002768, "step": 33350 }, { "epoch": 8.668139293139292, "grad_norm": 1.8015896081924438, "learning_rate": 3.48971350735103e-05, "loss": 0.2026, "num_input_tokens_seen": 3003232, "step": 33355 }, { "epoch": 8.66943866943867, "grad_norm": 0.7997921109199524, "learning_rate": 3.489192842703355e-05, "loss": 0.1597, "num_input_tokens_seen": 3003696, "step": 33360 }, { "epoch": 8.670738045738046, "grad_norm": 1.4165912866592407, "learning_rate": 3.488672127180544e-05, "loss": 0.2506, "num_input_tokens_seen": 3004176, "step": 33365 }, { "epoch": 8.672037422037421, "grad_norm": 0.9675780534744263, "learning_rate": 3.48815136080938e-05, "loss": 0.2994, "num_input_tokens_seen": 3004608, "step": 33370 }, { "epoch": 8.673336798336798, "grad_norm": 1.255529522895813, "learning_rate": 3.487630543616642e-05, "loss": 0.4168, "num_input_tokens_seen": 3005056, "step": 33375 }, { "epoch": 8.674636174636175, "grad_norm": 0.46004101634025574, "learning_rate": 3.4871096756291203e-05, "loss": 0.2267, "num_input_tokens_seen": 3005504, "step": 33380 }, { "epoch": 8.67593555093555, "grad_norm": 0.3675522804260254, "learning_rate": 3.486588756873602e-05, "loss": 0.1837, "num_input_tokens_seen": 3005920, "step": 33385 }, { "epoch": 8.677234927234927, "grad_norm": 1.3857145309448242, "learning_rate": 3.486067787376879e-05, "loss": 0.2002, "num_input_tokens_seen": 3006368, "step": 33390 }, { "epoch": 8.678534303534304, "grad_norm": 1.0653071403503418, "learning_rate": 3.485546767165745e-05, "loss": 0.2035, "num_input_tokens_seen": 3006816, "step": 33395 }, { "epoch": 8.67983367983368, "grad_norm": 0.6495673060417175, "learning_rate": 3.485025696266996e-05, "loss": 0.3396, "num_input_tokens_seen": 3007232, "step": 33400 }, { "epoch": 8.681133056133056, "grad_norm": 1.451939582824707, "learning_rate": 3.484504574707431e-05, "loss": 0.2455, "num_input_tokens_seen": 3007696, "step": 33405 }, { "epoch": 8.682432432432432, "grad_norm": 0.8746720552444458, "learning_rate": 3.4839834025138526e-05, "loss": 0.3672, "num_input_tokens_seen": 3008112, "step": 33410 }, { "epoch": 8.683731808731808, "grad_norm": 0.6325510144233704, "learning_rate": 3.483462179713066e-05, "loss": 0.1494, "num_input_tokens_seen": 3008592, "step": 33415 }, { "epoch": 8.685031185031185, "grad_norm": 0.44060713052749634, "learning_rate": 3.482940906331877e-05, "loss": 0.1828, "num_input_tokens_seen": 3009040, "step": 33420 }, { "epoch": 8.68633056133056, "grad_norm": 0.6927796006202698, "learning_rate": 3.482419582397095e-05, "loss": 0.1932, "num_input_tokens_seen": 3009504, "step": 33425 }, { "epoch": 8.687629937629938, "grad_norm": 0.6170414090156555, "learning_rate": 3.481898207935532e-05, "loss": 0.2567, "num_input_tokens_seen": 3009936, "step": 33430 }, { "epoch": 8.688929313929314, "grad_norm": 1.03257417678833, "learning_rate": 3.481376782974004e-05, "loss": 0.4462, "num_input_tokens_seen": 3010384, "step": 33435 }, { "epoch": 8.69022869022869, "grad_norm": 1.2725938558578491, "learning_rate": 3.480855307539328e-05, "loss": 0.2232, "num_input_tokens_seen": 3010848, "step": 33440 }, { "epoch": 8.691528066528067, "grad_norm": 0.79937344789505, "learning_rate": 3.4803337816583225e-05, "loss": 0.3312, "num_input_tokens_seen": 3011280, "step": 33445 }, { "epoch": 8.692827442827443, "grad_norm": 0.8040634989738464, "learning_rate": 3.479812205357813e-05, "loss": 0.292, "num_input_tokens_seen": 3011712, "step": 33450 }, { "epoch": 8.694126819126819, "grad_norm": 1.9432562589645386, "learning_rate": 3.479290578664622e-05, "loss": 0.2665, "num_input_tokens_seen": 3012144, "step": 33455 }, { "epoch": 8.695426195426196, "grad_norm": 1.163881540298462, "learning_rate": 3.478768901605578e-05, "loss": 0.2412, "num_input_tokens_seen": 3012608, "step": 33460 }, { "epoch": 8.696725571725572, "grad_norm": 0.7295109629631042, "learning_rate": 3.478247174207513e-05, "loss": 0.2253, "num_input_tokens_seen": 3013088, "step": 33465 }, { "epoch": 8.698024948024948, "grad_norm": 0.9600982666015625, "learning_rate": 3.477725396497257e-05, "loss": 0.1825, "num_input_tokens_seen": 3013536, "step": 33470 }, { "epoch": 8.699324324324325, "grad_norm": 1.4676151275634766, "learning_rate": 3.477203568501648e-05, "loss": 0.169, "num_input_tokens_seen": 3013984, "step": 33475 }, { "epoch": 8.700623700623701, "grad_norm": 0.8027260899543762, "learning_rate": 3.476681690247522e-05, "loss": 0.2311, "num_input_tokens_seen": 3014496, "step": 33480 }, { "epoch": 8.701923076923077, "grad_norm": 0.5779853463172913, "learning_rate": 3.476159761761722e-05, "loss": 0.2474, "num_input_tokens_seen": 3014960, "step": 33485 }, { "epoch": 8.703222453222454, "grad_norm": 1.9674403667449951, "learning_rate": 3.4756377830710895e-05, "loss": 0.4243, "num_input_tokens_seen": 3015424, "step": 33490 }, { "epoch": 8.704521829521829, "grad_norm": 0.7014150619506836, "learning_rate": 3.4751157542024714e-05, "loss": 0.3634, "num_input_tokens_seen": 3015840, "step": 33495 }, { "epoch": 8.705821205821206, "grad_norm": 0.4959148168563843, "learning_rate": 3.474593675182715e-05, "loss": 0.1328, "num_input_tokens_seen": 3016320, "step": 33500 }, { "epoch": 8.707120582120583, "grad_norm": 1.2515339851379395, "learning_rate": 3.474071546038673e-05, "loss": 0.291, "num_input_tokens_seen": 3016784, "step": 33505 }, { "epoch": 8.708419958419958, "grad_norm": 1.3243759870529175, "learning_rate": 3.473549366797197e-05, "loss": 0.3183, "num_input_tokens_seen": 3017216, "step": 33510 }, { "epoch": 8.709719334719335, "grad_norm": 1.8018218278884888, "learning_rate": 3.473027137485146e-05, "loss": 0.2793, "num_input_tokens_seen": 3017648, "step": 33515 }, { "epoch": 8.711018711018712, "grad_norm": 1.0601787567138672, "learning_rate": 3.472504858129375e-05, "loss": 0.2173, "num_input_tokens_seen": 3018096, "step": 33520 }, { "epoch": 8.712318087318087, "grad_norm": 0.7111133933067322, "learning_rate": 3.471982528756749e-05, "loss": 0.27, "num_input_tokens_seen": 3018544, "step": 33525 }, { "epoch": 8.713617463617464, "grad_norm": 0.8696151375770569, "learning_rate": 3.4714601493941304e-05, "loss": 0.153, "num_input_tokens_seen": 3018944, "step": 33530 }, { "epoch": 8.71491683991684, "grad_norm": 1.6821002960205078, "learning_rate": 3.470937720068384e-05, "loss": 0.3602, "num_input_tokens_seen": 3019376, "step": 33535 }, { "epoch": 8.716216216216216, "grad_norm": 1.5999590158462524, "learning_rate": 3.470415240806381e-05, "loss": 0.2855, "num_input_tokens_seen": 3019840, "step": 33540 }, { "epoch": 8.717515592515593, "grad_norm": 0.795951783657074, "learning_rate": 3.4698927116349924e-05, "loss": 0.1746, "num_input_tokens_seen": 3020272, "step": 33545 }, { "epoch": 8.71881496881497, "grad_norm": 0.4500851035118103, "learning_rate": 3.4693701325810924e-05, "loss": 0.1622, "num_input_tokens_seen": 3020704, "step": 33550 }, { "epoch": 8.720114345114345, "grad_norm": 0.4662036597728729, "learning_rate": 3.4688475036715575e-05, "loss": 0.3737, "num_input_tokens_seen": 3021120, "step": 33555 }, { "epoch": 8.721413721413722, "grad_norm": 2.5059621334075928, "learning_rate": 3.4683248249332664e-05, "loss": 0.3818, "num_input_tokens_seen": 3021552, "step": 33560 }, { "epoch": 8.722713097713097, "grad_norm": 1.499546766281128, "learning_rate": 3.467802096393103e-05, "loss": 0.2963, "num_input_tokens_seen": 3022000, "step": 33565 }, { "epoch": 8.724012474012474, "grad_norm": 0.727769136428833, "learning_rate": 3.46727931807795e-05, "loss": 0.2321, "num_input_tokens_seen": 3022464, "step": 33570 }, { "epoch": 8.72531185031185, "grad_norm": 0.5940296649932861, "learning_rate": 3.4667564900146956e-05, "loss": 0.2488, "num_input_tokens_seen": 3022928, "step": 33575 }, { "epoch": 8.726611226611226, "grad_norm": 0.5983131527900696, "learning_rate": 3.4662336122302274e-05, "loss": 0.205, "num_input_tokens_seen": 3023344, "step": 33580 }, { "epoch": 8.727910602910603, "grad_norm": 0.6257869005203247, "learning_rate": 3.46571068475144e-05, "loss": 0.2259, "num_input_tokens_seen": 3023792, "step": 33585 }, { "epoch": 8.72920997920998, "grad_norm": 1.0143864154815674, "learning_rate": 3.465187707605226e-05, "loss": 0.1998, "num_input_tokens_seen": 3024240, "step": 33590 }, { "epoch": 8.730509355509355, "grad_norm": 0.5693260431289673, "learning_rate": 3.464664680818483e-05, "loss": 0.2245, "num_input_tokens_seen": 3024736, "step": 33595 }, { "epoch": 8.731808731808732, "grad_norm": 0.758600652217865, "learning_rate": 3.464141604418112e-05, "loss": 0.1148, "num_input_tokens_seen": 3025152, "step": 33600 }, { "epoch": 8.733108108108109, "grad_norm": 1.860456109046936, "learning_rate": 3.463618478431014e-05, "loss": 0.2338, "num_input_tokens_seen": 3025584, "step": 33605 }, { "epoch": 8.734407484407484, "grad_norm": 0.33603429794311523, "learning_rate": 3.463095302884094e-05, "loss": 0.1387, "num_input_tokens_seen": 3026000, "step": 33610 }, { "epoch": 8.73570686070686, "grad_norm": 1.8482933044433594, "learning_rate": 3.4625720778042606e-05, "loss": 0.2663, "num_input_tokens_seen": 3026448, "step": 33615 }, { "epoch": 8.737006237006238, "grad_norm": 1.186694622039795, "learning_rate": 3.462048803218423e-05, "loss": 0.3308, "num_input_tokens_seen": 3026912, "step": 33620 }, { "epoch": 8.738305613305613, "grad_norm": 0.9994029998779297, "learning_rate": 3.461525479153493e-05, "loss": 0.3827, "num_input_tokens_seen": 3027408, "step": 33625 }, { "epoch": 8.73960498960499, "grad_norm": 1.3436962366104126, "learning_rate": 3.461002105636387e-05, "loss": 0.2918, "num_input_tokens_seen": 3027840, "step": 33630 }, { "epoch": 8.740904365904367, "grad_norm": 1.0624988079071045, "learning_rate": 3.4604786826940214e-05, "loss": 0.1991, "num_input_tokens_seen": 3028304, "step": 33635 }, { "epoch": 8.742203742203742, "grad_norm": 0.4140929579734802, "learning_rate": 3.4599552103533164e-05, "loss": 0.1593, "num_input_tokens_seen": 3028720, "step": 33640 }, { "epoch": 8.743503118503119, "grad_norm": 0.9366307854652405, "learning_rate": 3.459431688641196e-05, "loss": 0.3003, "num_input_tokens_seen": 3029184, "step": 33645 }, { "epoch": 8.744802494802494, "grad_norm": 0.949916660785675, "learning_rate": 3.458908117584584e-05, "loss": 0.1583, "num_input_tokens_seen": 3029648, "step": 33650 }, { "epoch": 8.746101871101871, "grad_norm": 1.4216657876968384, "learning_rate": 3.45838449721041e-05, "loss": 0.299, "num_input_tokens_seen": 3030112, "step": 33655 }, { "epoch": 8.747401247401248, "grad_norm": 1.7985293865203857, "learning_rate": 3.457860827545601e-05, "loss": 0.3283, "num_input_tokens_seen": 3030592, "step": 33660 }, { "epoch": 8.748700623700623, "grad_norm": 0.9156537055969238, "learning_rate": 3.4573371086170936e-05, "loss": 0.2203, "num_input_tokens_seen": 3031088, "step": 33665 }, { "epoch": 8.75, "grad_norm": 0.5607843995094299, "learning_rate": 3.456813340451821e-05, "loss": 0.3237, "num_input_tokens_seen": 3031504, "step": 33670 }, { "epoch": 8.751299376299377, "grad_norm": 0.8728926181793213, "learning_rate": 3.456289523076721e-05, "loss": 0.2265, "num_input_tokens_seen": 3031984, "step": 33675 }, { "epoch": 8.752598752598752, "grad_norm": 0.8613230586051941, "learning_rate": 3.4557656565187344e-05, "loss": 0.2409, "num_input_tokens_seen": 3032416, "step": 33680 }, { "epoch": 8.753898128898129, "grad_norm": 0.5309209823608398, "learning_rate": 3.455241740804805e-05, "loss": 0.2309, "num_input_tokens_seen": 3032864, "step": 33685 }, { "epoch": 8.755197505197506, "grad_norm": 0.8260422945022583, "learning_rate": 3.454717775961878e-05, "loss": 0.2447, "num_input_tokens_seen": 3033264, "step": 33690 }, { "epoch": 8.756496881496881, "grad_norm": 0.9154503345489502, "learning_rate": 3.4541937620169e-05, "loss": 0.2089, "num_input_tokens_seen": 3033712, "step": 33695 }, { "epoch": 8.757796257796258, "grad_norm": 0.4708462655544281, "learning_rate": 3.4536696989968226e-05, "loss": 0.2445, "num_input_tokens_seen": 3034192, "step": 33700 }, { "epoch": 8.759095634095633, "grad_norm": 0.5869988799095154, "learning_rate": 3.453145586928599e-05, "loss": 0.2819, "num_input_tokens_seen": 3034624, "step": 33705 }, { "epoch": 8.76039501039501, "grad_norm": 0.4194783568382263, "learning_rate": 3.4526214258391846e-05, "loss": 0.2454, "num_input_tokens_seen": 3035056, "step": 33710 }, { "epoch": 8.761694386694387, "grad_norm": 1.3954075574874878, "learning_rate": 3.452097215755537e-05, "loss": 0.228, "num_input_tokens_seen": 3035520, "step": 33715 }, { "epoch": 8.762993762993762, "grad_norm": 1.9581575393676758, "learning_rate": 3.451572956704619e-05, "loss": 0.2622, "num_input_tokens_seen": 3035968, "step": 33720 }, { "epoch": 8.76429313929314, "grad_norm": 1.6039282083511353, "learning_rate": 3.4510486487133916e-05, "loss": 0.3223, "num_input_tokens_seen": 3036464, "step": 33725 }, { "epoch": 8.765592515592516, "grad_norm": 0.6527566313743591, "learning_rate": 3.45052429180882e-05, "loss": 0.2832, "num_input_tokens_seen": 3036944, "step": 33730 }, { "epoch": 8.766891891891891, "grad_norm": 1.1404149532318115, "learning_rate": 3.4499998860178736e-05, "loss": 0.3494, "num_input_tokens_seen": 3037440, "step": 33735 }, { "epoch": 8.768191268191268, "grad_norm": 0.4505937993526459, "learning_rate": 3.4494754313675235e-05, "loss": 0.2598, "num_input_tokens_seen": 3037888, "step": 33740 }, { "epoch": 8.769490644490645, "grad_norm": 1.1529767513275146, "learning_rate": 3.4489509278847414e-05, "loss": 0.2954, "num_input_tokens_seen": 3038304, "step": 33745 }, { "epoch": 8.77079002079002, "grad_norm": 0.8006730675697327, "learning_rate": 3.448426375596504e-05, "loss": 0.2138, "num_input_tokens_seen": 3038752, "step": 33750 }, { "epoch": 8.772089397089397, "grad_norm": 0.8305683135986328, "learning_rate": 3.447901774529789e-05, "loss": 0.3135, "num_input_tokens_seen": 3039184, "step": 33755 }, { "epoch": 8.773388773388774, "grad_norm": 0.45098695158958435, "learning_rate": 3.447377124711578e-05, "loss": 0.2304, "num_input_tokens_seen": 3039632, "step": 33760 }, { "epoch": 8.77468814968815, "grad_norm": 1.5158874988555908, "learning_rate": 3.446852426168854e-05, "loss": 0.3288, "num_input_tokens_seen": 3040064, "step": 33765 }, { "epoch": 8.775987525987526, "grad_norm": 0.6251343488693237, "learning_rate": 3.446327678928602e-05, "loss": 0.2504, "num_input_tokens_seen": 3040496, "step": 33770 }, { "epoch": 8.777286902286903, "grad_norm": 0.4051831364631653, "learning_rate": 3.4458028830178114e-05, "loss": 0.248, "num_input_tokens_seen": 3040912, "step": 33775 }, { "epoch": 8.778586278586278, "grad_norm": 1.0086185932159424, "learning_rate": 3.4452780384634716e-05, "loss": 0.2463, "num_input_tokens_seen": 3041376, "step": 33780 }, { "epoch": 8.779885654885655, "grad_norm": 0.46398308873176575, "learning_rate": 3.4447531452925766e-05, "loss": 0.2554, "num_input_tokens_seen": 3041776, "step": 33785 }, { "epoch": 8.78118503118503, "grad_norm": 0.9634677767753601, "learning_rate": 3.4442282035321224e-05, "loss": 0.227, "num_input_tokens_seen": 3042240, "step": 33790 }, { "epoch": 8.782484407484407, "grad_norm": 0.2966494858264923, "learning_rate": 3.443703213209107e-05, "loss": 0.2311, "num_input_tokens_seen": 3042704, "step": 33795 }, { "epoch": 8.783783783783784, "grad_norm": 0.31498298048973083, "learning_rate": 3.4431781743505314e-05, "loss": 0.3009, "num_input_tokens_seen": 3043216, "step": 33800 }, { "epoch": 8.78508316008316, "grad_norm": 0.4440320134162903, "learning_rate": 3.442653086983398e-05, "loss": 0.2873, "num_input_tokens_seen": 3043648, "step": 33805 }, { "epoch": 8.786382536382536, "grad_norm": 0.4393715262413025, "learning_rate": 3.442127951134714e-05, "loss": 0.3175, "num_input_tokens_seen": 3044144, "step": 33810 }, { "epoch": 8.787681912681913, "grad_norm": 0.7131330966949463, "learning_rate": 3.4416027668314854e-05, "loss": 0.2812, "num_input_tokens_seen": 3044592, "step": 33815 }, { "epoch": 8.788981288981288, "grad_norm": 0.7889105081558228, "learning_rate": 3.441077534100725e-05, "loss": 0.2374, "num_input_tokens_seen": 3045040, "step": 33820 }, { "epoch": 8.790280665280665, "grad_norm": 0.3037320375442505, "learning_rate": 3.4405522529694454e-05, "loss": 0.2253, "num_input_tokens_seen": 3045472, "step": 33825 }, { "epoch": 8.791580041580042, "grad_norm": 0.4910479187965393, "learning_rate": 3.440026923464662e-05, "loss": 0.3245, "num_input_tokens_seen": 3045936, "step": 33830 }, { "epoch": 8.792879417879417, "grad_norm": 0.5210514664649963, "learning_rate": 3.4395015456133937e-05, "loss": 0.2927, "num_input_tokens_seen": 3046384, "step": 33835 }, { "epoch": 8.794178794178794, "grad_norm": 0.3938443958759308, "learning_rate": 3.43897611944266e-05, "loss": 0.3357, "num_input_tokens_seen": 3046816, "step": 33840 }, { "epoch": 8.795478170478171, "grad_norm": 0.6527367830276489, "learning_rate": 3.4384506449794857e-05, "loss": 0.2725, "num_input_tokens_seen": 3047280, "step": 33845 }, { "epoch": 8.796777546777546, "grad_norm": 0.8066277503967285, "learning_rate": 3.437925122250896e-05, "loss": 0.2786, "num_input_tokens_seen": 3047744, "step": 33850 }, { "epoch": 8.798076923076923, "grad_norm": 0.8051039576530457, "learning_rate": 3.437399551283917e-05, "loss": 0.2652, "num_input_tokens_seen": 3048240, "step": 33855 }, { "epoch": 8.799376299376299, "grad_norm": 0.8250488042831421, "learning_rate": 3.436873932105581e-05, "loss": 0.3142, "num_input_tokens_seen": 3048720, "step": 33860 }, { "epoch": 8.800675675675675, "grad_norm": 0.6180732250213623, "learning_rate": 3.436348264742922e-05, "loss": 0.2436, "num_input_tokens_seen": 3049184, "step": 33865 }, { "epoch": 8.801975051975052, "grad_norm": 0.46981173753738403, "learning_rate": 3.4358225492229746e-05, "loss": 0.303, "num_input_tokens_seen": 3049648, "step": 33870 }, { "epoch": 8.803274428274428, "grad_norm": 0.5027632713317871, "learning_rate": 3.435296785572776e-05, "loss": 0.2664, "num_input_tokens_seen": 3050128, "step": 33875 }, { "epoch": 8.804573804573804, "grad_norm": 0.576556921005249, "learning_rate": 3.434770973819368e-05, "loss": 0.237, "num_input_tokens_seen": 3050608, "step": 33880 }, { "epoch": 8.805873180873181, "grad_norm": 0.8578866124153137, "learning_rate": 3.434245113989793e-05, "loss": 0.2293, "num_input_tokens_seen": 3051088, "step": 33885 }, { "epoch": 8.807172557172557, "grad_norm": 0.8126557469367981, "learning_rate": 3.4337192061110966e-05, "loss": 0.2466, "num_input_tokens_seen": 3051536, "step": 33890 }, { "epoch": 8.808471933471933, "grad_norm": 0.32913923263549805, "learning_rate": 3.433193250210327e-05, "loss": 0.2682, "num_input_tokens_seen": 3051984, "step": 33895 }, { "epoch": 8.80977130977131, "grad_norm": 0.48974087834358215, "learning_rate": 3.4326672463145345e-05, "loss": 0.3506, "num_input_tokens_seen": 3052432, "step": 33900 }, { "epoch": 8.811070686070686, "grad_norm": 0.35374584794044495, "learning_rate": 3.432141194450772e-05, "loss": 0.2344, "num_input_tokens_seen": 3052832, "step": 33905 }, { "epoch": 8.812370062370062, "grad_norm": 0.3719191253185272, "learning_rate": 3.4316150946460946e-05, "loss": 0.1343, "num_input_tokens_seen": 3053264, "step": 33910 }, { "epoch": 8.81366943866944, "grad_norm": 0.490753173828125, "learning_rate": 3.43108894692756e-05, "loss": 0.3319, "num_input_tokens_seen": 3053728, "step": 33915 }, { "epoch": 8.814968814968815, "grad_norm": 0.3559890389442444, "learning_rate": 3.430562751322229e-05, "loss": 0.2104, "num_input_tokens_seen": 3054192, "step": 33920 }, { "epoch": 8.816268191268192, "grad_norm": 0.35218021273612976, "learning_rate": 3.430036507857164e-05, "loss": 0.2003, "num_input_tokens_seen": 3054688, "step": 33925 }, { "epoch": 8.817567567567568, "grad_norm": 0.4572393298149109, "learning_rate": 3.429510216559429e-05, "loss": 0.2271, "num_input_tokens_seen": 3055152, "step": 33930 }, { "epoch": 8.818866943866944, "grad_norm": 0.3266587257385254, "learning_rate": 3.428983877456095e-05, "loss": 0.2581, "num_input_tokens_seen": 3055584, "step": 33935 }, { "epoch": 8.82016632016632, "grad_norm": 1.1988105773925781, "learning_rate": 3.4284574905742294e-05, "loss": 0.2079, "num_input_tokens_seen": 3056048, "step": 33940 }, { "epoch": 8.821465696465696, "grad_norm": 1.6588366031646729, "learning_rate": 3.427931055940905e-05, "loss": 0.3189, "num_input_tokens_seen": 3056512, "step": 33945 }, { "epoch": 8.822765072765073, "grad_norm": 0.775895357131958, "learning_rate": 3.427404573583197e-05, "loss": 0.4208, "num_input_tokens_seen": 3056960, "step": 33950 }, { "epoch": 8.82406444906445, "grad_norm": 0.35740870237350464, "learning_rate": 3.426878043528185e-05, "loss": 0.1929, "num_input_tokens_seen": 3057392, "step": 33955 }, { "epoch": 8.825363825363825, "grad_norm": 0.6254670023918152, "learning_rate": 3.426351465802945e-05, "loss": 0.208, "num_input_tokens_seen": 3057856, "step": 33960 }, { "epoch": 8.826663201663202, "grad_norm": 0.8880094885826111, "learning_rate": 3.425824840434562e-05, "loss": 0.2923, "num_input_tokens_seen": 3058304, "step": 33965 }, { "epoch": 8.827962577962579, "grad_norm": 0.5347532033920288, "learning_rate": 3.425298167450121e-05, "loss": 0.1987, "num_input_tokens_seen": 3058784, "step": 33970 }, { "epoch": 8.829261954261954, "grad_norm": 0.6825275421142578, "learning_rate": 3.424771446876709e-05, "loss": 0.1985, "num_input_tokens_seen": 3059232, "step": 33975 }, { "epoch": 8.83056133056133, "grad_norm": 0.5985206365585327, "learning_rate": 3.424244678741414e-05, "loss": 0.1837, "num_input_tokens_seen": 3059696, "step": 33980 }, { "epoch": 8.831860706860708, "grad_norm": 0.4899046719074249, "learning_rate": 3.4237178630713314e-05, "loss": 0.3177, "num_input_tokens_seen": 3060160, "step": 33985 }, { "epoch": 8.833160083160083, "grad_norm": 0.6640735268592834, "learning_rate": 3.423190999893553e-05, "loss": 0.2768, "num_input_tokens_seen": 3060624, "step": 33990 }, { "epoch": 8.83445945945946, "grad_norm": 0.6886430978775024, "learning_rate": 3.4226640892351776e-05, "loss": 0.209, "num_input_tokens_seen": 3061072, "step": 33995 }, { "epoch": 8.835758835758837, "grad_norm": 1.227290391921997, "learning_rate": 3.422137131123303e-05, "loss": 0.3137, "num_input_tokens_seen": 3061552, "step": 34000 }, { "epoch": 8.837058212058212, "grad_norm": 0.9719465374946594, "learning_rate": 3.421610125585032e-05, "loss": 0.2284, "num_input_tokens_seen": 3062000, "step": 34005 }, { "epoch": 8.838357588357589, "grad_norm": 1.273679256439209, "learning_rate": 3.421083072647471e-05, "loss": 0.2182, "num_input_tokens_seen": 3062464, "step": 34010 }, { "epoch": 8.839656964656964, "grad_norm": 1.0625239610671997, "learning_rate": 3.4205559723377234e-05, "loss": 0.2076, "num_input_tokens_seen": 3062896, "step": 34015 }, { "epoch": 8.84095634095634, "grad_norm": 1.1862449645996094, "learning_rate": 3.4200288246829005e-05, "loss": 0.332, "num_input_tokens_seen": 3063328, "step": 34020 }, { "epoch": 8.842255717255718, "grad_norm": 1.424985408782959, "learning_rate": 3.4195016297101134e-05, "loss": 0.19, "num_input_tokens_seen": 3063760, "step": 34025 }, { "epoch": 8.843555093555093, "grad_norm": 0.8933931589126587, "learning_rate": 3.4189743874464766e-05, "loss": 0.2349, "num_input_tokens_seen": 3064192, "step": 34030 }, { "epoch": 8.84485446985447, "grad_norm": 0.5837796330451965, "learning_rate": 3.4184470979191076e-05, "loss": 0.2533, "num_input_tokens_seen": 3064640, "step": 34035 }, { "epoch": 8.846153846153847, "grad_norm": 0.6901532411575317, "learning_rate": 3.4179197611551227e-05, "loss": 0.3097, "num_input_tokens_seen": 3065072, "step": 34040 }, { "epoch": 8.847453222453222, "grad_norm": 0.615963339805603, "learning_rate": 3.417392377181646e-05, "loss": 0.2361, "num_input_tokens_seen": 3065472, "step": 34045 }, { "epoch": 8.848752598752599, "grad_norm": 1.5450644493103027, "learning_rate": 3.416864946025801e-05, "loss": 0.2379, "num_input_tokens_seen": 3065888, "step": 34050 }, { "epoch": 8.850051975051976, "grad_norm": 0.4090406596660614, "learning_rate": 3.4163374677147114e-05, "loss": 0.2687, "num_input_tokens_seen": 3066336, "step": 34055 }, { "epoch": 8.85135135135135, "grad_norm": 1.4059631824493408, "learning_rate": 3.415809942275509e-05, "loss": 0.2803, "num_input_tokens_seen": 3066784, "step": 34060 }, { "epoch": 8.852650727650728, "grad_norm": 0.8820080161094666, "learning_rate": 3.415282369735324e-05, "loss": 0.2721, "num_input_tokens_seen": 3067248, "step": 34065 }, { "epoch": 8.853950103950105, "grad_norm": 0.8423874378204346, "learning_rate": 3.414754750121289e-05, "loss": 0.2295, "num_input_tokens_seen": 3067712, "step": 34070 }, { "epoch": 8.85524948024948, "grad_norm": 0.9431300163269043, "learning_rate": 3.414227083460541e-05, "loss": 0.2612, "num_input_tokens_seen": 3068160, "step": 34075 }, { "epoch": 8.856548856548857, "grad_norm": 1.2903916835784912, "learning_rate": 3.4136993697802184e-05, "loss": 0.3209, "num_input_tokens_seen": 3068608, "step": 34080 }, { "epoch": 8.857848232848234, "grad_norm": 1.049262285232544, "learning_rate": 3.4131716091074617e-05, "loss": 0.2452, "num_input_tokens_seen": 3069040, "step": 34085 }, { "epoch": 8.859147609147609, "grad_norm": 0.8882285952568054, "learning_rate": 3.4126438014694134e-05, "loss": 0.2514, "num_input_tokens_seen": 3069536, "step": 34090 }, { "epoch": 8.860446985446986, "grad_norm": 1.0756900310516357, "learning_rate": 3.412115946893221e-05, "loss": 0.2495, "num_input_tokens_seen": 3069968, "step": 34095 }, { "epoch": 8.861746361746361, "grad_norm": 0.6631041169166565, "learning_rate": 3.4115880454060314e-05, "loss": 0.238, "num_input_tokens_seen": 3070400, "step": 34100 }, { "epoch": 8.863045738045738, "grad_norm": 1.3157306909561157, "learning_rate": 3.411060097034995e-05, "loss": 0.2417, "num_input_tokens_seen": 3070960, "step": 34105 }, { "epoch": 8.864345114345115, "grad_norm": 0.43598875403404236, "learning_rate": 3.4105321018072645e-05, "loss": 0.196, "num_input_tokens_seen": 3071424, "step": 34110 }, { "epoch": 8.86564449064449, "grad_norm": 0.6060724854469299, "learning_rate": 3.410004059749996e-05, "loss": 0.1411, "num_input_tokens_seen": 3071888, "step": 34115 }, { "epoch": 8.866943866943867, "grad_norm": 0.7717562913894653, "learning_rate": 3.409475970890347e-05, "loss": 0.2146, "num_input_tokens_seen": 3072304, "step": 34120 }, { "epoch": 8.868243243243244, "grad_norm": 1.512584924697876, "learning_rate": 3.408947835255476e-05, "loss": 0.2956, "num_input_tokens_seen": 3072752, "step": 34125 }, { "epoch": 8.869542619542619, "grad_norm": 1.8012880086898804, "learning_rate": 3.4084196528725484e-05, "loss": 0.3847, "num_input_tokens_seen": 3073232, "step": 34130 }, { "epoch": 8.870841995841996, "grad_norm": 1.0628764629364014, "learning_rate": 3.407891423768727e-05, "loss": 0.2325, "num_input_tokens_seen": 3073680, "step": 34135 }, { "epoch": 8.872141372141373, "grad_norm": 1.0193806886672974, "learning_rate": 3.407363147971181e-05, "loss": 0.256, "num_input_tokens_seen": 3074128, "step": 34140 }, { "epoch": 8.873440748440748, "grad_norm": 1.0388221740722656, "learning_rate": 3.4068348255070763e-05, "loss": 0.3326, "num_input_tokens_seen": 3074544, "step": 34145 }, { "epoch": 8.874740124740125, "grad_norm": 0.46514150500297546, "learning_rate": 3.4063064564035896e-05, "loss": 0.2235, "num_input_tokens_seen": 3074976, "step": 34150 }, { "epoch": 8.8760395010395, "grad_norm": 1.9034878015518188, "learning_rate": 3.4057780406878934e-05, "loss": 0.241, "num_input_tokens_seen": 3075472, "step": 34155 }, { "epoch": 8.877338877338877, "grad_norm": 0.4163361191749573, "learning_rate": 3.405249578387164e-05, "loss": 0.2757, "num_input_tokens_seen": 3075920, "step": 34160 }, { "epoch": 8.878638253638254, "grad_norm": 0.4230906367301941, "learning_rate": 3.404721069528581e-05, "loss": 0.1408, "num_input_tokens_seen": 3076352, "step": 34165 }, { "epoch": 8.87993762993763, "grad_norm": 0.36085084080696106, "learning_rate": 3.4041925141393284e-05, "loss": 0.1663, "num_input_tokens_seen": 3076816, "step": 34170 }, { "epoch": 8.881237006237006, "grad_norm": 0.44931504130363464, "learning_rate": 3.403663912246587e-05, "loss": 0.301, "num_input_tokens_seen": 3077264, "step": 34175 }, { "epoch": 8.882536382536383, "grad_norm": 1.0612808465957642, "learning_rate": 3.403135263877545e-05, "loss": 0.2572, "num_input_tokens_seen": 3077744, "step": 34180 }, { "epoch": 8.883835758835758, "grad_norm": 0.9332041144371033, "learning_rate": 3.402606569059392e-05, "loss": 0.2733, "num_input_tokens_seen": 3078208, "step": 34185 }, { "epoch": 8.885135135135135, "grad_norm": 0.2966673672199249, "learning_rate": 3.402077827819317e-05, "loss": 0.1797, "num_input_tokens_seen": 3078672, "step": 34190 }, { "epoch": 8.886434511434512, "grad_norm": 1.0950790643692017, "learning_rate": 3.401549040184515e-05, "loss": 0.285, "num_input_tokens_seen": 3079152, "step": 34195 }, { "epoch": 8.887733887733887, "grad_norm": 0.38679391145706177, "learning_rate": 3.4010202061821825e-05, "loss": 0.1599, "num_input_tokens_seen": 3079600, "step": 34200 }, { "epoch": 8.889033264033264, "grad_norm": 0.4896799325942993, "learning_rate": 3.400491325839518e-05, "loss": 0.279, "num_input_tokens_seen": 3080032, "step": 34205 }, { "epoch": 8.890332640332641, "grad_norm": 1.1178828477859497, "learning_rate": 3.399962399183721e-05, "loss": 0.2802, "num_input_tokens_seen": 3080512, "step": 34210 }, { "epoch": 8.891632016632016, "grad_norm": 0.9986152052879333, "learning_rate": 3.3994334262419955e-05, "loss": 0.2297, "num_input_tokens_seen": 3080976, "step": 34215 }, { "epoch": 8.892931392931393, "grad_norm": 1.1414923667907715, "learning_rate": 3.398904407041548e-05, "loss": 0.2362, "num_input_tokens_seen": 3081424, "step": 34220 }, { "epoch": 8.89423076923077, "grad_norm": 1.1626932621002197, "learning_rate": 3.3983753416095845e-05, "loss": 0.3216, "num_input_tokens_seen": 3081840, "step": 34225 }, { "epoch": 8.895530145530145, "grad_norm": 0.6952298879623413, "learning_rate": 3.397846229973317e-05, "loss": 0.2703, "num_input_tokens_seen": 3082288, "step": 34230 }, { "epoch": 8.896829521829522, "grad_norm": 0.6898910403251648, "learning_rate": 3.3973170721599565e-05, "loss": 0.1291, "num_input_tokens_seen": 3082752, "step": 34235 }, { "epoch": 8.898128898128899, "grad_norm": 0.813791811466217, "learning_rate": 3.3967878681967216e-05, "loss": 0.2346, "num_input_tokens_seen": 3083200, "step": 34240 }, { "epoch": 8.899428274428274, "grad_norm": 0.37581461668014526, "learning_rate": 3.3962586181108256e-05, "loss": 0.235, "num_input_tokens_seen": 3083616, "step": 34245 }, { "epoch": 8.900727650727651, "grad_norm": 0.33614230155944824, "learning_rate": 3.39572932192949e-05, "loss": 0.1407, "num_input_tokens_seen": 3084080, "step": 34250 }, { "epoch": 8.902027027027026, "grad_norm": 0.9777373671531677, "learning_rate": 3.395199979679938e-05, "loss": 0.2926, "num_input_tokens_seen": 3084512, "step": 34255 }, { "epoch": 8.903326403326403, "grad_norm": 0.6678815484046936, "learning_rate": 3.3946705913893925e-05, "loss": 0.2778, "num_input_tokens_seen": 3084960, "step": 34260 }, { "epoch": 8.90462577962578, "grad_norm": 0.5546672940254211, "learning_rate": 3.394141157085082e-05, "loss": 0.1359, "num_input_tokens_seen": 3085440, "step": 34265 }, { "epoch": 8.905925155925155, "grad_norm": 1.405874252319336, "learning_rate": 3.3936116767942336e-05, "loss": 0.1873, "num_input_tokens_seen": 3085888, "step": 34270 }, { "epoch": 8.907224532224532, "grad_norm": 1.5237836837768555, "learning_rate": 3.3930821505440824e-05, "loss": 0.5049, "num_input_tokens_seen": 3086352, "step": 34275 }, { "epoch": 8.90852390852391, "grad_norm": 1.5903769731521606, "learning_rate": 3.392552578361859e-05, "loss": 0.2228, "num_input_tokens_seen": 3086784, "step": 34280 }, { "epoch": 8.909823284823284, "grad_norm": 1.656412124633789, "learning_rate": 3.392022960274802e-05, "loss": 0.368, "num_input_tokens_seen": 3087248, "step": 34285 }, { "epoch": 8.911122661122661, "grad_norm": 0.8976522088050842, "learning_rate": 3.391493296310149e-05, "loss": 0.2525, "num_input_tokens_seen": 3087728, "step": 34290 }, { "epoch": 8.912422037422038, "grad_norm": 0.9509913921356201, "learning_rate": 3.390963586495142e-05, "loss": 0.2742, "num_input_tokens_seen": 3088160, "step": 34295 }, { "epoch": 8.913721413721413, "grad_norm": 1.8004498481750488, "learning_rate": 3.3904338308570244e-05, "loss": 0.2623, "num_input_tokens_seen": 3088608, "step": 34300 }, { "epoch": 8.91502079002079, "grad_norm": 0.639066755771637, "learning_rate": 3.389904029423041e-05, "loss": 0.238, "num_input_tokens_seen": 3089056, "step": 34305 }, { "epoch": 8.916320166320165, "grad_norm": 1.3537284135818481, "learning_rate": 3.3893741822204415e-05, "loss": 0.2828, "num_input_tokens_seen": 3089520, "step": 34310 }, { "epoch": 8.917619542619542, "grad_norm": 0.6106868982315063, "learning_rate": 3.388844289276475e-05, "loss": 0.1916, "num_input_tokens_seen": 3089920, "step": 34315 }, { "epoch": 8.91891891891892, "grad_norm": 0.4863857328891754, "learning_rate": 3.3883143506183954e-05, "loss": 0.2245, "num_input_tokens_seen": 3090368, "step": 34320 }, { "epoch": 8.920218295218294, "grad_norm": 0.8370432257652283, "learning_rate": 3.387784366273458e-05, "loss": 0.2263, "num_input_tokens_seen": 3090880, "step": 34325 }, { "epoch": 8.921517671517671, "grad_norm": 1.2835131883621216, "learning_rate": 3.387254336268919e-05, "loss": 0.209, "num_input_tokens_seen": 3091344, "step": 34330 }, { "epoch": 8.922817047817048, "grad_norm": 0.3728290796279907, "learning_rate": 3.38672426063204e-05, "loss": 0.1897, "num_input_tokens_seen": 3091824, "step": 34335 }, { "epoch": 8.924116424116423, "grad_norm": 0.8783730864524841, "learning_rate": 3.386194139390082e-05, "loss": 0.3096, "num_input_tokens_seen": 3092240, "step": 34340 }, { "epoch": 8.9254158004158, "grad_norm": 0.43038979172706604, "learning_rate": 3.385663972570311e-05, "loss": 0.2906, "num_input_tokens_seen": 3092672, "step": 34345 }, { "epoch": 8.926715176715177, "grad_norm": 0.3685171604156494, "learning_rate": 3.3851337601999936e-05, "loss": 0.1862, "num_input_tokens_seen": 3093136, "step": 34350 }, { "epoch": 8.928014553014552, "grad_norm": 0.2908042371273041, "learning_rate": 3.384603502306398e-05, "loss": 0.2498, "num_input_tokens_seen": 3093600, "step": 34355 }, { "epoch": 8.92931392931393, "grad_norm": 0.824826717376709, "learning_rate": 3.3840731989167963e-05, "loss": 0.334, "num_input_tokens_seen": 3094016, "step": 34360 }, { "epoch": 8.930613305613306, "grad_norm": 0.9860292077064514, "learning_rate": 3.3835428500584635e-05, "loss": 0.2514, "num_input_tokens_seen": 3094464, "step": 34365 }, { "epoch": 8.931912681912682, "grad_norm": 0.3721078038215637, "learning_rate": 3.383012455758676e-05, "loss": 0.287, "num_input_tokens_seen": 3094912, "step": 34370 }, { "epoch": 8.933212058212058, "grad_norm": 0.8145484924316406, "learning_rate": 3.382482016044711e-05, "loss": 0.2526, "num_input_tokens_seen": 3095376, "step": 34375 }, { "epoch": 8.934511434511435, "grad_norm": 0.5062919855117798, "learning_rate": 3.381951530943851e-05, "loss": 0.2514, "num_input_tokens_seen": 3095792, "step": 34380 }, { "epoch": 8.93581081081081, "grad_norm": 0.8684676885604858, "learning_rate": 3.381421000483378e-05, "loss": 0.2374, "num_input_tokens_seen": 3096224, "step": 34385 }, { "epoch": 8.937110187110187, "grad_norm": 0.7562459707260132, "learning_rate": 3.380890424690579e-05, "loss": 0.294, "num_input_tokens_seen": 3096704, "step": 34390 }, { "epoch": 8.938409563409563, "grad_norm": 0.6479294896125793, "learning_rate": 3.380359803592741e-05, "loss": 0.2324, "num_input_tokens_seen": 3097168, "step": 34395 }, { "epoch": 8.93970893970894, "grad_norm": 1.8769570589065552, "learning_rate": 3.3798291372171545e-05, "loss": 0.2684, "num_input_tokens_seen": 3097664, "step": 34400 }, { "epoch": 8.941008316008316, "grad_norm": 0.9171499609947205, "learning_rate": 3.379298425591113e-05, "loss": 0.2741, "num_input_tokens_seen": 3098160, "step": 34405 }, { "epoch": 8.942307692307692, "grad_norm": 0.9048478603363037, "learning_rate": 3.378767668741911e-05, "loss": 0.2157, "num_input_tokens_seen": 3098640, "step": 34410 }, { "epoch": 8.943607068607069, "grad_norm": 0.7739713788032532, "learning_rate": 3.378236866696846e-05, "loss": 0.2105, "num_input_tokens_seen": 3099104, "step": 34415 }, { "epoch": 8.944906444906445, "grad_norm": 0.5017459988594055, "learning_rate": 3.377706019483216e-05, "loss": 0.3027, "num_input_tokens_seen": 3099536, "step": 34420 }, { "epoch": 8.94620582120582, "grad_norm": 0.5482378005981445, "learning_rate": 3.377175127128327e-05, "loss": 0.194, "num_input_tokens_seen": 3100016, "step": 34425 }, { "epoch": 8.947505197505198, "grad_norm": 0.45634952187538147, "learning_rate": 3.3766441896594784e-05, "loss": 0.14, "num_input_tokens_seen": 3100416, "step": 34430 }, { "epoch": 8.948804573804575, "grad_norm": 1.1491889953613281, "learning_rate": 3.3761132071039805e-05, "loss": 0.1997, "num_input_tokens_seen": 3100848, "step": 34435 }, { "epoch": 8.95010395010395, "grad_norm": 1.435808539390564, "learning_rate": 3.3755821794891405e-05, "loss": 0.2649, "num_input_tokens_seen": 3101312, "step": 34440 }, { "epoch": 8.951403326403327, "grad_norm": 0.6379401683807373, "learning_rate": 3.37505110684227e-05, "loss": 0.3192, "num_input_tokens_seen": 3101808, "step": 34445 }, { "epoch": 8.952702702702704, "grad_norm": 1.4277594089508057, "learning_rate": 3.374519989190683e-05, "loss": 0.3036, "num_input_tokens_seen": 3102240, "step": 34450 }, { "epoch": 8.954002079002079, "grad_norm": 0.5805612802505493, "learning_rate": 3.373988826561695e-05, "loss": 0.1876, "num_input_tokens_seen": 3102672, "step": 34455 }, { "epoch": 8.955301455301456, "grad_norm": 0.4512163996696472, "learning_rate": 3.373457618982624e-05, "loss": 0.2207, "num_input_tokens_seen": 3103088, "step": 34460 }, { "epoch": 8.95660083160083, "grad_norm": 0.5661661624908447, "learning_rate": 3.37292636648079e-05, "loss": 0.19, "num_input_tokens_seen": 3103536, "step": 34465 }, { "epoch": 8.957900207900208, "grad_norm": 1.6512854099273682, "learning_rate": 3.3723950690835173e-05, "loss": 0.3318, "num_input_tokens_seen": 3103968, "step": 34470 }, { "epoch": 8.959199584199585, "grad_norm": 1.3343104124069214, "learning_rate": 3.371863726818131e-05, "loss": 0.2874, "num_input_tokens_seen": 3104416, "step": 34475 }, { "epoch": 8.96049896049896, "grad_norm": 1.317911148071289, "learning_rate": 3.371332339711957e-05, "loss": 0.2369, "num_input_tokens_seen": 3104896, "step": 34480 }, { "epoch": 8.961798336798337, "grad_norm": 1.7040287256240845, "learning_rate": 3.370800907792325e-05, "loss": 0.3035, "num_input_tokens_seen": 3105344, "step": 34485 }, { "epoch": 8.963097713097714, "grad_norm": 0.6052233576774597, "learning_rate": 3.3702694310865695e-05, "loss": 0.2694, "num_input_tokens_seen": 3105776, "step": 34490 }, { "epoch": 8.964397089397089, "grad_norm": 0.7293937802314758, "learning_rate": 3.369737909622023e-05, "loss": 0.2271, "num_input_tokens_seen": 3106192, "step": 34495 }, { "epoch": 8.965696465696466, "grad_norm": 1.078357458114624, "learning_rate": 3.369206343426023e-05, "loss": 0.2638, "num_input_tokens_seen": 3106640, "step": 34500 }, { "epoch": 8.966995841995843, "grad_norm": 0.9201123118400574, "learning_rate": 3.3686747325259065e-05, "loss": 0.2567, "num_input_tokens_seen": 3107072, "step": 34505 }, { "epoch": 8.968295218295218, "grad_norm": 1.4957631826400757, "learning_rate": 3.368143076949017e-05, "loss": 0.225, "num_input_tokens_seen": 3107520, "step": 34510 }, { "epoch": 8.969594594594595, "grad_norm": 1.5520814657211304, "learning_rate": 3.367611376722698e-05, "loss": 0.2755, "num_input_tokens_seen": 3107936, "step": 34515 }, { "epoch": 8.970893970893972, "grad_norm": 1.3048251867294312, "learning_rate": 3.367079631874293e-05, "loss": 0.2713, "num_input_tokens_seen": 3108384, "step": 34520 }, { "epoch": 8.972193347193347, "grad_norm": 0.9482016563415527, "learning_rate": 3.3665478424311524e-05, "loss": 0.2001, "num_input_tokens_seen": 3108880, "step": 34525 }, { "epoch": 8.973492723492724, "grad_norm": 0.9974784255027771, "learning_rate": 3.366016008420626e-05, "loss": 0.2674, "num_input_tokens_seen": 3109312, "step": 34530 }, { "epoch": 8.9747920997921, "grad_norm": 1.3965407609939575, "learning_rate": 3.365484129870067e-05, "loss": 0.2842, "num_input_tokens_seen": 3109744, "step": 34535 }, { "epoch": 8.976091476091476, "grad_norm": 1.081735610961914, "learning_rate": 3.364952206806828e-05, "loss": 0.1781, "num_input_tokens_seen": 3110192, "step": 34540 }, { "epoch": 8.977390852390853, "grad_norm": 0.5268439054489136, "learning_rate": 3.36442023925827e-05, "loss": 0.2689, "num_input_tokens_seen": 3110640, "step": 34545 }, { "epoch": 8.978690228690228, "grad_norm": 1.5187009572982788, "learning_rate": 3.3638882272517514e-05, "loss": 0.2427, "num_input_tokens_seen": 3111104, "step": 34550 }, { "epoch": 8.979989604989605, "grad_norm": 2.190603017807007, "learning_rate": 3.363356170814632e-05, "loss": 0.2398, "num_input_tokens_seen": 3111536, "step": 34555 }, { "epoch": 8.981288981288982, "grad_norm": 2.0475151538848877, "learning_rate": 3.362824069974279e-05, "loss": 0.2827, "num_input_tokens_seen": 3112000, "step": 34560 }, { "epoch": 8.982588357588357, "grad_norm": 2.0717766284942627, "learning_rate": 3.362291924758056e-05, "loss": 0.363, "num_input_tokens_seen": 3112480, "step": 34565 }, { "epoch": 8.983887733887734, "grad_norm": 1.0109361410140991, "learning_rate": 3.361759735193334e-05, "loss": 0.1685, "num_input_tokens_seen": 3112944, "step": 34570 }, { "epoch": 8.98518711018711, "grad_norm": 0.9501523375511169, "learning_rate": 3.361227501307483e-05, "loss": 0.2839, "num_input_tokens_seen": 3113440, "step": 34575 }, { "epoch": 8.986486486486486, "grad_norm": 0.7088027596473694, "learning_rate": 3.360695223127876e-05, "loss": 0.2651, "num_input_tokens_seen": 3113840, "step": 34580 }, { "epoch": 8.987785862785863, "grad_norm": 0.7166020274162292, "learning_rate": 3.360162900681889e-05, "loss": 0.204, "num_input_tokens_seen": 3114256, "step": 34585 }, { "epoch": 8.98908523908524, "grad_norm": 4.362947940826416, "learning_rate": 3.3596305339968995e-05, "loss": 0.2103, "num_input_tokens_seen": 3114720, "step": 34590 }, { "epoch": 8.990384615384615, "grad_norm": 0.34957098960876465, "learning_rate": 3.359098123100289e-05, "loss": 0.3394, "num_input_tokens_seen": 3115168, "step": 34595 }, { "epoch": 8.991683991683992, "grad_norm": 2.2814714908599854, "learning_rate": 3.358565668019439e-05, "loss": 0.3014, "num_input_tokens_seen": 3115632, "step": 34600 }, { "epoch": 8.992983367983367, "grad_norm": 0.4677963852882385, "learning_rate": 3.3580331687817336e-05, "loss": 0.2181, "num_input_tokens_seen": 3116096, "step": 34605 }, { "epoch": 8.994282744282744, "grad_norm": 0.3130343556404114, "learning_rate": 3.3575006254145594e-05, "loss": 0.1196, "num_input_tokens_seen": 3116544, "step": 34610 }, { "epoch": 8.995582120582121, "grad_norm": 0.7652916312217712, "learning_rate": 3.356968037945307e-05, "loss": 0.3679, "num_input_tokens_seen": 3116992, "step": 34615 }, { "epoch": 8.996881496881496, "grad_norm": 0.47465309500694275, "learning_rate": 3.3564354064013676e-05, "loss": 0.1787, "num_input_tokens_seen": 3117440, "step": 34620 }, { "epoch": 8.998180873180873, "grad_norm": 2.125601053237915, "learning_rate": 3.3559027308101345e-05, "loss": 0.2627, "num_input_tokens_seen": 3117904, "step": 34625 }, { "epoch": 8.99948024948025, "grad_norm": 1.5617916584014893, "learning_rate": 3.355370011199003e-05, "loss": 0.2726, "num_input_tokens_seen": 3118336, "step": 34630 }, { "epoch": 9.0, "eval_loss": 0.23929104208946228, "eval_runtime": 13.1539, "eval_samples_per_second": 65.076, "eval_steps_per_second": 32.538, "num_input_tokens_seen": 3118472, "step": 34632 }, { "epoch": 9.000779625779625, "grad_norm": 0.5083194375038147, "learning_rate": 3.354837247595374e-05, "loss": 0.1063, "num_input_tokens_seen": 3118728, "step": 34635 }, { "epoch": 9.002079002079002, "grad_norm": 0.379071980714798, "learning_rate": 3.354304440026646e-05, "loss": 0.2619, "num_input_tokens_seen": 3119176, "step": 34640 }, { "epoch": 9.003378378378379, "grad_norm": 0.3431968688964844, "learning_rate": 3.35377158852022e-05, "loss": 0.1186, "num_input_tokens_seen": 3119608, "step": 34645 }, { "epoch": 9.004677754677754, "grad_norm": 1.3972833156585693, "learning_rate": 3.3532386931035054e-05, "loss": 0.2756, "num_input_tokens_seen": 3120024, "step": 34650 }, { "epoch": 9.005977130977131, "grad_norm": 0.5020720958709717, "learning_rate": 3.3527057538039064e-05, "loss": 0.3529, "num_input_tokens_seen": 3120504, "step": 34655 }, { "epoch": 9.007276507276508, "grad_norm": 1.499262809753418, "learning_rate": 3.3521727706488336e-05, "loss": 0.2532, "num_input_tokens_seen": 3120952, "step": 34660 }, { "epoch": 9.008575883575883, "grad_norm": 1.5000962018966675, "learning_rate": 3.3516397436656985e-05, "loss": 0.2729, "num_input_tokens_seen": 3121432, "step": 34665 }, { "epoch": 9.00987525987526, "grad_norm": 1.034482479095459, "learning_rate": 3.351106672881915e-05, "loss": 0.3095, "num_input_tokens_seen": 3121848, "step": 34670 }, { "epoch": 9.011174636174637, "grad_norm": 1.0078312158584595, "learning_rate": 3.350573558324901e-05, "loss": 0.2072, "num_input_tokens_seen": 3122328, "step": 34675 }, { "epoch": 9.012474012474012, "grad_norm": 0.7082769274711609, "learning_rate": 3.350040400022072e-05, "loss": 0.202, "num_input_tokens_seen": 3122824, "step": 34680 }, { "epoch": 9.013773388773389, "grad_norm": 0.997972846031189, "learning_rate": 3.349507198000853e-05, "loss": 0.27, "num_input_tokens_seen": 3123240, "step": 34685 }, { "epoch": 9.015072765072764, "grad_norm": 0.6086682081222534, "learning_rate": 3.348973952288664e-05, "loss": 0.1725, "num_input_tokens_seen": 3123688, "step": 34690 }, { "epoch": 9.016372141372141, "grad_norm": 0.7315866947174072, "learning_rate": 3.3484406629129314e-05, "loss": 0.3098, "num_input_tokens_seen": 3124168, "step": 34695 }, { "epoch": 9.017671517671518, "grad_norm": 0.9425363540649414, "learning_rate": 3.347907329901082e-05, "loss": 0.1326, "num_input_tokens_seen": 3124600, "step": 34700 }, { "epoch": 9.018970893970893, "grad_norm": 0.7814302444458008, "learning_rate": 3.3473739532805467e-05, "loss": 0.2719, "num_input_tokens_seen": 3125080, "step": 34705 }, { "epoch": 9.02027027027027, "grad_norm": 0.7134539484977722, "learning_rate": 3.346840533078757e-05, "loss": 0.1898, "num_input_tokens_seen": 3125496, "step": 34710 }, { "epoch": 9.021569646569647, "grad_norm": 0.5109399557113647, "learning_rate": 3.346307069323146e-05, "loss": 0.1818, "num_input_tokens_seen": 3125944, "step": 34715 }, { "epoch": 9.022869022869022, "grad_norm": 1.3756051063537598, "learning_rate": 3.345773562041153e-05, "loss": 0.2621, "num_input_tokens_seen": 3126424, "step": 34720 }, { "epoch": 9.0241683991684, "grad_norm": 0.3268510699272156, "learning_rate": 3.345240011260215e-05, "loss": 0.279, "num_input_tokens_seen": 3126856, "step": 34725 }, { "epoch": 9.025467775467776, "grad_norm": 0.33659929037094116, "learning_rate": 3.344706417007773e-05, "loss": 0.1191, "num_input_tokens_seen": 3127304, "step": 34730 }, { "epoch": 9.026767151767151, "grad_norm": 0.5959120988845825, "learning_rate": 3.344172779311271e-05, "loss": 0.2707, "num_input_tokens_seen": 3127752, "step": 34735 }, { "epoch": 9.028066528066528, "grad_norm": 0.3249906599521637, "learning_rate": 3.3436390981981535e-05, "loss": 0.1693, "num_input_tokens_seen": 3128200, "step": 34740 }, { "epoch": 9.029365904365905, "grad_norm": 2.246964693069458, "learning_rate": 3.3431053736958695e-05, "loss": 0.3109, "num_input_tokens_seen": 3128680, "step": 34745 }, { "epoch": 9.03066528066528, "grad_norm": 0.3623042702674866, "learning_rate": 3.3425716058318684e-05, "loss": 0.2023, "num_input_tokens_seen": 3129128, "step": 34750 }, { "epoch": 9.031964656964657, "grad_norm": 1.0745364427566528, "learning_rate": 3.342037794633602e-05, "loss": 0.1206, "num_input_tokens_seen": 3129608, "step": 34755 }, { "epoch": 9.033264033264032, "grad_norm": 2.2764766216278076, "learning_rate": 3.3415039401285245e-05, "loss": 0.4991, "num_input_tokens_seen": 3130072, "step": 34760 }, { "epoch": 9.03456340956341, "grad_norm": 0.9587730765342712, "learning_rate": 3.3409700423440945e-05, "loss": 0.1476, "num_input_tokens_seen": 3130520, "step": 34765 }, { "epoch": 9.035862785862786, "grad_norm": 0.4098348915576935, "learning_rate": 3.340436101307768e-05, "loss": 0.1538, "num_input_tokens_seen": 3130968, "step": 34770 }, { "epoch": 9.037162162162161, "grad_norm": 0.5394964218139648, "learning_rate": 3.3399021170470084e-05, "loss": 0.2328, "num_input_tokens_seen": 3131416, "step": 34775 }, { "epoch": 9.038461538461538, "grad_norm": 0.7108348608016968, "learning_rate": 3.339368089589278e-05, "loss": 0.299, "num_input_tokens_seen": 3131880, "step": 34780 }, { "epoch": 9.039760914760915, "grad_norm": 4.521622657775879, "learning_rate": 3.3388340189620424e-05, "loss": 0.3727, "num_input_tokens_seen": 3132344, "step": 34785 }, { "epoch": 9.04106029106029, "grad_norm": 2.0515081882476807, "learning_rate": 3.338299905192769e-05, "loss": 0.3141, "num_input_tokens_seen": 3132808, "step": 34790 }, { "epoch": 9.042359667359667, "grad_norm": 1.1302372217178345, "learning_rate": 3.337765748308929e-05, "loss": 0.2026, "num_input_tokens_seen": 3133304, "step": 34795 }, { "epoch": 9.043659043659044, "grad_norm": 1.2497339248657227, "learning_rate": 3.337231548337994e-05, "loss": 0.2535, "num_input_tokens_seen": 3133816, "step": 34800 }, { "epoch": 9.04495841995842, "grad_norm": 1.8869829177856445, "learning_rate": 3.336697305307437e-05, "loss": 0.2787, "num_input_tokens_seen": 3134264, "step": 34805 }, { "epoch": 9.046257796257796, "grad_norm": 1.305241346359253, "learning_rate": 3.336163019244737e-05, "loss": 0.1796, "num_input_tokens_seen": 3134728, "step": 34810 }, { "epoch": 9.047557172557173, "grad_norm": 0.6109958291053772, "learning_rate": 3.335628690177371e-05, "loss": 0.2481, "num_input_tokens_seen": 3135160, "step": 34815 }, { "epoch": 9.048856548856548, "grad_norm": 1.3473248481750488, "learning_rate": 3.335094318132822e-05, "loss": 0.4127, "num_input_tokens_seen": 3135576, "step": 34820 }, { "epoch": 9.050155925155925, "grad_norm": 1.2463635206222534, "learning_rate": 3.3345599031385713e-05, "loss": 0.2175, "num_input_tokens_seen": 3136008, "step": 34825 }, { "epoch": 9.051455301455302, "grad_norm": 1.7413427829742432, "learning_rate": 3.3340254452221056e-05, "loss": 0.2316, "num_input_tokens_seen": 3136440, "step": 34830 }, { "epoch": 9.052754677754677, "grad_norm": 0.44527971744537354, "learning_rate": 3.333490944410912e-05, "loss": 0.1441, "num_input_tokens_seen": 3136904, "step": 34835 }, { "epoch": 9.054054054054054, "grad_norm": 0.7970423698425293, "learning_rate": 3.33295640073248e-05, "loss": 0.3239, "num_input_tokens_seen": 3137336, "step": 34840 }, { "epoch": 9.05535343035343, "grad_norm": 1.7635395526885986, "learning_rate": 3.332421814214302e-05, "loss": 0.3103, "num_input_tokens_seen": 3137816, "step": 34845 }, { "epoch": 9.056652806652806, "grad_norm": 1.3404301404953003, "learning_rate": 3.331887184883873e-05, "loss": 0.1823, "num_input_tokens_seen": 3138248, "step": 34850 }, { "epoch": 9.057952182952183, "grad_norm": 1.7701835632324219, "learning_rate": 3.33135251276869e-05, "loss": 0.1687, "num_input_tokens_seen": 3138712, "step": 34855 }, { "epoch": 9.059251559251559, "grad_norm": 0.4470777213573456, "learning_rate": 3.330817797896249e-05, "loss": 0.1434, "num_input_tokens_seen": 3139224, "step": 34860 }, { "epoch": 9.060550935550935, "grad_norm": 0.5478026270866394, "learning_rate": 3.330283040294053e-05, "loss": 0.1971, "num_input_tokens_seen": 3139656, "step": 34865 }, { "epoch": 9.061850311850312, "grad_norm": 0.4729577898979187, "learning_rate": 3.3297482399896054e-05, "loss": 0.1467, "num_input_tokens_seen": 3140088, "step": 34870 }, { "epoch": 9.063149688149688, "grad_norm": 0.45126351714134216, "learning_rate": 3.32921339701041e-05, "loss": 0.2684, "num_input_tokens_seen": 3140552, "step": 34875 }, { "epoch": 9.064449064449065, "grad_norm": 1.4614224433898926, "learning_rate": 3.328678511383975e-05, "loss": 0.2569, "num_input_tokens_seen": 3140984, "step": 34880 }, { "epoch": 9.065748440748441, "grad_norm": 3.2919631004333496, "learning_rate": 3.328143583137811e-05, "loss": 0.3164, "num_input_tokens_seen": 3141416, "step": 34885 }, { "epoch": 9.067047817047817, "grad_norm": 4.156943321228027, "learning_rate": 3.3276086122994286e-05, "loss": 0.3881, "num_input_tokens_seen": 3141864, "step": 34890 }, { "epoch": 9.068347193347194, "grad_norm": 2.5752789974212646, "learning_rate": 3.327073598896342e-05, "loss": 0.1986, "num_input_tokens_seen": 3142328, "step": 34895 }, { "epoch": 9.06964656964657, "grad_norm": 0.6553269624710083, "learning_rate": 3.326538542956069e-05, "loss": 0.2923, "num_input_tokens_seen": 3142728, "step": 34900 }, { "epoch": 9.070945945945946, "grad_norm": 0.7015355825424194, "learning_rate": 3.326003444506126e-05, "loss": 0.1242, "num_input_tokens_seen": 3143144, "step": 34905 }, { "epoch": 9.072245322245323, "grad_norm": 1.9697273969650269, "learning_rate": 3.3254683035740344e-05, "loss": 0.3322, "num_input_tokens_seen": 3143592, "step": 34910 }, { "epoch": 9.073544698544698, "grad_norm": 3.1690008640289307, "learning_rate": 3.3249331201873164e-05, "loss": 0.4556, "num_input_tokens_seen": 3144040, "step": 34915 }, { "epoch": 9.074844074844075, "grad_norm": 0.24262121319770813, "learning_rate": 3.324397894373498e-05, "loss": 0.2898, "num_input_tokens_seen": 3144472, "step": 34920 }, { "epoch": 9.076143451143452, "grad_norm": 1.1956557035446167, "learning_rate": 3.3238626261601066e-05, "loss": 0.1688, "num_input_tokens_seen": 3144920, "step": 34925 }, { "epoch": 9.077442827442827, "grad_norm": 0.5084936022758484, "learning_rate": 3.32332731557467e-05, "loss": 0.1357, "num_input_tokens_seen": 3145368, "step": 34930 }, { "epoch": 9.078742203742204, "grad_norm": 2.9121577739715576, "learning_rate": 3.322791962644721e-05, "loss": 0.2182, "num_input_tokens_seen": 3145800, "step": 34935 }, { "epoch": 9.08004158004158, "grad_norm": 0.5322800278663635, "learning_rate": 3.3222565673977936e-05, "loss": 0.3742, "num_input_tokens_seen": 3146280, "step": 34940 }, { "epoch": 9.081340956340956, "grad_norm": 2.1108133792877197, "learning_rate": 3.321721129861422e-05, "loss": 0.2008, "num_input_tokens_seen": 3146696, "step": 34945 }, { "epoch": 9.082640332640333, "grad_norm": 2.473393440246582, "learning_rate": 3.3211856500631464e-05, "loss": 0.3613, "num_input_tokens_seen": 3147128, "step": 34950 }, { "epoch": 9.08393970893971, "grad_norm": 0.9203315377235413, "learning_rate": 3.320650128030505e-05, "loss": 0.152, "num_input_tokens_seen": 3147608, "step": 34955 }, { "epoch": 9.085239085239085, "grad_norm": 2.6959571838378906, "learning_rate": 3.3201145637910426e-05, "loss": 0.3899, "num_input_tokens_seen": 3148024, "step": 34960 }, { "epoch": 9.086538461538462, "grad_norm": 0.3139822483062744, "learning_rate": 3.319578957372301e-05, "loss": 0.2391, "num_input_tokens_seen": 3148440, "step": 34965 }, { "epoch": 9.087837837837839, "grad_norm": 1.482998013496399, "learning_rate": 3.3190433088018294e-05, "loss": 0.1492, "num_input_tokens_seen": 3148888, "step": 34970 }, { "epoch": 9.089137214137214, "grad_norm": 0.6670325994491577, "learning_rate": 3.318507618107175e-05, "loss": 0.2092, "num_input_tokens_seen": 3149320, "step": 34975 }, { "epoch": 9.09043659043659, "grad_norm": 0.6986271142959595, "learning_rate": 3.3179718853158906e-05, "loss": 0.321, "num_input_tokens_seen": 3149816, "step": 34980 }, { "epoch": 9.091735966735968, "grad_norm": 0.8480697274208069, "learning_rate": 3.3174361104555266e-05, "loss": 0.191, "num_input_tokens_seen": 3150248, "step": 34985 }, { "epoch": 9.093035343035343, "grad_norm": 3.6184303760528564, "learning_rate": 3.316900293553642e-05, "loss": 0.3779, "num_input_tokens_seen": 3150696, "step": 34990 }, { "epoch": 9.09433471933472, "grad_norm": 1.9479650259017944, "learning_rate": 3.316364434637791e-05, "loss": 0.5418, "num_input_tokens_seen": 3151144, "step": 34995 }, { "epoch": 9.095634095634095, "grad_norm": 0.9351345896720886, "learning_rate": 3.315828533735536e-05, "loss": 0.1739, "num_input_tokens_seen": 3151592, "step": 35000 }, { "epoch": 9.096933471933472, "grad_norm": 1.3135920763015747, "learning_rate": 3.315292590874437e-05, "loss": 0.2573, "num_input_tokens_seen": 3152088, "step": 35005 }, { "epoch": 9.098232848232849, "grad_norm": 1.9596527814865112, "learning_rate": 3.314756606082059e-05, "loss": 0.3507, "num_input_tokens_seen": 3152584, "step": 35010 }, { "epoch": 9.099532224532224, "grad_norm": 2.133443832397461, "learning_rate": 3.3142205793859684e-05, "loss": 0.2811, "num_input_tokens_seen": 3153016, "step": 35015 }, { "epoch": 9.1008316008316, "grad_norm": 1.2692350149154663, "learning_rate": 3.3136845108137336e-05, "loss": 0.2433, "num_input_tokens_seen": 3153448, "step": 35020 }, { "epoch": 9.102130977130978, "grad_norm": 1.9538036584854126, "learning_rate": 3.3131484003929246e-05, "loss": 0.2874, "num_input_tokens_seen": 3153928, "step": 35025 }, { "epoch": 9.103430353430353, "grad_norm": 1.3404818773269653, "learning_rate": 3.312612248151115e-05, "loss": 0.2348, "num_input_tokens_seen": 3154440, "step": 35030 }, { "epoch": 9.10472972972973, "grad_norm": 2.2171363830566406, "learning_rate": 3.312076054115877e-05, "loss": 0.3187, "num_input_tokens_seen": 3154904, "step": 35035 }, { "epoch": 9.106029106029107, "grad_norm": 1.9494975805282593, "learning_rate": 3.31153981831479e-05, "loss": 0.2384, "num_input_tokens_seen": 3155368, "step": 35040 }, { "epoch": 9.107328482328482, "grad_norm": 0.6614241600036621, "learning_rate": 3.311003540775434e-05, "loss": 0.2546, "num_input_tokens_seen": 3155768, "step": 35045 }, { "epoch": 9.108627858627859, "grad_norm": 1.2490805387496948, "learning_rate": 3.310467221525387e-05, "loss": 0.2377, "num_input_tokens_seen": 3156248, "step": 35050 }, { "epoch": 9.109927234927236, "grad_norm": 0.8266968727111816, "learning_rate": 3.3099308605922354e-05, "loss": 0.162, "num_input_tokens_seen": 3156728, "step": 35055 }, { "epoch": 9.111226611226611, "grad_norm": 3.082965612411499, "learning_rate": 3.309394458003563e-05, "loss": 0.1805, "num_input_tokens_seen": 3157160, "step": 35060 }, { "epoch": 9.112525987525988, "grad_norm": 2.1550629138946533, "learning_rate": 3.3088580137869587e-05, "loss": 0.1759, "num_input_tokens_seen": 3157608, "step": 35065 }, { "epoch": 9.113825363825363, "grad_norm": 0.5949647426605225, "learning_rate": 3.3083215279700115e-05, "loss": 0.2374, "num_input_tokens_seen": 3158056, "step": 35070 }, { "epoch": 9.11512474012474, "grad_norm": 5.0334248542785645, "learning_rate": 3.307785000580313e-05, "loss": 0.3479, "num_input_tokens_seen": 3158504, "step": 35075 }, { "epoch": 9.116424116424117, "grad_norm": 3.853750228881836, "learning_rate": 3.307248431645458e-05, "loss": 0.3631, "num_input_tokens_seen": 3158936, "step": 35080 }, { "epoch": 9.117723492723492, "grad_norm": 3.9632253646850586, "learning_rate": 3.306711821193044e-05, "loss": 0.3272, "num_input_tokens_seen": 3159400, "step": 35085 }, { "epoch": 9.119022869022869, "grad_norm": 2.3995919227600098, "learning_rate": 3.306175169250667e-05, "loss": 0.301, "num_input_tokens_seen": 3159864, "step": 35090 }, { "epoch": 9.120322245322246, "grad_norm": 3.0564208030700684, "learning_rate": 3.305638475845929e-05, "loss": 0.1742, "num_input_tokens_seen": 3160312, "step": 35095 }, { "epoch": 9.121621621621621, "grad_norm": 2.6704061031341553, "learning_rate": 3.305101741006432e-05, "loss": 0.2981, "num_input_tokens_seen": 3160760, "step": 35100 }, { "epoch": 9.122920997920998, "grad_norm": 1.1131843328475952, "learning_rate": 3.3045649647597815e-05, "loss": 0.1895, "num_input_tokens_seen": 3161192, "step": 35105 }, { "epoch": 9.124220374220375, "grad_norm": 0.7694859504699707, "learning_rate": 3.304028147133583e-05, "loss": 0.2115, "num_input_tokens_seen": 3161608, "step": 35110 }, { "epoch": 9.12551975051975, "grad_norm": 2.909468412399292, "learning_rate": 3.303491288155448e-05, "loss": 0.2405, "num_input_tokens_seen": 3162040, "step": 35115 }, { "epoch": 9.126819126819127, "grad_norm": 0.48178890347480774, "learning_rate": 3.3029543878529844e-05, "loss": 0.1273, "num_input_tokens_seen": 3162472, "step": 35120 }, { "epoch": 9.128118503118504, "grad_norm": 0.5522757768630981, "learning_rate": 3.3024174462538086e-05, "loss": 0.1713, "num_input_tokens_seen": 3162904, "step": 35125 }, { "epoch": 9.129417879417879, "grad_norm": 1.1614806652069092, "learning_rate": 3.301880463385534e-05, "loss": 0.2984, "num_input_tokens_seen": 3163304, "step": 35130 }, { "epoch": 9.130717255717256, "grad_norm": 2.8279457092285156, "learning_rate": 3.301343439275779e-05, "loss": 0.3292, "num_input_tokens_seen": 3163768, "step": 35135 }, { "epoch": 9.132016632016631, "grad_norm": 1.4113130569458008, "learning_rate": 3.3008063739521636e-05, "loss": 0.2473, "num_input_tokens_seen": 3164200, "step": 35140 }, { "epoch": 9.133316008316008, "grad_norm": 4.453502178192139, "learning_rate": 3.300269267442309e-05, "loss": 0.3123, "num_input_tokens_seen": 3164664, "step": 35145 }, { "epoch": 9.134615384615385, "grad_norm": 3.004148244857788, "learning_rate": 3.2997321197738385e-05, "loss": 0.2167, "num_input_tokens_seen": 3165096, "step": 35150 }, { "epoch": 9.13591476091476, "grad_norm": 4.998094081878662, "learning_rate": 3.299194930974379e-05, "loss": 0.334, "num_input_tokens_seen": 3165560, "step": 35155 }, { "epoch": 9.137214137214137, "grad_norm": 3.088703155517578, "learning_rate": 3.2986577010715594e-05, "loss": 0.2273, "num_input_tokens_seen": 3165960, "step": 35160 }, { "epoch": 9.138513513513514, "grad_norm": 1.7002354860305786, "learning_rate": 3.298120430093008e-05, "loss": 0.1935, "num_input_tokens_seen": 3166392, "step": 35165 }, { "epoch": 9.13981288981289, "grad_norm": 0.9300881028175354, "learning_rate": 3.2975831180663585e-05, "loss": 0.0798, "num_input_tokens_seen": 3166824, "step": 35170 }, { "epoch": 9.141112266112266, "grad_norm": 0.4853835999965668, "learning_rate": 3.2970457650192446e-05, "loss": 0.2241, "num_input_tokens_seen": 3167256, "step": 35175 }, { "epoch": 9.142411642411643, "grad_norm": 0.42946186661720276, "learning_rate": 3.296508370979303e-05, "loss": 0.3341, "num_input_tokens_seen": 3167688, "step": 35180 }, { "epoch": 9.143711018711018, "grad_norm": 3.8808043003082275, "learning_rate": 3.2959709359741744e-05, "loss": 0.3731, "num_input_tokens_seen": 3168136, "step": 35185 }, { "epoch": 9.145010395010395, "grad_norm": 1.512609601020813, "learning_rate": 3.295433460031497e-05, "loss": 0.1114, "num_input_tokens_seen": 3168568, "step": 35190 }, { "epoch": 9.146309771309772, "grad_norm": 2.9174537658691406, "learning_rate": 3.294895943178914e-05, "loss": 0.1697, "num_input_tokens_seen": 3169016, "step": 35195 }, { "epoch": 9.147609147609147, "grad_norm": 3.89768648147583, "learning_rate": 3.294358385444071e-05, "loss": 0.3354, "num_input_tokens_seen": 3169464, "step": 35200 }, { "epoch": 9.148908523908524, "grad_norm": 1.2804269790649414, "learning_rate": 3.293820786854616e-05, "loss": 0.1498, "num_input_tokens_seen": 3169896, "step": 35205 }, { "epoch": 9.1502079002079, "grad_norm": 3.0351474285125732, "learning_rate": 3.293283147438197e-05, "loss": 0.4818, "num_input_tokens_seen": 3170376, "step": 35210 }, { "epoch": 9.151507276507276, "grad_norm": 0.44460567831993103, "learning_rate": 3.292745467222465e-05, "loss": 0.3624, "num_input_tokens_seen": 3170792, "step": 35215 }, { "epoch": 9.152806652806653, "grad_norm": 4.161654472351074, "learning_rate": 3.292207746235075e-05, "loss": 0.2375, "num_input_tokens_seen": 3171208, "step": 35220 }, { "epoch": 9.154106029106028, "grad_norm": 0.2869323790073395, "learning_rate": 3.2916699845036816e-05, "loss": 0.1887, "num_input_tokens_seen": 3171688, "step": 35225 }, { "epoch": 9.155405405405405, "grad_norm": 3.293008327484131, "learning_rate": 3.291132182055942e-05, "loss": 0.1676, "num_input_tokens_seen": 3172168, "step": 35230 }, { "epoch": 9.156704781704782, "grad_norm": 3.687258243560791, "learning_rate": 3.2905943389195156e-05, "loss": 0.5021, "num_input_tokens_seen": 3172568, "step": 35235 }, { "epoch": 9.158004158004157, "grad_norm": 0.5717617273330688, "learning_rate": 3.290056455122066e-05, "loss": 0.0671, "num_input_tokens_seen": 3173000, "step": 35240 }, { "epoch": 9.159303534303534, "grad_norm": 1.8614813089370728, "learning_rate": 3.289518530691255e-05, "loss": 0.2635, "num_input_tokens_seen": 3173464, "step": 35245 }, { "epoch": 9.160602910602911, "grad_norm": 0.3626460134983063, "learning_rate": 3.28898056565475e-05, "loss": 0.4138, "num_input_tokens_seen": 3173912, "step": 35250 }, { "epoch": 9.161902286902286, "grad_norm": 2.239907741546631, "learning_rate": 3.288442560040218e-05, "loss": 0.1787, "num_input_tokens_seen": 3174328, "step": 35255 }, { "epoch": 9.163201663201663, "grad_norm": 0.4039033353328705, "learning_rate": 3.2879045138753305e-05, "loss": 0.2162, "num_input_tokens_seen": 3174744, "step": 35260 }, { "epoch": 9.16450103950104, "grad_norm": 2.9303455352783203, "learning_rate": 3.2873664271877584e-05, "loss": 0.2681, "num_input_tokens_seen": 3175176, "step": 35265 }, { "epoch": 9.165800415800415, "grad_norm": 0.5074008703231812, "learning_rate": 3.286828300005177e-05, "loss": 0.2742, "num_input_tokens_seen": 3175640, "step": 35270 }, { "epoch": 9.167099792099792, "grad_norm": 4.503505706787109, "learning_rate": 3.2862901323552616e-05, "loss": 0.3989, "num_input_tokens_seen": 3176104, "step": 35275 }, { "epoch": 9.16839916839917, "grad_norm": 2.6321208477020264, "learning_rate": 3.2857519242656915e-05, "loss": 0.3374, "num_input_tokens_seen": 3176552, "step": 35280 }, { "epoch": 9.169698544698544, "grad_norm": 3.8721489906311035, "learning_rate": 3.285213675764147e-05, "loss": 0.1753, "num_input_tokens_seen": 3177016, "step": 35285 }, { "epoch": 9.170997920997921, "grad_norm": 3.3682327270507812, "learning_rate": 3.284675386878311e-05, "loss": 0.2626, "num_input_tokens_seen": 3177432, "step": 35290 }, { "epoch": 9.172297297297296, "grad_norm": 3.546922206878662, "learning_rate": 3.284137057635868e-05, "loss": 0.2282, "num_input_tokens_seen": 3177880, "step": 35295 }, { "epoch": 9.173596673596673, "grad_norm": 0.5297568440437317, "learning_rate": 3.283598688064505e-05, "loss": 0.1083, "num_input_tokens_seen": 3178344, "step": 35300 }, { "epoch": 9.17489604989605, "grad_norm": 4.049869060516357, "learning_rate": 3.28306027819191e-05, "loss": 0.5059, "num_input_tokens_seen": 3178744, "step": 35305 }, { "epoch": 9.176195426195425, "grad_norm": 0.9672582745552063, "learning_rate": 3.2825218280457747e-05, "loss": 0.4788, "num_input_tokens_seen": 3179224, "step": 35310 }, { "epoch": 9.177494802494802, "grad_norm": 3.474426746368408, "learning_rate": 3.281983337653793e-05, "loss": 0.3085, "num_input_tokens_seen": 3179672, "step": 35315 }, { "epoch": 9.17879417879418, "grad_norm": 3.2798328399658203, "learning_rate": 3.281444807043658e-05, "loss": 0.3312, "num_input_tokens_seen": 3180120, "step": 35320 }, { "epoch": 9.180093555093555, "grad_norm": 0.7068066000938416, "learning_rate": 3.280906236243067e-05, "loss": 0.2615, "num_input_tokens_seen": 3180552, "step": 35325 }, { "epoch": 9.181392931392931, "grad_norm": 1.235384464263916, "learning_rate": 3.280367625279722e-05, "loss": 0.189, "num_input_tokens_seen": 3181000, "step": 35330 }, { "epoch": 9.182692307692308, "grad_norm": 2.546772003173828, "learning_rate": 3.279828974181322e-05, "loss": 0.3065, "num_input_tokens_seen": 3181432, "step": 35335 }, { "epoch": 9.183991683991684, "grad_norm": 1.6545333862304688, "learning_rate": 3.27929028297557e-05, "loss": 0.222, "num_input_tokens_seen": 3181880, "step": 35340 }, { "epoch": 9.18529106029106, "grad_norm": 0.8024282455444336, "learning_rate": 3.278751551690172e-05, "loss": 0.1852, "num_input_tokens_seen": 3182344, "step": 35345 }, { "epoch": 9.186590436590437, "grad_norm": 0.6700215339660645, "learning_rate": 3.278212780352836e-05, "loss": 0.1799, "num_input_tokens_seen": 3182808, "step": 35350 }, { "epoch": 9.187889812889813, "grad_norm": 4.677123069763184, "learning_rate": 3.2776739689912714e-05, "loss": 0.6107, "num_input_tokens_seen": 3183224, "step": 35355 }, { "epoch": 9.18918918918919, "grad_norm": 0.49032101035118103, "learning_rate": 3.277135117633188e-05, "loss": 0.2627, "num_input_tokens_seen": 3183624, "step": 35360 }, { "epoch": 9.190488565488565, "grad_norm": 1.63612961769104, "learning_rate": 3.2765962263063016e-05, "loss": 0.3498, "num_input_tokens_seen": 3184024, "step": 35365 }, { "epoch": 9.191787941787942, "grad_norm": 0.6380659937858582, "learning_rate": 3.2760572950383275e-05, "loss": 0.2465, "num_input_tokens_seen": 3184488, "step": 35370 }, { "epoch": 9.193087318087318, "grad_norm": 1.9663610458374023, "learning_rate": 3.275518323856983e-05, "loss": 0.3813, "num_input_tokens_seen": 3184952, "step": 35375 }, { "epoch": 9.194386694386694, "grad_norm": 1.9945670366287231, "learning_rate": 3.274979312789988e-05, "loss": 0.3073, "num_input_tokens_seen": 3185432, "step": 35380 }, { "epoch": 9.19568607068607, "grad_norm": 1.1990545988082886, "learning_rate": 3.274440261865064e-05, "loss": 0.2182, "num_input_tokens_seen": 3185880, "step": 35385 }, { "epoch": 9.196985446985448, "grad_norm": 1.8079969882965088, "learning_rate": 3.273901171109936e-05, "loss": 0.2585, "num_input_tokens_seen": 3186328, "step": 35390 }, { "epoch": 9.198284823284823, "grad_norm": 1.2180758714675903, "learning_rate": 3.2733620405523294e-05, "loss": 0.2058, "num_input_tokens_seen": 3186792, "step": 35395 }, { "epoch": 9.1995841995842, "grad_norm": 3.458164691925049, "learning_rate": 3.272822870219971e-05, "loss": 0.3299, "num_input_tokens_seen": 3187240, "step": 35400 }, { "epoch": 9.200883575883577, "grad_norm": 3.967804431915283, "learning_rate": 3.2722836601405925e-05, "loss": 0.2864, "num_input_tokens_seen": 3187656, "step": 35405 }, { "epoch": 9.202182952182952, "grad_norm": 2.175985336303711, "learning_rate": 3.271744410341925e-05, "loss": 0.2294, "num_input_tokens_seen": 3188104, "step": 35410 }, { "epoch": 9.203482328482329, "grad_norm": 1.608516812324524, "learning_rate": 3.2712051208517035e-05, "loss": 0.2707, "num_input_tokens_seen": 3188536, "step": 35415 }, { "epoch": 9.204781704781706, "grad_norm": 3.5277044773101807, "learning_rate": 3.270665791697664e-05, "loss": 0.2239, "num_input_tokens_seen": 3189032, "step": 35420 }, { "epoch": 9.20608108108108, "grad_norm": 2.138390302658081, "learning_rate": 3.270126422907544e-05, "loss": 0.0839, "num_input_tokens_seen": 3189480, "step": 35425 }, { "epoch": 9.207380457380458, "grad_norm": 0.0975484699010849, "learning_rate": 3.269587014509084e-05, "loss": 0.2598, "num_input_tokens_seen": 3189912, "step": 35430 }, { "epoch": 9.208679833679835, "grad_norm": 0.48400941491127014, "learning_rate": 3.2690475665300266e-05, "loss": 0.0246, "num_input_tokens_seen": 3190376, "step": 35435 }, { "epoch": 9.20997920997921, "grad_norm": 6.499654769897461, "learning_rate": 3.268508078998116e-05, "loss": 0.64, "num_input_tokens_seen": 3190792, "step": 35440 }, { "epoch": 9.211278586278587, "grad_norm": 3.282437562942505, "learning_rate": 3.267968551941099e-05, "loss": 0.2281, "num_input_tokens_seen": 3191240, "step": 35445 }, { "epoch": 9.212577962577962, "grad_norm": 0.6325369477272034, "learning_rate": 3.2674289853867226e-05, "loss": 0.1485, "num_input_tokens_seen": 3191688, "step": 35450 }, { "epoch": 9.213877338877339, "grad_norm": 0.644243597984314, "learning_rate": 3.266889379362739e-05, "loss": 0.3901, "num_input_tokens_seen": 3192088, "step": 35455 }, { "epoch": 9.215176715176716, "grad_norm": 2.9076855182647705, "learning_rate": 3.2663497338968994e-05, "loss": 0.2055, "num_input_tokens_seen": 3192552, "step": 35460 }, { "epoch": 9.21647609147609, "grad_norm": 0.7286055088043213, "learning_rate": 3.265810049016959e-05, "loss": 0.2938, "num_input_tokens_seen": 3193032, "step": 35465 }, { "epoch": 9.217775467775468, "grad_norm": 0.7487501502037048, "learning_rate": 3.2652703247506735e-05, "loss": 0.3009, "num_input_tokens_seen": 3193512, "step": 35470 }, { "epoch": 9.219074844074845, "grad_norm": 3.261784553527832, "learning_rate": 3.264730561125802e-05, "loss": 0.3756, "num_input_tokens_seen": 3193992, "step": 35475 }, { "epoch": 9.22037422037422, "grad_norm": 1.0484414100646973, "learning_rate": 3.264190758170106e-05, "loss": 0.4139, "num_input_tokens_seen": 3194424, "step": 35480 }, { "epoch": 9.221673596673597, "grad_norm": 4.037378787994385, "learning_rate": 3.263650915911346e-05, "loss": 0.2821, "num_input_tokens_seen": 3194888, "step": 35485 }, { "epoch": 9.222972972972974, "grad_norm": 0.5206800699234009, "learning_rate": 3.263111034377288e-05, "loss": 0.1707, "num_input_tokens_seen": 3195320, "step": 35490 }, { "epoch": 9.224272349272349, "grad_norm": 2.2391886711120605, "learning_rate": 3.2625711135956986e-05, "loss": 0.2332, "num_input_tokens_seen": 3195768, "step": 35495 }, { "epoch": 9.225571725571726, "grad_norm": 0.5407325029373169, "learning_rate": 3.262031153594347e-05, "loss": 0.304, "num_input_tokens_seen": 3196248, "step": 35500 }, { "epoch": 9.226871101871103, "grad_norm": 2.683788299560547, "learning_rate": 3.261491154401001e-05, "loss": 0.125, "num_input_tokens_seen": 3196728, "step": 35505 }, { "epoch": 9.228170478170478, "grad_norm": 0.3612477481365204, "learning_rate": 3.2609511160434366e-05, "loss": 0.2112, "num_input_tokens_seen": 3197128, "step": 35510 }, { "epoch": 9.229469854469855, "grad_norm": 1.8776462078094482, "learning_rate": 3.260411038549427e-05, "loss": 0.1351, "num_input_tokens_seen": 3197608, "step": 35515 }, { "epoch": 9.23076923076923, "grad_norm": 3.944978952407837, "learning_rate": 3.2598709219467485e-05, "loss": 0.3349, "num_input_tokens_seen": 3198040, "step": 35520 }, { "epoch": 9.232068607068607, "grad_norm": 2.2075891494750977, "learning_rate": 3.259330766263181e-05, "loss": 0.4356, "num_input_tokens_seen": 3198520, "step": 35525 }, { "epoch": 9.233367983367984, "grad_norm": 0.6462467908859253, "learning_rate": 3.2587905715265047e-05, "loss": 0.2473, "num_input_tokens_seen": 3198968, "step": 35530 }, { "epoch": 9.234667359667359, "grad_norm": 1.0942130088806152, "learning_rate": 3.258250337764502e-05, "loss": 0.1712, "num_input_tokens_seen": 3199432, "step": 35535 }, { "epoch": 9.235966735966736, "grad_norm": 1.1691466569900513, "learning_rate": 3.257710065004958e-05, "loss": 0.239, "num_input_tokens_seen": 3199848, "step": 35540 }, { "epoch": 9.237266112266113, "grad_norm": 3.3156158924102783, "learning_rate": 3.2571697532756595e-05, "loss": 0.2571, "num_input_tokens_seen": 3200296, "step": 35545 }, { "epoch": 9.238565488565488, "grad_norm": 1.4498074054718018, "learning_rate": 3.2566294026043956e-05, "loss": 0.2275, "num_input_tokens_seen": 3200776, "step": 35550 }, { "epoch": 9.239864864864865, "grad_norm": 2.135820150375366, "learning_rate": 3.2560890130189555e-05, "loss": 0.3728, "num_input_tokens_seen": 3201272, "step": 35555 }, { "epoch": 9.241164241164242, "grad_norm": 0.8597109913825989, "learning_rate": 3.255548584547133e-05, "loss": 0.2251, "num_input_tokens_seen": 3201704, "step": 35560 }, { "epoch": 9.242463617463617, "grad_norm": 3.725263833999634, "learning_rate": 3.255008117216723e-05, "loss": 0.2701, "num_input_tokens_seen": 3202152, "step": 35565 }, { "epoch": 9.243762993762994, "grad_norm": 2.507822275161743, "learning_rate": 3.2544676110555236e-05, "loss": 0.2543, "num_input_tokens_seen": 3202616, "step": 35570 }, { "epoch": 9.24506237006237, "grad_norm": 1.8906748294830322, "learning_rate": 3.25392706609133e-05, "loss": 0.274, "num_input_tokens_seen": 3203064, "step": 35575 }, { "epoch": 9.246361746361746, "grad_norm": 1.969741702079773, "learning_rate": 3.253386482351946e-05, "loss": 0.3378, "num_input_tokens_seen": 3203528, "step": 35580 }, { "epoch": 9.247661122661123, "grad_norm": 1.2687089443206787, "learning_rate": 3.2528458598651734e-05, "loss": 0.3488, "num_input_tokens_seen": 3203960, "step": 35585 }, { "epoch": 9.248960498960498, "grad_norm": 3.3542327880859375, "learning_rate": 3.252305198658817e-05, "loss": 0.4088, "num_input_tokens_seen": 3204408, "step": 35590 }, { "epoch": 9.250259875259875, "grad_norm": 0.8281612992286682, "learning_rate": 3.251764498760683e-05, "loss": 0.1941, "num_input_tokens_seen": 3204872, "step": 35595 }, { "epoch": 9.251559251559252, "grad_norm": 0.6606445908546448, "learning_rate": 3.2512237601985805e-05, "loss": 0.1526, "num_input_tokens_seen": 3205352, "step": 35600 }, { "epoch": 9.252858627858627, "grad_norm": 1.0403470993041992, "learning_rate": 3.2506829830003205e-05, "loss": 0.0844, "num_input_tokens_seen": 3205800, "step": 35605 }, { "epoch": 9.254158004158004, "grad_norm": 3.21968412399292, "learning_rate": 3.2501421671937154e-05, "loss": 0.348, "num_input_tokens_seen": 3206248, "step": 35610 }, { "epoch": 9.255457380457381, "grad_norm": 1.9342949390411377, "learning_rate": 3.24960131280658e-05, "loss": 0.497, "num_input_tokens_seen": 3206712, "step": 35615 }, { "epoch": 9.256756756756756, "grad_norm": 2.269299030303955, "learning_rate": 3.249060419866731e-05, "loss": 0.4857, "num_input_tokens_seen": 3207160, "step": 35620 }, { "epoch": 9.258056133056133, "grad_norm": 1.5723599195480347, "learning_rate": 3.248519488401986e-05, "loss": 0.3511, "num_input_tokens_seen": 3207608, "step": 35625 }, { "epoch": 9.25935550935551, "grad_norm": 2.184828519821167, "learning_rate": 3.2479785184401674e-05, "loss": 0.2565, "num_input_tokens_seen": 3208072, "step": 35630 }, { "epoch": 9.260654885654885, "grad_norm": 0.6714780926704407, "learning_rate": 3.247437510009096e-05, "loss": 0.1187, "num_input_tokens_seen": 3208552, "step": 35635 }, { "epoch": 9.261954261954262, "grad_norm": 1.4091612100601196, "learning_rate": 3.2468964631365984e-05, "loss": 0.1533, "num_input_tokens_seen": 3209000, "step": 35640 }, { "epoch": 9.263253638253639, "grad_norm": 1.9187297821044922, "learning_rate": 3.2463553778505e-05, "loss": 0.2082, "num_input_tokens_seen": 3209448, "step": 35645 }, { "epoch": 9.264553014553014, "grad_norm": 0.40358883142471313, "learning_rate": 3.245814254178628e-05, "loss": 0.2323, "num_input_tokens_seen": 3209880, "step": 35650 }, { "epoch": 9.265852390852391, "grad_norm": 0.6173139214515686, "learning_rate": 3.245273092148816e-05, "loss": 0.0476, "num_input_tokens_seen": 3210344, "step": 35655 }, { "epoch": 9.267151767151766, "grad_norm": 0.7483504414558411, "learning_rate": 3.244731891788893e-05, "loss": 0.1835, "num_input_tokens_seen": 3210824, "step": 35660 }, { "epoch": 9.268451143451143, "grad_norm": 0.1693347990512848, "learning_rate": 3.244190653126696e-05, "loss": 0.2945, "num_input_tokens_seen": 3211256, "step": 35665 }, { "epoch": 9.26975051975052, "grad_norm": 0.4157183766365051, "learning_rate": 3.2436493761900614e-05, "loss": 0.4035, "num_input_tokens_seen": 3211688, "step": 35670 }, { "epoch": 9.271049896049895, "grad_norm": 3.1510417461395264, "learning_rate": 3.2431080610068264e-05, "loss": 0.3925, "num_input_tokens_seen": 3212152, "step": 35675 }, { "epoch": 9.272349272349272, "grad_norm": 1.8687670230865479, "learning_rate": 3.242566707604832e-05, "loss": 0.2322, "num_input_tokens_seen": 3212600, "step": 35680 }, { "epoch": 9.27364864864865, "grad_norm": 1.1204147338867188, "learning_rate": 3.242025316011921e-05, "loss": 0.1497, "num_input_tokens_seen": 3213048, "step": 35685 }, { "epoch": 9.274948024948024, "grad_norm": 0.8362829089164734, "learning_rate": 3.241483886255936e-05, "loss": 0.1229, "num_input_tokens_seen": 3213480, "step": 35690 }, { "epoch": 9.276247401247401, "grad_norm": 1.7419538497924805, "learning_rate": 3.240942418364724e-05, "loss": 0.2368, "num_input_tokens_seen": 3213928, "step": 35695 }, { "epoch": 9.277546777546778, "grad_norm": 0.5277470946311951, "learning_rate": 3.2404009123661336e-05, "loss": 0.3349, "num_input_tokens_seen": 3214360, "step": 35700 }, { "epoch": 9.278846153846153, "grad_norm": 2.6904797554016113, "learning_rate": 3.239859368288015e-05, "loss": 0.1315, "num_input_tokens_seen": 3214792, "step": 35705 }, { "epoch": 9.28014553014553, "grad_norm": 2.843531608581543, "learning_rate": 3.2393177861582206e-05, "loss": 0.4049, "num_input_tokens_seen": 3215256, "step": 35710 }, { "epoch": 9.281444906444907, "grad_norm": 1.9148491621017456, "learning_rate": 3.238776166004604e-05, "loss": 0.3886, "num_input_tokens_seen": 3215704, "step": 35715 }, { "epoch": 9.282744282744282, "grad_norm": 0.7636803388595581, "learning_rate": 3.238234507855021e-05, "loss": 0.238, "num_input_tokens_seen": 3216168, "step": 35720 }, { "epoch": 9.28404365904366, "grad_norm": 0.8916128873825073, "learning_rate": 3.23769281173733e-05, "loss": 0.2246, "num_input_tokens_seen": 3216632, "step": 35725 }, { "epoch": 9.285343035343036, "grad_norm": 2.2237493991851807, "learning_rate": 3.237151077679391e-05, "loss": 0.2066, "num_input_tokens_seen": 3217064, "step": 35730 }, { "epoch": 9.286642411642411, "grad_norm": 1.41213059425354, "learning_rate": 3.236609305709066e-05, "loss": 0.4044, "num_input_tokens_seen": 3217512, "step": 35735 }, { "epoch": 9.287941787941788, "grad_norm": 2.40761661529541, "learning_rate": 3.2360674958542184e-05, "loss": 0.2474, "num_input_tokens_seen": 3217944, "step": 35740 }, { "epoch": 9.289241164241163, "grad_norm": 1.3660446405410767, "learning_rate": 3.2355256481427145e-05, "loss": 0.1718, "num_input_tokens_seen": 3218360, "step": 35745 }, { "epoch": 9.29054054054054, "grad_norm": 3.3561432361602783, "learning_rate": 3.234983762602422e-05, "loss": 0.2947, "num_input_tokens_seen": 3218792, "step": 35750 }, { "epoch": 9.291839916839917, "grad_norm": 1.601004958152771, "learning_rate": 3.234441839261209e-05, "loss": 0.1855, "num_input_tokens_seen": 3219288, "step": 35755 }, { "epoch": 9.293139293139292, "grad_norm": 1.558727741241455, "learning_rate": 3.2338998781469485e-05, "loss": 0.2916, "num_input_tokens_seen": 3219720, "step": 35760 }, { "epoch": 9.29443866943867, "grad_norm": 2.3147921562194824, "learning_rate": 3.233357879287515e-05, "loss": 0.3353, "num_input_tokens_seen": 3220136, "step": 35765 }, { "epoch": 9.295738045738046, "grad_norm": 1.5901888608932495, "learning_rate": 3.232815842710784e-05, "loss": 0.2623, "num_input_tokens_seen": 3220600, "step": 35770 }, { "epoch": 9.297037422037421, "grad_norm": 1.7241672277450562, "learning_rate": 3.2322737684446304e-05, "loss": 0.3206, "num_input_tokens_seen": 3221096, "step": 35775 }, { "epoch": 9.298336798336798, "grad_norm": 1.981205701828003, "learning_rate": 3.231731656516936e-05, "loss": 0.2772, "num_input_tokens_seen": 3221528, "step": 35780 }, { "epoch": 9.299636174636175, "grad_norm": 1.3969101905822754, "learning_rate": 3.231189506955581e-05, "loss": 0.2243, "num_input_tokens_seen": 3221992, "step": 35785 }, { "epoch": 9.30093555093555, "grad_norm": 1.7817991971969604, "learning_rate": 3.2306473197884494e-05, "loss": 0.2537, "num_input_tokens_seen": 3222440, "step": 35790 }, { "epoch": 9.302234927234927, "grad_norm": 1.5259087085723877, "learning_rate": 3.2301050950434255e-05, "loss": 0.2586, "num_input_tokens_seen": 3222888, "step": 35795 }, { "epoch": 9.303534303534304, "grad_norm": 1.969385027885437, "learning_rate": 3.229562832748398e-05, "loss": 0.2961, "num_input_tokens_seen": 3223384, "step": 35800 }, { "epoch": 9.30483367983368, "grad_norm": 1.4186676740646362, "learning_rate": 3.2290205329312544e-05, "loss": 0.4133, "num_input_tokens_seen": 3223832, "step": 35805 }, { "epoch": 9.306133056133056, "grad_norm": 1.882434368133545, "learning_rate": 3.2284781956198866e-05, "loss": 0.2805, "num_input_tokens_seen": 3224280, "step": 35810 }, { "epoch": 9.307432432432432, "grad_norm": 2.094613790512085, "learning_rate": 3.2279358208421875e-05, "loss": 0.2649, "num_input_tokens_seen": 3224760, "step": 35815 }, { "epoch": 9.308731808731808, "grad_norm": 0.8601298332214355, "learning_rate": 3.227393408626051e-05, "loss": 0.2326, "num_input_tokens_seen": 3225224, "step": 35820 }, { "epoch": 9.310031185031185, "grad_norm": 0.5593995451927185, "learning_rate": 3.226850958999375e-05, "loss": 0.2409, "num_input_tokens_seen": 3225672, "step": 35825 }, { "epoch": 9.31133056133056, "grad_norm": 2.5110397338867188, "learning_rate": 3.226308471990057e-05, "loss": 0.2121, "num_input_tokens_seen": 3226072, "step": 35830 }, { "epoch": 9.312629937629938, "grad_norm": 0.7181068658828735, "learning_rate": 3.225765947625999e-05, "loss": 0.2121, "num_input_tokens_seen": 3226504, "step": 35835 }, { "epoch": 9.313929313929314, "grad_norm": 1.720503568649292, "learning_rate": 3.225223385935102e-05, "loss": 0.3858, "num_input_tokens_seen": 3226936, "step": 35840 }, { "epoch": 9.31522869022869, "grad_norm": 1.8885159492492676, "learning_rate": 3.224680786945272e-05, "loss": 0.3443, "num_input_tokens_seen": 3227352, "step": 35845 }, { "epoch": 9.316528066528067, "grad_norm": 0.7026519179344177, "learning_rate": 3.224138150684415e-05, "loss": 0.2428, "num_input_tokens_seen": 3227800, "step": 35850 }, { "epoch": 9.317827442827443, "grad_norm": 0.7437567710876465, "learning_rate": 3.223595477180439e-05, "loss": 0.2053, "num_input_tokens_seen": 3228232, "step": 35855 }, { "epoch": 9.319126819126819, "grad_norm": 1.2240105867385864, "learning_rate": 3.223052766461254e-05, "loss": 0.2853, "num_input_tokens_seen": 3228680, "step": 35860 }, { "epoch": 9.320426195426196, "grad_norm": 0.785860002040863, "learning_rate": 3.222510018554773e-05, "loss": 0.2462, "num_input_tokens_seen": 3229112, "step": 35865 }, { "epoch": 9.321725571725572, "grad_norm": 1.3930301666259766, "learning_rate": 3.221967233488909e-05, "loss": 0.3442, "num_input_tokens_seen": 3229576, "step": 35870 }, { "epoch": 9.323024948024948, "grad_norm": 2.2393105030059814, "learning_rate": 3.2214244112915795e-05, "loss": 0.2694, "num_input_tokens_seen": 3230040, "step": 35875 }, { "epoch": 9.324324324324325, "grad_norm": 0.560268759727478, "learning_rate": 3.2208815519907e-05, "loss": 0.2007, "num_input_tokens_seen": 3230440, "step": 35880 }, { "epoch": 9.325623700623701, "grad_norm": 3.5567219257354736, "learning_rate": 3.220338655614192e-05, "loss": 0.2847, "num_input_tokens_seen": 3230888, "step": 35885 }, { "epoch": 9.326923076923077, "grad_norm": 2.171013116836548, "learning_rate": 3.2197957221899764e-05, "loss": 0.2736, "num_input_tokens_seen": 3231336, "step": 35890 }, { "epoch": 9.328222453222454, "grad_norm": 1.9263380765914917, "learning_rate": 3.2192527517459774e-05, "loss": 0.3321, "num_input_tokens_seen": 3231752, "step": 35895 }, { "epoch": 9.329521829521829, "grad_norm": 1.5472427606582642, "learning_rate": 3.21870974431012e-05, "loss": 0.2147, "num_input_tokens_seen": 3232216, "step": 35900 }, { "epoch": 9.330821205821206, "grad_norm": 1.8109065294265747, "learning_rate": 3.2181666999103324e-05, "loss": 0.1637, "num_input_tokens_seen": 3232696, "step": 35905 }, { "epoch": 9.332120582120583, "grad_norm": 0.4654960334300995, "learning_rate": 3.217623618574543e-05, "loss": 0.1834, "num_input_tokens_seen": 3233144, "step": 35910 }, { "epoch": 9.333419958419958, "grad_norm": 2.651029348373413, "learning_rate": 3.2170805003306824e-05, "loss": 0.2755, "num_input_tokens_seen": 3233576, "step": 35915 }, { "epoch": 9.334719334719335, "grad_norm": 0.39615514874458313, "learning_rate": 3.216537345206686e-05, "loss": 0.2804, "num_input_tokens_seen": 3234024, "step": 35920 }, { "epoch": 9.336018711018712, "grad_norm": 1.707980751991272, "learning_rate": 3.215994153230487e-05, "loss": 0.2182, "num_input_tokens_seen": 3234456, "step": 35925 }, { "epoch": 9.337318087318087, "grad_norm": 2.114716053009033, "learning_rate": 3.215450924430022e-05, "loss": 0.2402, "num_input_tokens_seen": 3234888, "step": 35930 }, { "epoch": 9.338617463617464, "grad_norm": 1.553875207901001, "learning_rate": 3.214907658833231e-05, "loss": 0.2089, "num_input_tokens_seen": 3235352, "step": 35935 }, { "epoch": 9.33991683991684, "grad_norm": 1.8284245729446411, "learning_rate": 3.214364356468054e-05, "loss": 0.2632, "num_input_tokens_seen": 3235880, "step": 35940 }, { "epoch": 9.341216216216216, "grad_norm": 0.7864598035812378, "learning_rate": 3.213821017362434e-05, "loss": 0.2866, "num_input_tokens_seen": 3236360, "step": 35945 }, { "epoch": 9.342515592515593, "grad_norm": 1.272323727607727, "learning_rate": 3.2132776415443145e-05, "loss": 0.2758, "num_input_tokens_seen": 3236824, "step": 35950 }, { "epoch": 9.34381496881497, "grad_norm": 0.8461772799491882, "learning_rate": 3.212734229041643e-05, "loss": 0.254, "num_input_tokens_seen": 3237288, "step": 35955 }, { "epoch": 9.345114345114345, "grad_norm": 0.8500654697418213, "learning_rate": 3.212190779882367e-05, "loss": 0.2508, "num_input_tokens_seen": 3237768, "step": 35960 }, { "epoch": 9.346413721413722, "grad_norm": 0.6961926221847534, "learning_rate": 3.211647294094437e-05, "loss": 0.2009, "num_input_tokens_seen": 3238200, "step": 35965 }, { "epoch": 9.347713097713097, "grad_norm": 1.015797734260559, "learning_rate": 3.2111037717058045e-05, "loss": 0.2919, "num_input_tokens_seen": 3238648, "step": 35970 }, { "epoch": 9.349012474012474, "grad_norm": 0.6496147513389587, "learning_rate": 3.210560212744424e-05, "loss": 0.2132, "num_input_tokens_seen": 3239080, "step": 35975 }, { "epoch": 9.35031185031185, "grad_norm": 2.051063060760498, "learning_rate": 3.210016617238251e-05, "loss": 0.3762, "num_input_tokens_seen": 3239528, "step": 35980 }, { "epoch": 9.351611226611226, "grad_norm": 1.0038813352584839, "learning_rate": 3.209472985215243e-05, "loss": 0.2607, "num_input_tokens_seen": 3239960, "step": 35985 }, { "epoch": 9.352910602910603, "grad_norm": 1.2755565643310547, "learning_rate": 3.208929316703359e-05, "loss": 0.2625, "num_input_tokens_seen": 3240424, "step": 35990 }, { "epoch": 9.35420997920998, "grad_norm": 1.8940064907073975, "learning_rate": 3.208385611730561e-05, "loss": 0.2355, "num_input_tokens_seen": 3240872, "step": 35995 }, { "epoch": 9.355509355509355, "grad_norm": 1.7599436044692993, "learning_rate": 3.2078418703248126e-05, "loss": 0.2937, "num_input_tokens_seen": 3241320, "step": 36000 }, { "epoch": 9.356808731808732, "grad_norm": 0.7237112522125244, "learning_rate": 3.207298092514079e-05, "loss": 0.2529, "num_input_tokens_seen": 3241784, "step": 36005 }, { "epoch": 9.358108108108109, "grad_norm": 0.8216045498847961, "learning_rate": 3.206754278326326e-05, "loss": 0.2582, "num_input_tokens_seen": 3242200, "step": 36010 }, { "epoch": 9.359407484407484, "grad_norm": 0.5162513256072998, "learning_rate": 3.206210427789524e-05, "loss": 0.2231, "num_input_tokens_seen": 3242648, "step": 36015 }, { "epoch": 9.36070686070686, "grad_norm": 1.773650884628296, "learning_rate": 3.2056665409316426e-05, "loss": 0.241, "num_input_tokens_seen": 3243128, "step": 36020 }, { "epoch": 9.362006237006238, "grad_norm": 1.6128259897232056, "learning_rate": 3.205122617780655e-05, "loss": 0.1512, "num_input_tokens_seen": 3243560, "step": 36025 }, { "epoch": 9.363305613305613, "grad_norm": 2.2139852046966553, "learning_rate": 3.2045786583645354e-05, "loss": 0.3254, "num_input_tokens_seen": 3244040, "step": 36030 }, { "epoch": 9.36460498960499, "grad_norm": 2.072714328765869, "learning_rate": 3.2040346627112604e-05, "loss": 0.2349, "num_input_tokens_seen": 3244488, "step": 36035 }, { "epoch": 9.365904365904367, "grad_norm": 0.538465678691864, "learning_rate": 3.2034906308488075e-05, "loss": 0.2924, "num_input_tokens_seen": 3244952, "step": 36040 }, { "epoch": 9.367203742203742, "grad_norm": 1.0833172798156738, "learning_rate": 3.2029465628051586e-05, "loss": 0.2958, "num_input_tokens_seen": 3245416, "step": 36045 }, { "epoch": 9.368503118503119, "grad_norm": 5.777713298797607, "learning_rate": 3.202402458608294e-05, "loss": 0.3053, "num_input_tokens_seen": 3245864, "step": 36050 }, { "epoch": 9.369802494802494, "grad_norm": 0.8838732242584229, "learning_rate": 3.201858318286198e-05, "loss": 0.2138, "num_input_tokens_seen": 3246328, "step": 36055 }, { "epoch": 9.371101871101871, "grad_norm": 0.9093577265739441, "learning_rate": 3.201314141866856e-05, "loss": 0.2985, "num_input_tokens_seen": 3246792, "step": 36060 }, { "epoch": 9.372401247401248, "grad_norm": 0.7550411820411682, "learning_rate": 3.2007699293782555e-05, "loss": 0.2591, "num_input_tokens_seen": 3247208, "step": 36065 }, { "epoch": 9.373700623700623, "grad_norm": 0.6543846130371094, "learning_rate": 3.2002256808483864e-05, "loss": 0.3324, "num_input_tokens_seen": 3247640, "step": 36070 }, { "epoch": 9.375, "grad_norm": 1.3959205150604248, "learning_rate": 3.19968139630524e-05, "loss": 0.2576, "num_input_tokens_seen": 3248088, "step": 36075 }, { "epoch": 9.376299376299377, "grad_norm": 1.6641110181808472, "learning_rate": 3.199137075776809e-05, "loss": 0.3878, "num_input_tokens_seen": 3248536, "step": 36080 }, { "epoch": 9.377598752598752, "grad_norm": 0.874968945980072, "learning_rate": 3.1985927192910875e-05, "loss": 0.3054, "num_input_tokens_seen": 3249000, "step": 36085 }, { "epoch": 9.378898128898129, "grad_norm": 1.0094139575958252, "learning_rate": 3.198048326876074e-05, "loss": 0.2501, "num_input_tokens_seen": 3249416, "step": 36090 }, { "epoch": 9.380197505197506, "grad_norm": 0.8702488541603088, "learning_rate": 3.197503898559765e-05, "loss": 0.2325, "num_input_tokens_seen": 3249832, "step": 36095 }, { "epoch": 9.381496881496881, "grad_norm": 0.9113061428070068, "learning_rate": 3.1969594343701626e-05, "loss": 0.2182, "num_input_tokens_seen": 3250296, "step": 36100 }, { "epoch": 9.382796257796258, "grad_norm": 0.9605765342712402, "learning_rate": 3.196414934335269e-05, "loss": 0.1887, "num_input_tokens_seen": 3250728, "step": 36105 }, { "epoch": 9.384095634095633, "grad_norm": 1.7675379514694214, "learning_rate": 3.195870398483089e-05, "loss": 0.1439, "num_input_tokens_seen": 3251192, "step": 36110 }, { "epoch": 9.38539501039501, "grad_norm": 0.3789030909538269, "learning_rate": 3.195325826841625e-05, "loss": 0.2245, "num_input_tokens_seen": 3251656, "step": 36115 }, { "epoch": 9.386694386694387, "grad_norm": 0.3074887692928314, "learning_rate": 3.194781219438889e-05, "loss": 0.3066, "num_input_tokens_seen": 3252088, "step": 36120 }, { "epoch": 9.387993762993762, "grad_norm": 0.3622819781303406, "learning_rate": 3.19423657630289e-05, "loss": 0.2847, "num_input_tokens_seen": 3252520, "step": 36125 }, { "epoch": 9.38929313929314, "grad_norm": 2.312406063079834, "learning_rate": 3.193691897461638e-05, "loss": 0.2166, "num_input_tokens_seen": 3253000, "step": 36130 }, { "epoch": 9.390592515592516, "grad_norm": 0.5393637418746948, "learning_rate": 3.193147182943147e-05, "loss": 0.3136, "num_input_tokens_seen": 3253448, "step": 36135 }, { "epoch": 9.391891891891891, "grad_norm": 1.0525521039962769, "learning_rate": 3.192602432775433e-05, "loss": 0.2194, "num_input_tokens_seen": 3253912, "step": 36140 }, { "epoch": 9.393191268191268, "grad_norm": 0.9969369769096375, "learning_rate": 3.1920576469865115e-05, "loss": 0.147, "num_input_tokens_seen": 3254376, "step": 36145 }, { "epoch": 9.394490644490645, "grad_norm": 2.38993501663208, "learning_rate": 3.191512825604402e-05, "loss": 0.3602, "num_input_tokens_seen": 3254808, "step": 36150 }, { "epoch": 9.39579002079002, "grad_norm": 0.6898238062858582, "learning_rate": 3.1909679686571256e-05, "loss": 0.1802, "num_input_tokens_seen": 3255224, "step": 36155 }, { "epoch": 9.397089397089397, "grad_norm": 2.4226796627044678, "learning_rate": 3.190423076172705e-05, "loss": 0.2066, "num_input_tokens_seen": 3255640, "step": 36160 }, { "epoch": 9.398388773388774, "grad_norm": 3.0891072750091553, "learning_rate": 3.1898781481791624e-05, "loss": 0.3515, "num_input_tokens_seen": 3256072, "step": 36165 }, { "epoch": 9.39968814968815, "grad_norm": 0.9534833431243896, "learning_rate": 3.1893331847045266e-05, "loss": 0.2128, "num_input_tokens_seen": 3256520, "step": 36170 }, { "epoch": 9.400987525987526, "grad_norm": 2.156981945037842, "learning_rate": 3.188788185776825e-05, "loss": 0.2126, "num_input_tokens_seen": 3257000, "step": 36175 }, { "epoch": 9.402286902286903, "grad_norm": 0.7369605898857117, "learning_rate": 3.188243151424087e-05, "loss": 0.1577, "num_input_tokens_seen": 3257432, "step": 36180 }, { "epoch": 9.403586278586278, "grad_norm": 1.0573830604553223, "learning_rate": 3.1876980816743434e-05, "loss": 0.2243, "num_input_tokens_seen": 3257880, "step": 36185 }, { "epoch": 9.404885654885655, "grad_norm": 0.6623852849006653, "learning_rate": 3.187152976555629e-05, "loss": 0.1987, "num_input_tokens_seen": 3258360, "step": 36190 }, { "epoch": 9.40618503118503, "grad_norm": 4.309863090515137, "learning_rate": 3.186607836095979e-05, "loss": 0.1366, "num_input_tokens_seen": 3258808, "step": 36195 }, { "epoch": 9.407484407484407, "grad_norm": 2.92529559135437, "learning_rate": 3.18606266032343e-05, "loss": 0.3708, "num_input_tokens_seen": 3259224, "step": 36200 }, { "epoch": 9.408783783783784, "grad_norm": 3.689365863800049, "learning_rate": 3.185517449266021e-05, "loss": 0.2594, "num_input_tokens_seen": 3259720, "step": 36205 }, { "epoch": 9.41008316008316, "grad_norm": 1.3900094032287598, "learning_rate": 3.1849722029517934e-05, "loss": 0.3773, "num_input_tokens_seen": 3260216, "step": 36210 }, { "epoch": 9.411382536382536, "grad_norm": 1.3308358192443848, "learning_rate": 3.184426921408789e-05, "loss": 0.1867, "num_input_tokens_seen": 3260632, "step": 36215 }, { "epoch": 9.412681912681913, "grad_norm": 3.1630501747131348, "learning_rate": 3.1838816046650503e-05, "loss": 0.5733, "num_input_tokens_seen": 3261096, "step": 36220 }, { "epoch": 9.413981288981288, "grad_norm": 0.6624715924263, "learning_rate": 3.183336252748627e-05, "loss": 0.2472, "num_input_tokens_seen": 3261528, "step": 36225 }, { "epoch": 9.415280665280665, "grad_norm": 1.5392487049102783, "learning_rate": 3.182790865687565e-05, "loss": 0.1574, "num_input_tokens_seen": 3261976, "step": 36230 }, { "epoch": 9.416580041580042, "grad_norm": 1.890860676765442, "learning_rate": 3.182245443509915e-05, "loss": 0.1959, "num_input_tokens_seen": 3262408, "step": 36235 }, { "epoch": 9.417879417879417, "grad_norm": 0.5933676958084106, "learning_rate": 3.181699986243728e-05, "loss": 0.1665, "num_input_tokens_seen": 3262888, "step": 36240 }, { "epoch": 9.419178794178794, "grad_norm": 0.8205718398094177, "learning_rate": 3.1811544939170575e-05, "loss": 0.1618, "num_input_tokens_seen": 3263336, "step": 36245 }, { "epoch": 9.420478170478171, "grad_norm": 1.7281776666641235, "learning_rate": 3.180608966557959e-05, "loss": 0.352, "num_input_tokens_seen": 3263784, "step": 36250 }, { "epoch": 9.421777546777546, "grad_norm": 2.3506314754486084, "learning_rate": 3.180063404194489e-05, "loss": 0.3175, "num_input_tokens_seen": 3264248, "step": 36255 }, { "epoch": 9.423076923076923, "grad_norm": 0.5885881185531616, "learning_rate": 3.179517806854705e-05, "loss": 0.2018, "num_input_tokens_seen": 3264744, "step": 36260 }, { "epoch": 9.424376299376299, "grad_norm": 1.965605616569519, "learning_rate": 3.1789721745666714e-05, "loss": 0.1351, "num_input_tokens_seen": 3265208, "step": 36265 }, { "epoch": 9.425675675675675, "grad_norm": 3.291226387023926, "learning_rate": 3.178426507358448e-05, "loss": 0.3245, "num_input_tokens_seen": 3265672, "step": 36270 }, { "epoch": 9.426975051975052, "grad_norm": 2.1715264320373535, "learning_rate": 3.177880805258098e-05, "loss": 0.1446, "num_input_tokens_seen": 3266136, "step": 36275 }, { "epoch": 9.428274428274428, "grad_norm": 4.125585556030273, "learning_rate": 3.1773350682936895e-05, "loss": 0.339, "num_input_tokens_seen": 3266632, "step": 36280 }, { "epoch": 9.429573804573804, "grad_norm": 0.6346539855003357, "learning_rate": 3.17678929649329e-05, "loss": 0.3722, "num_input_tokens_seen": 3267096, "step": 36285 }, { "epoch": 9.430873180873181, "grad_norm": 0.4687715470790863, "learning_rate": 3.176243489884967e-05, "loss": 0.294, "num_input_tokens_seen": 3267512, "step": 36290 }, { "epoch": 9.432172557172557, "grad_norm": 0.9411725401878357, "learning_rate": 3.1756976484967944e-05, "loss": 0.2046, "num_input_tokens_seen": 3267976, "step": 36295 }, { "epoch": 9.433471933471933, "grad_norm": 3.329568862915039, "learning_rate": 3.1751517723568445e-05, "loss": 0.3806, "num_input_tokens_seen": 3268392, "step": 36300 }, { "epoch": 9.43477130977131, "grad_norm": 2.2872536182403564, "learning_rate": 3.1746058614931916e-05, "loss": 0.1291, "num_input_tokens_seen": 3268840, "step": 36305 }, { "epoch": 9.436070686070686, "grad_norm": 3.2025272846221924, "learning_rate": 3.1740599159339125e-05, "loss": 0.3829, "num_input_tokens_seen": 3269272, "step": 36310 }, { "epoch": 9.437370062370062, "grad_norm": 1.2353603839874268, "learning_rate": 3.1735139357070866e-05, "loss": 0.2365, "num_input_tokens_seen": 3269704, "step": 36315 }, { "epoch": 9.43866943866944, "grad_norm": 3.614652633666992, "learning_rate": 3.1729679208407935e-05, "loss": 0.2414, "num_input_tokens_seen": 3270136, "step": 36320 }, { "epoch": 9.439968814968815, "grad_norm": 3.0586142539978027, "learning_rate": 3.172421871363116e-05, "loss": 0.2439, "num_input_tokens_seen": 3270568, "step": 36325 }, { "epoch": 9.441268191268192, "grad_norm": 0.9132594466209412, "learning_rate": 3.171875787302136e-05, "loss": 0.242, "num_input_tokens_seen": 3271000, "step": 36330 }, { "epoch": 9.442567567567568, "grad_norm": 0.5893222689628601, "learning_rate": 3.1713296686859426e-05, "loss": 0.1468, "num_input_tokens_seen": 3271448, "step": 36335 }, { "epoch": 9.443866943866944, "grad_norm": 3.4186437129974365, "learning_rate": 3.17078351554262e-05, "loss": 0.1715, "num_input_tokens_seen": 3271864, "step": 36340 }, { "epoch": 9.44516632016632, "grad_norm": 0.2198123335838318, "learning_rate": 3.170237327900258e-05, "loss": 0.2007, "num_input_tokens_seen": 3272312, "step": 36345 }, { "epoch": 9.446465696465696, "grad_norm": 0.6941258311271667, "learning_rate": 3.169691105786948e-05, "loss": 0.2447, "num_input_tokens_seen": 3272776, "step": 36350 }, { "epoch": 9.447765072765073, "grad_norm": 0.456843763589859, "learning_rate": 3.169144849230783e-05, "loss": 0.3352, "num_input_tokens_seen": 3273192, "step": 36355 }, { "epoch": 9.44906444906445, "grad_norm": 3.7258124351501465, "learning_rate": 3.168598558259858e-05, "loss": 0.4896, "num_input_tokens_seen": 3273624, "step": 36360 }, { "epoch": 9.450363825363825, "grad_norm": 0.6799912452697754, "learning_rate": 3.168052232902268e-05, "loss": 0.2854, "num_input_tokens_seen": 3274088, "step": 36365 }, { "epoch": 9.451663201663202, "grad_norm": 1.6533610820770264, "learning_rate": 3.1675058731861115e-05, "loss": 0.2029, "num_input_tokens_seen": 3274536, "step": 36370 }, { "epoch": 9.452962577962579, "grad_norm": 0.5114681720733643, "learning_rate": 3.1669594791394886e-05, "loss": 0.0658, "num_input_tokens_seen": 3274968, "step": 36375 }, { "epoch": 9.454261954261954, "grad_norm": 3.050457239151001, "learning_rate": 3.1664130507905e-05, "loss": 0.4697, "num_input_tokens_seen": 3275448, "step": 36380 }, { "epoch": 9.45556133056133, "grad_norm": 1.5654127597808838, "learning_rate": 3.16586658816725e-05, "loss": 0.3298, "num_input_tokens_seen": 3275896, "step": 36385 }, { "epoch": 9.456860706860708, "grad_norm": 0.5896149277687073, "learning_rate": 3.165320091297843e-05, "loss": 0.0566, "num_input_tokens_seen": 3276360, "step": 36390 }, { "epoch": 9.458160083160083, "grad_norm": 1.377445936203003, "learning_rate": 3.164773560210387e-05, "loss": 0.3846, "num_input_tokens_seen": 3276840, "step": 36395 }, { "epoch": 9.45945945945946, "grad_norm": 0.5370206236839294, "learning_rate": 3.16422699493299e-05, "loss": 0.2992, "num_input_tokens_seen": 3277288, "step": 36400 }, { "epoch": 9.460758835758837, "grad_norm": 0.5766162276268005, "learning_rate": 3.1636803954937616e-05, "loss": 0.2587, "num_input_tokens_seen": 3277768, "step": 36405 }, { "epoch": 9.462058212058212, "grad_norm": 0.5199432969093323, "learning_rate": 3.163133761920815e-05, "loss": 0.3271, "num_input_tokens_seen": 3278216, "step": 36410 }, { "epoch": 9.463357588357589, "grad_norm": 0.6998236775398254, "learning_rate": 3.162587094242263e-05, "loss": 0.1593, "num_input_tokens_seen": 3278696, "step": 36415 }, { "epoch": 9.464656964656964, "grad_norm": 2.5709211826324463, "learning_rate": 3.162040392486222e-05, "loss": 0.2401, "num_input_tokens_seen": 3279224, "step": 36420 }, { "epoch": 9.46595634095634, "grad_norm": 0.4508076012134552, "learning_rate": 3.1614936566808095e-05, "loss": 0.3008, "num_input_tokens_seen": 3279688, "step": 36425 }, { "epoch": 9.467255717255718, "grad_norm": 1.4507029056549072, "learning_rate": 3.160946886854145e-05, "loss": 0.1755, "num_input_tokens_seen": 3280152, "step": 36430 }, { "epoch": 9.468555093555093, "grad_norm": 3.5118308067321777, "learning_rate": 3.1604000830343475e-05, "loss": 0.2685, "num_input_tokens_seen": 3280600, "step": 36435 }, { "epoch": 9.46985446985447, "grad_norm": 0.5656672716140747, "learning_rate": 3.159853245249542e-05, "loss": 0.2191, "num_input_tokens_seen": 3281032, "step": 36440 }, { "epoch": 9.471153846153847, "grad_norm": 2.1938986778259277, "learning_rate": 3.1593063735278517e-05, "loss": 0.2861, "num_input_tokens_seen": 3281496, "step": 36445 }, { "epoch": 9.472453222453222, "grad_norm": 1.059667706489563, "learning_rate": 3.1587594678974034e-05, "loss": 0.4201, "num_input_tokens_seen": 3281960, "step": 36450 }, { "epoch": 9.473752598752599, "grad_norm": 0.8836149573326111, "learning_rate": 3.158212528386323e-05, "loss": 0.107, "num_input_tokens_seen": 3282440, "step": 36455 }, { "epoch": 9.475051975051976, "grad_norm": 2.0782108306884766, "learning_rate": 3.157665555022742e-05, "loss": 0.3268, "num_input_tokens_seen": 3282888, "step": 36460 }, { "epoch": 9.47635135135135, "grad_norm": 0.537373423576355, "learning_rate": 3.157118547834793e-05, "loss": 0.1803, "num_input_tokens_seen": 3283352, "step": 36465 }, { "epoch": 9.477650727650728, "grad_norm": 0.5713454484939575, "learning_rate": 3.1565715068506056e-05, "loss": 0.1746, "num_input_tokens_seen": 3283832, "step": 36470 }, { "epoch": 9.478950103950105, "grad_norm": 1.5062938928604126, "learning_rate": 3.156024432098317e-05, "loss": 0.2609, "num_input_tokens_seen": 3284312, "step": 36475 }, { "epoch": 9.48024948024948, "grad_norm": 1.3840116262435913, "learning_rate": 3.155477323606064e-05, "loss": 0.3212, "num_input_tokens_seen": 3284792, "step": 36480 }, { "epoch": 9.481548856548857, "grad_norm": 0.991933286190033, "learning_rate": 3.154930181401984e-05, "loss": 0.2747, "num_input_tokens_seen": 3285224, "step": 36485 }, { "epoch": 9.482848232848234, "grad_norm": 0.8376442193984985, "learning_rate": 3.154383005514216e-05, "loss": 0.1858, "num_input_tokens_seen": 3285688, "step": 36490 }, { "epoch": 9.484147609147609, "grad_norm": 2.0359127521514893, "learning_rate": 3.153835795970904e-05, "loss": 0.2886, "num_input_tokens_seen": 3286184, "step": 36495 }, { "epoch": 9.485446985446986, "grad_norm": 1.5815895795822144, "learning_rate": 3.153288552800191e-05, "loss": 0.2239, "num_input_tokens_seen": 3286648, "step": 36500 }, { "epoch": 9.486746361746361, "grad_norm": 0.5479756593704224, "learning_rate": 3.152741276030221e-05, "loss": 0.1884, "num_input_tokens_seen": 3287192, "step": 36505 }, { "epoch": 9.488045738045738, "grad_norm": 0.7591684460639954, "learning_rate": 3.152193965689142e-05, "loss": 0.1701, "num_input_tokens_seen": 3287608, "step": 36510 }, { "epoch": 9.489345114345115, "grad_norm": 0.41228166222572327, "learning_rate": 3.151646621805102e-05, "loss": 0.2048, "num_input_tokens_seen": 3288088, "step": 36515 }, { "epoch": 9.49064449064449, "grad_norm": 0.4801678955554962, "learning_rate": 3.151099244406253e-05, "loss": 0.2002, "num_input_tokens_seen": 3288552, "step": 36520 }, { "epoch": 9.491943866943867, "grad_norm": 0.20783308148384094, "learning_rate": 3.150551833520745e-05, "loss": 0.3024, "num_input_tokens_seen": 3288984, "step": 36525 }, { "epoch": 9.493243243243244, "grad_norm": 3.286921977996826, "learning_rate": 3.1500043891767336e-05, "loss": 0.4403, "num_input_tokens_seen": 3289464, "step": 36530 }, { "epoch": 9.494542619542619, "grad_norm": 2.5351603031158447, "learning_rate": 3.149456911402373e-05, "loss": 0.2507, "num_input_tokens_seen": 3289928, "step": 36535 }, { "epoch": 9.495841995841996, "grad_norm": 0.6716941595077515, "learning_rate": 3.148909400225821e-05, "loss": 0.3218, "num_input_tokens_seen": 3290344, "step": 36540 }, { "epoch": 9.497141372141373, "grad_norm": 1.7597064971923828, "learning_rate": 3.148361855675237e-05, "loss": 0.2607, "num_input_tokens_seen": 3290776, "step": 36545 }, { "epoch": 9.498440748440748, "grad_norm": 0.9367545247077942, "learning_rate": 3.147814277778782e-05, "loss": 0.1686, "num_input_tokens_seen": 3291192, "step": 36550 }, { "epoch": 9.499740124740125, "grad_norm": 0.37886562943458557, "learning_rate": 3.1472666665646176e-05, "loss": 0.1637, "num_input_tokens_seen": 3291640, "step": 36555 }, { "epoch": 9.5010395010395, "grad_norm": 2.3438568115234375, "learning_rate": 3.146719022060908e-05, "loss": 0.3068, "num_input_tokens_seen": 3292072, "step": 36560 }, { "epoch": 9.502338877338877, "grad_norm": 0.5418007969856262, "learning_rate": 3.1461713442958204e-05, "loss": 0.236, "num_input_tokens_seen": 3292536, "step": 36565 }, { "epoch": 9.503638253638254, "grad_norm": 0.5440788269042969, "learning_rate": 3.145623633297521e-05, "loss": 0.1439, "num_input_tokens_seen": 3293000, "step": 36570 }, { "epoch": 9.50493762993763, "grad_norm": 0.4651923179626465, "learning_rate": 3.145075889094179e-05, "loss": 0.2995, "num_input_tokens_seen": 3293480, "step": 36575 }, { "epoch": 9.506237006237006, "grad_norm": 3.8818342685699463, "learning_rate": 3.1445281117139666e-05, "loss": 0.2648, "num_input_tokens_seen": 3293896, "step": 36580 }, { "epoch": 9.507536382536383, "grad_norm": 0.46855273842811584, "learning_rate": 3.143980301185055e-05, "loss": 0.1966, "num_input_tokens_seen": 3294328, "step": 36585 }, { "epoch": 9.508835758835758, "grad_norm": 1.3931448459625244, "learning_rate": 3.1434324575356214e-05, "loss": 0.209, "num_input_tokens_seen": 3294824, "step": 36590 }, { "epoch": 9.510135135135135, "grad_norm": 0.4079519808292389, "learning_rate": 3.142884580793838e-05, "loss": 0.2678, "num_input_tokens_seen": 3295272, "step": 36595 }, { "epoch": 9.511434511434512, "grad_norm": 2.188406467437744, "learning_rate": 3.142336670987886e-05, "loss": 0.1968, "num_input_tokens_seen": 3295768, "step": 36600 }, { "epoch": 9.512733887733887, "grad_norm": 0.6733528971672058, "learning_rate": 3.141788728145943e-05, "loss": 0.3298, "num_input_tokens_seen": 3296200, "step": 36605 }, { "epoch": 9.514033264033264, "grad_norm": 0.837581992149353, "learning_rate": 3.1412407522961906e-05, "loss": 0.1299, "num_input_tokens_seen": 3296648, "step": 36610 }, { "epoch": 9.515332640332641, "grad_norm": 3.1751582622528076, "learning_rate": 3.140692743466812e-05, "loss": 0.4051, "num_input_tokens_seen": 3297112, "step": 36615 }, { "epoch": 9.516632016632016, "grad_norm": 1.325109839439392, "learning_rate": 3.140144701685992e-05, "loss": 0.2751, "num_input_tokens_seen": 3297528, "step": 36620 }, { "epoch": 9.517931392931393, "grad_norm": 1.0879923105239868, "learning_rate": 3.139596626981916e-05, "loss": 0.2357, "num_input_tokens_seen": 3297944, "step": 36625 }, { "epoch": 9.51923076923077, "grad_norm": 1.165305733680725, "learning_rate": 3.139048519382773e-05, "loss": 0.2549, "num_input_tokens_seen": 3298344, "step": 36630 }, { "epoch": 9.520530145530145, "grad_norm": 2.338660955429077, "learning_rate": 3.138500378916752e-05, "loss": 0.3878, "num_input_tokens_seen": 3298792, "step": 36635 }, { "epoch": 9.521829521829522, "grad_norm": 2.0994293689727783, "learning_rate": 3.137952205612045e-05, "loss": 0.1672, "num_input_tokens_seen": 3299208, "step": 36640 }, { "epoch": 9.523128898128899, "grad_norm": 1.2288336753845215, "learning_rate": 3.137403999496845e-05, "loss": 0.2718, "num_input_tokens_seen": 3299624, "step": 36645 }, { "epoch": 9.524428274428274, "grad_norm": 1.2850847244262695, "learning_rate": 3.1368557605993465e-05, "loss": 0.2741, "num_input_tokens_seen": 3300056, "step": 36650 }, { "epoch": 9.525727650727651, "grad_norm": 0.7416209578514099, "learning_rate": 3.1363074889477463e-05, "loss": 0.1479, "num_input_tokens_seen": 3300536, "step": 36655 }, { "epoch": 9.527027027027026, "grad_norm": 2.5300683975219727, "learning_rate": 3.135759184570242e-05, "loss": 0.2031, "num_input_tokens_seen": 3300968, "step": 36660 }, { "epoch": 9.528326403326403, "grad_norm": 0.8446624875068665, "learning_rate": 3.1352108474950336e-05, "loss": 0.1656, "num_input_tokens_seen": 3301400, "step": 36665 }, { "epoch": 9.52962577962578, "grad_norm": 1.2329670190811157, "learning_rate": 3.134662477750323e-05, "loss": 0.1835, "num_input_tokens_seen": 3301880, "step": 36670 }, { "epoch": 9.530925155925155, "grad_norm": 1.8964406251907349, "learning_rate": 3.1341140753643126e-05, "loss": 0.1322, "num_input_tokens_seen": 3302344, "step": 36675 }, { "epoch": 9.532224532224532, "grad_norm": 1.682795524597168, "learning_rate": 3.133565640365208e-05, "loss": 0.3465, "num_input_tokens_seen": 3302776, "step": 36680 }, { "epoch": 9.53352390852391, "grad_norm": 1.8945790529251099, "learning_rate": 3.133017172781215e-05, "loss": 0.2331, "num_input_tokens_seen": 3303208, "step": 36685 }, { "epoch": 9.534823284823284, "grad_norm": 0.3446574807167053, "learning_rate": 3.132468672640543e-05, "loss": 0.2521, "num_input_tokens_seen": 3303672, "step": 36690 }, { "epoch": 9.536122661122661, "grad_norm": 0.5740050077438354, "learning_rate": 3.131920139971401e-05, "loss": 0.1309, "num_input_tokens_seen": 3304088, "step": 36695 }, { "epoch": 9.537422037422038, "grad_norm": 4.808609962463379, "learning_rate": 3.131371574802e-05, "loss": 0.3014, "num_input_tokens_seen": 3304568, "step": 36700 }, { "epoch": 9.538721413721413, "grad_norm": 3.9888172149658203, "learning_rate": 3.130822977160554e-05, "loss": 0.4003, "num_input_tokens_seen": 3305032, "step": 36705 }, { "epoch": 9.54002079002079, "grad_norm": 0.7766801118850708, "learning_rate": 3.130274347075279e-05, "loss": 0.2481, "num_input_tokens_seen": 3305464, "step": 36710 }, { "epoch": 9.541320166320165, "grad_norm": 1.9983958005905151, "learning_rate": 3.12972568457439e-05, "loss": 0.3236, "num_input_tokens_seen": 3305944, "step": 36715 }, { "epoch": 9.542619542619542, "grad_norm": 2.9575085639953613, "learning_rate": 3.1291769896861056e-05, "loss": 0.4657, "num_input_tokens_seen": 3306376, "step": 36720 }, { "epoch": 9.54391891891892, "grad_norm": 2.1063153743743896, "learning_rate": 3.128628262438645e-05, "loss": 0.3627, "num_input_tokens_seen": 3306872, "step": 36725 }, { "epoch": 9.545218295218294, "grad_norm": 0.7304079532623291, "learning_rate": 3.128079502860232e-05, "loss": 0.1804, "num_input_tokens_seen": 3307336, "step": 36730 }, { "epoch": 9.546517671517671, "grad_norm": 2.125293731689453, "learning_rate": 3.1275307109790873e-05, "loss": 0.2224, "num_input_tokens_seen": 3307816, "step": 36735 }, { "epoch": 9.547817047817048, "grad_norm": 2.529324531555176, "learning_rate": 3.126981886823437e-05, "loss": 0.2493, "num_input_tokens_seen": 3308328, "step": 36740 }, { "epoch": 9.549116424116423, "grad_norm": 2.540320634841919, "learning_rate": 3.1264330304215075e-05, "loss": 0.2886, "num_input_tokens_seen": 3308776, "step": 36745 }, { "epoch": 9.5504158004158, "grad_norm": 2.3725883960723877, "learning_rate": 3.125884141801527e-05, "loss": 0.3341, "num_input_tokens_seen": 3309208, "step": 36750 }, { "epoch": 9.551715176715177, "grad_norm": 1.7301374673843384, "learning_rate": 3.125335220991726e-05, "loss": 0.3148, "num_input_tokens_seen": 3309688, "step": 36755 }, { "epoch": 9.553014553014552, "grad_norm": 1.1059855222702026, "learning_rate": 3.124786268020334e-05, "loss": 0.1841, "num_input_tokens_seen": 3310120, "step": 36760 }, { "epoch": 9.55431392931393, "grad_norm": 1.9248090982437134, "learning_rate": 3.124237282915587e-05, "loss": 0.2758, "num_input_tokens_seen": 3310536, "step": 36765 }, { "epoch": 9.555613305613306, "grad_norm": 1.0098906755447388, "learning_rate": 3.123688265705718e-05, "loss": 0.3817, "num_input_tokens_seen": 3310952, "step": 36770 }, { "epoch": 9.556912681912682, "grad_norm": 1.5374594926834106, "learning_rate": 3.123139216418964e-05, "loss": 0.1712, "num_input_tokens_seen": 3311416, "step": 36775 }, { "epoch": 9.558212058212058, "grad_norm": 2.608944892883301, "learning_rate": 3.122590135083563e-05, "loss": 0.2378, "num_input_tokens_seen": 3311832, "step": 36780 }, { "epoch": 9.559511434511435, "grad_norm": 0.9509196281433105, "learning_rate": 3.122041021727755e-05, "loss": 0.1968, "num_input_tokens_seen": 3312248, "step": 36785 }, { "epoch": 9.56081081081081, "grad_norm": 2.9225919246673584, "learning_rate": 3.1214918763797805e-05, "loss": 0.332, "num_input_tokens_seen": 3312696, "step": 36790 }, { "epoch": 9.562110187110187, "grad_norm": 0.96437668800354, "learning_rate": 3.120942699067884e-05, "loss": 0.2511, "num_input_tokens_seen": 3313112, "step": 36795 }, { "epoch": 9.563409563409563, "grad_norm": 0.5125147104263306, "learning_rate": 3.1203934898203096e-05, "loss": 0.1666, "num_input_tokens_seen": 3313528, "step": 36800 }, { "epoch": 9.56470893970894, "grad_norm": 2.1578657627105713, "learning_rate": 3.119844248665303e-05, "loss": 0.2699, "num_input_tokens_seen": 3313976, "step": 36805 }, { "epoch": 9.566008316008316, "grad_norm": 2.7296996116638184, "learning_rate": 3.119294975631113e-05, "loss": 0.211, "num_input_tokens_seen": 3314408, "step": 36810 }, { "epoch": 9.567307692307692, "grad_norm": 2.654196262359619, "learning_rate": 3.118745670745989e-05, "loss": 0.2555, "num_input_tokens_seen": 3314872, "step": 36815 }, { "epoch": 9.568607068607069, "grad_norm": 2.379157543182373, "learning_rate": 3.118196334038182e-05, "loss": 0.3422, "num_input_tokens_seen": 3315336, "step": 36820 }, { "epoch": 9.569906444906445, "grad_norm": 1.1461646556854248, "learning_rate": 3.1176469655359465e-05, "loss": 0.2176, "num_input_tokens_seen": 3315768, "step": 36825 }, { "epoch": 9.57120582120582, "grad_norm": 1.0111044645309448, "learning_rate": 3.1170975652675344e-05, "loss": 0.2411, "num_input_tokens_seen": 3316168, "step": 36830 }, { "epoch": 9.572505197505198, "grad_norm": 1.3846867084503174, "learning_rate": 3.116548133261204e-05, "loss": 0.1279, "num_input_tokens_seen": 3316648, "step": 36835 }, { "epoch": 9.573804573804575, "grad_norm": 0.32673853635787964, "learning_rate": 3.115998669545212e-05, "loss": 0.2405, "num_input_tokens_seen": 3317064, "step": 36840 }, { "epoch": 9.57510395010395, "grad_norm": 0.6065447926521301, "learning_rate": 3.115449174147818e-05, "loss": 0.1445, "num_input_tokens_seen": 3317496, "step": 36845 }, { "epoch": 9.576403326403327, "grad_norm": 0.4567776918411255, "learning_rate": 3.1148996470972835e-05, "loss": 0.3215, "num_input_tokens_seen": 3317960, "step": 36850 }, { "epoch": 9.577702702702704, "grad_norm": 0.46290504932403564, "learning_rate": 3.1143500884218714e-05, "loss": 0.1755, "num_input_tokens_seen": 3318376, "step": 36855 }, { "epoch": 9.579002079002079, "grad_norm": 0.32831159234046936, "learning_rate": 3.113800498149846e-05, "loss": 0.1731, "num_input_tokens_seen": 3318808, "step": 36860 }, { "epoch": 9.580301455301456, "grad_norm": 0.6657861471176147, "learning_rate": 3.1132508763094715e-05, "loss": 0.2602, "num_input_tokens_seen": 3319256, "step": 36865 }, { "epoch": 9.58160083160083, "grad_norm": 1.9714837074279785, "learning_rate": 3.1127012229290174e-05, "loss": 0.2167, "num_input_tokens_seen": 3319752, "step": 36870 }, { "epoch": 9.582900207900208, "grad_norm": 2.796130895614624, "learning_rate": 3.112151538036753e-05, "loss": 0.1152, "num_input_tokens_seen": 3320200, "step": 36875 }, { "epoch": 9.584199584199585, "grad_norm": 2.5028562545776367, "learning_rate": 3.111601821660948e-05, "loss": 0.1682, "num_input_tokens_seen": 3320680, "step": 36880 }, { "epoch": 9.58549896049896, "grad_norm": 2.684161901473999, "learning_rate": 3.1110520738298746e-05, "loss": 0.3915, "num_input_tokens_seen": 3321096, "step": 36885 }, { "epoch": 9.586798336798337, "grad_norm": 1.9742012023925781, "learning_rate": 3.1105022945718074e-05, "loss": 0.4104, "num_input_tokens_seen": 3321528, "step": 36890 }, { "epoch": 9.588097713097714, "grad_norm": 0.3951505124568939, "learning_rate": 3.109952483915024e-05, "loss": 0.1223, "num_input_tokens_seen": 3322008, "step": 36895 }, { "epoch": 9.589397089397089, "grad_norm": 1.048121452331543, "learning_rate": 3.109402641887798e-05, "loss": 0.2884, "num_input_tokens_seen": 3322472, "step": 36900 }, { "epoch": 9.590696465696466, "grad_norm": 0.7413297295570374, "learning_rate": 3.1088527685184114e-05, "loss": 0.2376, "num_input_tokens_seen": 3322904, "step": 36905 }, { "epoch": 9.591995841995843, "grad_norm": 2.095691442489624, "learning_rate": 3.108302863835143e-05, "loss": 0.3491, "num_input_tokens_seen": 3323400, "step": 36910 }, { "epoch": 9.593295218295218, "grad_norm": 1.377470850944519, "learning_rate": 3.107752927866276e-05, "loss": 0.2787, "num_input_tokens_seen": 3323848, "step": 36915 }, { "epoch": 9.594594594594595, "grad_norm": 0.817559003829956, "learning_rate": 3.107202960640093e-05, "loss": 0.1214, "num_input_tokens_seen": 3324296, "step": 36920 }, { "epoch": 9.595893970893972, "grad_norm": 1.2145445346832275, "learning_rate": 3.106652962184881e-05, "loss": 0.2491, "num_input_tokens_seen": 3324760, "step": 36925 }, { "epoch": 9.597193347193347, "grad_norm": 0.9916574358940125, "learning_rate": 3.106102932528925e-05, "loss": 0.1827, "num_input_tokens_seen": 3325208, "step": 36930 }, { "epoch": 9.598492723492724, "grad_norm": 3.8959946632385254, "learning_rate": 3.105552871700515e-05, "loss": 0.262, "num_input_tokens_seen": 3325672, "step": 36935 }, { "epoch": 9.5997920997921, "grad_norm": 1.3395123481750488, "learning_rate": 3.1050027797279394e-05, "loss": 0.2347, "num_input_tokens_seen": 3326104, "step": 36940 }, { "epoch": 9.601091476091476, "grad_norm": 3.2446162700653076, "learning_rate": 3.104452656639492e-05, "loss": 0.217, "num_input_tokens_seen": 3326568, "step": 36945 }, { "epoch": 9.602390852390853, "grad_norm": 0.37665167450904846, "learning_rate": 3.103902502463465e-05, "loss": 0.1967, "num_input_tokens_seen": 3327032, "step": 36950 }, { "epoch": 9.603690228690228, "grad_norm": 0.313595712184906, "learning_rate": 3.1033523172281544e-05, "loss": 0.1684, "num_input_tokens_seen": 3327512, "step": 36955 }, { "epoch": 9.604989604989605, "grad_norm": 0.30609774589538574, "learning_rate": 3.102802100961856e-05, "loss": 0.369, "num_input_tokens_seen": 3327960, "step": 36960 }, { "epoch": 9.606288981288982, "grad_norm": 0.4826774299144745, "learning_rate": 3.102251853692867e-05, "loss": 0.3809, "num_input_tokens_seen": 3328392, "step": 36965 }, { "epoch": 9.607588357588357, "grad_norm": 1.3799536228179932, "learning_rate": 3.101701575449489e-05, "loss": 0.1779, "num_input_tokens_seen": 3328856, "step": 36970 }, { "epoch": 9.608887733887734, "grad_norm": 1.752015471458435, "learning_rate": 3.1011512662600216e-05, "loss": 0.5735, "num_input_tokens_seen": 3329272, "step": 36975 }, { "epoch": 9.61018711018711, "grad_norm": 0.7700933218002319, "learning_rate": 3.100600926152769e-05, "loss": 0.2105, "num_input_tokens_seen": 3329736, "step": 36980 }, { "epoch": 9.611486486486486, "grad_norm": 3.7919716835021973, "learning_rate": 3.100050555156035e-05, "loss": 0.3217, "num_input_tokens_seen": 3330184, "step": 36985 }, { "epoch": 9.612785862785863, "grad_norm": 1.0339548587799072, "learning_rate": 3.099500153298127e-05, "loss": 0.1906, "num_input_tokens_seen": 3330600, "step": 36990 }, { "epoch": 9.61408523908524, "grad_norm": 0.5955589413642883, "learning_rate": 3.098949720607351e-05, "loss": 0.1746, "num_input_tokens_seen": 3331032, "step": 36995 }, { "epoch": 9.615384615384615, "grad_norm": 3.1690335273742676, "learning_rate": 3.0983992571120176e-05, "loss": 0.3543, "num_input_tokens_seen": 3331480, "step": 37000 }, { "epoch": 9.616683991683992, "grad_norm": 2.467050313949585, "learning_rate": 3.0978487628404365e-05, "loss": 0.3817, "num_input_tokens_seen": 3331944, "step": 37005 }, { "epoch": 9.617983367983367, "grad_norm": 4.04911470413208, "learning_rate": 3.0972982378209204e-05, "loss": 0.2814, "num_input_tokens_seen": 3332392, "step": 37010 }, { "epoch": 9.619282744282744, "grad_norm": 1.7748584747314453, "learning_rate": 3.096747682081784e-05, "loss": 0.2314, "num_input_tokens_seen": 3332824, "step": 37015 }, { "epoch": 9.620582120582121, "grad_norm": 1.3510475158691406, "learning_rate": 3.096197095651342e-05, "loss": 0.2426, "num_input_tokens_seen": 3333256, "step": 37020 }, { "epoch": 9.621881496881496, "grad_norm": 2.7308876514434814, "learning_rate": 3.0956464785579124e-05, "loss": 0.3373, "num_input_tokens_seen": 3333720, "step": 37025 }, { "epoch": 9.623180873180873, "grad_norm": 1.4103847742080688, "learning_rate": 3.095095830829814e-05, "loss": 0.2972, "num_input_tokens_seen": 3334168, "step": 37030 }, { "epoch": 9.62448024948025, "grad_norm": 1.6199145317077637, "learning_rate": 3.0945451524953666e-05, "loss": 0.2793, "num_input_tokens_seen": 3334648, "step": 37035 }, { "epoch": 9.625779625779625, "grad_norm": 1.6115437746047974, "learning_rate": 3.093994443582893e-05, "loss": 0.204, "num_input_tokens_seen": 3335112, "step": 37040 }, { "epoch": 9.627079002079002, "grad_norm": 1.7836179733276367, "learning_rate": 3.093443704120715e-05, "loss": 0.189, "num_input_tokens_seen": 3335624, "step": 37045 }, { "epoch": 9.628378378378379, "grad_norm": 1.173880696296692, "learning_rate": 3.092892934137159e-05, "loss": 0.1574, "num_input_tokens_seen": 3336088, "step": 37050 }, { "epoch": 9.629677754677754, "grad_norm": 0.3866649866104126, "learning_rate": 3.092342133660553e-05, "loss": 0.3259, "num_input_tokens_seen": 3336568, "step": 37055 }, { "epoch": 9.630977130977131, "grad_norm": 0.588679850101471, "learning_rate": 3.091791302719221e-05, "loss": 0.1388, "num_input_tokens_seen": 3337016, "step": 37060 }, { "epoch": 9.632276507276508, "grad_norm": 1.8206391334533691, "learning_rate": 3.0912404413414965e-05, "loss": 0.2245, "num_input_tokens_seen": 3337496, "step": 37065 }, { "epoch": 9.633575883575883, "grad_norm": 1.5381883382797241, "learning_rate": 3.0906895495557094e-05, "loss": 0.3225, "num_input_tokens_seen": 3337944, "step": 37070 }, { "epoch": 9.63487525987526, "grad_norm": 1.751851201057434, "learning_rate": 3.090138627390193e-05, "loss": 0.2784, "num_input_tokens_seen": 3338360, "step": 37075 }, { "epoch": 9.636174636174637, "grad_norm": 0.5471895933151245, "learning_rate": 3.0895876748732806e-05, "loss": 0.2032, "num_input_tokens_seen": 3338808, "step": 37080 }, { "epoch": 9.637474012474012, "grad_norm": 0.3392212986946106, "learning_rate": 3.08903669203331e-05, "loss": 0.2054, "num_input_tokens_seen": 3339240, "step": 37085 }, { "epoch": 9.638773388773389, "grad_norm": 0.9320186972618103, "learning_rate": 3.088485678898618e-05, "loss": 0.4125, "num_input_tokens_seen": 3339704, "step": 37090 }, { "epoch": 9.640072765072766, "grad_norm": 0.5498251914978027, "learning_rate": 3.0879346354975435e-05, "loss": 0.1495, "num_input_tokens_seen": 3340152, "step": 37095 }, { "epoch": 9.641372141372141, "grad_norm": 0.7565819621086121, "learning_rate": 3.087383561858427e-05, "loss": 0.2473, "num_input_tokens_seen": 3340616, "step": 37100 }, { "epoch": 9.642671517671518, "grad_norm": 2.0518221855163574, "learning_rate": 3.0868324580096114e-05, "loss": 0.328, "num_input_tokens_seen": 3341064, "step": 37105 }, { "epoch": 9.643970893970893, "grad_norm": 0.7863402962684631, "learning_rate": 3.0862813239794405e-05, "loss": 0.1113, "num_input_tokens_seen": 3341512, "step": 37110 }, { "epoch": 9.64527027027027, "grad_norm": 0.21946266293525696, "learning_rate": 3.085730159796259e-05, "loss": 0.1217, "num_input_tokens_seen": 3341944, "step": 37115 }, { "epoch": 9.646569646569647, "grad_norm": 2.3846354484558105, "learning_rate": 3.0851789654884145e-05, "loss": 0.432, "num_input_tokens_seen": 3342376, "step": 37120 }, { "epoch": 9.647869022869022, "grad_norm": 1.5628100633621216, "learning_rate": 3.084627741084255e-05, "loss": 0.2573, "num_input_tokens_seen": 3342824, "step": 37125 }, { "epoch": 9.6491683991684, "grad_norm": 1.940301537513733, "learning_rate": 3.084076486612131e-05, "loss": 0.2022, "num_input_tokens_seen": 3343272, "step": 37130 }, { "epoch": 9.650467775467776, "grad_norm": 0.43167808651924133, "learning_rate": 3.0835252021003925e-05, "loss": 0.2279, "num_input_tokens_seen": 3343720, "step": 37135 }, { "epoch": 9.651767151767151, "grad_norm": 1.27106511592865, "learning_rate": 3.0829738875773945e-05, "loss": 0.2028, "num_input_tokens_seen": 3344152, "step": 37140 }, { "epoch": 9.653066528066528, "grad_norm": 0.48711591958999634, "learning_rate": 3.082422543071491e-05, "loss": 0.0736, "num_input_tokens_seen": 3344584, "step": 37145 }, { "epoch": 9.654365904365905, "grad_norm": 3.934354543685913, "learning_rate": 3.081871168611037e-05, "loss": 0.1644, "num_input_tokens_seen": 3345032, "step": 37150 }, { "epoch": 9.65566528066528, "grad_norm": 2.05816650390625, "learning_rate": 3.0813197642243925e-05, "loss": 0.341, "num_input_tokens_seen": 3345448, "step": 37155 }, { "epoch": 9.656964656964657, "grad_norm": 3.059297561645508, "learning_rate": 3.080768329939916e-05, "loss": 0.1664, "num_input_tokens_seen": 3345880, "step": 37160 }, { "epoch": 9.658264033264032, "grad_norm": 1.8609983921051025, "learning_rate": 3.0802168657859665e-05, "loss": 0.3199, "num_input_tokens_seen": 3346328, "step": 37165 }, { "epoch": 9.65956340956341, "grad_norm": 1.9952468872070312, "learning_rate": 3.079665371790908e-05, "loss": 0.2305, "num_input_tokens_seen": 3346776, "step": 37170 }, { "epoch": 9.660862785862786, "grad_norm": 0.2209264636039734, "learning_rate": 3.079113847983104e-05, "loss": 0.032, "num_input_tokens_seen": 3347224, "step": 37175 }, { "epoch": 9.662162162162161, "grad_norm": 2.3400371074676514, "learning_rate": 3.0785622943909195e-05, "loss": 0.5952, "num_input_tokens_seen": 3347704, "step": 37180 }, { "epoch": 9.663461538461538, "grad_norm": 0.5703392624855042, "learning_rate": 3.078010711042723e-05, "loss": 0.1978, "num_input_tokens_seen": 3348136, "step": 37185 }, { "epoch": 9.664760914760915, "grad_norm": 0.24958302080631256, "learning_rate": 3.0774590979668814e-05, "loss": 0.2466, "num_input_tokens_seen": 3348584, "step": 37190 }, { "epoch": 9.66606029106029, "grad_norm": 2.2380168437957764, "learning_rate": 3.076907455191765e-05, "loss": 0.2306, "num_input_tokens_seen": 3349016, "step": 37195 }, { "epoch": 9.667359667359667, "grad_norm": 0.6289910674095154, "learning_rate": 3.0763557827457454e-05, "loss": 0.3745, "num_input_tokens_seen": 3349464, "step": 37200 }, { "epoch": 9.668659043659044, "grad_norm": 0.3589366376399994, "learning_rate": 3.0758040806571954e-05, "loss": 0.0829, "num_input_tokens_seen": 3349880, "step": 37205 }, { "epoch": 9.66995841995842, "grad_norm": 1.9664475917816162, "learning_rate": 3.07525234895449e-05, "loss": 0.3773, "num_input_tokens_seen": 3350344, "step": 37210 }, { "epoch": 9.671257796257796, "grad_norm": 1.9073249101638794, "learning_rate": 3.074700587666005e-05, "loss": 0.3629, "num_input_tokens_seen": 3350824, "step": 37215 }, { "epoch": 9.672557172557173, "grad_norm": 1.7393178939819336, "learning_rate": 3.074148796820119e-05, "loss": 0.2716, "num_input_tokens_seen": 3351304, "step": 37220 }, { "epoch": 9.673856548856548, "grad_norm": 1.4945776462554932, "learning_rate": 3.073596976445209e-05, "loss": 0.2819, "num_input_tokens_seen": 3351784, "step": 37225 }, { "epoch": 9.675155925155925, "grad_norm": 0.47908392548561096, "learning_rate": 3.0730451265696576e-05, "loss": 0.2193, "num_input_tokens_seen": 3352248, "step": 37230 }, { "epoch": 9.676455301455302, "grad_norm": 2.299147129058838, "learning_rate": 3.072493247221846e-05, "loss": 0.3061, "num_input_tokens_seen": 3352744, "step": 37235 }, { "epoch": 9.677754677754677, "grad_norm": 0.5975326895713806, "learning_rate": 3.0719413384301584e-05, "loss": 0.211, "num_input_tokens_seen": 3353176, "step": 37240 }, { "epoch": 9.679054054054054, "grad_norm": 0.8701707124710083, "learning_rate": 3.071389400222979e-05, "loss": 0.3551, "num_input_tokens_seen": 3353608, "step": 37245 }, { "epoch": 9.68035343035343, "grad_norm": 0.9034836888313293, "learning_rate": 3.0708374326286965e-05, "loss": 0.2512, "num_input_tokens_seen": 3354120, "step": 37250 }, { "epoch": 9.681652806652806, "grad_norm": 1.5719462633132935, "learning_rate": 3.070285435675698e-05, "loss": 0.2125, "num_input_tokens_seen": 3354600, "step": 37255 }, { "epoch": 9.682952182952183, "grad_norm": 2.962473154067993, "learning_rate": 3.069733409392371e-05, "loss": 0.2794, "num_input_tokens_seen": 3355080, "step": 37260 }, { "epoch": 9.684251559251559, "grad_norm": 1.6411687135696411, "learning_rate": 3.0691813538071105e-05, "loss": 0.2915, "num_input_tokens_seen": 3355576, "step": 37265 }, { "epoch": 9.685550935550935, "grad_norm": 1.584480881690979, "learning_rate": 3.0686292689483074e-05, "loss": 0.2994, "num_input_tokens_seen": 3356024, "step": 37270 }, { "epoch": 9.686850311850312, "grad_norm": 1.7597622871398926, "learning_rate": 3.068077154844356e-05, "loss": 0.2852, "num_input_tokens_seen": 3356472, "step": 37275 }, { "epoch": 9.688149688149688, "grad_norm": 1.4413310289382935, "learning_rate": 3.0675250115236526e-05, "loss": 0.285, "num_input_tokens_seen": 3356904, "step": 37280 }, { "epoch": 9.689449064449065, "grad_norm": 1.1512908935546875, "learning_rate": 3.066972839014594e-05, "loss": 0.2637, "num_input_tokens_seen": 3357352, "step": 37285 }, { "epoch": 9.690748440748441, "grad_norm": 1.0251671075820923, "learning_rate": 3.066420637345579e-05, "loss": 0.1943, "num_input_tokens_seen": 3357816, "step": 37290 }, { "epoch": 9.692047817047817, "grad_norm": 0.7745269536972046, "learning_rate": 3.0658684065450075e-05, "loss": 0.2175, "num_input_tokens_seen": 3358248, "step": 37295 }, { "epoch": 9.693347193347194, "grad_norm": 2.9765896797180176, "learning_rate": 3.0653161466412824e-05, "loss": 0.2174, "num_input_tokens_seen": 3358728, "step": 37300 }, { "epoch": 9.69464656964657, "grad_norm": 0.6413636803627014, "learning_rate": 3.064763857662806e-05, "loss": 0.1501, "num_input_tokens_seen": 3359208, "step": 37305 }, { "epoch": 9.695945945945946, "grad_norm": 2.6932787895202637, "learning_rate": 3.0642115396379825e-05, "loss": 0.4031, "num_input_tokens_seen": 3359672, "step": 37310 }, { "epoch": 9.697245322245323, "grad_norm": 2.435242176055908, "learning_rate": 3.06365919259522e-05, "loss": 0.2064, "num_input_tokens_seen": 3360120, "step": 37315 }, { "epoch": 9.698544698544698, "grad_norm": 2.44801664352417, "learning_rate": 3.063106816562925e-05, "loss": 0.3701, "num_input_tokens_seen": 3360584, "step": 37320 }, { "epoch": 9.699844074844075, "grad_norm": 1.5214864015579224, "learning_rate": 3.062554411569506e-05, "loss": 0.2239, "num_input_tokens_seen": 3361032, "step": 37325 }, { "epoch": 9.701143451143452, "grad_norm": 1.565622091293335, "learning_rate": 3.062001977643375e-05, "loss": 0.1783, "num_input_tokens_seen": 3361512, "step": 37330 }, { "epoch": 9.702442827442827, "grad_norm": 0.7534353137016296, "learning_rate": 3.0614495148129436e-05, "loss": 0.1476, "num_input_tokens_seen": 3361928, "step": 37335 }, { "epoch": 9.703742203742204, "grad_norm": 1.9952524900436401, "learning_rate": 3.060897023106627e-05, "loss": 0.311, "num_input_tokens_seen": 3362360, "step": 37340 }, { "epoch": 9.70504158004158, "grad_norm": 2.20397686958313, "learning_rate": 3.0603445025528376e-05, "loss": 0.3419, "num_input_tokens_seen": 3362840, "step": 37345 }, { "epoch": 9.706340956340956, "grad_norm": 0.62422776222229, "learning_rate": 3.059791953179993e-05, "loss": 0.2342, "num_input_tokens_seen": 3363272, "step": 37350 }, { "epoch": 9.707640332640333, "grad_norm": 0.6091843843460083, "learning_rate": 3.059239375016513e-05, "loss": 0.219, "num_input_tokens_seen": 3363736, "step": 37355 }, { "epoch": 9.70893970893971, "grad_norm": 2.7417445182800293, "learning_rate": 3.058686768090816e-05, "loss": 0.2899, "num_input_tokens_seen": 3364216, "step": 37360 }, { "epoch": 9.710239085239085, "grad_norm": 1.094735026359558, "learning_rate": 3.058134132431324e-05, "loss": 0.3003, "num_input_tokens_seen": 3364680, "step": 37365 }, { "epoch": 9.711538461538462, "grad_norm": 0.6797084212303162, "learning_rate": 3.0575814680664575e-05, "loss": 0.3201, "num_input_tokens_seen": 3365112, "step": 37370 }, { "epoch": 9.712837837837839, "grad_norm": 0.6859063506126404, "learning_rate": 3.057028775024642e-05, "loss": 0.1254, "num_input_tokens_seen": 3365608, "step": 37375 }, { "epoch": 9.714137214137214, "grad_norm": 1.1844124794006348, "learning_rate": 3.056476053334304e-05, "loss": 0.2815, "num_input_tokens_seen": 3366104, "step": 37380 }, { "epoch": 9.71543659043659, "grad_norm": 1.6515380144119263, "learning_rate": 3.0559233030238684e-05, "loss": 0.2358, "num_input_tokens_seen": 3366536, "step": 37385 }, { "epoch": 9.716735966735968, "grad_norm": 2.5079100131988525, "learning_rate": 3.055370524121765e-05, "loss": 0.4322, "num_input_tokens_seen": 3366968, "step": 37390 }, { "epoch": 9.718035343035343, "grad_norm": 0.874860405921936, "learning_rate": 3.0548177166564235e-05, "loss": 0.2036, "num_input_tokens_seen": 3367416, "step": 37395 }, { "epoch": 9.71933471933472, "grad_norm": 0.5627648830413818, "learning_rate": 3.054264880656275e-05, "loss": 0.186, "num_input_tokens_seen": 3367896, "step": 37400 }, { "epoch": 9.720634095634095, "grad_norm": 0.49967315793037415, "learning_rate": 3.053712016149752e-05, "loss": 0.2842, "num_input_tokens_seen": 3368392, "step": 37405 }, { "epoch": 9.721933471933472, "grad_norm": 1.8091920614242554, "learning_rate": 3.05315912316529e-05, "loss": 0.2105, "num_input_tokens_seen": 3368824, "step": 37410 }, { "epoch": 9.723232848232849, "grad_norm": 1.0017449855804443, "learning_rate": 3.0526062017313254e-05, "loss": 0.221, "num_input_tokens_seen": 3369240, "step": 37415 }, { "epoch": 9.724532224532224, "grad_norm": 0.44273343682289124, "learning_rate": 3.0520532518762924e-05, "loss": 0.2212, "num_input_tokens_seen": 3369736, "step": 37420 }, { "epoch": 9.7258316008316, "grad_norm": 2.2864391803741455, "learning_rate": 3.051500273628633e-05, "loss": 0.4427, "num_input_tokens_seen": 3370152, "step": 37425 }, { "epoch": 9.727130977130978, "grad_norm": 1.5294320583343506, "learning_rate": 3.0509472670167853e-05, "loss": 0.3586, "num_input_tokens_seen": 3370600, "step": 37430 }, { "epoch": 9.728430353430353, "grad_norm": 0.4973093867301941, "learning_rate": 3.0503942320691925e-05, "loss": 0.1805, "num_input_tokens_seen": 3371032, "step": 37435 }, { "epoch": 9.72972972972973, "grad_norm": 0.8169763088226318, "learning_rate": 3.0498411688142962e-05, "loss": 0.289, "num_input_tokens_seen": 3371480, "step": 37440 }, { "epoch": 9.731029106029107, "grad_norm": 0.7549741268157959, "learning_rate": 3.0492880772805433e-05, "loss": 0.2564, "num_input_tokens_seen": 3371928, "step": 37445 }, { "epoch": 9.732328482328482, "grad_norm": 0.851925790309906, "learning_rate": 3.0487349574963775e-05, "loss": 0.1469, "num_input_tokens_seen": 3372376, "step": 37450 }, { "epoch": 9.733627858627859, "grad_norm": 3.3718783855438232, "learning_rate": 3.048181809490246e-05, "loss": 0.3782, "num_input_tokens_seen": 3372824, "step": 37455 }, { "epoch": 9.734927234927234, "grad_norm": 1.6358062028884888, "learning_rate": 3.0476286332906e-05, "loss": 0.2883, "num_input_tokens_seen": 3373272, "step": 37460 }, { "epoch": 9.736226611226611, "grad_norm": 0.637398362159729, "learning_rate": 3.0470754289258886e-05, "loss": 0.1808, "num_input_tokens_seen": 3373752, "step": 37465 }, { "epoch": 9.737525987525988, "grad_norm": 0.6035104990005493, "learning_rate": 3.0465221964245633e-05, "loss": 0.2591, "num_input_tokens_seen": 3374232, "step": 37470 }, { "epoch": 9.738825363825363, "grad_norm": 2.0629465579986572, "learning_rate": 3.0459689358150774e-05, "loss": 0.2322, "num_input_tokens_seen": 3374712, "step": 37475 }, { "epoch": 9.74012474012474, "grad_norm": 1.9885660409927368, "learning_rate": 3.0454156471258866e-05, "loss": 0.3549, "num_input_tokens_seen": 3375176, "step": 37480 }, { "epoch": 9.741424116424117, "grad_norm": 0.6837049722671509, "learning_rate": 3.0448623303854464e-05, "loss": 0.1565, "num_input_tokens_seen": 3375624, "step": 37485 }, { "epoch": 9.742723492723492, "grad_norm": 1.2439137697219849, "learning_rate": 3.0443089856222147e-05, "loss": 0.3211, "num_input_tokens_seen": 3376056, "step": 37490 }, { "epoch": 9.744022869022869, "grad_norm": 2.035555839538574, "learning_rate": 3.0437556128646494e-05, "loss": 0.3382, "num_input_tokens_seen": 3376520, "step": 37495 }, { "epoch": 9.745322245322246, "grad_norm": 2.0062317848205566, "learning_rate": 3.043202212141213e-05, "loss": 0.2204, "num_input_tokens_seen": 3376984, "step": 37500 }, { "epoch": 9.746621621621621, "grad_norm": 0.5603029131889343, "learning_rate": 3.0426487834803657e-05, "loss": 0.2192, "num_input_tokens_seen": 3377432, "step": 37505 }, { "epoch": 9.747920997920998, "grad_norm": 0.7829630374908447, "learning_rate": 3.0420953269105722e-05, "loss": 0.2156, "num_input_tokens_seen": 3377880, "step": 37510 }, { "epoch": 9.749220374220375, "grad_norm": 0.7086576819419861, "learning_rate": 3.0415418424602966e-05, "loss": 0.2015, "num_input_tokens_seen": 3378328, "step": 37515 }, { "epoch": 9.75051975051975, "grad_norm": 1.5693565607070923, "learning_rate": 3.0409883301580045e-05, "loss": 0.323, "num_input_tokens_seen": 3378824, "step": 37520 }, { "epoch": 9.751819126819127, "grad_norm": 2.212613582611084, "learning_rate": 3.0404347900321638e-05, "loss": 0.2411, "num_input_tokens_seen": 3379272, "step": 37525 }, { "epoch": 9.753118503118504, "grad_norm": 0.3618321716785431, "learning_rate": 3.039881222111245e-05, "loss": 0.1529, "num_input_tokens_seen": 3379720, "step": 37530 }, { "epoch": 9.754417879417879, "grad_norm": 0.3598364591598511, "learning_rate": 3.0393276264237176e-05, "loss": 0.2491, "num_input_tokens_seen": 3380152, "step": 37535 }, { "epoch": 9.755717255717256, "grad_norm": 0.7912431359291077, "learning_rate": 3.0387740029980538e-05, "loss": 0.3186, "num_input_tokens_seen": 3380584, "step": 37540 }, { "epoch": 9.757016632016633, "grad_norm": 0.4234021008014679, "learning_rate": 3.0382203518627262e-05, "loss": 0.1451, "num_input_tokens_seen": 3381032, "step": 37545 }, { "epoch": 9.758316008316008, "grad_norm": 0.42751750349998474, "learning_rate": 3.03766667304621e-05, "loss": 0.1672, "num_input_tokens_seen": 3381480, "step": 37550 }, { "epoch": 9.759615384615385, "grad_norm": 0.3680073916912079, "learning_rate": 3.0371129665769825e-05, "loss": 0.1291, "num_input_tokens_seen": 3381912, "step": 37555 }, { "epoch": 9.76091476091476, "grad_norm": 1.6053073406219482, "learning_rate": 3.03655923248352e-05, "loss": 0.1868, "num_input_tokens_seen": 3382328, "step": 37560 }, { "epoch": 9.762214137214137, "grad_norm": 0.2609817385673523, "learning_rate": 3.036005470794302e-05, "loss": 0.2718, "num_input_tokens_seen": 3382808, "step": 37565 }, { "epoch": 9.763513513513514, "grad_norm": 0.44520851969718933, "learning_rate": 3.0354516815378085e-05, "loss": 0.3694, "num_input_tokens_seen": 3383240, "step": 37570 }, { "epoch": 9.76481288981289, "grad_norm": 0.3883351981639862, "learning_rate": 3.0348978647425236e-05, "loss": 0.0994, "num_input_tokens_seen": 3383688, "step": 37575 }, { "epoch": 9.766112266112266, "grad_norm": 0.331860214471817, "learning_rate": 3.0343440204369278e-05, "loss": 0.3298, "num_input_tokens_seen": 3384136, "step": 37580 }, { "epoch": 9.767411642411643, "grad_norm": 0.6873432397842407, "learning_rate": 3.0337901486495073e-05, "loss": 0.2777, "num_input_tokens_seen": 3384568, "step": 37585 }, { "epoch": 9.768711018711018, "grad_norm": 2.106765031814575, "learning_rate": 3.0332362494087485e-05, "loss": 0.2714, "num_input_tokens_seen": 3385016, "step": 37590 }, { "epoch": 9.770010395010395, "grad_norm": 0.5057589411735535, "learning_rate": 3.0326823227431377e-05, "loss": 0.1923, "num_input_tokens_seen": 3385464, "step": 37595 }, { "epoch": 9.771309771309772, "grad_norm": 1.6395949125289917, "learning_rate": 3.0321283686811648e-05, "loss": 0.2087, "num_input_tokens_seen": 3385896, "step": 37600 }, { "epoch": 9.772609147609147, "grad_norm": 0.4955597221851349, "learning_rate": 3.0315743872513202e-05, "loss": 0.2804, "num_input_tokens_seen": 3386328, "step": 37605 }, { "epoch": 9.773908523908524, "grad_norm": 1.6867895126342773, "learning_rate": 3.0310203784820957e-05, "loss": 0.2725, "num_input_tokens_seen": 3386760, "step": 37610 }, { "epoch": 9.7752079002079, "grad_norm": 0.35676148533821106, "learning_rate": 3.0304663424019842e-05, "loss": 0.2569, "num_input_tokens_seen": 3387176, "step": 37615 }, { "epoch": 9.776507276507276, "grad_norm": 2.1120848655700684, "learning_rate": 3.0299122790394794e-05, "loss": 0.2479, "num_input_tokens_seen": 3387672, "step": 37620 }, { "epoch": 9.777806652806653, "grad_norm": 0.5144261717796326, "learning_rate": 3.0293581884230798e-05, "loss": 0.2351, "num_input_tokens_seen": 3388120, "step": 37625 }, { "epoch": 9.779106029106028, "grad_norm": 1.6134898662567139, "learning_rate": 3.028804070581281e-05, "loss": 0.2434, "num_input_tokens_seen": 3388584, "step": 37630 }, { "epoch": 9.780405405405405, "grad_norm": 1.6401212215423584, "learning_rate": 3.028249925542582e-05, "loss": 0.2992, "num_input_tokens_seen": 3389032, "step": 37635 }, { "epoch": 9.781704781704782, "grad_norm": 1.559023380279541, "learning_rate": 3.027695753335483e-05, "loss": 0.2942, "num_input_tokens_seen": 3389464, "step": 37640 }, { "epoch": 9.783004158004157, "grad_norm": 1.0046461820602417, "learning_rate": 3.027141553988487e-05, "loss": 0.2591, "num_input_tokens_seen": 3389896, "step": 37645 }, { "epoch": 9.784303534303534, "grad_norm": 0.931012749671936, "learning_rate": 3.0265873275300945e-05, "loss": 0.283, "num_input_tokens_seen": 3390344, "step": 37650 }, { "epoch": 9.785602910602911, "grad_norm": 1.8387694358825684, "learning_rate": 3.026033073988811e-05, "loss": 0.1955, "num_input_tokens_seen": 3390792, "step": 37655 }, { "epoch": 9.786902286902286, "grad_norm": 0.5766914486885071, "learning_rate": 3.0254787933931434e-05, "loss": 0.1737, "num_input_tokens_seen": 3391256, "step": 37660 }, { "epoch": 9.788201663201663, "grad_norm": 1.9513263702392578, "learning_rate": 3.0249244857715976e-05, "loss": 0.2396, "num_input_tokens_seen": 3391704, "step": 37665 }, { "epoch": 9.78950103950104, "grad_norm": 2.4664790630340576, "learning_rate": 3.024370151152682e-05, "loss": 0.3317, "num_input_tokens_seen": 3392184, "step": 37670 }, { "epoch": 9.790800415800415, "grad_norm": 1.3589316606521606, "learning_rate": 3.0238157895649078e-05, "loss": 0.2657, "num_input_tokens_seen": 3392648, "step": 37675 }, { "epoch": 9.792099792099792, "grad_norm": 2.1060192584991455, "learning_rate": 3.0232614010367854e-05, "loss": 0.336, "num_input_tokens_seen": 3393128, "step": 37680 }, { "epoch": 9.79339916839917, "grad_norm": 1.4632885456085205, "learning_rate": 3.0227069855968283e-05, "loss": 0.2461, "num_input_tokens_seen": 3393576, "step": 37685 }, { "epoch": 9.794698544698544, "grad_norm": 0.31245365738868713, "learning_rate": 3.0221525432735492e-05, "loss": 0.2706, "num_input_tokens_seen": 3394056, "step": 37690 }, { "epoch": 9.795997920997921, "grad_norm": 1.2282631397247314, "learning_rate": 3.0215980740954653e-05, "loss": 0.1992, "num_input_tokens_seen": 3394520, "step": 37695 }, { "epoch": 9.797297297297296, "grad_norm": 1.8733779191970825, "learning_rate": 3.0210435780910923e-05, "loss": 0.2755, "num_input_tokens_seen": 3394984, "step": 37700 }, { "epoch": 9.798596673596673, "grad_norm": 0.32976651191711426, "learning_rate": 3.0204890552889486e-05, "loss": 0.1582, "num_input_tokens_seen": 3395432, "step": 37705 }, { "epoch": 9.79989604989605, "grad_norm": 0.5403436422348022, "learning_rate": 3.0199345057175544e-05, "loss": 0.1954, "num_input_tokens_seen": 3395880, "step": 37710 }, { "epoch": 9.801195426195425, "grad_norm": 1.394225835800171, "learning_rate": 3.0193799294054304e-05, "loss": 0.3951, "num_input_tokens_seen": 3396344, "step": 37715 }, { "epoch": 9.802494802494802, "grad_norm": 2.353436231613159, "learning_rate": 3.0188253263810995e-05, "loss": 0.2062, "num_input_tokens_seen": 3396760, "step": 37720 }, { "epoch": 9.80379417879418, "grad_norm": 1.7748746871948242, "learning_rate": 3.0182706966730834e-05, "loss": 0.2684, "num_input_tokens_seen": 3397240, "step": 37725 }, { "epoch": 9.805093555093555, "grad_norm": 1.5199960470199585, "learning_rate": 3.0177160403099104e-05, "loss": 0.2467, "num_input_tokens_seen": 3397688, "step": 37730 }, { "epoch": 9.806392931392931, "grad_norm": 2.2523837089538574, "learning_rate": 3.0171613573201046e-05, "loss": 0.2642, "num_input_tokens_seen": 3398104, "step": 37735 }, { "epoch": 9.807692307692308, "grad_norm": 1.5716742277145386, "learning_rate": 3.0166066477321947e-05, "loss": 0.3097, "num_input_tokens_seen": 3398568, "step": 37740 }, { "epoch": 9.808991683991684, "grad_norm": 2.438389778137207, "learning_rate": 3.01605191157471e-05, "loss": 0.1936, "num_input_tokens_seen": 3399032, "step": 37745 }, { "epoch": 9.81029106029106, "grad_norm": 1.0025866031646729, "learning_rate": 3.0154971488761808e-05, "loss": 0.2449, "num_input_tokens_seen": 3399448, "step": 37750 }, { "epoch": 9.811590436590437, "grad_norm": 0.5405114889144897, "learning_rate": 3.01494235966514e-05, "loss": 0.1058, "num_input_tokens_seen": 3399880, "step": 37755 }, { "epoch": 9.812889812889813, "grad_norm": 1.4710487127304077, "learning_rate": 3.0143875439701192e-05, "loss": 0.2106, "num_input_tokens_seen": 3400392, "step": 37760 }, { "epoch": 9.81418918918919, "grad_norm": 3.1347904205322266, "learning_rate": 3.013832701819655e-05, "loss": 0.1654, "num_input_tokens_seen": 3400840, "step": 37765 }, { "epoch": 9.815488565488565, "grad_norm": 2.273223638534546, "learning_rate": 3.0132778332422824e-05, "loss": 0.202, "num_input_tokens_seen": 3401272, "step": 37770 }, { "epoch": 9.816787941787942, "grad_norm": 3.476323366165161, "learning_rate": 3.012722938266539e-05, "loss": 0.3976, "num_input_tokens_seen": 3401720, "step": 37775 }, { "epoch": 9.818087318087318, "grad_norm": 1.0566320419311523, "learning_rate": 3.0121680169209636e-05, "loss": 0.1205, "num_input_tokens_seen": 3402184, "step": 37780 }, { "epoch": 9.819386694386694, "grad_norm": 0.508598804473877, "learning_rate": 3.011613069234097e-05, "loss": 0.2166, "num_input_tokens_seen": 3402632, "step": 37785 }, { "epoch": 9.82068607068607, "grad_norm": 1.747554898262024, "learning_rate": 3.0110580952344792e-05, "loss": 0.3163, "num_input_tokens_seen": 3403080, "step": 37790 }, { "epoch": 9.821985446985448, "grad_norm": 5.157079696655273, "learning_rate": 3.0105030949506542e-05, "loss": 0.3527, "num_input_tokens_seen": 3403528, "step": 37795 }, { "epoch": 9.823284823284823, "grad_norm": 0.37094107270240784, "learning_rate": 3.009948068411166e-05, "loss": 0.1796, "num_input_tokens_seen": 3403976, "step": 37800 }, { "epoch": 9.8245841995842, "grad_norm": 0.5122570991516113, "learning_rate": 3.0093930156445595e-05, "loss": 0.2053, "num_input_tokens_seen": 3404424, "step": 37805 }, { "epoch": 9.825883575883577, "grad_norm": 1.7599486112594604, "learning_rate": 3.008837936679383e-05, "loss": 0.1597, "num_input_tokens_seen": 3404840, "step": 37810 }, { "epoch": 9.827182952182952, "grad_norm": 0.778318464756012, "learning_rate": 3.0082828315441825e-05, "loss": 0.1834, "num_input_tokens_seen": 3405256, "step": 37815 }, { "epoch": 9.828482328482329, "grad_norm": 0.8057138323783875, "learning_rate": 3.0077277002675097e-05, "loss": 0.2084, "num_input_tokens_seen": 3405704, "step": 37820 }, { "epoch": 9.829781704781706, "grad_norm": 3.127855062484741, "learning_rate": 3.007172542877915e-05, "loss": 0.2575, "num_input_tokens_seen": 3406120, "step": 37825 }, { "epoch": 9.83108108108108, "grad_norm": 1.6105889081954956, "learning_rate": 3.006617359403951e-05, "loss": 0.1805, "num_input_tokens_seen": 3406616, "step": 37830 }, { "epoch": 9.832380457380458, "grad_norm": 1.1222336292266846, "learning_rate": 3.0060621498741692e-05, "loss": 0.2793, "num_input_tokens_seen": 3407064, "step": 37835 }, { "epoch": 9.833679833679835, "grad_norm": 3.885331392288208, "learning_rate": 3.0055069143171284e-05, "loss": 0.3373, "num_input_tokens_seen": 3407512, "step": 37840 }, { "epoch": 9.83497920997921, "grad_norm": 0.9014942049980164, "learning_rate": 3.0049516527613812e-05, "loss": 0.4188, "num_input_tokens_seen": 3407960, "step": 37845 }, { "epoch": 9.836278586278587, "grad_norm": 2.6852293014526367, "learning_rate": 3.0043963652354863e-05, "loss": 0.3683, "num_input_tokens_seen": 3408424, "step": 37850 }, { "epoch": 9.837577962577962, "grad_norm": 3.731987953186035, "learning_rate": 3.0038410517680036e-05, "loss": 0.3014, "num_input_tokens_seen": 3408904, "step": 37855 }, { "epoch": 9.838877338877339, "grad_norm": 1.0951776504516602, "learning_rate": 3.003285712387493e-05, "loss": 0.2481, "num_input_tokens_seen": 3409352, "step": 37860 }, { "epoch": 9.840176715176716, "grad_norm": 1.0781261920928955, "learning_rate": 3.002730347122516e-05, "loss": 0.2084, "num_input_tokens_seen": 3409816, "step": 37865 }, { "epoch": 9.84147609147609, "grad_norm": 0.17289820313453674, "learning_rate": 3.002174956001635e-05, "loss": 0.235, "num_input_tokens_seen": 3410296, "step": 37870 }, { "epoch": 9.842775467775468, "grad_norm": 0.32090115547180176, "learning_rate": 3.001619539053415e-05, "loss": 0.1737, "num_input_tokens_seen": 3410712, "step": 37875 }, { "epoch": 9.844074844074845, "grad_norm": 3.115612745285034, "learning_rate": 3.001064096306422e-05, "loss": 0.2913, "num_input_tokens_seen": 3411176, "step": 37880 }, { "epoch": 9.84537422037422, "grad_norm": 3.372080087661743, "learning_rate": 3.0005086277892218e-05, "loss": 0.2334, "num_input_tokens_seen": 3411624, "step": 37885 }, { "epoch": 9.846673596673597, "grad_norm": 0.3643353283405304, "learning_rate": 2.9999531335303838e-05, "loss": 0.184, "num_input_tokens_seen": 3412088, "step": 37890 }, { "epoch": 9.847972972972974, "grad_norm": 2.8321993350982666, "learning_rate": 2.9993976135584766e-05, "loss": 0.1877, "num_input_tokens_seen": 3412568, "step": 37895 }, { "epoch": 9.849272349272349, "grad_norm": 2.844273805618286, "learning_rate": 2.9988420679020724e-05, "loss": 0.363, "num_input_tokens_seen": 3413016, "step": 37900 }, { "epoch": 9.850571725571726, "grad_norm": 3.6176347732543945, "learning_rate": 2.998286496589742e-05, "loss": 0.5248, "num_input_tokens_seen": 3413480, "step": 37905 }, { "epoch": 9.851871101871101, "grad_norm": 1.4774798154830933, "learning_rate": 2.99773089965006e-05, "loss": 0.3151, "num_input_tokens_seen": 3413928, "step": 37910 }, { "epoch": 9.853170478170478, "grad_norm": 2.232234477996826, "learning_rate": 2.9971752771116012e-05, "loss": 0.3341, "num_input_tokens_seen": 3414408, "step": 37915 }, { "epoch": 9.854469854469855, "grad_norm": 0.7456824779510498, "learning_rate": 2.9966196290029408e-05, "loss": 0.144, "num_input_tokens_seen": 3414824, "step": 37920 }, { "epoch": 9.85576923076923, "grad_norm": 1.2909634113311768, "learning_rate": 2.9960639553526577e-05, "loss": 0.3059, "num_input_tokens_seen": 3415240, "step": 37925 }, { "epoch": 9.857068607068607, "grad_norm": 3.735511302947998, "learning_rate": 2.99550825618933e-05, "loss": 0.1483, "num_input_tokens_seen": 3415720, "step": 37930 }, { "epoch": 9.858367983367984, "grad_norm": 1.2317018508911133, "learning_rate": 2.9949525315415378e-05, "loss": 0.2197, "num_input_tokens_seen": 3416136, "step": 37935 }, { "epoch": 9.859667359667359, "grad_norm": 0.4726385772228241, "learning_rate": 2.994396781437862e-05, "loss": 0.1062, "num_input_tokens_seen": 3416552, "step": 37940 }, { "epoch": 9.860966735966736, "grad_norm": 3.3866024017333984, "learning_rate": 2.9938410059068868e-05, "loss": 0.3636, "num_input_tokens_seen": 3416984, "step": 37945 }, { "epoch": 9.862266112266113, "grad_norm": 0.4449041485786438, "learning_rate": 2.9932852049771952e-05, "loss": 0.2412, "num_input_tokens_seen": 3417448, "step": 37950 }, { "epoch": 9.863565488565488, "grad_norm": 0.7684001326560974, "learning_rate": 2.992729378677373e-05, "loss": 0.1393, "num_input_tokens_seen": 3417864, "step": 37955 }, { "epoch": 9.864864864864865, "grad_norm": 1.3836700916290283, "learning_rate": 2.9921735270360063e-05, "loss": 0.3535, "num_input_tokens_seen": 3418280, "step": 37960 }, { "epoch": 9.866164241164242, "grad_norm": 0.3531123995780945, "learning_rate": 2.991617650081684e-05, "loss": 0.0752, "num_input_tokens_seen": 3418744, "step": 37965 }, { "epoch": 9.867463617463617, "grad_norm": 2.214674949645996, "learning_rate": 2.9910617478429953e-05, "loss": 0.2995, "num_input_tokens_seen": 3419208, "step": 37970 }, { "epoch": 9.868762993762994, "grad_norm": 0.8821311593055725, "learning_rate": 2.9905058203485298e-05, "loss": 0.235, "num_input_tokens_seen": 3419624, "step": 37975 }, { "epoch": 9.87006237006237, "grad_norm": 2.786118268966675, "learning_rate": 2.98994986762688e-05, "loss": 0.3209, "num_input_tokens_seen": 3420104, "step": 37980 }, { "epoch": 9.871361746361746, "grad_norm": 2.318838357925415, "learning_rate": 2.9893938897066393e-05, "loss": 0.4078, "num_input_tokens_seen": 3420536, "step": 37985 }, { "epoch": 9.872661122661123, "grad_norm": 2.04416561126709, "learning_rate": 2.988837886616402e-05, "loss": 0.2825, "num_input_tokens_seen": 3421016, "step": 37990 }, { "epoch": 9.8739604989605, "grad_norm": 1.9865163564682007, "learning_rate": 2.9882818583847634e-05, "loss": 0.2549, "num_input_tokens_seen": 3421448, "step": 37995 }, { "epoch": 9.875259875259875, "grad_norm": 0.30869758129119873, "learning_rate": 2.9877258050403212e-05, "loss": 0.181, "num_input_tokens_seen": 3421880, "step": 38000 }, { "epoch": 9.876559251559252, "grad_norm": 1.0680164098739624, "learning_rate": 2.987169726611673e-05, "loss": 0.1557, "num_input_tokens_seen": 3422344, "step": 38005 }, { "epoch": 9.877858627858627, "grad_norm": 0.8139044642448425, "learning_rate": 2.9866136231274195e-05, "loss": 0.3406, "num_input_tokens_seen": 3422792, "step": 38010 }, { "epoch": 9.879158004158004, "grad_norm": 3.298973321914673, "learning_rate": 2.9860574946161612e-05, "loss": 0.2787, "num_input_tokens_seen": 3423272, "step": 38015 }, { "epoch": 9.880457380457381, "grad_norm": 4.222345352172852, "learning_rate": 2.9855013411065e-05, "loss": 0.411, "num_input_tokens_seen": 3423720, "step": 38020 }, { "epoch": 9.881756756756756, "grad_norm": 2.0669543743133545, "learning_rate": 2.9849451626270397e-05, "loss": 0.2035, "num_input_tokens_seen": 3424200, "step": 38025 }, { "epoch": 9.883056133056133, "grad_norm": 2.1592910289764404, "learning_rate": 2.984388959206385e-05, "loss": 0.2532, "num_input_tokens_seen": 3424616, "step": 38030 }, { "epoch": 9.88435550935551, "grad_norm": 0.9209421277046204, "learning_rate": 2.983832730873143e-05, "loss": 0.204, "num_input_tokens_seen": 3425080, "step": 38035 }, { "epoch": 9.885654885654885, "grad_norm": 1.5477406978607178, "learning_rate": 2.983276477655919e-05, "loss": 0.2396, "num_input_tokens_seen": 3425528, "step": 38040 }, { "epoch": 9.886954261954262, "grad_norm": 1.0618537664413452, "learning_rate": 2.9827201995833225e-05, "loss": 0.1927, "num_input_tokens_seen": 3426024, "step": 38045 }, { "epoch": 9.888253638253639, "grad_norm": 1.7952899932861328, "learning_rate": 2.982163896683964e-05, "loss": 0.2923, "num_input_tokens_seen": 3426504, "step": 38050 }, { "epoch": 9.889553014553014, "grad_norm": 0.8401381969451904, "learning_rate": 2.9816075689864543e-05, "loss": 0.2642, "num_input_tokens_seen": 3426952, "step": 38055 }, { "epoch": 9.890852390852391, "grad_norm": 1.1911531686782837, "learning_rate": 2.981051216519406e-05, "loss": 0.2213, "num_input_tokens_seen": 3427432, "step": 38060 }, { "epoch": 9.892151767151766, "grad_norm": 0.45628541707992554, "learning_rate": 2.9804948393114324e-05, "loss": 0.1189, "num_input_tokens_seen": 3427880, "step": 38065 }, { "epoch": 9.893451143451143, "grad_norm": 2.83058500289917, "learning_rate": 2.9799384373911488e-05, "loss": 0.2317, "num_input_tokens_seen": 3428360, "step": 38070 }, { "epoch": 9.89475051975052, "grad_norm": 0.7930495142936707, "learning_rate": 2.9793820107871717e-05, "loss": 0.2724, "num_input_tokens_seen": 3428872, "step": 38075 }, { "epoch": 9.896049896049895, "grad_norm": 2.4013679027557373, "learning_rate": 2.978825559528119e-05, "loss": 0.366, "num_input_tokens_seen": 3429336, "step": 38080 }, { "epoch": 9.897349272349272, "grad_norm": 2.971374034881592, "learning_rate": 2.978269083642608e-05, "loss": 0.2364, "num_input_tokens_seen": 3429784, "step": 38085 }, { "epoch": 9.89864864864865, "grad_norm": 2.4905776977539062, "learning_rate": 2.9777125831592602e-05, "loss": 0.1803, "num_input_tokens_seen": 3430248, "step": 38090 }, { "epoch": 9.899948024948024, "grad_norm": 1.740115761756897, "learning_rate": 2.9771560581066972e-05, "loss": 0.3355, "num_input_tokens_seen": 3430696, "step": 38095 }, { "epoch": 9.901247401247401, "grad_norm": 0.5633099675178528, "learning_rate": 2.9765995085135402e-05, "loss": 0.2204, "num_input_tokens_seen": 3431144, "step": 38100 }, { "epoch": 9.902546777546778, "grad_norm": 0.3035154938697815, "learning_rate": 2.976042934408414e-05, "loss": 0.0886, "num_input_tokens_seen": 3431608, "step": 38105 }, { "epoch": 9.903846153846153, "grad_norm": 0.37374594807624817, "learning_rate": 2.9754863358199436e-05, "loss": 0.0756, "num_input_tokens_seen": 3432056, "step": 38110 }, { "epoch": 9.90514553014553, "grad_norm": 0.2716316282749176, "learning_rate": 2.974929712776755e-05, "loss": 0.3342, "num_input_tokens_seen": 3432504, "step": 38115 }, { "epoch": 9.906444906444907, "grad_norm": 2.4545979499816895, "learning_rate": 2.9743730653074766e-05, "loss": 0.299, "num_input_tokens_seen": 3432968, "step": 38120 }, { "epoch": 9.907744282744282, "grad_norm": 1.388870120048523, "learning_rate": 2.9738163934407365e-05, "loss": 0.3878, "num_input_tokens_seen": 3433384, "step": 38125 }, { "epoch": 9.90904365904366, "grad_norm": 1.272159218788147, "learning_rate": 2.9732596972051656e-05, "loss": 0.1679, "num_input_tokens_seen": 3433864, "step": 38130 }, { "epoch": 9.910343035343036, "grad_norm": 1.0444130897521973, "learning_rate": 2.9727029766293944e-05, "loss": 0.135, "num_input_tokens_seen": 3434328, "step": 38135 }, { "epoch": 9.911642411642411, "grad_norm": 0.7155676484107971, "learning_rate": 2.9721462317420572e-05, "loss": 0.1331, "num_input_tokens_seen": 3434760, "step": 38140 }, { "epoch": 9.912941787941788, "grad_norm": 0.42616814374923706, "learning_rate": 2.9715894625717866e-05, "loss": 0.2672, "num_input_tokens_seen": 3435240, "step": 38145 }, { "epoch": 9.914241164241163, "grad_norm": 0.367143452167511, "learning_rate": 2.971032669147218e-05, "loss": 0.2735, "num_input_tokens_seen": 3435704, "step": 38150 }, { "epoch": 9.91554054054054, "grad_norm": 0.4191323518753052, "learning_rate": 2.9704758514969878e-05, "loss": 0.2285, "num_input_tokens_seen": 3436136, "step": 38155 }, { "epoch": 9.916839916839917, "grad_norm": 0.5101361870765686, "learning_rate": 2.9699190096497335e-05, "loss": 0.3618, "num_input_tokens_seen": 3436600, "step": 38160 }, { "epoch": 9.918139293139292, "grad_norm": 2.175581932067871, "learning_rate": 2.9693621436340956e-05, "loss": 0.312, "num_input_tokens_seen": 3437064, "step": 38165 }, { "epoch": 9.91943866943867, "grad_norm": 1.6801010370254517, "learning_rate": 2.9688052534787113e-05, "loss": 0.273, "num_input_tokens_seen": 3437528, "step": 38170 }, { "epoch": 9.920738045738046, "grad_norm": 1.5772556066513062, "learning_rate": 2.9682483392122247e-05, "loss": 0.202, "num_input_tokens_seen": 3437960, "step": 38175 }, { "epoch": 9.922037422037421, "grad_norm": 0.2321825474500656, "learning_rate": 2.9676914008632772e-05, "loss": 0.3273, "num_input_tokens_seen": 3438392, "step": 38180 }, { "epoch": 9.923336798336798, "grad_norm": 1.4750629663467407, "learning_rate": 2.9671344384605127e-05, "loss": 0.1455, "num_input_tokens_seen": 3438872, "step": 38185 }, { "epoch": 9.924636174636175, "grad_norm": 0.4446675181388855, "learning_rate": 2.9665774520325755e-05, "loss": 0.2081, "num_input_tokens_seen": 3439304, "step": 38190 }, { "epoch": 9.92593555093555, "grad_norm": 0.4193187654018402, "learning_rate": 2.966020441608114e-05, "loss": 0.2402, "num_input_tokens_seen": 3439768, "step": 38195 }, { "epoch": 9.927234927234927, "grad_norm": 1.187423825263977, "learning_rate": 2.9654634072157743e-05, "loss": 0.2338, "num_input_tokens_seen": 3440232, "step": 38200 }, { "epoch": 9.928534303534304, "grad_norm": 1.78738272190094, "learning_rate": 2.964906348884206e-05, "loss": 0.2246, "num_input_tokens_seen": 3440696, "step": 38205 }, { "epoch": 9.92983367983368, "grad_norm": 2.1789703369140625, "learning_rate": 2.964349266642058e-05, "loss": 0.2716, "num_input_tokens_seen": 3441128, "step": 38210 }, { "epoch": 9.931133056133056, "grad_norm": 0.6838464140892029, "learning_rate": 2.963792160517983e-05, "loss": 0.2615, "num_input_tokens_seen": 3441560, "step": 38215 }, { "epoch": 9.932432432432432, "grad_norm": 2.776524543762207, "learning_rate": 2.9632350305406326e-05, "loss": 0.3005, "num_input_tokens_seen": 3441992, "step": 38220 }, { "epoch": 9.933731808731808, "grad_norm": 0.801785409450531, "learning_rate": 2.9626778767386604e-05, "loss": 0.2116, "num_input_tokens_seen": 3442424, "step": 38225 }, { "epoch": 9.935031185031185, "grad_norm": 1.846661925315857, "learning_rate": 2.9621206991407223e-05, "loss": 0.2601, "num_input_tokens_seen": 3442888, "step": 38230 }, { "epoch": 9.93633056133056, "grad_norm": 1.0389634370803833, "learning_rate": 2.9615634977754737e-05, "loss": 0.2337, "num_input_tokens_seen": 3443336, "step": 38235 }, { "epoch": 9.937629937629938, "grad_norm": 0.946610152721405, "learning_rate": 2.961006272671572e-05, "loss": 0.2236, "num_input_tokens_seen": 3443800, "step": 38240 }, { "epoch": 9.938929313929314, "grad_norm": 1.9784497022628784, "learning_rate": 2.9604490238576754e-05, "loss": 0.2574, "num_input_tokens_seen": 3444248, "step": 38245 }, { "epoch": 9.94022869022869, "grad_norm": 2.169144630432129, "learning_rate": 2.9598917513624447e-05, "loss": 0.3165, "num_input_tokens_seen": 3444712, "step": 38250 }, { "epoch": 9.941528066528067, "grad_norm": 0.6718755960464478, "learning_rate": 2.9593344552145407e-05, "loss": 0.1821, "num_input_tokens_seen": 3445144, "step": 38255 }, { "epoch": 9.942827442827443, "grad_norm": 0.8806782364845276, "learning_rate": 2.958777135442625e-05, "loss": 0.2182, "num_input_tokens_seen": 3445592, "step": 38260 }, { "epoch": 9.944126819126819, "grad_norm": 0.7729665040969849, "learning_rate": 2.9582197920753628e-05, "loss": 0.2423, "num_input_tokens_seen": 3446024, "step": 38265 }, { "epoch": 9.945426195426196, "grad_norm": 1.94225013256073, "learning_rate": 2.957662425141417e-05, "loss": 0.2653, "num_input_tokens_seen": 3446472, "step": 38270 }, { "epoch": 9.946725571725572, "grad_norm": 0.5232767462730408, "learning_rate": 2.9571050346694545e-05, "loss": 0.1284, "num_input_tokens_seen": 3446936, "step": 38275 }, { "epoch": 9.948024948024948, "grad_norm": 2.0930352210998535, "learning_rate": 2.9565476206881415e-05, "loss": 0.3078, "num_input_tokens_seen": 3447352, "step": 38280 }, { "epoch": 9.949324324324325, "grad_norm": 2.982731819152832, "learning_rate": 2.9559901832261473e-05, "loss": 0.3958, "num_input_tokens_seen": 3447768, "step": 38285 }, { "epoch": 9.950623700623701, "grad_norm": 1.7165098190307617, "learning_rate": 2.9554327223121414e-05, "loss": 0.4009, "num_input_tokens_seen": 3448216, "step": 38290 }, { "epoch": 9.951923076923077, "grad_norm": 0.4116491675376892, "learning_rate": 2.9548752379747936e-05, "loss": 0.2293, "num_input_tokens_seen": 3448680, "step": 38295 }, { "epoch": 9.953222453222454, "grad_norm": 0.8306145071983337, "learning_rate": 2.954317730242777e-05, "loss": 0.2291, "num_input_tokens_seen": 3449144, "step": 38300 }, { "epoch": 9.954521829521829, "grad_norm": 1.8412314653396606, "learning_rate": 2.953760199144764e-05, "loss": 0.2674, "num_input_tokens_seen": 3449592, "step": 38305 }, { "epoch": 9.955821205821206, "grad_norm": 2.6063592433929443, "learning_rate": 2.9532026447094292e-05, "loss": 0.3856, "num_input_tokens_seen": 3450008, "step": 38310 }, { "epoch": 9.957120582120583, "grad_norm": 0.9736190438270569, "learning_rate": 2.9526450669654476e-05, "loss": 0.2377, "num_input_tokens_seen": 3450456, "step": 38315 }, { "epoch": 9.958419958419958, "grad_norm": 0.8068764805793762, "learning_rate": 2.9520874659414973e-05, "loss": 0.2384, "num_input_tokens_seen": 3450936, "step": 38320 }, { "epoch": 9.959719334719335, "grad_norm": 0.8957391381263733, "learning_rate": 2.951529841666255e-05, "loss": 0.2532, "num_input_tokens_seen": 3451400, "step": 38325 }, { "epoch": 9.961018711018712, "grad_norm": 1.4773316383361816, "learning_rate": 2.9509721941684004e-05, "loss": 0.2306, "num_input_tokens_seen": 3451816, "step": 38330 }, { "epoch": 9.962318087318087, "grad_norm": 0.6116657853126526, "learning_rate": 2.9504145234766133e-05, "loss": 0.1648, "num_input_tokens_seen": 3452232, "step": 38335 }, { "epoch": 9.963617463617464, "grad_norm": 1.8018121719360352, "learning_rate": 2.9498568296195768e-05, "loss": 0.2252, "num_input_tokens_seen": 3452696, "step": 38340 }, { "epoch": 9.96491683991684, "grad_norm": 1.3324757814407349, "learning_rate": 2.9492991126259716e-05, "loss": 0.1926, "num_input_tokens_seen": 3453160, "step": 38345 }, { "epoch": 9.966216216216216, "grad_norm": 1.1562834978103638, "learning_rate": 2.948741372524483e-05, "loss": 0.2011, "num_input_tokens_seen": 3453624, "step": 38350 }, { "epoch": 9.967515592515593, "grad_norm": 1.1901214122772217, "learning_rate": 2.9481836093437954e-05, "loss": 0.2649, "num_input_tokens_seen": 3454072, "step": 38355 }, { "epoch": 9.96881496881497, "grad_norm": 0.4921379089355469, "learning_rate": 2.947625823112596e-05, "loss": 0.2576, "num_input_tokens_seen": 3454520, "step": 38360 }, { "epoch": 9.970114345114345, "grad_norm": 3.301778554916382, "learning_rate": 2.947068013859571e-05, "loss": 0.2109, "num_input_tokens_seen": 3454968, "step": 38365 }, { "epoch": 9.971413721413722, "grad_norm": 3.413055658340454, "learning_rate": 2.9465101816134093e-05, "loss": 0.3539, "num_input_tokens_seen": 3455448, "step": 38370 }, { "epoch": 9.972713097713097, "grad_norm": 0.5222281217575073, "learning_rate": 2.9459523264028018e-05, "loss": 0.1477, "num_input_tokens_seen": 3455912, "step": 38375 }, { "epoch": 9.974012474012474, "grad_norm": 1.7934696674346924, "learning_rate": 2.9453944482564387e-05, "loss": 0.2734, "num_input_tokens_seen": 3456344, "step": 38380 }, { "epoch": 9.97531185031185, "grad_norm": 0.7347393035888672, "learning_rate": 2.9448365472030115e-05, "loss": 0.3045, "num_input_tokens_seen": 3456792, "step": 38385 }, { "epoch": 9.976611226611226, "grad_norm": 1.0314273834228516, "learning_rate": 2.9442786232712155e-05, "loss": 0.1828, "num_input_tokens_seen": 3457272, "step": 38390 }, { "epoch": 9.977910602910603, "grad_norm": 2.425014019012451, "learning_rate": 2.943720676489744e-05, "loss": 0.4493, "num_input_tokens_seen": 3457688, "step": 38395 }, { "epoch": 9.97920997920998, "grad_norm": 0.4095212519168854, "learning_rate": 2.9431627068872932e-05, "loss": 0.068, "num_input_tokens_seen": 3458136, "step": 38400 }, { "epoch": 9.980509355509355, "grad_norm": 2.2362704277038574, "learning_rate": 2.942604714492559e-05, "loss": 0.1763, "num_input_tokens_seen": 3458616, "step": 38405 }, { "epoch": 9.981808731808732, "grad_norm": 2.800769090652466, "learning_rate": 2.942046699334241e-05, "loss": 0.3073, "num_input_tokens_seen": 3459048, "step": 38410 }, { "epoch": 9.983108108108109, "grad_norm": 0.4888684153556824, "learning_rate": 2.9414886614410375e-05, "loss": 0.227, "num_input_tokens_seen": 3459496, "step": 38415 }, { "epoch": 9.984407484407484, "grad_norm": 2.205831289291382, "learning_rate": 2.940930600841649e-05, "loss": 0.3067, "num_input_tokens_seen": 3459944, "step": 38420 }, { "epoch": 9.98570686070686, "grad_norm": 1.2710001468658447, "learning_rate": 2.940372517564777e-05, "loss": 0.2731, "num_input_tokens_seen": 3460360, "step": 38425 }, { "epoch": 9.987006237006238, "grad_norm": 2.42337965965271, "learning_rate": 2.9398144116391257e-05, "loss": 0.2099, "num_input_tokens_seen": 3460824, "step": 38430 }, { "epoch": 9.988305613305613, "grad_norm": 0.6557043790817261, "learning_rate": 2.9392562830933968e-05, "loss": 0.2205, "num_input_tokens_seen": 3461240, "step": 38435 }, { "epoch": 9.98960498960499, "grad_norm": 0.3898971974849701, "learning_rate": 2.9386981319562962e-05, "loss": 0.123, "num_input_tokens_seen": 3461704, "step": 38440 }, { "epoch": 9.990904365904367, "grad_norm": 2.165147066116333, "learning_rate": 2.9381399582565304e-05, "loss": 0.2333, "num_input_tokens_seen": 3462168, "step": 38445 }, { "epoch": 9.992203742203742, "grad_norm": 3.1520636081695557, "learning_rate": 2.9375817620228075e-05, "loss": 0.3525, "num_input_tokens_seen": 3462616, "step": 38450 }, { "epoch": 9.993503118503119, "grad_norm": 2.7100989818573, "learning_rate": 2.9370235432838343e-05, "loss": 0.1294, "num_input_tokens_seen": 3463048, "step": 38455 }, { "epoch": 9.994802494802494, "grad_norm": 0.28621694445610046, "learning_rate": 2.9364653020683218e-05, "loss": 0.3018, "num_input_tokens_seen": 3463496, "step": 38460 }, { "epoch": 9.996101871101871, "grad_norm": 2.143275499343872, "learning_rate": 2.935907038404981e-05, "loss": 0.2559, "num_input_tokens_seen": 3463928, "step": 38465 }, { "epoch": 9.997401247401248, "grad_norm": 3.3835630416870117, "learning_rate": 2.9353487523225232e-05, "loss": 0.3323, "num_input_tokens_seen": 3464440, "step": 38470 }, { "epoch": 9.998700623700623, "grad_norm": 0.6840338706970215, "learning_rate": 2.9347904438496622e-05, "loss": 0.3676, "num_input_tokens_seen": 3464888, "step": 38475 }, { "epoch": 10.0, "grad_norm": 5.315817356109619, "learning_rate": 2.9342321130151118e-05, "loss": 0.2241, "num_input_tokens_seen": 3465288, "step": 38480 }, { "epoch": 10.0, "eval_loss": 0.24384894967079163, "eval_runtime": 13.1657, "eval_samples_per_second": 65.017, "eval_steps_per_second": 32.509, "num_input_tokens_seen": 3465288, "step": 38480 }, { "epoch": 10.001299376299377, "grad_norm": 0.9063490629196167, "learning_rate": 2.9336737598475877e-05, "loss": 0.2165, "num_input_tokens_seen": 3465720, "step": 38485 }, { "epoch": 10.002598752598752, "grad_norm": 2.6149473190307617, "learning_rate": 2.933115384375807e-05, "loss": 0.1407, "num_input_tokens_seen": 3466184, "step": 38490 }, { "epoch": 10.003898128898129, "grad_norm": 0.774721622467041, "learning_rate": 2.9325569866284876e-05, "loss": 0.21, "num_input_tokens_seen": 3466584, "step": 38495 }, { "epoch": 10.005197505197506, "grad_norm": 4.040956020355225, "learning_rate": 2.931998566634347e-05, "loss": 0.2344, "num_input_tokens_seen": 3467048, "step": 38500 }, { "epoch": 10.006496881496881, "grad_norm": 1.2821569442749023, "learning_rate": 2.9314401244221064e-05, "loss": 0.1138, "num_input_tokens_seen": 3467512, "step": 38505 }, { "epoch": 10.007796257796258, "grad_norm": 0.5263404250144958, "learning_rate": 2.9308816600204863e-05, "loss": 0.1573, "num_input_tokens_seen": 3467928, "step": 38510 }, { "epoch": 10.009095634095635, "grad_norm": 3.534897565841675, "learning_rate": 2.9303231734582102e-05, "loss": 0.2319, "num_input_tokens_seen": 3468376, "step": 38515 }, { "epoch": 10.01039501039501, "grad_norm": 0.5096873641014099, "learning_rate": 2.929764664764001e-05, "loss": 0.1063, "num_input_tokens_seen": 3468856, "step": 38520 }, { "epoch": 10.011694386694387, "grad_norm": 3.187206268310547, "learning_rate": 2.9292061339665833e-05, "loss": 0.2589, "num_input_tokens_seen": 3469304, "step": 38525 }, { "epoch": 10.012993762993762, "grad_norm": 4.751542091369629, "learning_rate": 2.9286475810946823e-05, "loss": 0.1841, "num_input_tokens_seen": 3469768, "step": 38530 }, { "epoch": 10.01429313929314, "grad_norm": 3.130659580230713, "learning_rate": 2.928089006177026e-05, "loss": 0.2239, "num_input_tokens_seen": 3470200, "step": 38535 }, { "epoch": 10.015592515592516, "grad_norm": 5.16060733795166, "learning_rate": 2.9275304092423422e-05, "loss": 0.757, "num_input_tokens_seen": 3470680, "step": 38540 }, { "epoch": 10.016891891891891, "grad_norm": 2.985231876373291, "learning_rate": 2.92697179031936e-05, "loss": 0.1647, "num_input_tokens_seen": 3471128, "step": 38545 }, { "epoch": 10.018191268191268, "grad_norm": 4.350244522094727, "learning_rate": 2.9264131494368084e-05, "loss": 0.3117, "num_input_tokens_seen": 3471608, "step": 38550 }, { "epoch": 10.019490644490645, "grad_norm": 3.556725263595581, "learning_rate": 2.9258544866234207e-05, "loss": 0.2158, "num_input_tokens_seen": 3472040, "step": 38555 }, { "epoch": 10.02079002079002, "grad_norm": 2.0522680282592773, "learning_rate": 2.9252958019079296e-05, "loss": 0.2544, "num_input_tokens_seen": 3472520, "step": 38560 }, { "epoch": 10.022089397089397, "grad_norm": 0.6076603531837463, "learning_rate": 2.924737095319066e-05, "loss": 0.1834, "num_input_tokens_seen": 3473000, "step": 38565 }, { "epoch": 10.023388773388774, "grad_norm": 1.9142463207244873, "learning_rate": 2.9241783668855683e-05, "loss": 0.1977, "num_input_tokens_seen": 3473464, "step": 38570 }, { "epoch": 10.02468814968815, "grad_norm": 2.778557538986206, "learning_rate": 2.9236196166361694e-05, "loss": 0.2948, "num_input_tokens_seen": 3473880, "step": 38575 }, { "epoch": 10.025987525987526, "grad_norm": 1.2926175594329834, "learning_rate": 2.9230608445996087e-05, "loss": 0.2866, "num_input_tokens_seen": 3474328, "step": 38580 }, { "epoch": 10.027286902286903, "grad_norm": 1.1545871496200562, "learning_rate": 2.9225020508046232e-05, "loss": 0.1842, "num_input_tokens_seen": 3474776, "step": 38585 }, { "epoch": 10.028586278586278, "grad_norm": 3.332233428955078, "learning_rate": 2.921943235279952e-05, "loss": 0.3096, "num_input_tokens_seen": 3475240, "step": 38590 }, { "epoch": 10.029885654885655, "grad_norm": 3.2913002967834473, "learning_rate": 2.9213843980543364e-05, "loss": 0.267, "num_input_tokens_seen": 3475688, "step": 38595 }, { "epoch": 10.03118503118503, "grad_norm": 2.860095977783203, "learning_rate": 2.920825539156517e-05, "loss": 0.1712, "num_input_tokens_seen": 3476168, "step": 38600 }, { "epoch": 10.032484407484407, "grad_norm": 1.53603994846344, "learning_rate": 2.920266658615237e-05, "loss": 0.6621, "num_input_tokens_seen": 3476648, "step": 38605 }, { "epoch": 10.033783783783784, "grad_norm": 2.2472853660583496, "learning_rate": 2.9197077564592402e-05, "loss": 0.1687, "num_input_tokens_seen": 3477112, "step": 38610 }, { "epoch": 10.03508316008316, "grad_norm": 1.713470220565796, "learning_rate": 2.9191488327172717e-05, "loss": 0.394, "num_input_tokens_seen": 3477544, "step": 38615 }, { "epoch": 10.036382536382536, "grad_norm": 1.0276697874069214, "learning_rate": 2.9185898874180762e-05, "loss": 0.2154, "num_input_tokens_seen": 3478040, "step": 38620 }, { "epoch": 10.037681912681913, "grad_norm": 0.9627013802528381, "learning_rate": 2.9180309205904027e-05, "loss": 0.0735, "num_input_tokens_seen": 3478488, "step": 38625 }, { "epoch": 10.038981288981288, "grad_norm": 0.6832330822944641, "learning_rate": 2.9174719322629983e-05, "loss": 0.0539, "num_input_tokens_seen": 3478936, "step": 38630 }, { "epoch": 10.040280665280665, "grad_norm": 1.5181881189346313, "learning_rate": 2.916912922464612e-05, "loss": 0.2715, "num_input_tokens_seen": 3479384, "step": 38635 }, { "epoch": 10.041580041580042, "grad_norm": 0.31572964787483215, "learning_rate": 2.9163538912239946e-05, "loss": 0.3067, "num_input_tokens_seen": 3479848, "step": 38640 }, { "epoch": 10.042879417879417, "grad_norm": 0.13996395468711853, "learning_rate": 2.915794838569898e-05, "loss": 0.0168, "num_input_tokens_seen": 3480280, "step": 38645 }, { "epoch": 10.044178794178794, "grad_norm": 4.146121978759766, "learning_rate": 2.9152357645310748e-05, "loss": 0.3123, "num_input_tokens_seen": 3480728, "step": 38650 }, { "epoch": 10.045478170478171, "grad_norm": 3.357076644897461, "learning_rate": 2.914676669136277e-05, "loss": 0.3997, "num_input_tokens_seen": 3481176, "step": 38655 }, { "epoch": 10.046777546777546, "grad_norm": 10.858115196228027, "learning_rate": 2.9141175524142615e-05, "loss": 0.4297, "num_input_tokens_seen": 3481640, "step": 38660 }, { "epoch": 10.048076923076923, "grad_norm": 1.68363356590271, "learning_rate": 2.913558414393784e-05, "loss": 0.5424, "num_input_tokens_seen": 3482088, "step": 38665 }, { "epoch": 10.049376299376299, "grad_norm": 2.3356268405914307, "learning_rate": 2.9129992551036005e-05, "loss": 0.5064, "num_input_tokens_seen": 3482536, "step": 38670 }, { "epoch": 10.050675675675675, "grad_norm": 2.4204394817352295, "learning_rate": 2.9124400745724695e-05, "loss": 0.1421, "num_input_tokens_seen": 3483000, "step": 38675 }, { "epoch": 10.051975051975052, "grad_norm": 0.4352438151836395, "learning_rate": 2.9118808728291503e-05, "loss": 0.0863, "num_input_tokens_seen": 3483432, "step": 38680 }, { "epoch": 10.053274428274428, "grad_norm": 2.0211620330810547, "learning_rate": 2.9113216499024038e-05, "loss": 0.3174, "num_input_tokens_seen": 3483896, "step": 38685 }, { "epoch": 10.054573804573804, "grad_norm": 1.7813389301300049, "learning_rate": 2.9107624058209905e-05, "loss": 0.2795, "num_input_tokens_seen": 3484312, "step": 38690 }, { "epoch": 10.055873180873181, "grad_norm": 1.5990585088729858, "learning_rate": 2.9102031406136725e-05, "loss": 0.2371, "num_input_tokens_seen": 3484760, "step": 38695 }, { "epoch": 10.057172557172557, "grad_norm": 2.8859426975250244, "learning_rate": 2.9096438543092147e-05, "loss": 0.2246, "num_input_tokens_seen": 3485224, "step": 38700 }, { "epoch": 10.058471933471933, "grad_norm": 2.1147897243499756, "learning_rate": 2.9090845469363805e-05, "loss": 0.2368, "num_input_tokens_seen": 3485688, "step": 38705 }, { "epoch": 10.05977130977131, "grad_norm": 3.6102564334869385, "learning_rate": 2.908525218523936e-05, "loss": 0.3788, "num_input_tokens_seen": 3486120, "step": 38710 }, { "epoch": 10.061070686070686, "grad_norm": 1.5028687715530396, "learning_rate": 2.9079658691006484e-05, "loss": 0.2267, "num_input_tokens_seen": 3486552, "step": 38715 }, { "epoch": 10.062370062370062, "grad_norm": 0.5362077355384827, "learning_rate": 2.9074064986952848e-05, "loss": 0.1398, "num_input_tokens_seen": 3486984, "step": 38720 }, { "epoch": 10.06366943866944, "grad_norm": 1.7574241161346436, "learning_rate": 2.9068471073366154e-05, "loss": 0.428, "num_input_tokens_seen": 3487432, "step": 38725 }, { "epoch": 10.064968814968815, "grad_norm": 1.0559096336364746, "learning_rate": 2.9062876950534085e-05, "loss": 0.2657, "num_input_tokens_seen": 3487864, "step": 38730 }, { "epoch": 10.066268191268192, "grad_norm": 0.9215508103370667, "learning_rate": 2.9057282618744362e-05, "loss": 0.1494, "num_input_tokens_seen": 3488280, "step": 38735 }, { "epoch": 10.067567567567568, "grad_norm": 1.2324479818344116, "learning_rate": 2.9051688078284715e-05, "loss": 0.2277, "num_input_tokens_seen": 3488776, "step": 38740 }, { "epoch": 10.068866943866944, "grad_norm": 1.5032987594604492, "learning_rate": 2.9046093329442857e-05, "loss": 0.2531, "num_input_tokens_seen": 3489256, "step": 38745 }, { "epoch": 10.07016632016632, "grad_norm": 0.97425776720047, "learning_rate": 2.9040498372506552e-05, "loss": 0.2793, "num_input_tokens_seen": 3489688, "step": 38750 }, { "epoch": 10.071465696465696, "grad_norm": 3.7200958728790283, "learning_rate": 2.903490320776355e-05, "loss": 0.2266, "num_input_tokens_seen": 3490120, "step": 38755 }, { "epoch": 10.072765072765073, "grad_norm": 0.8399020433425903, "learning_rate": 2.9029307835501596e-05, "loss": 0.4326, "num_input_tokens_seen": 3490600, "step": 38760 }, { "epoch": 10.07406444906445, "grad_norm": 0.6768960952758789, "learning_rate": 2.9023712256008486e-05, "loss": 0.1091, "num_input_tokens_seen": 3491016, "step": 38765 }, { "epoch": 10.075363825363825, "grad_norm": 1.0302202701568604, "learning_rate": 2.9018116469572e-05, "loss": 0.2965, "num_input_tokens_seen": 3491448, "step": 38770 }, { "epoch": 10.076663201663202, "grad_norm": 0.621532142162323, "learning_rate": 2.901252047647993e-05, "loss": 0.2266, "num_input_tokens_seen": 3491880, "step": 38775 }, { "epoch": 10.077962577962579, "grad_norm": 0.6498260498046875, "learning_rate": 2.9006924277020086e-05, "loss": 0.1183, "num_input_tokens_seen": 3492296, "step": 38780 }, { "epoch": 10.079261954261954, "grad_norm": 2.093235492706299, "learning_rate": 2.9001327871480294e-05, "loss": 0.5898, "num_input_tokens_seen": 3492728, "step": 38785 }, { "epoch": 10.08056133056133, "grad_norm": 0.8897647261619568, "learning_rate": 2.8995731260148374e-05, "loss": 0.2492, "num_input_tokens_seen": 3493192, "step": 38790 }, { "epoch": 10.081860706860708, "grad_norm": 0.4545290470123291, "learning_rate": 2.8990134443312167e-05, "loss": 0.1389, "num_input_tokens_seen": 3493624, "step": 38795 }, { "epoch": 10.083160083160083, "grad_norm": 1.3093953132629395, "learning_rate": 2.898453742125951e-05, "loss": 0.3654, "num_input_tokens_seen": 3494072, "step": 38800 }, { "epoch": 10.08445945945946, "grad_norm": 2.877601385116577, "learning_rate": 2.8978940194278293e-05, "loss": 0.3031, "num_input_tokens_seen": 3494504, "step": 38805 }, { "epoch": 10.085758835758837, "grad_norm": 0.471080482006073, "learning_rate": 2.8973342762656357e-05, "loss": 0.1416, "num_input_tokens_seen": 3494920, "step": 38810 }, { "epoch": 10.087058212058212, "grad_norm": 2.3620517253875732, "learning_rate": 2.8967745126681604e-05, "loss": 0.3022, "num_input_tokens_seen": 3495352, "step": 38815 }, { "epoch": 10.088357588357589, "grad_norm": 6.550198078155518, "learning_rate": 2.8962147286641916e-05, "loss": 0.1893, "num_input_tokens_seen": 3495816, "step": 38820 }, { "epoch": 10.089656964656964, "grad_norm": 2.66908860206604, "learning_rate": 2.8956549242825197e-05, "loss": 0.1924, "num_input_tokens_seen": 3496216, "step": 38825 }, { "epoch": 10.09095634095634, "grad_norm": 0.35851991176605225, "learning_rate": 2.895095099551936e-05, "loss": 0.2172, "num_input_tokens_seen": 3496648, "step": 38830 }, { "epoch": 10.092255717255718, "grad_norm": 4.389855861663818, "learning_rate": 2.8945352545012316e-05, "loss": 0.3009, "num_input_tokens_seen": 3497112, "step": 38835 }, { "epoch": 10.093555093555093, "grad_norm": 0.5258617401123047, "learning_rate": 2.8939753891592025e-05, "loss": 0.1539, "num_input_tokens_seen": 3497592, "step": 38840 }, { "epoch": 10.09485446985447, "grad_norm": 4.524167060852051, "learning_rate": 2.893415503554641e-05, "loss": 0.2389, "num_input_tokens_seen": 3498024, "step": 38845 }, { "epoch": 10.096153846153847, "grad_norm": 0.4162927269935608, "learning_rate": 2.8928555977163435e-05, "loss": 0.0362, "num_input_tokens_seen": 3498472, "step": 38850 }, { "epoch": 10.097453222453222, "grad_norm": 3.499680995941162, "learning_rate": 2.8922956716731054e-05, "loss": 0.2588, "num_input_tokens_seen": 3498920, "step": 38855 }, { "epoch": 10.098752598752599, "grad_norm": 4.5142340660095215, "learning_rate": 2.8917357254537257e-05, "loss": 0.674, "num_input_tokens_seen": 3499400, "step": 38860 }, { "epoch": 10.100051975051976, "grad_norm": 4.714877605438232, "learning_rate": 2.8911757590870027e-05, "loss": 0.7279, "num_input_tokens_seen": 3499880, "step": 38865 }, { "epoch": 10.10135135135135, "grad_norm": 1.2183270454406738, "learning_rate": 2.8906157726017347e-05, "loss": 0.256, "num_input_tokens_seen": 3500328, "step": 38870 }, { "epoch": 10.102650727650728, "grad_norm": 1.784703016281128, "learning_rate": 2.8900557660267236e-05, "loss": 0.419, "num_input_tokens_seen": 3500776, "step": 38875 }, { "epoch": 10.103950103950105, "grad_norm": 2.627307891845703, "learning_rate": 2.8894957393907708e-05, "loss": 0.2422, "num_input_tokens_seen": 3501240, "step": 38880 }, { "epoch": 10.10524948024948, "grad_norm": 1.2495906352996826, "learning_rate": 2.888935692722679e-05, "loss": 0.2793, "num_input_tokens_seen": 3501720, "step": 38885 }, { "epoch": 10.106548856548857, "grad_norm": 1.4635136127471924, "learning_rate": 2.8883756260512517e-05, "loss": 0.2503, "num_input_tokens_seen": 3502152, "step": 38890 }, { "epoch": 10.107848232848232, "grad_norm": 1.4516019821166992, "learning_rate": 2.8878155394052942e-05, "loss": 0.3922, "num_input_tokens_seen": 3502616, "step": 38895 }, { "epoch": 10.109147609147609, "grad_norm": 1.3624461889266968, "learning_rate": 2.887255432813612e-05, "loss": 0.1469, "num_input_tokens_seen": 3503048, "step": 38900 }, { "epoch": 10.110446985446986, "grad_norm": 0.9265660643577576, "learning_rate": 2.8866953063050105e-05, "loss": 0.2343, "num_input_tokens_seen": 3503496, "step": 38905 }, { "epoch": 10.111746361746361, "grad_norm": 1.1345033645629883, "learning_rate": 2.8861351599083002e-05, "loss": 0.2713, "num_input_tokens_seen": 3503912, "step": 38910 }, { "epoch": 10.113045738045738, "grad_norm": 0.9061231017112732, "learning_rate": 2.8855749936522886e-05, "loss": 0.2763, "num_input_tokens_seen": 3504344, "step": 38915 }, { "epoch": 10.114345114345115, "grad_norm": 1.0547007322311401, "learning_rate": 2.885014807565785e-05, "loss": 0.1937, "num_input_tokens_seen": 3504792, "step": 38920 }, { "epoch": 10.11564449064449, "grad_norm": 0.9935551285743713, "learning_rate": 2.8844546016776013e-05, "loss": 0.2236, "num_input_tokens_seen": 3505208, "step": 38925 }, { "epoch": 10.116943866943867, "grad_norm": 2.1863772869110107, "learning_rate": 2.8838943760165487e-05, "loss": 0.2786, "num_input_tokens_seen": 3505624, "step": 38930 }, { "epoch": 10.118243243243244, "grad_norm": 2.166177272796631, "learning_rate": 2.8833341306114413e-05, "loss": 0.2886, "num_input_tokens_seen": 3506072, "step": 38935 }, { "epoch": 10.119542619542619, "grad_norm": 0.4011947810649872, "learning_rate": 2.882773865491092e-05, "loss": 0.1938, "num_input_tokens_seen": 3506520, "step": 38940 }, { "epoch": 10.120841995841996, "grad_norm": 1.276680827140808, "learning_rate": 2.8822135806843154e-05, "loss": 0.1562, "num_input_tokens_seen": 3507016, "step": 38945 }, { "epoch": 10.122141372141373, "grad_norm": 2.082582950592041, "learning_rate": 2.8816532762199293e-05, "loss": 0.1806, "num_input_tokens_seen": 3507448, "step": 38950 }, { "epoch": 10.123440748440748, "grad_norm": 0.54071444272995, "learning_rate": 2.881092952126749e-05, "loss": 0.1629, "num_input_tokens_seen": 3507896, "step": 38955 }, { "epoch": 10.124740124740125, "grad_norm": 3.3794260025024414, "learning_rate": 2.880532608433592e-05, "loss": 0.3098, "num_input_tokens_seen": 3508360, "step": 38960 }, { "epoch": 10.126039501039502, "grad_norm": 0.7916403412818909, "learning_rate": 2.8799722451692796e-05, "loss": 0.1569, "num_input_tokens_seen": 3508760, "step": 38965 }, { "epoch": 10.127338877338877, "grad_norm": 0.7723174095153809, "learning_rate": 2.8794118623626305e-05, "loss": 0.2004, "num_input_tokens_seen": 3509192, "step": 38970 }, { "epoch": 10.128638253638254, "grad_norm": 0.636364758014679, "learning_rate": 2.8788514600424653e-05, "loss": 0.3336, "num_input_tokens_seen": 3509640, "step": 38975 }, { "epoch": 10.12993762993763, "grad_norm": 2.534764528274536, "learning_rate": 2.878291038237606e-05, "loss": 0.2322, "num_input_tokens_seen": 3510104, "step": 38980 }, { "epoch": 10.131237006237006, "grad_norm": 0.6148755550384521, "learning_rate": 2.877730596976877e-05, "loss": 0.3139, "num_input_tokens_seen": 3510536, "step": 38985 }, { "epoch": 10.132536382536383, "grad_norm": 4.226396560668945, "learning_rate": 2.8771701362891012e-05, "loss": 0.2753, "num_input_tokens_seen": 3511000, "step": 38990 }, { "epoch": 10.133835758835758, "grad_norm": 0.7070499062538147, "learning_rate": 2.8766096562031038e-05, "loss": 0.1751, "num_input_tokens_seen": 3511480, "step": 38995 }, { "epoch": 10.135135135135135, "grad_norm": 3.353116512298584, "learning_rate": 2.876049156747711e-05, "loss": 0.2057, "num_input_tokens_seen": 3511960, "step": 39000 }, { "epoch": 10.136434511434512, "grad_norm": 0.2685227394104004, "learning_rate": 2.87548863795175e-05, "loss": 0.2433, "num_input_tokens_seen": 3512408, "step": 39005 }, { "epoch": 10.137733887733887, "grad_norm": 2.6095693111419678, "learning_rate": 2.8749280998440488e-05, "loss": 0.3036, "num_input_tokens_seen": 3512872, "step": 39010 }, { "epoch": 10.139033264033264, "grad_norm": 0.42137470841407776, "learning_rate": 2.8743675424534355e-05, "loss": 0.3333, "num_input_tokens_seen": 3513304, "step": 39015 }, { "epoch": 10.140332640332641, "grad_norm": 0.6126119494438171, "learning_rate": 2.8738069658087412e-05, "loss": 0.2705, "num_input_tokens_seen": 3513784, "step": 39020 }, { "epoch": 10.141632016632016, "grad_norm": 0.7945812940597534, "learning_rate": 2.8732463699387968e-05, "loss": 0.2501, "num_input_tokens_seen": 3514248, "step": 39025 }, { "epoch": 10.142931392931393, "grad_norm": 0.923092782497406, "learning_rate": 2.8726857548724328e-05, "loss": 0.1976, "num_input_tokens_seen": 3514696, "step": 39030 }, { "epoch": 10.14423076923077, "grad_norm": 1.4360908269882202, "learning_rate": 2.872125120638484e-05, "loss": 0.2814, "num_input_tokens_seen": 3515128, "step": 39035 }, { "epoch": 10.145530145530145, "grad_norm": 2.618081569671631, "learning_rate": 2.8715644672657842e-05, "loss": 0.247, "num_input_tokens_seen": 3515576, "step": 39040 }, { "epoch": 10.146829521829522, "grad_norm": 0.7552192807197571, "learning_rate": 2.8710037947831676e-05, "loss": 0.1809, "num_input_tokens_seen": 3516056, "step": 39045 }, { "epoch": 10.148128898128897, "grad_norm": 0.533371090888977, "learning_rate": 2.8704431032194696e-05, "loss": 0.1508, "num_input_tokens_seen": 3516504, "step": 39050 }, { "epoch": 10.149428274428274, "grad_norm": 1.847139596939087, "learning_rate": 2.8698823926035283e-05, "loss": 0.1774, "num_input_tokens_seen": 3516952, "step": 39055 }, { "epoch": 10.150727650727651, "grad_norm": 0.34271472692489624, "learning_rate": 2.8693216629641818e-05, "loss": 0.1349, "num_input_tokens_seen": 3517384, "step": 39060 }, { "epoch": 10.152027027027026, "grad_norm": 1.8607274293899536, "learning_rate": 2.8687609143302678e-05, "loss": 0.4605, "num_input_tokens_seen": 3517816, "step": 39065 }, { "epoch": 10.153326403326403, "grad_norm": 0.7116725444793701, "learning_rate": 2.8682001467306265e-05, "loss": 0.3347, "num_input_tokens_seen": 3518248, "step": 39070 }, { "epoch": 10.15462577962578, "grad_norm": 4.902679920196533, "learning_rate": 2.867639360194099e-05, "loss": 0.367, "num_input_tokens_seen": 3518728, "step": 39075 }, { "epoch": 10.155925155925155, "grad_norm": 1.1714738607406616, "learning_rate": 2.8670785547495272e-05, "loss": 0.2229, "num_input_tokens_seen": 3519176, "step": 39080 }, { "epoch": 10.157224532224532, "grad_norm": 1.3203214406967163, "learning_rate": 2.866517730425753e-05, "loss": 0.3029, "num_input_tokens_seen": 3519640, "step": 39085 }, { "epoch": 10.15852390852391, "grad_norm": 0.8345782160758972, "learning_rate": 2.8659568872516213e-05, "loss": 0.2489, "num_input_tokens_seen": 3520088, "step": 39090 }, { "epoch": 10.159823284823284, "grad_norm": 3.6959121227264404, "learning_rate": 2.8653960252559757e-05, "loss": 0.3241, "num_input_tokens_seen": 3520552, "step": 39095 }, { "epoch": 10.161122661122661, "grad_norm": 1.4181479215621948, "learning_rate": 2.864835144467663e-05, "loss": 0.1969, "num_input_tokens_seen": 3521032, "step": 39100 }, { "epoch": 10.162422037422038, "grad_norm": 1.4465607404708862, "learning_rate": 2.8642742449155284e-05, "loss": 0.2345, "num_input_tokens_seen": 3521464, "step": 39105 }, { "epoch": 10.163721413721413, "grad_norm": 0.9746960997581482, "learning_rate": 2.8637133266284215e-05, "loss": 0.2167, "num_input_tokens_seen": 3521896, "step": 39110 }, { "epoch": 10.16502079002079, "grad_norm": 0.6851769089698792, "learning_rate": 2.8631523896351893e-05, "loss": 0.1625, "num_input_tokens_seen": 3522328, "step": 39115 }, { "epoch": 10.166320166320165, "grad_norm": 0.6103527545928955, "learning_rate": 2.862591433964681e-05, "loss": 0.2997, "num_input_tokens_seen": 3522760, "step": 39120 }, { "epoch": 10.167619542619542, "grad_norm": 1.1952053308486938, "learning_rate": 2.8620304596457486e-05, "loss": 0.2016, "num_input_tokens_seen": 3523160, "step": 39125 }, { "epoch": 10.16891891891892, "grad_norm": 0.9996565580368042, "learning_rate": 2.8614694667072428e-05, "loss": 0.2732, "num_input_tokens_seen": 3523592, "step": 39130 }, { "epoch": 10.170218295218294, "grad_norm": 0.2032269388437271, "learning_rate": 2.860908455178016e-05, "loss": 0.201, "num_input_tokens_seen": 3524088, "step": 39135 }, { "epoch": 10.171517671517671, "grad_norm": 0.6135315299034119, "learning_rate": 2.860347425086921e-05, "loss": 0.2047, "num_input_tokens_seen": 3524536, "step": 39140 }, { "epoch": 10.172817047817048, "grad_norm": 0.2809063196182251, "learning_rate": 2.8597863764628136e-05, "loss": 0.0527, "num_input_tokens_seen": 3525000, "step": 39145 }, { "epoch": 10.174116424116423, "grad_norm": 4.489542484283447, "learning_rate": 2.859225309334548e-05, "loss": 0.418, "num_input_tokens_seen": 3525512, "step": 39150 }, { "epoch": 10.1754158004158, "grad_norm": 1.0985506772994995, "learning_rate": 2.8586642237309792e-05, "loss": 0.095, "num_input_tokens_seen": 3525944, "step": 39155 }, { "epoch": 10.176715176715177, "grad_norm": 0.11173411458730698, "learning_rate": 2.8581031196809665e-05, "loss": 0.4244, "num_input_tokens_seen": 3526376, "step": 39160 }, { "epoch": 10.178014553014552, "grad_norm": 2.586209535598755, "learning_rate": 2.8575419972133675e-05, "loss": 0.239, "num_input_tokens_seen": 3526840, "step": 39165 }, { "epoch": 10.17931392931393, "grad_norm": 5.583185195922852, "learning_rate": 2.856980856357041e-05, "loss": 0.1474, "num_input_tokens_seen": 3527320, "step": 39170 }, { "epoch": 10.180613305613306, "grad_norm": 0.11690295487642288, "learning_rate": 2.8564196971408462e-05, "loss": 0.1772, "num_input_tokens_seen": 3527768, "step": 39175 }, { "epoch": 10.181912681912682, "grad_norm": 0.7728444337844849, "learning_rate": 2.855858519593646e-05, "loss": 0.3278, "num_input_tokens_seen": 3528184, "step": 39180 }, { "epoch": 10.183212058212058, "grad_norm": 3.8721165657043457, "learning_rate": 2.855297323744301e-05, "loss": 0.2968, "num_input_tokens_seen": 3528632, "step": 39185 }, { "epoch": 10.184511434511435, "grad_norm": 1.0897043943405151, "learning_rate": 2.854736109621674e-05, "loss": 0.2263, "num_input_tokens_seen": 3529096, "step": 39190 }, { "epoch": 10.18581081081081, "grad_norm": 4.537153720855713, "learning_rate": 2.8541748772546286e-05, "loss": 0.5098, "num_input_tokens_seen": 3529560, "step": 39195 }, { "epoch": 10.187110187110187, "grad_norm": 5.677163124084473, "learning_rate": 2.85361362667203e-05, "loss": 0.4346, "num_input_tokens_seen": 3530040, "step": 39200 }, { "epoch": 10.188409563409563, "grad_norm": 0.45342493057250977, "learning_rate": 2.853052357902744e-05, "loss": 0.1866, "num_input_tokens_seen": 3530472, "step": 39205 }, { "epoch": 10.18970893970894, "grad_norm": 2.775552272796631, "learning_rate": 2.852491070975637e-05, "loss": 0.2097, "num_input_tokens_seen": 3530904, "step": 39210 }, { "epoch": 10.191008316008316, "grad_norm": 1.9036592245101929, "learning_rate": 2.8519297659195766e-05, "loss": 0.2676, "num_input_tokens_seen": 3531352, "step": 39215 }, { "epoch": 10.192307692307692, "grad_norm": 2.4095728397369385, "learning_rate": 2.851368442763431e-05, "loss": 0.278, "num_input_tokens_seen": 3531784, "step": 39220 }, { "epoch": 10.193607068607069, "grad_norm": 1.5947976112365723, "learning_rate": 2.8508071015360698e-05, "loss": 0.2704, "num_input_tokens_seen": 3532248, "step": 39225 }, { "epoch": 10.194906444906445, "grad_norm": 2.949606418609619, "learning_rate": 2.850245742266363e-05, "loss": 0.2413, "num_input_tokens_seen": 3532680, "step": 39230 }, { "epoch": 10.19620582120582, "grad_norm": 2.441425323486328, "learning_rate": 2.849684364983182e-05, "loss": 0.1997, "num_input_tokens_seen": 3533096, "step": 39235 }, { "epoch": 10.197505197505198, "grad_norm": 1.9756163358688354, "learning_rate": 2.8491229697153993e-05, "loss": 0.1731, "num_input_tokens_seen": 3533560, "step": 39240 }, { "epoch": 10.198804573804575, "grad_norm": 1.806861400604248, "learning_rate": 2.8485615564918878e-05, "loss": 0.3423, "num_input_tokens_seen": 3533992, "step": 39245 }, { "epoch": 10.20010395010395, "grad_norm": 3.017981767654419, "learning_rate": 2.8480001253415213e-05, "loss": 0.2049, "num_input_tokens_seen": 3534440, "step": 39250 }, { "epoch": 10.201403326403327, "grad_norm": 3.1358234882354736, "learning_rate": 2.847438676293175e-05, "loss": 0.2779, "num_input_tokens_seen": 3534904, "step": 39255 }, { "epoch": 10.202702702702704, "grad_norm": 2.581812858581543, "learning_rate": 2.8468772093757244e-05, "loss": 0.3055, "num_input_tokens_seen": 3535384, "step": 39260 }, { "epoch": 10.204002079002079, "grad_norm": 4.358315944671631, "learning_rate": 2.8463157246180468e-05, "loss": 0.2449, "num_input_tokens_seen": 3535832, "step": 39265 }, { "epoch": 10.205301455301456, "grad_norm": 2.5772056579589844, "learning_rate": 2.8457542220490196e-05, "loss": 0.5463, "num_input_tokens_seen": 3536280, "step": 39270 }, { "epoch": 10.20660083160083, "grad_norm": 1.6692135334014893, "learning_rate": 2.8451927016975216e-05, "loss": 0.2014, "num_input_tokens_seen": 3536728, "step": 39275 }, { "epoch": 10.207900207900208, "grad_norm": 2.7021427154541016, "learning_rate": 2.8446311635924326e-05, "loss": 0.3352, "num_input_tokens_seen": 3537160, "step": 39280 }, { "epoch": 10.209199584199585, "grad_norm": 1.9930710792541504, "learning_rate": 2.8440696077626324e-05, "loss": 0.2238, "num_input_tokens_seen": 3537576, "step": 39285 }, { "epoch": 10.21049896049896, "grad_norm": 2.0167596340179443, "learning_rate": 2.8435080342370023e-05, "loss": 0.1897, "num_input_tokens_seen": 3538024, "step": 39290 }, { "epoch": 10.211798336798337, "grad_norm": 0.9129000306129456, "learning_rate": 2.8429464430444252e-05, "loss": 0.1935, "num_input_tokens_seen": 3538440, "step": 39295 }, { "epoch": 10.213097713097714, "grad_norm": 3.5454330444335938, "learning_rate": 2.8423848342137832e-05, "loss": 0.2668, "num_input_tokens_seen": 3538888, "step": 39300 }, { "epoch": 10.214397089397089, "grad_norm": 2.1065738201141357, "learning_rate": 2.841823207773962e-05, "loss": 0.1806, "num_input_tokens_seen": 3539320, "step": 39305 }, { "epoch": 10.215696465696466, "grad_norm": 1.312704086303711, "learning_rate": 2.8412615637538454e-05, "loss": 0.212, "num_input_tokens_seen": 3539736, "step": 39310 }, { "epoch": 10.216995841995843, "grad_norm": 2.0118002891540527, "learning_rate": 2.8406999021823196e-05, "loss": 0.2607, "num_input_tokens_seen": 3540216, "step": 39315 }, { "epoch": 10.218295218295218, "grad_norm": 0.9278863072395325, "learning_rate": 2.8401382230882712e-05, "loss": 0.1371, "num_input_tokens_seen": 3540712, "step": 39320 }, { "epoch": 10.219594594594595, "grad_norm": 2.941347360610962, "learning_rate": 2.839576526500588e-05, "loss": 0.3028, "num_input_tokens_seen": 3541208, "step": 39325 }, { "epoch": 10.220893970893972, "grad_norm": 3.2360849380493164, "learning_rate": 2.839014812448159e-05, "loss": 0.3479, "num_input_tokens_seen": 3541624, "step": 39330 }, { "epoch": 10.222193347193347, "grad_norm": 3.157294511795044, "learning_rate": 2.838453080959873e-05, "loss": 0.2144, "num_input_tokens_seen": 3542040, "step": 39335 }, { "epoch": 10.223492723492724, "grad_norm": 4.206202030181885, "learning_rate": 2.8378913320646212e-05, "loss": 0.1739, "num_input_tokens_seen": 3542472, "step": 39340 }, { "epoch": 10.2247920997921, "grad_norm": 2.7140607833862305, "learning_rate": 2.8373295657912945e-05, "loss": 0.395, "num_input_tokens_seen": 3542888, "step": 39345 }, { "epoch": 10.226091476091476, "grad_norm": 1.6415419578552246, "learning_rate": 2.8367677821687848e-05, "loss": 0.1257, "num_input_tokens_seen": 3543352, "step": 39350 }, { "epoch": 10.227390852390853, "grad_norm": 0.9924843907356262, "learning_rate": 2.836205981225985e-05, "loss": 0.2089, "num_input_tokens_seen": 3543832, "step": 39355 }, { "epoch": 10.228690228690228, "grad_norm": 0.546226978302002, "learning_rate": 2.8356441629917902e-05, "loss": 0.2467, "num_input_tokens_seen": 3544296, "step": 39360 }, { "epoch": 10.229989604989605, "grad_norm": 5.042295455932617, "learning_rate": 2.8350823274950943e-05, "loss": 0.4605, "num_input_tokens_seen": 3544760, "step": 39365 }, { "epoch": 10.231288981288982, "grad_norm": 0.4460085332393646, "learning_rate": 2.8345204747647924e-05, "loss": 0.3573, "num_input_tokens_seen": 3545208, "step": 39370 }, { "epoch": 10.232588357588357, "grad_norm": 2.4384918212890625, "learning_rate": 2.833958604829783e-05, "loss": 0.3286, "num_input_tokens_seen": 3545672, "step": 39375 }, { "epoch": 10.233887733887734, "grad_norm": 6.146620750427246, "learning_rate": 2.833396717718962e-05, "loss": 0.162, "num_input_tokens_seen": 3546088, "step": 39380 }, { "epoch": 10.23518711018711, "grad_norm": 0.5567207932472229, "learning_rate": 2.8328348134612288e-05, "loss": 0.095, "num_input_tokens_seen": 3546536, "step": 39385 }, { "epoch": 10.236486486486486, "grad_norm": 3.851564645767212, "learning_rate": 2.8322728920854812e-05, "loss": 0.3781, "num_input_tokens_seen": 3546936, "step": 39390 }, { "epoch": 10.237785862785863, "grad_norm": 4.5304694175720215, "learning_rate": 2.8317109536206216e-05, "loss": 0.2674, "num_input_tokens_seen": 3547368, "step": 39395 }, { "epoch": 10.23908523908524, "grad_norm": 1.5328681468963623, "learning_rate": 2.831148998095549e-05, "loss": 0.3478, "num_input_tokens_seen": 3547832, "step": 39400 }, { "epoch": 10.240384615384615, "grad_norm": 1.5136692523956299, "learning_rate": 2.830587025539167e-05, "loss": 0.2315, "num_input_tokens_seen": 3548296, "step": 39405 }, { "epoch": 10.241683991683992, "grad_norm": 3.468602418899536, "learning_rate": 2.830025035980377e-05, "loss": 0.2108, "num_input_tokens_seen": 3548760, "step": 39410 }, { "epoch": 10.242983367983369, "grad_norm": 1.752776861190796, "learning_rate": 2.8294630294480834e-05, "loss": 0.1448, "num_input_tokens_seen": 3549224, "step": 39415 }, { "epoch": 10.244282744282744, "grad_norm": 3.950087070465088, "learning_rate": 2.8289010059711906e-05, "loss": 0.2665, "num_input_tokens_seen": 3549704, "step": 39420 }, { "epoch": 10.245582120582121, "grad_norm": 2.96567440032959, "learning_rate": 2.828338965578603e-05, "loss": 0.4418, "num_input_tokens_seen": 3550184, "step": 39425 }, { "epoch": 10.246881496881496, "grad_norm": 3.7998733520507812, "learning_rate": 2.827776908299229e-05, "loss": 0.2807, "num_input_tokens_seen": 3550648, "step": 39430 }, { "epoch": 10.248180873180873, "grad_norm": 1.2735023498535156, "learning_rate": 2.8272148341619742e-05, "loss": 0.2721, "num_input_tokens_seen": 3551080, "step": 39435 }, { "epoch": 10.24948024948025, "grad_norm": 0.4210778772830963, "learning_rate": 2.8266527431957467e-05, "loss": 0.1847, "num_input_tokens_seen": 3551512, "step": 39440 }, { "epoch": 10.250779625779625, "grad_norm": 0.8935508728027344, "learning_rate": 2.826090635429455e-05, "loss": 0.2463, "num_input_tokens_seen": 3551960, "step": 39445 }, { "epoch": 10.252079002079002, "grad_norm": 0.559441089630127, "learning_rate": 2.8255285108920105e-05, "loss": 0.2303, "num_input_tokens_seen": 3552408, "step": 39450 }, { "epoch": 10.253378378378379, "grad_norm": 5.134081840515137, "learning_rate": 2.8249663696123223e-05, "loss": 0.1747, "num_input_tokens_seen": 3552904, "step": 39455 }, { "epoch": 10.254677754677754, "grad_norm": 0.26738494634628296, "learning_rate": 2.8244042116193033e-05, "loss": 0.3975, "num_input_tokens_seen": 3553368, "step": 39460 }, { "epoch": 10.255977130977131, "grad_norm": 4.878360271453857, "learning_rate": 2.8238420369418633e-05, "loss": 0.4026, "num_input_tokens_seen": 3553816, "step": 39465 }, { "epoch": 10.257276507276508, "grad_norm": 0.769090473651886, "learning_rate": 2.8232798456089183e-05, "loss": 0.2417, "num_input_tokens_seen": 3554232, "step": 39470 }, { "epoch": 10.258575883575883, "grad_norm": 3.1600940227508545, "learning_rate": 2.822717637649381e-05, "loss": 0.1253, "num_input_tokens_seen": 3554696, "step": 39475 }, { "epoch": 10.25987525987526, "grad_norm": 3.8914809226989746, "learning_rate": 2.8221554130921656e-05, "loss": 0.4742, "num_input_tokens_seen": 3555112, "step": 39480 }, { "epoch": 10.261174636174637, "grad_norm": 3.267427921295166, "learning_rate": 2.8215931719661893e-05, "loss": 0.2709, "num_input_tokens_seen": 3555608, "step": 39485 }, { "epoch": 10.262474012474012, "grad_norm": 1.476863980293274, "learning_rate": 2.8210309143003676e-05, "loss": 0.2814, "num_input_tokens_seen": 3556024, "step": 39490 }, { "epoch": 10.263773388773389, "grad_norm": 1.3954360485076904, "learning_rate": 2.820468640123618e-05, "loss": 0.1956, "num_input_tokens_seen": 3556488, "step": 39495 }, { "epoch": 10.265072765072764, "grad_norm": 0.9013580083847046, "learning_rate": 2.8199063494648598e-05, "loss": 0.2862, "num_input_tokens_seen": 3556952, "step": 39500 }, { "epoch": 10.266372141372141, "grad_norm": 3.6167612075805664, "learning_rate": 2.8193440423530114e-05, "loss": 0.3389, "num_input_tokens_seen": 3557384, "step": 39505 }, { "epoch": 10.267671517671518, "grad_norm": 3.985771894454956, "learning_rate": 2.8187817188169936e-05, "loss": 0.2847, "num_input_tokens_seen": 3557864, "step": 39510 }, { "epoch": 10.268970893970893, "grad_norm": 3.438688039779663, "learning_rate": 2.8182193788857254e-05, "loss": 0.1952, "num_input_tokens_seen": 3558280, "step": 39515 }, { "epoch": 10.27027027027027, "grad_norm": 2.242213249206543, "learning_rate": 2.81765702258813e-05, "loss": 0.1357, "num_input_tokens_seen": 3558696, "step": 39520 }, { "epoch": 10.271569646569647, "grad_norm": 3.236464738845825, "learning_rate": 2.8170946499531298e-05, "loss": 0.2726, "num_input_tokens_seen": 3559144, "step": 39525 }, { "epoch": 10.272869022869022, "grad_norm": 0.5194839835166931, "learning_rate": 2.8165322610096483e-05, "loss": 0.3162, "num_input_tokens_seen": 3559592, "step": 39530 }, { "epoch": 10.2741683991684, "grad_norm": 1.5661320686340332, "learning_rate": 2.8159698557866082e-05, "loss": 0.2209, "num_input_tokens_seen": 3560024, "step": 39535 }, { "epoch": 10.275467775467776, "grad_norm": 0.5296986103057861, "learning_rate": 2.8154074343129372e-05, "loss": 0.1535, "num_input_tokens_seen": 3560472, "step": 39540 }, { "epoch": 10.276767151767151, "grad_norm": 1.3189259767532349, "learning_rate": 2.814844996617559e-05, "loss": 0.0933, "num_input_tokens_seen": 3560936, "step": 39545 }, { "epoch": 10.278066528066528, "grad_norm": 2.6375861167907715, "learning_rate": 2.8142825427294e-05, "loss": 0.2588, "num_input_tokens_seen": 3561368, "step": 39550 }, { "epoch": 10.279365904365905, "grad_norm": 0.7181041836738586, "learning_rate": 2.81372007267739e-05, "loss": 0.3568, "num_input_tokens_seen": 3561864, "step": 39555 }, { "epoch": 10.28066528066528, "grad_norm": 1.2800356149673462, "learning_rate": 2.813157586490455e-05, "loss": 0.1463, "num_input_tokens_seen": 3562264, "step": 39560 }, { "epoch": 10.281964656964657, "grad_norm": 0.20039592683315277, "learning_rate": 2.8125950841975264e-05, "loss": 0.2231, "num_input_tokens_seen": 3562728, "step": 39565 }, { "epoch": 10.283264033264032, "grad_norm": 2.455148220062256, "learning_rate": 2.8120325658275322e-05, "loss": 0.1259, "num_input_tokens_seen": 3563208, "step": 39570 }, { "epoch": 10.28456340956341, "grad_norm": 3.0400476455688477, "learning_rate": 2.8114700314094044e-05, "loss": 0.2413, "num_input_tokens_seen": 3563640, "step": 39575 }, { "epoch": 10.285862785862786, "grad_norm": 2.337948799133301, "learning_rate": 2.8109074809720747e-05, "loss": 0.2477, "num_input_tokens_seen": 3564088, "step": 39580 }, { "epoch": 10.287162162162161, "grad_norm": 4.4150495529174805, "learning_rate": 2.810344914544475e-05, "loss": 0.5329, "num_input_tokens_seen": 3564616, "step": 39585 }, { "epoch": 10.288461538461538, "grad_norm": 2.179835081100464, "learning_rate": 2.8097823321555388e-05, "loss": 0.1872, "num_input_tokens_seen": 3565032, "step": 39590 }, { "epoch": 10.289760914760915, "grad_norm": 4.3613362312316895, "learning_rate": 2.809219733834201e-05, "loss": 0.4076, "num_input_tokens_seen": 3565464, "step": 39595 }, { "epoch": 10.29106029106029, "grad_norm": 0.5634968876838684, "learning_rate": 2.808657119609396e-05, "loss": 0.1119, "num_input_tokens_seen": 3565880, "step": 39600 }, { "epoch": 10.292359667359667, "grad_norm": 2.841704845428467, "learning_rate": 2.808094489510059e-05, "loss": 0.1403, "num_input_tokens_seen": 3566312, "step": 39605 }, { "epoch": 10.293659043659044, "grad_norm": 0.3502742350101471, "learning_rate": 2.8075318435651282e-05, "loss": 0.1919, "num_input_tokens_seen": 3566760, "step": 39610 }, { "epoch": 10.29495841995842, "grad_norm": 2.3272738456726074, "learning_rate": 2.8069691818035393e-05, "loss": 0.3694, "num_input_tokens_seen": 3567192, "step": 39615 }, { "epoch": 10.296257796257796, "grad_norm": 3.3665854930877686, "learning_rate": 2.806406504254231e-05, "loss": 0.3224, "num_input_tokens_seen": 3567624, "step": 39620 }, { "epoch": 10.297557172557173, "grad_norm": 2.0499768257141113, "learning_rate": 2.8058438109461434e-05, "loss": 0.4509, "num_input_tokens_seen": 3568072, "step": 39625 }, { "epoch": 10.298856548856548, "grad_norm": 3.3871424198150635, "learning_rate": 2.8052811019082155e-05, "loss": 0.4073, "num_input_tokens_seen": 3568520, "step": 39630 }, { "epoch": 10.300155925155925, "grad_norm": 4.091223239898682, "learning_rate": 2.8047183771693874e-05, "loss": 0.248, "num_input_tokens_seen": 3568952, "step": 39635 }, { "epoch": 10.301455301455302, "grad_norm": 1.4344627857208252, "learning_rate": 2.804155636758601e-05, "loss": 0.3275, "num_input_tokens_seen": 3569400, "step": 39640 }, { "epoch": 10.302754677754677, "grad_norm": 1.2311413288116455, "learning_rate": 2.8035928807047995e-05, "loss": 0.244, "num_input_tokens_seen": 3569880, "step": 39645 }, { "epoch": 10.304054054054054, "grad_norm": 1.4206852912902832, "learning_rate": 2.8030301090369248e-05, "loss": 0.1873, "num_input_tokens_seen": 3570328, "step": 39650 }, { "epoch": 10.30535343035343, "grad_norm": 1.2356112003326416, "learning_rate": 2.8024673217839214e-05, "loss": 0.1547, "num_input_tokens_seen": 3570792, "step": 39655 }, { "epoch": 10.306652806652806, "grad_norm": 1.611018419265747, "learning_rate": 2.801904518974734e-05, "loss": 0.1861, "num_input_tokens_seen": 3571288, "step": 39660 }, { "epoch": 10.307952182952183, "grad_norm": 1.955278992652893, "learning_rate": 2.8013417006383076e-05, "loss": 0.1585, "num_input_tokens_seen": 3571704, "step": 39665 }, { "epoch": 10.309251559251559, "grad_norm": 0.9601069688796997, "learning_rate": 2.8007788668035895e-05, "loss": 0.0934, "num_input_tokens_seen": 3572136, "step": 39670 }, { "epoch": 10.310550935550935, "grad_norm": 0.7135927677154541, "learning_rate": 2.800216017499525e-05, "loss": 0.0434, "num_input_tokens_seen": 3572552, "step": 39675 }, { "epoch": 10.311850311850312, "grad_norm": 3.8799309730529785, "learning_rate": 2.799653152755064e-05, "loss": 0.2061, "num_input_tokens_seen": 3573016, "step": 39680 }, { "epoch": 10.313149688149688, "grad_norm": 3.532175302505493, "learning_rate": 2.799090272599154e-05, "loss": 0.3471, "num_input_tokens_seen": 3573448, "step": 39685 }, { "epoch": 10.314449064449065, "grad_norm": 0.4086763560771942, "learning_rate": 2.7985273770607445e-05, "loss": 0.0359, "num_input_tokens_seen": 3573880, "step": 39690 }, { "epoch": 10.315748440748441, "grad_norm": 0.36326202750205994, "learning_rate": 2.797964466168786e-05, "loss": 0.3702, "num_input_tokens_seen": 3574312, "step": 39695 }, { "epoch": 10.317047817047817, "grad_norm": 0.1714942753314972, "learning_rate": 2.7974015399522298e-05, "loss": 0.3257, "num_input_tokens_seen": 3574808, "step": 39700 }, { "epoch": 10.318347193347194, "grad_norm": 0.2415107786655426, "learning_rate": 2.796838598440027e-05, "loss": 0.2037, "num_input_tokens_seen": 3575288, "step": 39705 }, { "epoch": 10.31964656964657, "grad_norm": 0.6168842315673828, "learning_rate": 2.7962756416611314e-05, "loss": 0.495, "num_input_tokens_seen": 3575752, "step": 39710 }, { "epoch": 10.320945945945946, "grad_norm": 3.7585909366607666, "learning_rate": 2.7957126696444948e-05, "loss": 0.2457, "num_input_tokens_seen": 3576184, "step": 39715 }, { "epoch": 10.322245322245323, "grad_norm": 3.9092633724212646, "learning_rate": 2.7951496824190726e-05, "loss": 0.3993, "num_input_tokens_seen": 3576600, "step": 39720 }, { "epoch": 10.323544698544698, "grad_norm": 0.2153501957654953, "learning_rate": 2.7945866800138195e-05, "loss": 0.051, "num_input_tokens_seen": 3577048, "step": 39725 }, { "epoch": 10.324844074844075, "grad_norm": 3.172041177749634, "learning_rate": 2.7940236624576904e-05, "loss": 0.2209, "num_input_tokens_seen": 3577480, "step": 39730 }, { "epoch": 10.326143451143452, "grad_norm": 0.7094486355781555, "learning_rate": 2.793460629779644e-05, "loss": 0.3574, "num_input_tokens_seen": 3577928, "step": 39735 }, { "epoch": 10.327442827442827, "grad_norm": 3.124598979949951, "learning_rate": 2.7928975820086357e-05, "loss": 0.249, "num_input_tokens_seen": 3578408, "step": 39740 }, { "epoch": 10.328742203742204, "grad_norm": 1.4109077453613281, "learning_rate": 2.792334519173624e-05, "loss": 0.3179, "num_input_tokens_seen": 3578872, "step": 39745 }, { "epoch": 10.33004158004158, "grad_norm": 5.183286190032959, "learning_rate": 2.7917714413035678e-05, "loss": 0.2688, "num_input_tokens_seen": 3579288, "step": 39750 }, { "epoch": 10.331340956340956, "grad_norm": 0.5577941536903381, "learning_rate": 2.7912083484274266e-05, "loss": 0.1247, "num_input_tokens_seen": 3579752, "step": 39755 }, { "epoch": 10.332640332640333, "grad_norm": 4.801610946655273, "learning_rate": 2.790645240574162e-05, "loss": 0.2025, "num_input_tokens_seen": 3580200, "step": 39760 }, { "epoch": 10.33393970893971, "grad_norm": 0.46237102150917053, "learning_rate": 2.7900821177727326e-05, "loss": 0.1956, "num_input_tokens_seen": 3580616, "step": 39765 }, { "epoch": 10.335239085239085, "grad_norm": 4.099911212921143, "learning_rate": 2.7895189800521033e-05, "loss": 0.468, "num_input_tokens_seen": 3581080, "step": 39770 }, { "epoch": 10.336538461538462, "grad_norm": 3.8971471786499023, "learning_rate": 2.788955827441235e-05, "loss": 0.5375, "num_input_tokens_seen": 3581576, "step": 39775 }, { "epoch": 10.337837837837839, "grad_norm": 0.8309058547019958, "learning_rate": 2.7883926599690914e-05, "loss": 0.1865, "num_input_tokens_seen": 3582040, "step": 39780 }, { "epoch": 10.339137214137214, "grad_norm": 2.3465116024017334, "learning_rate": 2.787829477664637e-05, "loss": 0.1346, "num_input_tokens_seen": 3582504, "step": 39785 }, { "epoch": 10.34043659043659, "grad_norm": 3.1251800060272217, "learning_rate": 2.787266280556837e-05, "loss": 0.4431, "num_input_tokens_seen": 3582968, "step": 39790 }, { "epoch": 10.341735966735968, "grad_norm": 1.5543206930160522, "learning_rate": 2.786703068674657e-05, "loss": 0.2755, "num_input_tokens_seen": 3583400, "step": 39795 }, { "epoch": 10.343035343035343, "grad_norm": 1.1633085012435913, "learning_rate": 2.7861398420470636e-05, "loss": 0.2536, "num_input_tokens_seen": 3583880, "step": 39800 }, { "epoch": 10.34433471933472, "grad_norm": 2.4007601737976074, "learning_rate": 2.7855766007030237e-05, "loss": 0.234, "num_input_tokens_seen": 3584328, "step": 39805 }, { "epoch": 10.345634095634095, "grad_norm": 2.468686103820801, "learning_rate": 2.785013344671506e-05, "loss": 0.2327, "num_input_tokens_seen": 3584744, "step": 39810 }, { "epoch": 10.346933471933472, "grad_norm": 0.635514497756958, "learning_rate": 2.7844500739814782e-05, "loss": 0.2289, "num_input_tokens_seen": 3585176, "step": 39815 }, { "epoch": 10.348232848232849, "grad_norm": 1.554256796836853, "learning_rate": 2.783886788661911e-05, "loss": 0.2086, "num_input_tokens_seen": 3585640, "step": 39820 }, { "epoch": 10.349532224532224, "grad_norm": 0.7348840236663818, "learning_rate": 2.7833234887417743e-05, "loss": 0.1787, "num_input_tokens_seen": 3586056, "step": 39825 }, { "epoch": 10.3508316008316, "grad_norm": 1.0148319005966187, "learning_rate": 2.7827601742500388e-05, "loss": 0.181, "num_input_tokens_seen": 3586488, "step": 39830 }, { "epoch": 10.352130977130978, "grad_norm": 0.8226093649864197, "learning_rate": 2.7821968452156766e-05, "loss": 0.0938, "num_input_tokens_seen": 3586952, "step": 39835 }, { "epoch": 10.353430353430353, "grad_norm": 1.8261988162994385, "learning_rate": 2.7816335016676604e-05, "loss": 0.3264, "num_input_tokens_seen": 3587416, "step": 39840 }, { "epoch": 10.35472972972973, "grad_norm": 3.1609039306640625, "learning_rate": 2.7810701436349633e-05, "loss": 0.1922, "num_input_tokens_seen": 3587864, "step": 39845 }, { "epoch": 10.356029106029107, "grad_norm": 1.0351414680480957, "learning_rate": 2.7805067711465594e-05, "loss": 0.2619, "num_input_tokens_seen": 3588344, "step": 39850 }, { "epoch": 10.357328482328482, "grad_norm": 2.9897913932800293, "learning_rate": 2.7799433842314232e-05, "loss": 0.2888, "num_input_tokens_seen": 3588792, "step": 39855 }, { "epoch": 10.358627858627859, "grad_norm": 2.513178586959839, "learning_rate": 2.7793799829185315e-05, "loss": 0.3589, "num_input_tokens_seen": 3589208, "step": 39860 }, { "epoch": 10.359927234927236, "grad_norm": 0.6298973560333252, "learning_rate": 2.7788165672368594e-05, "loss": 0.2332, "num_input_tokens_seen": 3589624, "step": 39865 }, { "epoch": 10.361226611226611, "grad_norm": 0.5859886407852173, "learning_rate": 2.778253137215383e-05, "loss": 0.0928, "num_input_tokens_seen": 3590040, "step": 39870 }, { "epoch": 10.362525987525988, "grad_norm": 2.1364612579345703, "learning_rate": 2.777689692883082e-05, "loss": 0.2141, "num_input_tokens_seen": 3590472, "step": 39875 }, { "epoch": 10.363825363825363, "grad_norm": 0.27944180369377136, "learning_rate": 2.7771262342689343e-05, "loss": 0.1483, "num_input_tokens_seen": 3590904, "step": 39880 }, { "epoch": 10.36512474012474, "grad_norm": 0.4945380389690399, "learning_rate": 2.7765627614019185e-05, "loss": 0.0655, "num_input_tokens_seen": 3591336, "step": 39885 }, { "epoch": 10.366424116424117, "grad_norm": 0.3825379014015198, "learning_rate": 2.7759992743110143e-05, "loss": 0.3238, "num_input_tokens_seen": 3591752, "step": 39890 }, { "epoch": 10.367723492723492, "grad_norm": 0.663653552532196, "learning_rate": 2.7754357730252032e-05, "loss": 0.2151, "num_input_tokens_seen": 3592216, "step": 39895 }, { "epoch": 10.369022869022869, "grad_norm": 2.8071742057800293, "learning_rate": 2.7748722575734672e-05, "loss": 0.2267, "num_input_tokens_seen": 3592632, "step": 39900 }, { "epoch": 10.370322245322246, "grad_norm": 1.3171113729476929, "learning_rate": 2.7743087279847868e-05, "loss": 0.4921, "num_input_tokens_seen": 3593096, "step": 39905 }, { "epoch": 10.371621621621621, "grad_norm": 0.7471391558647156, "learning_rate": 2.7737451842881455e-05, "loss": 0.1285, "num_input_tokens_seen": 3593544, "step": 39910 }, { "epoch": 10.372920997920998, "grad_norm": 3.582750082015991, "learning_rate": 2.7731816265125278e-05, "loss": 0.2072, "num_input_tokens_seen": 3594008, "step": 39915 }, { "epoch": 10.374220374220375, "grad_norm": 4.661919116973877, "learning_rate": 2.7726180546869175e-05, "loss": 0.2496, "num_input_tokens_seen": 3594472, "step": 39920 }, { "epoch": 10.37551975051975, "grad_norm": 4.527729034423828, "learning_rate": 2.772054468840299e-05, "loss": 0.3191, "num_input_tokens_seen": 3594936, "step": 39925 }, { "epoch": 10.376819126819127, "grad_norm": 1.0770490169525146, "learning_rate": 2.7714908690016583e-05, "loss": 0.1985, "num_input_tokens_seen": 3595384, "step": 39930 }, { "epoch": 10.378118503118504, "grad_norm": 0.19591793417930603, "learning_rate": 2.7709272551999828e-05, "loss": 0.1316, "num_input_tokens_seen": 3595832, "step": 39935 }, { "epoch": 10.379417879417879, "grad_norm": 5.28710412979126, "learning_rate": 2.770363627464258e-05, "loss": 0.308, "num_input_tokens_seen": 3596296, "step": 39940 }, { "epoch": 10.380717255717256, "grad_norm": 3.345060110092163, "learning_rate": 2.769799985823473e-05, "loss": 0.4267, "num_input_tokens_seen": 3596776, "step": 39945 }, { "epoch": 10.382016632016631, "grad_norm": 1.398305058479309, "learning_rate": 2.7692363303066164e-05, "loss": 0.2882, "num_input_tokens_seen": 3597224, "step": 39950 }, { "epoch": 10.383316008316008, "grad_norm": 1.1586861610412598, "learning_rate": 2.7686726609426777e-05, "loss": 0.3162, "num_input_tokens_seen": 3597672, "step": 39955 }, { "epoch": 10.384615384615385, "grad_norm": 3.6077828407287598, "learning_rate": 2.7681089777606463e-05, "loss": 0.3499, "num_input_tokens_seen": 3598152, "step": 39960 }, { "epoch": 10.38591476091476, "grad_norm": 3.603731632232666, "learning_rate": 2.767545280789513e-05, "loss": 0.2609, "num_input_tokens_seen": 3598632, "step": 39965 }, { "epoch": 10.387214137214137, "grad_norm": 1.1863468885421753, "learning_rate": 2.7669815700582697e-05, "loss": 0.2317, "num_input_tokens_seen": 3599096, "step": 39970 }, { "epoch": 10.388513513513514, "grad_norm": 1.8064422607421875, "learning_rate": 2.7664178455959087e-05, "loss": 0.2412, "num_input_tokens_seen": 3599608, "step": 39975 }, { "epoch": 10.38981288981289, "grad_norm": 4.683784484863281, "learning_rate": 2.765854107431422e-05, "loss": 0.2779, "num_input_tokens_seen": 3600040, "step": 39980 }, { "epoch": 10.391112266112266, "grad_norm": 2.6393847465515137, "learning_rate": 2.765290355593805e-05, "loss": 0.2435, "num_input_tokens_seen": 3600472, "step": 39985 }, { "epoch": 10.392411642411643, "grad_norm": 0.7182754278182983, "learning_rate": 2.76472659011205e-05, "loss": 0.1922, "num_input_tokens_seen": 3600920, "step": 39990 }, { "epoch": 10.393711018711018, "grad_norm": 4.549493789672852, "learning_rate": 2.764162811015153e-05, "loss": 0.3053, "num_input_tokens_seen": 3601352, "step": 39995 }, { "epoch": 10.395010395010395, "grad_norm": 0.6444849371910095, "learning_rate": 2.7635990183321098e-05, "loss": 0.2197, "num_input_tokens_seen": 3601784, "step": 40000 }, { "epoch": 10.396309771309772, "grad_norm": 1.4536960124969482, "learning_rate": 2.7630352120919162e-05, "loss": 0.1517, "num_input_tokens_seen": 3602216, "step": 40005 }, { "epoch": 10.397609147609147, "grad_norm": 0.7068054676055908, "learning_rate": 2.76247139232357e-05, "loss": 0.4249, "num_input_tokens_seen": 3602664, "step": 40010 }, { "epoch": 10.398908523908524, "grad_norm": 5.9907965660095215, "learning_rate": 2.7619075590560678e-05, "loss": 0.1676, "num_input_tokens_seen": 3603128, "step": 40015 }, { "epoch": 10.4002079002079, "grad_norm": 0.6727757453918457, "learning_rate": 2.7613437123184093e-05, "loss": 0.3328, "num_input_tokens_seen": 3603576, "step": 40020 }, { "epoch": 10.401507276507276, "grad_norm": 1.8362122774124146, "learning_rate": 2.7607798521395933e-05, "loss": 0.2133, "num_input_tokens_seen": 3604024, "step": 40025 }, { "epoch": 10.402806652806653, "grad_norm": 1.8395251035690308, "learning_rate": 2.7602159785486198e-05, "loss": 0.349, "num_input_tokens_seen": 3604456, "step": 40030 }, { "epoch": 10.404106029106028, "grad_norm": 2.7494709491729736, "learning_rate": 2.759652091574489e-05, "loss": 0.2853, "num_input_tokens_seen": 3604904, "step": 40035 }, { "epoch": 10.405405405405405, "grad_norm": 0.4781818091869354, "learning_rate": 2.7590881912462026e-05, "loss": 0.4079, "num_input_tokens_seen": 3605384, "step": 40040 }, { "epoch": 10.406704781704782, "grad_norm": 2.3473074436187744, "learning_rate": 2.7585242775927618e-05, "loss": 0.5392, "num_input_tokens_seen": 3605880, "step": 40045 }, { "epoch": 10.408004158004157, "grad_norm": 1.1287004947662354, "learning_rate": 2.75796035064317e-05, "loss": 0.355, "num_input_tokens_seen": 3606344, "step": 40050 }, { "epoch": 10.409303534303534, "grad_norm": 1.0051589012145996, "learning_rate": 2.75739641042643e-05, "loss": 0.2069, "num_input_tokens_seen": 3606792, "step": 40055 }, { "epoch": 10.410602910602911, "grad_norm": 1.2469605207443237, "learning_rate": 2.7568324569715465e-05, "loss": 0.245, "num_input_tokens_seen": 3607224, "step": 40060 }, { "epoch": 10.411902286902286, "grad_norm": 1.5726937055587769, "learning_rate": 2.7562684903075238e-05, "loss": 0.2731, "num_input_tokens_seen": 3607672, "step": 40065 }, { "epoch": 10.413201663201663, "grad_norm": 1.2075188159942627, "learning_rate": 2.7557045104633662e-05, "loss": 0.3246, "num_input_tokens_seen": 3608104, "step": 40070 }, { "epoch": 10.41450103950104, "grad_norm": 2.2311935424804688, "learning_rate": 2.7551405174680812e-05, "loss": 0.3532, "num_input_tokens_seen": 3608552, "step": 40075 }, { "epoch": 10.415800415800415, "grad_norm": 1.5531010627746582, "learning_rate": 2.7545765113506746e-05, "loss": 0.2949, "num_input_tokens_seen": 3609000, "step": 40080 }, { "epoch": 10.417099792099792, "grad_norm": 1.7357200384140015, "learning_rate": 2.7540124921401545e-05, "loss": 0.2684, "num_input_tokens_seen": 3609464, "step": 40085 }, { "epoch": 10.41839916839917, "grad_norm": 1.6234890222549438, "learning_rate": 2.7534484598655275e-05, "loss": 0.3102, "num_input_tokens_seen": 3609912, "step": 40090 }, { "epoch": 10.419698544698544, "grad_norm": 1.4433828592300415, "learning_rate": 2.7528844145558048e-05, "loss": 0.3197, "num_input_tokens_seen": 3610376, "step": 40095 }, { "epoch": 10.420997920997921, "grad_norm": 1.8275223970413208, "learning_rate": 2.7523203562399935e-05, "loss": 0.2525, "num_input_tokens_seen": 3610856, "step": 40100 }, { "epoch": 10.422297297297296, "grad_norm": 1.0885536670684814, "learning_rate": 2.7517562849471045e-05, "loss": 0.1719, "num_input_tokens_seen": 3611336, "step": 40105 }, { "epoch": 10.423596673596673, "grad_norm": 1.2156790494918823, "learning_rate": 2.7511922007061487e-05, "loss": 0.2542, "num_input_tokens_seen": 3611800, "step": 40110 }, { "epoch": 10.42489604989605, "grad_norm": 0.9801134467124939, "learning_rate": 2.750628103546138e-05, "loss": 0.1974, "num_input_tokens_seen": 3612232, "step": 40115 }, { "epoch": 10.426195426195425, "grad_norm": 0.7099313139915466, "learning_rate": 2.750063993496083e-05, "loss": 0.1648, "num_input_tokens_seen": 3612712, "step": 40120 }, { "epoch": 10.427494802494802, "grad_norm": 0.29689672589302063, "learning_rate": 2.7494998705849968e-05, "loss": 0.0634, "num_input_tokens_seen": 3613144, "step": 40125 }, { "epoch": 10.42879417879418, "grad_norm": 0.1309659481048584, "learning_rate": 2.748935734841895e-05, "loss": 0.1855, "num_input_tokens_seen": 3613576, "step": 40130 }, { "epoch": 10.430093555093555, "grad_norm": 0.181536003947258, "learning_rate": 2.7483715862957882e-05, "loss": 0.1173, "num_input_tokens_seen": 3614008, "step": 40135 }, { "epoch": 10.431392931392931, "grad_norm": 0.19927753508090973, "learning_rate": 2.747807424975693e-05, "loss": 0.283, "num_input_tokens_seen": 3614408, "step": 40140 }, { "epoch": 10.432692307692308, "grad_norm": 4.248584747314453, "learning_rate": 2.7472432509106248e-05, "loss": 0.2391, "num_input_tokens_seen": 3614840, "step": 40145 }, { "epoch": 10.433991683991684, "grad_norm": 4.874834060668945, "learning_rate": 2.7466790641295992e-05, "loss": 0.577, "num_input_tokens_seen": 3615320, "step": 40150 }, { "epoch": 10.43529106029106, "grad_norm": 3.8924083709716797, "learning_rate": 2.746114864661633e-05, "loss": 0.2798, "num_input_tokens_seen": 3615816, "step": 40155 }, { "epoch": 10.436590436590437, "grad_norm": 0.4828925132751465, "learning_rate": 2.745550652535743e-05, "loss": 0.2722, "num_input_tokens_seen": 3616216, "step": 40160 }, { "epoch": 10.437889812889813, "grad_norm": 0.3612162172794342, "learning_rate": 2.7449864277809484e-05, "loss": 0.3677, "num_input_tokens_seen": 3616680, "step": 40165 }, { "epoch": 10.43918918918919, "grad_norm": 0.7204742431640625, "learning_rate": 2.744422190426267e-05, "loss": 0.0475, "num_input_tokens_seen": 3617144, "step": 40170 }, { "epoch": 10.440488565488565, "grad_norm": 0.3772214353084564, "learning_rate": 2.7438579405007182e-05, "loss": 0.2816, "num_input_tokens_seen": 3617592, "step": 40175 }, { "epoch": 10.441787941787942, "grad_norm": 0.6228324174880981, "learning_rate": 2.7432936780333214e-05, "loss": 0.3022, "num_input_tokens_seen": 3618024, "step": 40180 }, { "epoch": 10.443087318087318, "grad_norm": 0.1947556734085083, "learning_rate": 2.7427294030530975e-05, "loss": 0.2083, "num_input_tokens_seen": 3618488, "step": 40185 }, { "epoch": 10.444386694386694, "grad_norm": 3.4650793075561523, "learning_rate": 2.7421651155890686e-05, "loss": 0.3186, "num_input_tokens_seen": 3618952, "step": 40190 }, { "epoch": 10.44568607068607, "grad_norm": 1.7114439010620117, "learning_rate": 2.7416008156702554e-05, "loss": 0.2679, "num_input_tokens_seen": 3619432, "step": 40195 }, { "epoch": 10.446985446985448, "grad_norm": 0.6389188170433044, "learning_rate": 2.7410365033256806e-05, "loss": 0.2762, "num_input_tokens_seen": 3619880, "step": 40200 }, { "epoch": 10.448284823284823, "grad_norm": 0.8609286546707153, "learning_rate": 2.740472178584368e-05, "loss": 0.0446, "num_input_tokens_seen": 3620312, "step": 40205 }, { "epoch": 10.4495841995842, "grad_norm": 0.41264182329177856, "learning_rate": 2.7399078414753403e-05, "loss": 0.1833, "num_input_tokens_seen": 3620792, "step": 40210 }, { "epoch": 10.450883575883577, "grad_norm": 0.5230713486671448, "learning_rate": 2.7393434920276222e-05, "loss": 0.434, "num_input_tokens_seen": 3621272, "step": 40215 }, { "epoch": 10.452182952182952, "grad_norm": 0.36756226420402527, "learning_rate": 2.7387791302702397e-05, "loss": 0.1287, "num_input_tokens_seen": 3621704, "step": 40220 }, { "epoch": 10.453482328482329, "grad_norm": 0.3787698447704315, "learning_rate": 2.7382147562322174e-05, "loss": 0.2505, "num_input_tokens_seen": 3622120, "step": 40225 }, { "epoch": 10.454781704781706, "grad_norm": 2.2025084495544434, "learning_rate": 2.7376503699425814e-05, "loss": 0.3978, "num_input_tokens_seen": 3622568, "step": 40230 }, { "epoch": 10.45608108108108, "grad_norm": 2.7715513706207275, "learning_rate": 2.7370859714303603e-05, "loss": 0.3312, "num_input_tokens_seen": 3623016, "step": 40235 }, { "epoch": 10.457380457380458, "grad_norm": 2.2846009731292725, "learning_rate": 2.73652156072458e-05, "loss": 0.2838, "num_input_tokens_seen": 3623432, "step": 40240 }, { "epoch": 10.458679833679835, "grad_norm": 0.8342730402946472, "learning_rate": 2.7359571378542692e-05, "loss": 0.1528, "num_input_tokens_seen": 3623848, "step": 40245 }, { "epoch": 10.45997920997921, "grad_norm": 1.8540124893188477, "learning_rate": 2.735392702848456e-05, "loss": 0.3402, "num_input_tokens_seen": 3624280, "step": 40250 }, { "epoch": 10.461278586278587, "grad_norm": 0.8552624583244324, "learning_rate": 2.7348282557361714e-05, "loss": 0.2808, "num_input_tokens_seen": 3624760, "step": 40255 }, { "epoch": 10.462577962577962, "grad_norm": 1.3372758626937866, "learning_rate": 2.7342637965464453e-05, "loss": 0.2138, "num_input_tokens_seen": 3625208, "step": 40260 }, { "epoch": 10.463877338877339, "grad_norm": 1.274830937385559, "learning_rate": 2.7336993253083064e-05, "loss": 0.3023, "num_input_tokens_seen": 3625656, "step": 40265 }, { "epoch": 10.465176715176716, "grad_norm": 1.6213653087615967, "learning_rate": 2.733134842050788e-05, "loss": 0.1717, "num_input_tokens_seen": 3626136, "step": 40270 }, { "epoch": 10.46647609147609, "grad_norm": 1.9537681341171265, "learning_rate": 2.7325703468029207e-05, "loss": 0.3753, "num_input_tokens_seen": 3626568, "step": 40275 }, { "epoch": 10.467775467775468, "grad_norm": 3.9987454414367676, "learning_rate": 2.732005839593738e-05, "loss": 0.2788, "num_input_tokens_seen": 3627000, "step": 40280 }, { "epoch": 10.469074844074845, "grad_norm": 0.7898331880569458, "learning_rate": 2.7314413204522725e-05, "loss": 0.3395, "num_input_tokens_seen": 3627464, "step": 40285 }, { "epoch": 10.47037422037422, "grad_norm": 0.9779250621795654, "learning_rate": 2.7308767894075583e-05, "loss": 0.2258, "num_input_tokens_seen": 3627896, "step": 40290 }, { "epoch": 10.471673596673597, "grad_norm": 0.6750031113624573, "learning_rate": 2.7303122464886298e-05, "loss": 0.2142, "num_input_tokens_seen": 3628312, "step": 40295 }, { "epoch": 10.472972972972974, "grad_norm": 0.7242704033851624, "learning_rate": 2.7297476917245214e-05, "loss": 0.2094, "num_input_tokens_seen": 3628776, "step": 40300 }, { "epoch": 10.474272349272349, "grad_norm": 1.9660594463348389, "learning_rate": 2.729183125144269e-05, "loss": 0.2018, "num_input_tokens_seen": 3629224, "step": 40305 }, { "epoch": 10.475571725571726, "grad_norm": 0.5330866575241089, "learning_rate": 2.728618546776909e-05, "loss": 0.224, "num_input_tokens_seen": 3629720, "step": 40310 }, { "epoch": 10.476871101871103, "grad_norm": 2.7952866554260254, "learning_rate": 2.7280539566514786e-05, "loss": 0.2602, "num_input_tokens_seen": 3630200, "step": 40315 }, { "epoch": 10.478170478170478, "grad_norm": 4.0482177734375, "learning_rate": 2.7274893547970143e-05, "loss": 0.2072, "num_input_tokens_seen": 3630680, "step": 40320 }, { "epoch": 10.479469854469855, "grad_norm": 0.4124443531036377, "learning_rate": 2.7269247412425548e-05, "loss": 0.082, "num_input_tokens_seen": 3631096, "step": 40325 }, { "epoch": 10.48076923076923, "grad_norm": 3.6520159244537354, "learning_rate": 2.7263601160171376e-05, "loss": 0.1127, "num_input_tokens_seen": 3631512, "step": 40330 }, { "epoch": 10.482068607068607, "grad_norm": 2.7420215606689453, "learning_rate": 2.7257954791498035e-05, "loss": 0.224, "num_input_tokens_seen": 3631960, "step": 40335 }, { "epoch": 10.483367983367984, "grad_norm": 3.1059916019439697, "learning_rate": 2.725230830669591e-05, "loss": 0.2597, "num_input_tokens_seen": 3632408, "step": 40340 }, { "epoch": 10.484667359667359, "grad_norm": 3.174393892288208, "learning_rate": 2.7246661706055414e-05, "loss": 0.123, "num_input_tokens_seen": 3632840, "step": 40345 }, { "epoch": 10.485966735966736, "grad_norm": 3.5630929470062256, "learning_rate": 2.724101498986695e-05, "loss": 0.2615, "num_input_tokens_seen": 3633336, "step": 40350 }, { "epoch": 10.487266112266113, "grad_norm": 0.20264457166194916, "learning_rate": 2.7235368158420944e-05, "loss": 0.1473, "num_input_tokens_seen": 3633752, "step": 40355 }, { "epoch": 10.488565488565488, "grad_norm": 0.9804037809371948, "learning_rate": 2.722972121200781e-05, "loss": 0.2258, "num_input_tokens_seen": 3634264, "step": 40360 }, { "epoch": 10.489864864864865, "grad_norm": 0.4022725522518158, "learning_rate": 2.722407415091798e-05, "loss": 0.2693, "num_input_tokens_seen": 3634696, "step": 40365 }, { "epoch": 10.491164241164242, "grad_norm": 0.3884623348712921, "learning_rate": 2.721842697544188e-05, "loss": 0.1165, "num_input_tokens_seen": 3635128, "step": 40370 }, { "epoch": 10.492463617463617, "grad_norm": 0.32712772488594055, "learning_rate": 2.7212779685869954e-05, "loss": 0.4143, "num_input_tokens_seen": 3635560, "step": 40375 }, { "epoch": 10.493762993762994, "grad_norm": 2.5471718311309814, "learning_rate": 2.7207132282492654e-05, "loss": 0.3102, "num_input_tokens_seen": 3636008, "step": 40380 }, { "epoch": 10.49506237006237, "grad_norm": 2.7577478885650635, "learning_rate": 2.7201484765600426e-05, "loss": 0.259, "num_input_tokens_seen": 3636472, "step": 40385 }, { "epoch": 10.496361746361746, "grad_norm": 0.5899460315704346, "learning_rate": 2.7195837135483726e-05, "loss": 0.3151, "num_input_tokens_seen": 3636904, "step": 40390 }, { "epoch": 10.497661122661123, "grad_norm": 0.7527451515197754, "learning_rate": 2.719018939243302e-05, "loss": 0.1473, "num_input_tokens_seen": 3637368, "step": 40395 }, { "epoch": 10.4989604989605, "grad_norm": 0.6242979764938354, "learning_rate": 2.7184541536738774e-05, "loss": 0.2228, "num_input_tokens_seen": 3637864, "step": 40400 }, { "epoch": 10.500259875259875, "grad_norm": 1.8758575916290283, "learning_rate": 2.717889356869146e-05, "loss": 0.3947, "num_input_tokens_seen": 3638376, "step": 40405 }, { "epoch": 10.501559251559252, "grad_norm": 0.9360637068748474, "learning_rate": 2.7173245488581563e-05, "loss": 0.1808, "num_input_tokens_seen": 3638808, "step": 40410 }, { "epoch": 10.502858627858627, "grad_norm": 1.0101022720336914, "learning_rate": 2.7167597296699564e-05, "loss": 0.3146, "num_input_tokens_seen": 3639224, "step": 40415 }, { "epoch": 10.504158004158004, "grad_norm": 3.3484864234924316, "learning_rate": 2.7161948993335967e-05, "loss": 0.1709, "num_input_tokens_seen": 3639704, "step": 40420 }, { "epoch": 10.505457380457381, "grad_norm": 2.1054365634918213, "learning_rate": 2.715630057878126e-05, "loss": 0.1664, "num_input_tokens_seen": 3640152, "step": 40425 }, { "epoch": 10.506756756756756, "grad_norm": 1.487966537475586, "learning_rate": 2.715065205332594e-05, "loss": 0.1819, "num_input_tokens_seen": 3640632, "step": 40430 }, { "epoch": 10.508056133056133, "grad_norm": 1.455973505973816, "learning_rate": 2.714500341726054e-05, "loss": 0.1764, "num_input_tokens_seen": 3641112, "step": 40435 }, { "epoch": 10.50935550935551, "grad_norm": 2.8411307334899902, "learning_rate": 2.713935467087555e-05, "loss": 0.2399, "num_input_tokens_seen": 3641528, "step": 40440 }, { "epoch": 10.510654885654885, "grad_norm": 0.3894928991794586, "learning_rate": 2.7133705814461503e-05, "loss": 0.3204, "num_input_tokens_seen": 3641976, "step": 40445 }, { "epoch": 10.511954261954262, "grad_norm": 2.5902509689331055, "learning_rate": 2.7128056848308913e-05, "loss": 0.235, "num_input_tokens_seen": 3642456, "step": 40450 }, { "epoch": 10.513253638253639, "grad_norm": 1.484151840209961, "learning_rate": 2.712240777270833e-05, "loss": 0.3597, "num_input_tokens_seen": 3642904, "step": 40455 }, { "epoch": 10.514553014553014, "grad_norm": 2.8001997470855713, "learning_rate": 2.711675858795028e-05, "loss": 0.2797, "num_input_tokens_seen": 3643336, "step": 40460 }, { "epoch": 10.515852390852391, "grad_norm": 0.3706834316253662, "learning_rate": 2.7111109294325297e-05, "loss": 0.2541, "num_input_tokens_seen": 3643768, "step": 40465 }, { "epoch": 10.517151767151766, "grad_norm": 0.3661126494407654, "learning_rate": 2.710545989212395e-05, "loss": 0.075, "num_input_tokens_seen": 3644264, "step": 40470 }, { "epoch": 10.518451143451143, "grad_norm": 0.7834314703941345, "learning_rate": 2.7099810381636788e-05, "loss": 0.338, "num_input_tokens_seen": 3644680, "step": 40475 }, { "epoch": 10.51975051975052, "grad_norm": 0.8170658349990845, "learning_rate": 2.7094160763154354e-05, "loss": 0.3043, "num_input_tokens_seen": 3645160, "step": 40480 }, { "epoch": 10.521049896049895, "grad_norm": 4.196951389312744, "learning_rate": 2.7088511036967235e-05, "loss": 0.2052, "num_input_tokens_seen": 3645592, "step": 40485 }, { "epoch": 10.522349272349272, "grad_norm": 0.6134584546089172, "learning_rate": 2.7082861203365988e-05, "loss": 0.2031, "num_input_tokens_seen": 3646072, "step": 40490 }, { "epoch": 10.52364864864865, "grad_norm": 2.2154760360717773, "learning_rate": 2.7077211262641196e-05, "loss": 0.2252, "num_input_tokens_seen": 3646536, "step": 40495 }, { "epoch": 10.524948024948024, "grad_norm": 0.9495194554328918, "learning_rate": 2.707156121508343e-05, "loss": 0.1557, "num_input_tokens_seen": 3647016, "step": 40500 }, { "epoch": 10.526247401247401, "grad_norm": 4.810044765472412, "learning_rate": 2.7065911060983297e-05, "loss": 0.3606, "num_input_tokens_seen": 3647496, "step": 40505 }, { "epoch": 10.527546777546778, "grad_norm": 2.7526330947875977, "learning_rate": 2.706026080063137e-05, "loss": 0.195, "num_input_tokens_seen": 3647928, "step": 40510 }, { "epoch": 10.528846153846153, "grad_norm": 5.219970226287842, "learning_rate": 2.7054610434318262e-05, "loss": 0.2217, "num_input_tokens_seen": 3648408, "step": 40515 }, { "epoch": 10.53014553014553, "grad_norm": 0.7517987489700317, "learning_rate": 2.7048959962334568e-05, "loss": 0.1743, "num_input_tokens_seen": 3648856, "step": 40520 }, { "epoch": 10.531444906444907, "grad_norm": 4.25741720199585, "learning_rate": 2.7043309384970905e-05, "loss": 0.1204, "num_input_tokens_seen": 3649336, "step": 40525 }, { "epoch": 10.532744282744282, "grad_norm": 3.4923744201660156, "learning_rate": 2.7037658702517883e-05, "loss": 0.237, "num_input_tokens_seen": 3649800, "step": 40530 }, { "epoch": 10.53404365904366, "grad_norm": 1.2606621980667114, "learning_rate": 2.703200791526611e-05, "loss": 0.1738, "num_input_tokens_seen": 3650232, "step": 40535 }, { "epoch": 10.535343035343036, "grad_norm": 5.37444543838501, "learning_rate": 2.7026357023506233e-05, "loss": 0.4854, "num_input_tokens_seen": 3650664, "step": 40540 }, { "epoch": 10.536642411642411, "grad_norm": 0.07534883171319962, "learning_rate": 2.702070602752887e-05, "loss": 0.2885, "num_input_tokens_seen": 3651096, "step": 40545 }, { "epoch": 10.537941787941788, "grad_norm": 0.5078128576278687, "learning_rate": 2.7015054927624662e-05, "loss": 0.149, "num_input_tokens_seen": 3651528, "step": 40550 }, { "epoch": 10.539241164241163, "grad_norm": 2.8180723190307617, "learning_rate": 2.7009403724084235e-05, "loss": 0.1958, "num_input_tokens_seen": 3651992, "step": 40555 }, { "epoch": 10.54054054054054, "grad_norm": 1.0515562295913696, "learning_rate": 2.7003752417198264e-05, "loss": 0.5054, "num_input_tokens_seen": 3652472, "step": 40560 }, { "epoch": 10.541839916839917, "grad_norm": 0.342294842004776, "learning_rate": 2.6998101007257383e-05, "loss": 0.2722, "num_input_tokens_seen": 3652904, "step": 40565 }, { "epoch": 10.543139293139292, "grad_norm": 0.7470903992652893, "learning_rate": 2.699244949455225e-05, "loss": 0.1384, "num_input_tokens_seen": 3653336, "step": 40570 }, { "epoch": 10.54443866943867, "grad_norm": 0.4842716455459595, "learning_rate": 2.698679787937353e-05, "loss": 0.2026, "num_input_tokens_seen": 3653800, "step": 40575 }, { "epoch": 10.545738045738046, "grad_norm": 4.267524719238281, "learning_rate": 2.6981146162011894e-05, "loss": 0.1962, "num_input_tokens_seen": 3654248, "step": 40580 }, { "epoch": 10.547037422037421, "grad_norm": 0.43991297483444214, "learning_rate": 2.6975494342758025e-05, "loss": 0.1799, "num_input_tokens_seen": 3654680, "step": 40585 }, { "epoch": 10.548336798336798, "grad_norm": 0.17145466804504395, "learning_rate": 2.696984242190257e-05, "loss": 0.2132, "num_input_tokens_seen": 3655144, "step": 40590 }, { "epoch": 10.549636174636175, "grad_norm": 0.5839239358901978, "learning_rate": 2.6964190399736238e-05, "loss": 0.3562, "num_input_tokens_seen": 3655608, "step": 40595 }, { "epoch": 10.55093555093555, "grad_norm": 3.1972835063934326, "learning_rate": 2.695853827654971e-05, "loss": 0.3391, "num_input_tokens_seen": 3656024, "step": 40600 }, { "epoch": 10.552234927234927, "grad_norm": 1.3087680339813232, "learning_rate": 2.695288605263368e-05, "loss": 0.58, "num_input_tokens_seen": 3656488, "step": 40605 }, { "epoch": 10.553534303534304, "grad_norm": 3.657775402069092, "learning_rate": 2.6947233728278852e-05, "loss": 0.3303, "num_input_tokens_seen": 3656904, "step": 40610 }, { "epoch": 10.55483367983368, "grad_norm": 4.431763172149658, "learning_rate": 2.694158130377593e-05, "loss": 0.4617, "num_input_tokens_seen": 3657336, "step": 40615 }, { "epoch": 10.556133056133056, "grad_norm": 2.5016205310821533, "learning_rate": 2.6935928779415626e-05, "loss": 0.3564, "num_input_tokens_seen": 3657800, "step": 40620 }, { "epoch": 10.557432432432432, "grad_norm": 2.46342396736145, "learning_rate": 2.693027615548864e-05, "loss": 0.3424, "num_input_tokens_seen": 3658264, "step": 40625 }, { "epoch": 10.558731808731808, "grad_norm": 3.4937126636505127, "learning_rate": 2.6924623432285707e-05, "loss": 0.3312, "num_input_tokens_seen": 3658712, "step": 40630 }, { "epoch": 10.560031185031185, "grad_norm": 2.203035593032837, "learning_rate": 2.6918970610097543e-05, "loss": 0.2136, "num_input_tokens_seen": 3659176, "step": 40635 }, { "epoch": 10.56133056133056, "grad_norm": 3.21458101272583, "learning_rate": 2.691331768921489e-05, "loss": 0.2753, "num_input_tokens_seen": 3659608, "step": 40640 }, { "epoch": 10.562629937629938, "grad_norm": 1.704566240310669, "learning_rate": 2.6907664669928463e-05, "loss": 0.2391, "num_input_tokens_seen": 3660024, "step": 40645 }, { "epoch": 10.563929313929314, "grad_norm": 3.506216287612915, "learning_rate": 2.690201155252903e-05, "loss": 0.2633, "num_input_tokens_seen": 3660520, "step": 40650 }, { "epoch": 10.56522869022869, "grad_norm": 5.299206256866455, "learning_rate": 2.689635833730731e-05, "loss": 0.2373, "num_input_tokens_seen": 3660952, "step": 40655 }, { "epoch": 10.566528066528067, "grad_norm": 2.48272967338562, "learning_rate": 2.689070502455406e-05, "loss": 0.1764, "num_input_tokens_seen": 3661384, "step": 40660 }, { "epoch": 10.567827442827443, "grad_norm": 2.441380262374878, "learning_rate": 2.6885051614560042e-05, "loss": 0.3138, "num_input_tokens_seen": 3661800, "step": 40665 }, { "epoch": 10.569126819126819, "grad_norm": 4.282463550567627, "learning_rate": 2.6879398107616017e-05, "loss": 0.4197, "num_input_tokens_seen": 3662232, "step": 40670 }, { "epoch": 10.570426195426196, "grad_norm": 2.512326955795288, "learning_rate": 2.6873744504012742e-05, "loss": 0.2777, "num_input_tokens_seen": 3662712, "step": 40675 }, { "epoch": 10.571725571725572, "grad_norm": 0.9857288599014282, "learning_rate": 2.6868090804040998e-05, "loss": 0.2755, "num_input_tokens_seen": 3663144, "step": 40680 }, { "epoch": 10.573024948024948, "grad_norm": 5.228057384490967, "learning_rate": 2.686243700799155e-05, "loss": 0.2467, "num_input_tokens_seen": 3663592, "step": 40685 }, { "epoch": 10.574324324324325, "grad_norm": 2.7524843215942383, "learning_rate": 2.6856783116155183e-05, "loss": 0.1522, "num_input_tokens_seen": 3664040, "step": 40690 }, { "epoch": 10.575623700623701, "grad_norm": 1.9212795495986938, "learning_rate": 2.6851129128822677e-05, "loss": 0.244, "num_input_tokens_seen": 3664456, "step": 40695 }, { "epoch": 10.576923076923077, "grad_norm": 0.6485753655433655, "learning_rate": 2.684547504628483e-05, "loss": 0.1941, "num_input_tokens_seen": 3664888, "step": 40700 }, { "epoch": 10.578222453222454, "grad_norm": 3.8307459354400635, "learning_rate": 2.6839820868832433e-05, "loss": 0.2017, "num_input_tokens_seen": 3665320, "step": 40705 }, { "epoch": 10.579521829521829, "grad_norm": 0.9191867709159851, "learning_rate": 2.683416659675629e-05, "loss": 0.2896, "num_input_tokens_seen": 3665784, "step": 40710 }, { "epoch": 10.580821205821206, "grad_norm": 7.933493137359619, "learning_rate": 2.6828512230347197e-05, "loss": 0.3017, "num_input_tokens_seen": 3666216, "step": 40715 }, { "epoch": 10.582120582120583, "grad_norm": 4.103466510772705, "learning_rate": 2.682285776989597e-05, "loss": 0.3085, "num_input_tokens_seen": 3666728, "step": 40720 }, { "epoch": 10.583419958419958, "grad_norm": 4.906108856201172, "learning_rate": 2.681720321569342e-05, "loss": 0.354, "num_input_tokens_seen": 3667176, "step": 40725 }, { "epoch": 10.584719334719335, "grad_norm": 0.5156373977661133, "learning_rate": 2.6811548568030364e-05, "loss": 0.3543, "num_input_tokens_seen": 3667640, "step": 40730 }, { "epoch": 10.586018711018712, "grad_norm": 3.804900884628296, "learning_rate": 2.6805893827197632e-05, "loss": 0.2519, "num_input_tokens_seen": 3668104, "step": 40735 }, { "epoch": 10.587318087318087, "grad_norm": 3.0375537872314453, "learning_rate": 2.680023899348605e-05, "loss": 0.2869, "num_input_tokens_seen": 3668584, "step": 40740 }, { "epoch": 10.588617463617464, "grad_norm": 1.2958872318267822, "learning_rate": 2.6794584067186456e-05, "loss": 0.2221, "num_input_tokens_seen": 3669016, "step": 40745 }, { "epoch": 10.58991683991684, "grad_norm": 6.413658142089844, "learning_rate": 2.6788929048589672e-05, "loss": 0.2584, "num_input_tokens_seen": 3669480, "step": 40750 }, { "epoch": 10.591216216216216, "grad_norm": 0.8546732664108276, "learning_rate": 2.6783273937986563e-05, "loss": 0.1872, "num_input_tokens_seen": 3669928, "step": 40755 }, { "epoch": 10.592515592515593, "grad_norm": 1.2942314147949219, "learning_rate": 2.677761873566797e-05, "loss": 0.1528, "num_input_tokens_seen": 3670392, "step": 40760 }, { "epoch": 10.59381496881497, "grad_norm": 4.415756702423096, "learning_rate": 2.6771963441924735e-05, "loss": 0.2749, "num_input_tokens_seen": 3670856, "step": 40765 }, { "epoch": 10.595114345114345, "grad_norm": 4.404716491699219, "learning_rate": 2.6766308057047723e-05, "loss": 0.3316, "num_input_tokens_seen": 3671336, "step": 40770 }, { "epoch": 10.596413721413722, "grad_norm": 1.3819944858551025, "learning_rate": 2.67606525813278e-05, "loss": 0.3921, "num_input_tokens_seen": 3671784, "step": 40775 }, { "epoch": 10.597713097713097, "grad_norm": 4.52131462097168, "learning_rate": 2.6754997015055827e-05, "loss": 0.2873, "num_input_tokens_seen": 3672216, "step": 40780 }, { "epoch": 10.599012474012474, "grad_norm": 2.224318265914917, "learning_rate": 2.6749341358522674e-05, "loss": 0.1295, "num_input_tokens_seen": 3672696, "step": 40785 }, { "epoch": 10.60031185031185, "grad_norm": 1.1570584774017334, "learning_rate": 2.6743685612019216e-05, "loss": 0.279, "num_input_tokens_seen": 3673112, "step": 40790 }, { "epoch": 10.601611226611226, "grad_norm": 3.2736353874206543, "learning_rate": 2.673802977583634e-05, "loss": 0.3699, "num_input_tokens_seen": 3673592, "step": 40795 }, { "epoch": 10.602910602910603, "grad_norm": 2.7479231357574463, "learning_rate": 2.673237385026493e-05, "loss": 0.2096, "num_input_tokens_seen": 3674024, "step": 40800 }, { "epoch": 10.60420997920998, "grad_norm": 0.8595014810562134, "learning_rate": 2.672671783559586e-05, "loss": 0.1733, "num_input_tokens_seen": 3674504, "step": 40805 }, { "epoch": 10.605509355509355, "grad_norm": 2.5080292224884033, "learning_rate": 2.672106173212005e-05, "loss": 0.1608, "num_input_tokens_seen": 3674968, "step": 40810 }, { "epoch": 10.606808731808732, "grad_norm": 0.13978461921215057, "learning_rate": 2.6715405540128386e-05, "loss": 0.2907, "num_input_tokens_seen": 3675384, "step": 40815 }, { "epoch": 10.608108108108109, "grad_norm": 2.290977716445923, "learning_rate": 2.6709749259911765e-05, "loss": 0.3663, "num_input_tokens_seen": 3675816, "step": 40820 }, { "epoch": 10.609407484407484, "grad_norm": 3.711747169494629, "learning_rate": 2.67040928917611e-05, "loss": 0.4187, "num_input_tokens_seen": 3676280, "step": 40825 }, { "epoch": 10.61070686070686, "grad_norm": 2.4847288131713867, "learning_rate": 2.6698436435967313e-05, "loss": 0.3244, "num_input_tokens_seen": 3676712, "step": 40830 }, { "epoch": 10.612006237006238, "grad_norm": 4.7850189208984375, "learning_rate": 2.6692779892821308e-05, "loss": 0.1941, "num_input_tokens_seen": 3677160, "step": 40835 }, { "epoch": 10.613305613305613, "grad_norm": 2.3604443073272705, "learning_rate": 2.6687123262614007e-05, "loss": 0.2865, "num_input_tokens_seen": 3677640, "step": 40840 }, { "epoch": 10.61460498960499, "grad_norm": 2.277341842651367, "learning_rate": 2.6681466545636353e-05, "loss": 0.2472, "num_input_tokens_seen": 3678088, "step": 40845 }, { "epoch": 10.615904365904367, "grad_norm": 3.9017257690429688, "learning_rate": 2.6675809742179255e-05, "loss": 0.3439, "num_input_tokens_seen": 3678520, "step": 40850 }, { "epoch": 10.617203742203742, "grad_norm": 4.459859371185303, "learning_rate": 2.6670152852533653e-05, "loss": 0.3742, "num_input_tokens_seen": 3678984, "step": 40855 }, { "epoch": 10.618503118503119, "grad_norm": 3.4206225872039795, "learning_rate": 2.666449587699049e-05, "loss": 0.3302, "num_input_tokens_seen": 3679448, "step": 40860 }, { "epoch": 10.619802494802494, "grad_norm": 3.263131618499756, "learning_rate": 2.665883881584072e-05, "loss": 0.2538, "num_input_tokens_seen": 3679896, "step": 40865 }, { "epoch": 10.621101871101871, "grad_norm": 2.1358277797698975, "learning_rate": 2.665318166937527e-05, "loss": 0.1979, "num_input_tokens_seen": 3680328, "step": 40870 }, { "epoch": 10.622401247401248, "grad_norm": 0.9328548908233643, "learning_rate": 2.66475244378851e-05, "loss": 0.1496, "num_input_tokens_seen": 3680776, "step": 40875 }, { "epoch": 10.623700623700623, "grad_norm": 4.599078178405762, "learning_rate": 2.6641867121661178e-05, "loss": 0.2246, "num_input_tokens_seen": 3681240, "step": 40880 }, { "epoch": 10.625, "grad_norm": 2.5189321041107178, "learning_rate": 2.6636209720994454e-05, "loss": 0.196, "num_input_tokens_seen": 3681672, "step": 40885 }, { "epoch": 10.626299376299377, "grad_norm": 4.275198459625244, "learning_rate": 2.6630552236175897e-05, "loss": 0.1787, "num_input_tokens_seen": 3682168, "step": 40890 }, { "epoch": 10.627598752598752, "grad_norm": 1.331957221031189, "learning_rate": 2.6624894667496474e-05, "loss": 0.1191, "num_input_tokens_seen": 3682616, "step": 40895 }, { "epoch": 10.628898128898129, "grad_norm": 1.0441665649414062, "learning_rate": 2.661923701524716e-05, "loss": 0.2493, "num_input_tokens_seen": 3683112, "step": 40900 }, { "epoch": 10.630197505197506, "grad_norm": 4.859422206878662, "learning_rate": 2.661357927971894e-05, "loss": 0.485, "num_input_tokens_seen": 3683560, "step": 40905 }, { "epoch": 10.631496881496881, "grad_norm": 1.659424066543579, "learning_rate": 2.660792146120279e-05, "loss": 0.5456, "num_input_tokens_seen": 3684024, "step": 40910 }, { "epoch": 10.632796257796258, "grad_norm": 1.1394233703613281, "learning_rate": 2.6602263559989697e-05, "loss": 0.0745, "num_input_tokens_seen": 3684472, "step": 40915 }, { "epoch": 10.634095634095633, "grad_norm": 3.028966188430786, "learning_rate": 2.659660557637066e-05, "loss": 0.3342, "num_input_tokens_seen": 3684952, "step": 40920 }, { "epoch": 10.63539501039501, "grad_norm": 0.3522202968597412, "learning_rate": 2.659094751063666e-05, "loss": 0.255, "num_input_tokens_seen": 3685400, "step": 40925 }, { "epoch": 10.636694386694387, "grad_norm": 2.314598560333252, "learning_rate": 2.658528936307871e-05, "loss": 0.3075, "num_input_tokens_seen": 3685848, "step": 40930 }, { "epoch": 10.637993762993762, "grad_norm": 1.2179354429244995, "learning_rate": 2.6579631133987802e-05, "loss": 0.3329, "num_input_tokens_seen": 3686312, "step": 40935 }, { "epoch": 10.63929313929314, "grad_norm": 1.678728461265564, "learning_rate": 2.6573972823654957e-05, "loss": 0.1973, "num_input_tokens_seen": 3686776, "step": 40940 }, { "epoch": 10.640592515592516, "grad_norm": 2.4357528686523438, "learning_rate": 2.6568314432371183e-05, "loss": 0.2118, "num_input_tokens_seen": 3687240, "step": 40945 }, { "epoch": 10.641891891891891, "grad_norm": 2.182910203933716, "learning_rate": 2.656265596042749e-05, "loss": 0.2161, "num_input_tokens_seen": 3687720, "step": 40950 }, { "epoch": 10.643191268191268, "grad_norm": 1.8477643728256226, "learning_rate": 2.655699740811491e-05, "loss": 0.2375, "num_input_tokens_seen": 3688168, "step": 40955 }, { "epoch": 10.644490644490645, "grad_norm": 3.499523401260376, "learning_rate": 2.655133877572446e-05, "loss": 0.2506, "num_input_tokens_seen": 3688584, "step": 40960 }, { "epoch": 10.64579002079002, "grad_norm": 1.8425272703170776, "learning_rate": 2.6545680063547164e-05, "loss": 0.1825, "num_input_tokens_seen": 3689032, "step": 40965 }, { "epoch": 10.647089397089397, "grad_norm": 4.1941447257995605, "learning_rate": 2.6540021271874067e-05, "loss": 0.345, "num_input_tokens_seen": 3689464, "step": 40970 }, { "epoch": 10.648388773388774, "grad_norm": 0.9299718737602234, "learning_rate": 2.65343624009962e-05, "loss": 0.2134, "num_input_tokens_seen": 3689928, "step": 40975 }, { "epoch": 10.64968814968815, "grad_norm": 2.167616128921509, "learning_rate": 2.6528703451204606e-05, "loss": 0.1074, "num_input_tokens_seen": 3690376, "step": 40980 }, { "epoch": 10.650987525987526, "grad_norm": 1.154989242553711, "learning_rate": 2.6523044422790326e-05, "loss": 0.1011, "num_input_tokens_seen": 3690856, "step": 40985 }, { "epoch": 10.652286902286903, "grad_norm": 3.450000762939453, "learning_rate": 2.6517385316044412e-05, "loss": 0.4082, "num_input_tokens_seen": 3691272, "step": 40990 }, { "epoch": 10.653586278586278, "grad_norm": 0.7875811457633972, "learning_rate": 2.651172613125792e-05, "loss": 0.3788, "num_input_tokens_seen": 3691720, "step": 40995 }, { "epoch": 10.654885654885655, "grad_norm": 2.3536458015441895, "learning_rate": 2.6506066868721897e-05, "loss": 0.1997, "num_input_tokens_seen": 3692184, "step": 41000 }, { "epoch": 10.65618503118503, "grad_norm": 0.27984902262687683, "learning_rate": 2.6500407528727422e-05, "loss": 0.1067, "num_input_tokens_seen": 3692632, "step": 41005 }, { "epoch": 10.657484407484407, "grad_norm": 2.9485347270965576, "learning_rate": 2.6494748111565542e-05, "loss": 0.5089, "num_input_tokens_seen": 3693112, "step": 41010 }, { "epoch": 10.658783783783784, "grad_norm": 0.15260197222232819, "learning_rate": 2.648908861752734e-05, "loss": 0.1548, "num_input_tokens_seen": 3693544, "step": 41015 }, { "epoch": 10.66008316008316, "grad_norm": 4.3763227462768555, "learning_rate": 2.648342904690388e-05, "loss": 0.3495, "num_input_tokens_seen": 3693976, "step": 41020 }, { "epoch": 10.661382536382536, "grad_norm": 7.7514495849609375, "learning_rate": 2.6477769399986245e-05, "loss": 0.2784, "num_input_tokens_seen": 3694408, "step": 41025 }, { "epoch": 10.662681912681913, "grad_norm": 3.5293281078338623, "learning_rate": 2.6472109677065515e-05, "loss": 0.2061, "num_input_tokens_seen": 3694888, "step": 41030 }, { "epoch": 10.663981288981288, "grad_norm": 4.257354736328125, "learning_rate": 2.6466449878432776e-05, "loss": 0.3434, "num_input_tokens_seen": 3695320, "step": 41035 }, { "epoch": 10.665280665280665, "grad_norm": 0.3910292088985443, "learning_rate": 2.6460790004379105e-05, "loss": 0.1755, "num_input_tokens_seen": 3695800, "step": 41040 }, { "epoch": 10.666580041580042, "grad_norm": 3.069242000579834, "learning_rate": 2.6455130055195613e-05, "loss": 0.1522, "num_input_tokens_seen": 3696280, "step": 41045 }, { "epoch": 10.667879417879417, "grad_norm": 3.3504221439361572, "learning_rate": 2.644947003117339e-05, "loss": 0.2549, "num_input_tokens_seen": 3696696, "step": 41050 }, { "epoch": 10.669178794178794, "grad_norm": 0.40971678495407104, "learning_rate": 2.6443809932603526e-05, "loss": 0.2144, "num_input_tokens_seen": 3697160, "step": 41055 }, { "epoch": 10.670478170478171, "grad_norm": 0.6102396845817566, "learning_rate": 2.6438149759777137e-05, "loss": 0.3069, "num_input_tokens_seen": 3697640, "step": 41060 }, { "epoch": 10.671777546777546, "grad_norm": 0.2974849045276642, "learning_rate": 2.6432489512985326e-05, "loss": 0.2258, "num_input_tokens_seen": 3698104, "step": 41065 }, { "epoch": 10.673076923076923, "grad_norm": 0.6262699961662292, "learning_rate": 2.6426829192519213e-05, "loss": 0.3082, "num_input_tokens_seen": 3698552, "step": 41070 }, { "epoch": 10.674376299376299, "grad_norm": 0.8883246183395386, "learning_rate": 2.64211687986699e-05, "loss": 0.3958, "num_input_tokens_seen": 3699016, "step": 41075 }, { "epoch": 10.675675675675675, "grad_norm": 4.067606449127197, "learning_rate": 2.6415508331728517e-05, "loss": 0.2199, "num_input_tokens_seen": 3699448, "step": 41080 }, { "epoch": 10.676975051975052, "grad_norm": 3.6699819564819336, "learning_rate": 2.6409847791986188e-05, "loss": 0.3287, "num_input_tokens_seen": 3699864, "step": 41085 }, { "epoch": 10.678274428274428, "grad_norm": 1.7066603899002075, "learning_rate": 2.640418717973403e-05, "loss": 0.1422, "num_input_tokens_seen": 3700296, "step": 41090 }, { "epoch": 10.679573804573804, "grad_norm": 2.8954885005950928, "learning_rate": 2.6398526495263182e-05, "loss": 0.3029, "num_input_tokens_seen": 3700744, "step": 41095 }, { "epoch": 10.680873180873181, "grad_norm": 1.471684217453003, "learning_rate": 2.639286573886478e-05, "loss": 0.1805, "num_input_tokens_seen": 3701192, "step": 41100 }, { "epoch": 10.682172557172557, "grad_norm": 1.5426559448242188, "learning_rate": 2.6387204910829956e-05, "loss": 0.2094, "num_input_tokens_seen": 3701672, "step": 41105 }, { "epoch": 10.683471933471933, "grad_norm": 2.4812965393066406, "learning_rate": 2.6381544011449854e-05, "loss": 0.1966, "num_input_tokens_seen": 3702152, "step": 41110 }, { "epoch": 10.68477130977131, "grad_norm": 2.6491944789886475, "learning_rate": 2.637588304101562e-05, "loss": 0.275, "num_input_tokens_seen": 3702600, "step": 41115 }, { "epoch": 10.686070686070686, "grad_norm": 3.0671331882476807, "learning_rate": 2.6370221999818407e-05, "loss": 0.2625, "num_input_tokens_seen": 3703016, "step": 41120 }, { "epoch": 10.687370062370062, "grad_norm": 4.03106689453125, "learning_rate": 2.6364560888149352e-05, "loss": 0.441, "num_input_tokens_seen": 3703432, "step": 41125 }, { "epoch": 10.68866943866944, "grad_norm": 2.864074230194092, "learning_rate": 2.6358899706299633e-05, "loss": 0.2316, "num_input_tokens_seen": 3703864, "step": 41130 }, { "epoch": 10.689968814968815, "grad_norm": 0.8349457383155823, "learning_rate": 2.6353238454560398e-05, "loss": 0.3121, "num_input_tokens_seen": 3704312, "step": 41135 }, { "epoch": 10.691268191268192, "grad_norm": 1.222899079322815, "learning_rate": 2.634757713322281e-05, "loss": 0.2273, "num_input_tokens_seen": 3704760, "step": 41140 }, { "epoch": 10.692567567567568, "grad_norm": 0.3749253749847412, "learning_rate": 2.6341915742578037e-05, "loss": 0.1195, "num_input_tokens_seen": 3705192, "step": 41145 }, { "epoch": 10.693866943866944, "grad_norm": 1.579838514328003, "learning_rate": 2.633625428291726e-05, "loss": 0.131, "num_input_tokens_seen": 3705624, "step": 41150 }, { "epoch": 10.69516632016632, "grad_norm": 1.0469915866851807, "learning_rate": 2.633059275453164e-05, "loss": 0.138, "num_input_tokens_seen": 3706072, "step": 41155 }, { "epoch": 10.696465696465696, "grad_norm": 5.86428689956665, "learning_rate": 2.6324931157712362e-05, "loss": 0.4057, "num_input_tokens_seen": 3706536, "step": 41160 }, { "epoch": 10.697765072765073, "grad_norm": 5.900808811187744, "learning_rate": 2.6319269492750598e-05, "loss": 0.3112, "num_input_tokens_seen": 3706968, "step": 41165 }, { "epoch": 10.69906444906445, "grad_norm": 5.3768815994262695, "learning_rate": 2.6313607759937548e-05, "loss": 0.4447, "num_input_tokens_seen": 3707416, "step": 41170 }, { "epoch": 10.700363825363825, "grad_norm": 3.6634528636932373, "learning_rate": 2.6307945959564394e-05, "loss": 0.3515, "num_input_tokens_seen": 3707880, "step": 41175 }, { "epoch": 10.701663201663202, "grad_norm": 0.711354672908783, "learning_rate": 2.630228409192232e-05, "loss": 0.1655, "num_input_tokens_seen": 3708328, "step": 41180 }, { "epoch": 10.702962577962579, "grad_norm": 7.39969539642334, "learning_rate": 2.629662215730253e-05, "loss": 0.3803, "num_input_tokens_seen": 3708792, "step": 41185 }, { "epoch": 10.704261954261954, "grad_norm": 4.715324878692627, "learning_rate": 2.6290960155996218e-05, "loss": 0.2894, "num_input_tokens_seen": 3709240, "step": 41190 }, { "epoch": 10.70556133056133, "grad_norm": 1.8708082437515259, "learning_rate": 2.628529808829459e-05, "loss": 0.1067, "num_input_tokens_seen": 3709688, "step": 41195 }, { "epoch": 10.706860706860708, "grad_norm": 2.5347373485565186, "learning_rate": 2.6279635954488845e-05, "loss": 0.2556, "num_input_tokens_seen": 3710136, "step": 41200 }, { "epoch": 10.708160083160083, "grad_norm": 0.6210907697677612, "learning_rate": 2.627397375487021e-05, "loss": 0.141, "num_input_tokens_seen": 3710648, "step": 41205 }, { "epoch": 10.70945945945946, "grad_norm": 0.388741135597229, "learning_rate": 2.626831148972987e-05, "loss": 0.1675, "num_input_tokens_seen": 3711112, "step": 41210 }, { "epoch": 10.710758835758837, "grad_norm": 5.392002582550049, "learning_rate": 2.6262649159359053e-05, "loss": 0.3446, "num_input_tokens_seen": 3711576, "step": 41215 }, { "epoch": 10.712058212058212, "grad_norm": 1.7313891649246216, "learning_rate": 2.6256986764048992e-05, "loss": 0.3743, "num_input_tokens_seen": 3712040, "step": 41220 }, { "epoch": 10.713357588357589, "grad_norm": 0.2204097956418991, "learning_rate": 2.6251324304090892e-05, "loss": 0.3958, "num_input_tokens_seen": 3712520, "step": 41225 }, { "epoch": 10.714656964656964, "grad_norm": 1.0903986692428589, "learning_rate": 2.624566177977599e-05, "loss": 0.1549, "num_input_tokens_seen": 3712984, "step": 41230 }, { "epoch": 10.71595634095634, "grad_norm": 3.158831834793091, "learning_rate": 2.6239999191395494e-05, "loss": 0.183, "num_input_tokens_seen": 3713448, "step": 41235 }, { "epoch": 10.717255717255718, "grad_norm": 0.3855641782283783, "learning_rate": 2.623433653924067e-05, "loss": 0.1459, "num_input_tokens_seen": 3713896, "step": 41240 }, { "epoch": 10.718555093555093, "grad_norm": 0.3258817791938782, "learning_rate": 2.6228673823602723e-05, "loss": 0.1841, "num_input_tokens_seen": 3714360, "step": 41245 }, { "epoch": 10.71985446985447, "grad_norm": 5.4599409103393555, "learning_rate": 2.6223011044772904e-05, "loss": 0.3198, "num_input_tokens_seen": 3714856, "step": 41250 }, { "epoch": 10.721153846153847, "grad_norm": 2.7924301624298096, "learning_rate": 2.621734820304246e-05, "loss": 0.346, "num_input_tokens_seen": 3715320, "step": 41255 }, { "epoch": 10.722453222453222, "grad_norm": 0.387178897857666, "learning_rate": 2.6211685298702632e-05, "loss": 0.333, "num_input_tokens_seen": 3715768, "step": 41260 }, { "epoch": 10.723752598752599, "grad_norm": 3.579815626144409, "learning_rate": 2.6206022332044667e-05, "loss": 0.3361, "num_input_tokens_seen": 3716232, "step": 41265 }, { "epoch": 10.725051975051976, "grad_norm": 0.4243595004081726, "learning_rate": 2.620035930335981e-05, "loss": 0.119, "num_input_tokens_seen": 3716664, "step": 41270 }, { "epoch": 10.72635135135135, "grad_norm": 4.521749496459961, "learning_rate": 2.619469621293933e-05, "loss": 0.351, "num_input_tokens_seen": 3717112, "step": 41275 }, { "epoch": 10.727650727650728, "grad_norm": 1.3038663864135742, "learning_rate": 2.618903306107448e-05, "loss": 0.3359, "num_input_tokens_seen": 3717544, "step": 41280 }, { "epoch": 10.728950103950105, "grad_norm": 3.4838035106658936, "learning_rate": 2.618336984805652e-05, "loss": 0.2227, "num_input_tokens_seen": 3718008, "step": 41285 }, { "epoch": 10.73024948024948, "grad_norm": 1.2503200769424438, "learning_rate": 2.6177706574176714e-05, "loss": 0.1621, "num_input_tokens_seen": 3718456, "step": 41290 }, { "epoch": 10.731548856548857, "grad_norm": 0.3633861243724823, "learning_rate": 2.617204323972633e-05, "loss": 0.3098, "num_input_tokens_seen": 3718920, "step": 41295 }, { "epoch": 10.732848232848234, "grad_norm": 0.4502524435520172, "learning_rate": 2.6166379844996643e-05, "loss": 0.261, "num_input_tokens_seen": 3719320, "step": 41300 }, { "epoch": 10.734147609147609, "grad_norm": 0.47034773230552673, "learning_rate": 2.6160716390278923e-05, "loss": 0.1724, "num_input_tokens_seen": 3719720, "step": 41305 }, { "epoch": 10.735446985446986, "grad_norm": 4.364715576171875, "learning_rate": 2.6155052875864443e-05, "loss": 0.4872, "num_input_tokens_seen": 3720216, "step": 41310 }, { "epoch": 10.736746361746361, "grad_norm": 3.170379400253296, "learning_rate": 2.6149389302044492e-05, "loss": 0.4024, "num_input_tokens_seen": 3720664, "step": 41315 }, { "epoch": 10.738045738045738, "grad_norm": 1.328641414642334, "learning_rate": 2.6143725669110343e-05, "loss": 0.1117, "num_input_tokens_seen": 3721128, "step": 41320 }, { "epoch": 10.739345114345115, "grad_norm": 0.5185123085975647, "learning_rate": 2.6138061977353286e-05, "loss": 0.3085, "num_input_tokens_seen": 3721560, "step": 41325 }, { "epoch": 10.74064449064449, "grad_norm": 1.235748291015625, "learning_rate": 2.6132398227064615e-05, "loss": 0.2591, "num_input_tokens_seen": 3722024, "step": 41330 }, { "epoch": 10.741943866943867, "grad_norm": 2.493290424346924, "learning_rate": 2.6126734418535613e-05, "loss": 0.4276, "num_input_tokens_seen": 3722456, "step": 41335 }, { "epoch": 10.743243243243244, "grad_norm": 1.689613938331604, "learning_rate": 2.612107055205758e-05, "loss": 0.2532, "num_input_tokens_seen": 3722920, "step": 41340 }, { "epoch": 10.744542619542619, "grad_norm": 2.561589241027832, "learning_rate": 2.6115406627921825e-05, "loss": 0.1553, "num_input_tokens_seen": 3723336, "step": 41345 }, { "epoch": 10.745841995841996, "grad_norm": 4.305463790893555, "learning_rate": 2.6109742646419628e-05, "loss": 0.3704, "num_input_tokens_seen": 3723816, "step": 41350 }, { "epoch": 10.747141372141373, "grad_norm": 0.5018516778945923, "learning_rate": 2.6104078607842308e-05, "loss": 0.4383, "num_input_tokens_seen": 3724280, "step": 41355 }, { "epoch": 10.748440748440748, "grad_norm": 2.5362765789031982, "learning_rate": 2.6098414512481163e-05, "loss": 0.4236, "num_input_tokens_seen": 3724712, "step": 41360 }, { "epoch": 10.749740124740125, "grad_norm": 1.415175437927246, "learning_rate": 2.609275036062751e-05, "loss": 0.2805, "num_input_tokens_seen": 3725160, "step": 41365 }, { "epoch": 10.7510395010395, "grad_norm": 1.124493956565857, "learning_rate": 2.6087086152572665e-05, "loss": 0.1197, "num_input_tokens_seen": 3725624, "step": 41370 }, { "epoch": 10.752338877338877, "grad_norm": 1.0475510358810425, "learning_rate": 2.6081421888607928e-05, "loss": 0.2885, "num_input_tokens_seen": 3726056, "step": 41375 }, { "epoch": 10.753638253638254, "grad_norm": 3.62178373336792, "learning_rate": 2.6075757569024633e-05, "loss": 0.3565, "num_input_tokens_seen": 3726504, "step": 41380 }, { "epoch": 10.75493762993763, "grad_norm": 1.9319261312484741, "learning_rate": 2.6070093194114094e-05, "loss": 0.1425, "num_input_tokens_seen": 3726952, "step": 41385 }, { "epoch": 10.756237006237006, "grad_norm": 0.46475735306739807, "learning_rate": 2.6064428764167637e-05, "loss": 0.184, "num_input_tokens_seen": 3727384, "step": 41390 }, { "epoch": 10.757536382536383, "grad_norm": 1.5638651847839355, "learning_rate": 2.6058764279476583e-05, "loss": 0.3493, "num_input_tokens_seen": 3727816, "step": 41395 }, { "epoch": 10.758835758835758, "grad_norm": 5.219425678253174, "learning_rate": 2.6053099740332275e-05, "loss": 0.4299, "num_input_tokens_seen": 3728280, "step": 41400 }, { "epoch": 10.760135135135135, "grad_norm": 1.9685336351394653, "learning_rate": 2.6047435147026034e-05, "loss": 0.1927, "num_input_tokens_seen": 3728712, "step": 41405 }, { "epoch": 10.761434511434512, "grad_norm": 1.480846643447876, "learning_rate": 2.6041770499849206e-05, "loss": 0.1572, "num_input_tokens_seen": 3729176, "step": 41410 }, { "epoch": 10.762733887733887, "grad_norm": 2.3841307163238525, "learning_rate": 2.6036105799093112e-05, "loss": 0.3107, "num_input_tokens_seen": 3729624, "step": 41415 }, { "epoch": 10.764033264033264, "grad_norm": 3.100611925125122, "learning_rate": 2.6030441045049115e-05, "loss": 0.2595, "num_input_tokens_seen": 3730072, "step": 41420 }, { "epoch": 10.765332640332641, "grad_norm": 3.481851816177368, "learning_rate": 2.6024776238008543e-05, "loss": 0.247, "num_input_tokens_seen": 3730488, "step": 41425 }, { "epoch": 10.766632016632016, "grad_norm": 1.057996392250061, "learning_rate": 2.6019111378262745e-05, "loss": 0.1761, "num_input_tokens_seen": 3730952, "step": 41430 }, { "epoch": 10.767931392931393, "grad_norm": 2.2143566608428955, "learning_rate": 2.601344646610308e-05, "loss": 0.2613, "num_input_tokens_seen": 3731400, "step": 41435 }, { "epoch": 10.76923076923077, "grad_norm": 1.130173921585083, "learning_rate": 2.600778150182089e-05, "loss": 0.2952, "num_input_tokens_seen": 3731832, "step": 41440 }, { "epoch": 10.770530145530145, "grad_norm": 2.0011980533599854, "learning_rate": 2.600211648570753e-05, "loss": 0.3554, "num_input_tokens_seen": 3732312, "step": 41445 }, { "epoch": 10.771829521829522, "grad_norm": 2.5513625144958496, "learning_rate": 2.599645141805435e-05, "loss": 0.2953, "num_input_tokens_seen": 3732792, "step": 41450 }, { "epoch": 10.773128898128899, "grad_norm": 0.3091510534286499, "learning_rate": 2.5990786299152725e-05, "loss": 0.138, "num_input_tokens_seen": 3733256, "step": 41455 }, { "epoch": 10.774428274428274, "grad_norm": 2.7230656147003174, "learning_rate": 2.5985121129294016e-05, "loss": 0.1615, "num_input_tokens_seen": 3733704, "step": 41460 }, { "epoch": 10.775727650727651, "grad_norm": 4.297686576843262, "learning_rate": 2.597945590876958e-05, "loss": 0.3215, "num_input_tokens_seen": 3734152, "step": 41465 }, { "epoch": 10.777027027027026, "grad_norm": 1.694417119026184, "learning_rate": 2.5973790637870786e-05, "loss": 0.2878, "num_input_tokens_seen": 3734584, "step": 41470 }, { "epoch": 10.778326403326403, "grad_norm": 1.8671047687530518, "learning_rate": 2.596812531688901e-05, "loss": 0.3382, "num_input_tokens_seen": 3735032, "step": 41475 }, { "epoch": 10.77962577962578, "grad_norm": 0.7586629986763, "learning_rate": 2.5962459946115618e-05, "loss": 0.0829, "num_input_tokens_seen": 3735464, "step": 41480 }, { "epoch": 10.780925155925155, "grad_norm": 4.125550746917725, "learning_rate": 2.5956794525841986e-05, "loss": 0.2142, "num_input_tokens_seen": 3735912, "step": 41485 }, { "epoch": 10.782224532224532, "grad_norm": 4.470313549041748, "learning_rate": 2.59511290563595e-05, "loss": 0.4263, "num_input_tokens_seen": 3736344, "step": 41490 }, { "epoch": 10.78352390852391, "grad_norm": 3.734752655029297, "learning_rate": 2.5945463537959542e-05, "loss": 0.2276, "num_input_tokens_seen": 3736808, "step": 41495 }, { "epoch": 10.784823284823284, "grad_norm": 4.393395900726318, "learning_rate": 2.593979797093348e-05, "loss": 0.3948, "num_input_tokens_seen": 3737224, "step": 41500 }, { "epoch": 10.786122661122661, "grad_norm": 0.9700783491134644, "learning_rate": 2.593413235557271e-05, "loss": 0.219, "num_input_tokens_seen": 3737688, "step": 41505 }, { "epoch": 10.787422037422038, "grad_norm": 2.113365650177002, "learning_rate": 2.5928466692168617e-05, "loss": 0.1955, "num_input_tokens_seen": 3738136, "step": 41510 }, { "epoch": 10.788721413721413, "grad_norm": 2.1372129917144775, "learning_rate": 2.5922800981012596e-05, "loss": 0.2954, "num_input_tokens_seen": 3738584, "step": 41515 }, { "epoch": 10.79002079002079, "grad_norm": 1.5674564838409424, "learning_rate": 2.5917135222396027e-05, "loss": 0.339, "num_input_tokens_seen": 3739048, "step": 41520 }, { "epoch": 10.791320166320165, "grad_norm": 3.0339083671569824, "learning_rate": 2.5911469416610322e-05, "loss": 0.2856, "num_input_tokens_seen": 3739496, "step": 41525 }, { "epoch": 10.792619542619542, "grad_norm": 1.2158023118972778, "learning_rate": 2.5905803563946872e-05, "loss": 0.3309, "num_input_tokens_seen": 3739944, "step": 41530 }, { "epoch": 10.79391891891892, "grad_norm": 2.5460703372955322, "learning_rate": 2.5900137664697078e-05, "loss": 0.2589, "num_input_tokens_seen": 3740408, "step": 41535 }, { "epoch": 10.795218295218294, "grad_norm": 0.6449465751647949, "learning_rate": 2.5894471719152336e-05, "loss": 0.1597, "num_input_tokens_seen": 3740872, "step": 41540 }, { "epoch": 10.796517671517671, "grad_norm": 2.8521478176116943, "learning_rate": 2.588880572760406e-05, "loss": 0.1506, "num_input_tokens_seen": 3741384, "step": 41545 }, { "epoch": 10.797817047817048, "grad_norm": 1.2783311605453491, "learning_rate": 2.5883139690343656e-05, "loss": 0.1747, "num_input_tokens_seen": 3741832, "step": 41550 }, { "epoch": 10.799116424116423, "grad_norm": 2.6975831985473633, "learning_rate": 2.5877473607662528e-05, "loss": 0.2608, "num_input_tokens_seen": 3742280, "step": 41555 }, { "epoch": 10.8004158004158, "grad_norm": 3.1944241523742676, "learning_rate": 2.5871807479852084e-05, "loss": 0.2188, "num_input_tokens_seen": 3742728, "step": 41560 }, { "epoch": 10.801715176715177, "grad_norm": 0.8782366514205933, "learning_rate": 2.586614130720376e-05, "loss": 0.426, "num_input_tokens_seen": 3743192, "step": 41565 }, { "epoch": 10.803014553014552, "grad_norm": 0.41332095861434937, "learning_rate": 2.5860475090008956e-05, "loss": 0.2403, "num_input_tokens_seen": 3743640, "step": 41570 }, { "epoch": 10.80431392931393, "grad_norm": 3.1463842391967773, "learning_rate": 2.5854808828559085e-05, "loss": 0.2658, "num_input_tokens_seen": 3744104, "step": 41575 }, { "epoch": 10.805613305613306, "grad_norm": 0.674805223941803, "learning_rate": 2.5849142523145588e-05, "loss": 0.2026, "num_input_tokens_seen": 3744552, "step": 41580 }, { "epoch": 10.806912681912682, "grad_norm": 1.165733814239502, "learning_rate": 2.5843476174059872e-05, "loss": 0.2938, "num_input_tokens_seen": 3745000, "step": 41585 }, { "epoch": 10.808212058212058, "grad_norm": 3.249505043029785, "learning_rate": 2.5837809781593357e-05, "loss": 0.2103, "num_input_tokens_seen": 3745464, "step": 41590 }, { "epoch": 10.809511434511435, "grad_norm": 2.13677978515625, "learning_rate": 2.5832143346037496e-05, "loss": 0.2398, "num_input_tokens_seen": 3745880, "step": 41595 }, { "epoch": 10.81081081081081, "grad_norm": 2.3529326915740967, "learning_rate": 2.5826476867683707e-05, "loss": 0.2341, "num_input_tokens_seen": 3746312, "step": 41600 }, { "epoch": 10.812110187110187, "grad_norm": 0.8703547120094299, "learning_rate": 2.5820810346823416e-05, "loss": 0.1397, "num_input_tokens_seen": 3746808, "step": 41605 }, { "epoch": 10.813409563409563, "grad_norm": 7.297679901123047, "learning_rate": 2.5815143783748057e-05, "loss": 0.2549, "num_input_tokens_seen": 3747272, "step": 41610 }, { "epoch": 10.81470893970894, "grad_norm": 0.5213196277618408, "learning_rate": 2.580947717874908e-05, "loss": 0.0944, "num_input_tokens_seen": 3747720, "step": 41615 }, { "epoch": 10.816008316008316, "grad_norm": 5.3246893882751465, "learning_rate": 2.5803810532117918e-05, "loss": 0.4094, "num_input_tokens_seen": 3748152, "step": 41620 }, { "epoch": 10.817307692307692, "grad_norm": 5.448361873626709, "learning_rate": 2.5798143844146005e-05, "loss": 0.128, "num_input_tokens_seen": 3748616, "step": 41625 }, { "epoch": 10.818607068607069, "grad_norm": 2.9471282958984375, "learning_rate": 2.5792477115124793e-05, "loss": 0.3487, "num_input_tokens_seen": 3749064, "step": 41630 }, { "epoch": 10.819906444906445, "grad_norm": 3.310640573501587, "learning_rate": 2.578681034534572e-05, "loss": 0.3538, "num_input_tokens_seen": 3749496, "step": 41635 }, { "epoch": 10.82120582120582, "grad_norm": 2.879662036895752, "learning_rate": 2.5781143535100237e-05, "loss": 0.0902, "num_input_tokens_seen": 3749928, "step": 41640 }, { "epoch": 10.822505197505198, "grad_norm": 4.722325801849365, "learning_rate": 2.5775476684679796e-05, "loss": 0.3034, "num_input_tokens_seen": 3750424, "step": 41645 }, { "epoch": 10.823804573804575, "grad_norm": 4.617786407470703, "learning_rate": 2.5769809794375843e-05, "loss": 0.2922, "num_input_tokens_seen": 3750872, "step": 41650 }, { "epoch": 10.82510395010395, "grad_norm": 0.31235411763191223, "learning_rate": 2.5764142864479835e-05, "loss": 0.1094, "num_input_tokens_seen": 3751352, "step": 41655 }, { "epoch": 10.826403326403327, "grad_norm": 4.002870082855225, "learning_rate": 2.575847589528323e-05, "loss": 0.213, "num_input_tokens_seen": 3751800, "step": 41660 }, { "epoch": 10.827702702702704, "grad_norm": 2.7193005084991455, "learning_rate": 2.5752808887077477e-05, "loss": 0.3266, "num_input_tokens_seen": 3752248, "step": 41665 }, { "epoch": 10.829002079002079, "grad_norm": 2.8432517051696777, "learning_rate": 2.574714184015405e-05, "loss": 0.3373, "num_input_tokens_seen": 3752712, "step": 41670 }, { "epoch": 10.830301455301456, "grad_norm": 2.532999038696289, "learning_rate": 2.57414747548044e-05, "loss": 0.1192, "num_input_tokens_seen": 3753224, "step": 41675 }, { "epoch": 10.83160083160083, "grad_norm": 0.3918384313583374, "learning_rate": 2.5735807631319993e-05, "loss": 0.0735, "num_input_tokens_seen": 3753656, "step": 41680 }, { "epoch": 10.832900207900208, "grad_norm": 5.851533889770508, "learning_rate": 2.5730140469992286e-05, "loss": 0.3347, "num_input_tokens_seen": 3754104, "step": 41685 }, { "epoch": 10.834199584199585, "grad_norm": 0.12931567430496216, "learning_rate": 2.5724473271112763e-05, "loss": 0.1139, "num_input_tokens_seen": 3754536, "step": 41690 }, { "epoch": 10.83549896049896, "grad_norm": 0.9723628163337708, "learning_rate": 2.571880603497289e-05, "loss": 0.2673, "num_input_tokens_seen": 3755032, "step": 41695 }, { "epoch": 10.836798336798337, "grad_norm": 0.36615145206451416, "learning_rate": 2.5713138761864127e-05, "loss": 0.4192, "num_input_tokens_seen": 3755432, "step": 41700 }, { "epoch": 10.838097713097714, "grad_norm": 3.5204179286956787, "learning_rate": 2.570747145207796e-05, "loss": 0.34, "num_input_tokens_seen": 3755864, "step": 41705 }, { "epoch": 10.839397089397089, "grad_norm": 2.7637343406677246, "learning_rate": 2.5701804105905854e-05, "loss": 0.1838, "num_input_tokens_seen": 3756312, "step": 41710 }, { "epoch": 10.840696465696466, "grad_norm": 1.836700201034546, "learning_rate": 2.5696136723639286e-05, "loss": 0.4022, "num_input_tokens_seen": 3756776, "step": 41715 }, { "epoch": 10.841995841995843, "grad_norm": 4.496695518493652, "learning_rate": 2.569046930556974e-05, "loss": 0.2767, "num_input_tokens_seen": 3757272, "step": 41720 }, { "epoch": 10.843295218295218, "grad_norm": 2.4132349491119385, "learning_rate": 2.5684801851988704e-05, "loss": 0.3668, "num_input_tokens_seen": 3757720, "step": 41725 }, { "epoch": 10.844594594594595, "grad_norm": 1.8760522603988647, "learning_rate": 2.5679134363187652e-05, "loss": 0.0457, "num_input_tokens_seen": 3758168, "step": 41730 }, { "epoch": 10.845893970893972, "grad_norm": 5.749300003051758, "learning_rate": 2.567346683945806e-05, "loss": 0.214, "num_input_tokens_seen": 3758584, "step": 41735 }, { "epoch": 10.847193347193347, "grad_norm": 0.602820873260498, "learning_rate": 2.5667799281091427e-05, "loss": 0.4529, "num_input_tokens_seen": 3759048, "step": 41740 }, { "epoch": 10.848492723492724, "grad_norm": 4.700620651245117, "learning_rate": 2.5662131688379242e-05, "loss": 0.284, "num_input_tokens_seen": 3759480, "step": 41745 }, { "epoch": 10.8497920997921, "grad_norm": 0.8458952307701111, "learning_rate": 2.5656464061612982e-05, "loss": 0.3706, "num_input_tokens_seen": 3759928, "step": 41750 }, { "epoch": 10.851091476091476, "grad_norm": 1.903273582458496, "learning_rate": 2.565079640108415e-05, "loss": 0.0988, "num_input_tokens_seen": 3760360, "step": 41755 }, { "epoch": 10.852390852390853, "grad_norm": 0.7466446161270142, "learning_rate": 2.564512870708424e-05, "loss": 0.1987, "num_input_tokens_seen": 3760808, "step": 41760 }, { "epoch": 10.853690228690228, "grad_norm": 2.9237494468688965, "learning_rate": 2.5639460979904744e-05, "loss": 0.1672, "num_input_tokens_seen": 3761240, "step": 41765 }, { "epoch": 10.854989604989605, "grad_norm": 5.823208808898926, "learning_rate": 2.5633793219837148e-05, "loss": 0.3027, "num_input_tokens_seen": 3761656, "step": 41770 }, { "epoch": 10.856288981288982, "grad_norm": 6.354865074157715, "learning_rate": 2.562812542717296e-05, "loss": 0.3653, "num_input_tokens_seen": 3762088, "step": 41775 }, { "epoch": 10.857588357588357, "grad_norm": 5.544859886169434, "learning_rate": 2.5622457602203688e-05, "loss": 0.4564, "num_input_tokens_seen": 3762568, "step": 41780 }, { "epoch": 10.858887733887734, "grad_norm": 1.0246657133102417, "learning_rate": 2.5616789745220822e-05, "loss": 0.2046, "num_input_tokens_seen": 3763032, "step": 41785 }, { "epoch": 10.86018711018711, "grad_norm": 1.1380481719970703, "learning_rate": 2.5611121856515857e-05, "loss": 0.1347, "num_input_tokens_seen": 3763512, "step": 41790 }, { "epoch": 10.861486486486486, "grad_norm": 1.2174092531204224, "learning_rate": 2.560545393638032e-05, "loss": 0.3223, "num_input_tokens_seen": 3763944, "step": 41795 }, { "epoch": 10.862785862785863, "grad_norm": 5.142277240753174, "learning_rate": 2.5599785985105705e-05, "loss": 0.1993, "num_input_tokens_seen": 3764360, "step": 41800 }, { "epoch": 10.86408523908524, "grad_norm": 3.767493486404419, "learning_rate": 2.5594118002983523e-05, "loss": 0.4427, "num_input_tokens_seen": 3764808, "step": 41805 }, { "epoch": 10.865384615384615, "grad_norm": 4.027649879455566, "learning_rate": 2.5588449990305278e-05, "loss": 0.1512, "num_input_tokens_seen": 3765272, "step": 41810 }, { "epoch": 10.866683991683992, "grad_norm": 0.09351688623428345, "learning_rate": 2.5582781947362495e-05, "loss": 0.327, "num_input_tokens_seen": 3765720, "step": 41815 }, { "epoch": 10.867983367983367, "grad_norm": 0.8193649053573608, "learning_rate": 2.557711387444668e-05, "loss": 0.0863, "num_input_tokens_seen": 3766152, "step": 41820 }, { "epoch": 10.869282744282744, "grad_norm": 4.6671319007873535, "learning_rate": 2.5571445771849327e-05, "loss": 0.3049, "num_input_tokens_seen": 3766584, "step": 41825 }, { "epoch": 10.870582120582121, "grad_norm": 1.0443146228790283, "learning_rate": 2.5565777639861992e-05, "loss": 0.4318, "num_input_tokens_seen": 3767048, "step": 41830 }, { "epoch": 10.871881496881496, "grad_norm": 1.6649630069732666, "learning_rate": 2.5560109478776162e-05, "loss": 0.3951, "num_input_tokens_seen": 3767512, "step": 41835 }, { "epoch": 10.873180873180873, "grad_norm": 0.9037665724754333, "learning_rate": 2.5554441288883364e-05, "loss": 0.2988, "num_input_tokens_seen": 3767960, "step": 41840 }, { "epoch": 10.87448024948025, "grad_norm": 0.9857718348503113, "learning_rate": 2.5548773070475118e-05, "loss": 0.2441, "num_input_tokens_seen": 3768408, "step": 41845 }, { "epoch": 10.875779625779625, "grad_norm": 4.761181354522705, "learning_rate": 2.554310482384295e-05, "loss": 0.2792, "num_input_tokens_seen": 3768840, "step": 41850 }, { "epoch": 10.877079002079002, "grad_norm": 2.479609966278076, "learning_rate": 2.553743654927838e-05, "loss": 0.2469, "num_input_tokens_seen": 3769256, "step": 41855 }, { "epoch": 10.878378378378379, "grad_norm": 6.047396659851074, "learning_rate": 2.553176824707293e-05, "loss": 0.2301, "num_input_tokens_seen": 3769704, "step": 41860 }, { "epoch": 10.879677754677754, "grad_norm": 2.8746604919433594, "learning_rate": 2.552609991751813e-05, "loss": 0.2384, "num_input_tokens_seen": 3770152, "step": 41865 }, { "epoch": 10.880977130977131, "grad_norm": 4.027523994445801, "learning_rate": 2.552043156090551e-05, "loss": 0.2332, "num_input_tokens_seen": 3770584, "step": 41870 }, { "epoch": 10.882276507276508, "grad_norm": 2.5251247882843018, "learning_rate": 2.55147631775266e-05, "loss": 0.2061, "num_input_tokens_seen": 3771048, "step": 41875 }, { "epoch": 10.883575883575883, "grad_norm": 2.624332904815674, "learning_rate": 2.550909476767292e-05, "loss": 0.2677, "num_input_tokens_seen": 3771512, "step": 41880 }, { "epoch": 10.88487525987526, "grad_norm": 2.75270938873291, "learning_rate": 2.550342633163601e-05, "loss": 0.1554, "num_input_tokens_seen": 3771960, "step": 41885 }, { "epoch": 10.886174636174637, "grad_norm": 5.178008556365967, "learning_rate": 2.54977578697074e-05, "loss": 0.3303, "num_input_tokens_seen": 3772424, "step": 41890 }, { "epoch": 10.887474012474012, "grad_norm": 2.9043383598327637, "learning_rate": 2.549208938217863e-05, "loss": 0.1856, "num_input_tokens_seen": 3772824, "step": 41895 }, { "epoch": 10.888773388773389, "grad_norm": 1.6887353658676147, "learning_rate": 2.5486420869341232e-05, "loss": 0.1617, "num_input_tokens_seen": 3773288, "step": 41900 }, { "epoch": 10.890072765072766, "grad_norm": 3.3967044353485107, "learning_rate": 2.548075233148674e-05, "loss": 0.3429, "num_input_tokens_seen": 3773752, "step": 41905 }, { "epoch": 10.891372141372141, "grad_norm": 1.3106975555419922, "learning_rate": 2.5475083768906694e-05, "loss": 0.0591, "num_input_tokens_seen": 3774216, "step": 41910 }, { "epoch": 10.892671517671518, "grad_norm": 5.946140766143799, "learning_rate": 2.546941518189263e-05, "loss": 0.2165, "num_input_tokens_seen": 3774680, "step": 41915 }, { "epoch": 10.893970893970893, "grad_norm": 0.2911527454853058, "learning_rate": 2.5463746570736103e-05, "loss": 0.2812, "num_input_tokens_seen": 3775112, "step": 41920 }, { "epoch": 10.89527027027027, "grad_norm": 3.7826905250549316, "learning_rate": 2.545807793572864e-05, "loss": 0.3399, "num_input_tokens_seen": 3775576, "step": 41925 }, { "epoch": 10.896569646569647, "grad_norm": 3.4703335762023926, "learning_rate": 2.5452409277161793e-05, "loss": 0.3745, "num_input_tokens_seen": 3776056, "step": 41930 }, { "epoch": 10.897869022869022, "grad_norm": 8.512590408325195, "learning_rate": 2.5446740595327096e-05, "loss": 0.3359, "num_input_tokens_seen": 3776520, "step": 41935 }, { "epoch": 10.8991683991684, "grad_norm": 1.2309608459472656, "learning_rate": 2.5441071890516112e-05, "loss": 0.2345, "num_input_tokens_seen": 3776968, "step": 41940 }, { "epoch": 10.900467775467776, "grad_norm": 1.5294996500015259, "learning_rate": 2.5435403163020373e-05, "loss": 0.2344, "num_input_tokens_seen": 3777416, "step": 41945 }, { "epoch": 10.901767151767151, "grad_norm": 1.8235336542129517, "learning_rate": 2.5429734413131427e-05, "loss": 0.2174, "num_input_tokens_seen": 3777832, "step": 41950 }, { "epoch": 10.903066528066528, "grad_norm": 1.338881015777588, "learning_rate": 2.5424065641140837e-05, "loss": 0.3476, "num_input_tokens_seen": 3778296, "step": 41955 }, { "epoch": 10.904365904365905, "grad_norm": 0.8905272483825684, "learning_rate": 2.541839684734015e-05, "loss": 0.1599, "num_input_tokens_seen": 3778712, "step": 41960 }, { "epoch": 10.90566528066528, "grad_norm": 0.5106415152549744, "learning_rate": 2.54127280320209e-05, "loss": 0.1531, "num_input_tokens_seen": 3779128, "step": 41965 }, { "epoch": 10.906964656964657, "grad_norm": 0.5038330554962158, "learning_rate": 2.540705919547466e-05, "loss": 0.114, "num_input_tokens_seen": 3779592, "step": 41970 }, { "epoch": 10.908264033264032, "grad_norm": 1.9544177055358887, "learning_rate": 2.540139033799297e-05, "loss": 0.3049, "num_input_tokens_seen": 3780056, "step": 41975 }, { "epoch": 10.90956340956341, "grad_norm": 0.7705890536308289, "learning_rate": 2.5395721459867393e-05, "loss": 0.3851, "num_input_tokens_seen": 3780520, "step": 41980 }, { "epoch": 10.910862785862786, "grad_norm": 0.5912392139434814, "learning_rate": 2.5390052561389478e-05, "loss": 0.2167, "num_input_tokens_seen": 3780952, "step": 41985 }, { "epoch": 10.912162162162161, "grad_norm": 0.6571342349052429, "learning_rate": 2.538438364285079e-05, "loss": 0.22, "num_input_tokens_seen": 3781400, "step": 41990 }, { "epoch": 10.913461538461538, "grad_norm": 3.2365283966064453, "learning_rate": 2.5378714704542883e-05, "loss": 0.2285, "num_input_tokens_seen": 3781880, "step": 41995 }, { "epoch": 10.914760914760915, "grad_norm": 1.722356915473938, "learning_rate": 2.5373045746757313e-05, "loss": 0.2255, "num_input_tokens_seen": 3782344, "step": 42000 }, { "epoch": 10.91606029106029, "grad_norm": 1.9401320219039917, "learning_rate": 2.5367376769785645e-05, "loss": 0.3091, "num_input_tokens_seen": 3782792, "step": 42005 }, { "epoch": 10.917359667359667, "grad_norm": 2.0400068759918213, "learning_rate": 2.5361707773919436e-05, "loss": 0.3292, "num_input_tokens_seen": 3783224, "step": 42010 }, { "epoch": 10.918659043659044, "grad_norm": 1.323804259300232, "learning_rate": 2.5356038759450252e-05, "loss": 0.0985, "num_input_tokens_seen": 3783672, "step": 42015 }, { "epoch": 10.91995841995842, "grad_norm": 7.0104289054870605, "learning_rate": 2.5350369726669652e-05, "loss": 0.3888, "num_input_tokens_seen": 3784104, "step": 42020 }, { "epoch": 10.921257796257796, "grad_norm": 1.4152048826217651, "learning_rate": 2.5344700675869203e-05, "loss": 0.3665, "num_input_tokens_seen": 3784584, "step": 42025 }, { "epoch": 10.922557172557173, "grad_norm": 0.9876789450645447, "learning_rate": 2.533903160734047e-05, "loss": 0.1756, "num_input_tokens_seen": 3785048, "step": 42030 }, { "epoch": 10.923856548856548, "grad_norm": 0.5598794221878052, "learning_rate": 2.5333362521375013e-05, "loss": 0.159, "num_input_tokens_seen": 3785496, "step": 42035 }, { "epoch": 10.925155925155925, "grad_norm": 0.11652688682079315, "learning_rate": 2.5327693418264397e-05, "loss": 0.1384, "num_input_tokens_seen": 3785944, "step": 42040 }, { "epoch": 10.926455301455302, "grad_norm": 6.216135025024414, "learning_rate": 2.53220242983002e-05, "loss": 0.2291, "num_input_tokens_seen": 3786424, "step": 42045 }, { "epoch": 10.927754677754677, "grad_norm": 4.33622407913208, "learning_rate": 2.531635516177399e-05, "loss": 0.448, "num_input_tokens_seen": 3786872, "step": 42050 }, { "epoch": 10.929054054054054, "grad_norm": 0.7141099572181702, "learning_rate": 2.5310686008977326e-05, "loss": 0.165, "num_input_tokens_seen": 3787320, "step": 42055 }, { "epoch": 10.93035343035343, "grad_norm": 1.6849561929702759, "learning_rate": 2.530501684020178e-05, "loss": 0.0933, "num_input_tokens_seen": 3787768, "step": 42060 }, { "epoch": 10.931652806652806, "grad_norm": 3.739871025085449, "learning_rate": 2.529934765573893e-05, "loss": 0.2142, "num_input_tokens_seen": 3788184, "step": 42065 }, { "epoch": 10.932952182952183, "grad_norm": 0.9702435731887817, "learning_rate": 2.5293678455880343e-05, "loss": 0.2494, "num_input_tokens_seen": 3788632, "step": 42070 }, { "epoch": 10.934251559251559, "grad_norm": 0.39101722836494446, "learning_rate": 2.528800924091758e-05, "loss": 0.1891, "num_input_tokens_seen": 3789096, "step": 42075 }, { "epoch": 10.935550935550935, "grad_norm": 0.35855019092559814, "learning_rate": 2.528234001114224e-05, "loss": 0.3653, "num_input_tokens_seen": 3789576, "step": 42080 }, { "epoch": 10.936850311850312, "grad_norm": 3.3427369594573975, "learning_rate": 2.527667076684588e-05, "loss": 0.2307, "num_input_tokens_seen": 3790008, "step": 42085 }, { "epoch": 10.938149688149688, "grad_norm": 1.937848687171936, "learning_rate": 2.527100150832008e-05, "loss": 0.3684, "num_input_tokens_seen": 3790440, "step": 42090 }, { "epoch": 10.939449064449065, "grad_norm": 0.8810349702835083, "learning_rate": 2.526533223585641e-05, "loss": 0.1253, "num_input_tokens_seen": 3790904, "step": 42095 }, { "epoch": 10.940748440748441, "grad_norm": 2.330446481704712, "learning_rate": 2.525966294974645e-05, "loss": 0.3663, "num_input_tokens_seen": 3791320, "step": 42100 }, { "epoch": 10.942047817047817, "grad_norm": 0.5352229475975037, "learning_rate": 2.525399365028177e-05, "loss": 0.2114, "num_input_tokens_seen": 3791768, "step": 42105 }, { "epoch": 10.943347193347194, "grad_norm": 1.126496434211731, "learning_rate": 2.5248324337753953e-05, "loss": 0.4161, "num_input_tokens_seen": 3792216, "step": 42110 }, { "epoch": 10.94464656964657, "grad_norm": 0.8747829794883728, "learning_rate": 2.524265501245458e-05, "loss": 0.4144, "num_input_tokens_seen": 3792680, "step": 42115 }, { "epoch": 10.945945945945946, "grad_norm": 0.9499819874763489, "learning_rate": 2.523698567467523e-05, "loss": 0.1516, "num_input_tokens_seen": 3793144, "step": 42120 }, { "epoch": 10.947245322245323, "grad_norm": 2.351149320602417, "learning_rate": 2.5231316324707482e-05, "loss": 0.2897, "num_input_tokens_seen": 3793592, "step": 42125 }, { "epoch": 10.948544698544698, "grad_norm": 0.6422273516654968, "learning_rate": 2.5225646962842904e-05, "loss": 0.0835, "num_input_tokens_seen": 3794072, "step": 42130 }, { "epoch": 10.949844074844075, "grad_norm": 0.7156645655632019, "learning_rate": 2.5219977589373093e-05, "loss": 0.182, "num_input_tokens_seen": 3794520, "step": 42135 }, { "epoch": 10.951143451143452, "grad_norm": 3.2734017372131348, "learning_rate": 2.5214308204589626e-05, "loss": 0.3465, "num_input_tokens_seen": 3794984, "step": 42140 }, { "epoch": 10.952442827442827, "grad_norm": 0.423209547996521, "learning_rate": 2.520863880878408e-05, "loss": 0.1122, "num_input_tokens_seen": 3795416, "step": 42145 }, { "epoch": 10.953742203742204, "grad_norm": 0.5684783458709717, "learning_rate": 2.5202969402248033e-05, "loss": 0.1384, "num_input_tokens_seen": 3795880, "step": 42150 }, { "epoch": 10.95504158004158, "grad_norm": 6.731879234313965, "learning_rate": 2.519729998527309e-05, "loss": 0.2889, "num_input_tokens_seen": 3796344, "step": 42155 }, { "epoch": 10.956340956340956, "grad_norm": 0.23801983892917633, "learning_rate": 2.5191630558150816e-05, "loss": 0.332, "num_input_tokens_seen": 3796760, "step": 42160 }, { "epoch": 10.957640332640333, "grad_norm": 0.5482392311096191, "learning_rate": 2.5185961121172785e-05, "loss": 0.3307, "num_input_tokens_seen": 3797224, "step": 42165 }, { "epoch": 10.95893970893971, "grad_norm": 3.517015218734741, "learning_rate": 2.5180291674630608e-05, "loss": 0.1767, "num_input_tokens_seen": 3797656, "step": 42170 }, { "epoch": 10.960239085239085, "grad_norm": 2.788146734237671, "learning_rate": 2.5174622218815852e-05, "loss": 0.3708, "num_input_tokens_seen": 3798104, "step": 42175 }, { "epoch": 10.961538461538462, "grad_norm": 2.1270411014556885, "learning_rate": 2.5168952754020115e-05, "loss": 0.34, "num_input_tokens_seen": 3798504, "step": 42180 }, { "epoch": 10.962837837837839, "grad_norm": 1.6989954710006714, "learning_rate": 2.5163283280534964e-05, "loss": 0.29, "num_input_tokens_seen": 3798968, "step": 42185 }, { "epoch": 10.964137214137214, "grad_norm": 2.158017158508301, "learning_rate": 2.5157613798652008e-05, "loss": 0.3275, "num_input_tokens_seen": 3799400, "step": 42190 }, { "epoch": 10.96543659043659, "grad_norm": 1.6267950534820557, "learning_rate": 2.5151944308662824e-05, "loss": 0.1753, "num_input_tokens_seen": 3799880, "step": 42195 }, { "epoch": 10.966735966735968, "grad_norm": 2.779015302658081, "learning_rate": 2.5146274810858988e-05, "loss": 0.3419, "num_input_tokens_seen": 3800344, "step": 42200 }, { "epoch": 10.968035343035343, "grad_norm": 0.8942886590957642, "learning_rate": 2.5140605305532104e-05, "loss": 0.1839, "num_input_tokens_seen": 3800776, "step": 42205 }, { "epoch": 10.96933471933472, "grad_norm": 2.958570718765259, "learning_rate": 2.5134935792973757e-05, "loss": 0.1634, "num_input_tokens_seen": 3801224, "step": 42210 }, { "epoch": 10.970634095634095, "grad_norm": 0.9087955355644226, "learning_rate": 2.512926627347553e-05, "loss": 0.0569, "num_input_tokens_seen": 3801688, "step": 42215 }, { "epoch": 10.971933471933472, "grad_norm": 1.207458734512329, "learning_rate": 2.512359674732902e-05, "loss": 0.205, "num_input_tokens_seen": 3802120, "step": 42220 }, { "epoch": 10.973232848232849, "grad_norm": 2.216860055923462, "learning_rate": 2.511792721482581e-05, "loss": 0.5008, "num_input_tokens_seen": 3802584, "step": 42225 }, { "epoch": 10.974532224532224, "grad_norm": 3.7537930011749268, "learning_rate": 2.5112257676257484e-05, "loss": 0.49, "num_input_tokens_seen": 3803080, "step": 42230 }, { "epoch": 10.9758316008316, "grad_norm": 2.4254884719848633, "learning_rate": 2.5106588131915636e-05, "loss": 0.2984, "num_input_tokens_seen": 3803496, "step": 42235 }, { "epoch": 10.977130977130978, "grad_norm": 4.081188201904297, "learning_rate": 2.5100918582091864e-05, "loss": 0.3165, "num_input_tokens_seen": 3803928, "step": 42240 }, { "epoch": 10.978430353430353, "grad_norm": 0.30860552191734314, "learning_rate": 2.5095249027077757e-05, "loss": 0.1046, "num_input_tokens_seen": 3804360, "step": 42245 }, { "epoch": 10.97972972972973, "grad_norm": 0.5871478915214539, "learning_rate": 2.5089579467164898e-05, "loss": 0.1765, "num_input_tokens_seen": 3804840, "step": 42250 }, { "epoch": 10.981029106029107, "grad_norm": 3.5407888889312744, "learning_rate": 2.5083909902644875e-05, "loss": 0.4251, "num_input_tokens_seen": 3805272, "step": 42255 }, { "epoch": 10.982328482328482, "grad_norm": 0.9811127185821533, "learning_rate": 2.5078240333809287e-05, "loss": 0.2963, "num_input_tokens_seen": 3805768, "step": 42260 }, { "epoch": 10.983627858627859, "grad_norm": 0.4624450206756592, "learning_rate": 2.5072570760949733e-05, "loss": 0.3403, "num_input_tokens_seen": 3806200, "step": 42265 }, { "epoch": 10.984927234927234, "grad_norm": 1.6027123928070068, "learning_rate": 2.506690118435779e-05, "loss": 0.3338, "num_input_tokens_seen": 3806632, "step": 42270 }, { "epoch": 10.986226611226611, "grad_norm": 1.079444408416748, "learning_rate": 2.5061231604325046e-05, "loss": 0.1923, "num_input_tokens_seen": 3807048, "step": 42275 }, { "epoch": 10.987525987525988, "grad_norm": 1.0663292407989502, "learning_rate": 2.505556202114311e-05, "loss": 0.1286, "num_input_tokens_seen": 3807512, "step": 42280 }, { "epoch": 10.988825363825363, "grad_norm": 1.1357290744781494, "learning_rate": 2.5049892435103573e-05, "loss": 0.1856, "num_input_tokens_seen": 3807944, "step": 42285 }, { "epoch": 10.99012474012474, "grad_norm": 1.686509609222412, "learning_rate": 2.5044222846498012e-05, "loss": 0.1558, "num_input_tokens_seen": 3808392, "step": 42290 }, { "epoch": 10.991424116424117, "grad_norm": 0.29955965280532837, "learning_rate": 2.5038553255618026e-05, "loss": 0.2328, "num_input_tokens_seen": 3808872, "step": 42295 }, { "epoch": 10.992723492723492, "grad_norm": 1.8489536046981812, "learning_rate": 2.5032883662755213e-05, "loss": 0.2707, "num_input_tokens_seen": 3809288, "step": 42300 }, { "epoch": 10.994022869022869, "grad_norm": 0.38203632831573486, "learning_rate": 2.502721406820116e-05, "loss": 0.3039, "num_input_tokens_seen": 3809736, "step": 42305 }, { "epoch": 10.995322245322246, "grad_norm": 0.18690958619117737, "learning_rate": 2.502154447224746e-05, "loss": 0.2289, "num_input_tokens_seen": 3810200, "step": 42310 }, { "epoch": 10.996621621621621, "grad_norm": 0.6940978765487671, "learning_rate": 2.5015874875185708e-05, "loss": 0.0686, "num_input_tokens_seen": 3810600, "step": 42315 }, { "epoch": 10.997920997920998, "grad_norm": 0.32331499457359314, "learning_rate": 2.5010205277307498e-05, "loss": 0.1823, "num_input_tokens_seen": 3811032, "step": 42320 }, { "epoch": 10.999220374220375, "grad_norm": 2.7465713024139404, "learning_rate": 2.500453567890442e-05, "loss": 0.2284, "num_input_tokens_seen": 3811480, "step": 42325 }, { "epoch": 11.0, "eval_loss": 0.2862201929092407, "eval_runtime": 13.1944, "eval_samples_per_second": 64.876, "eval_steps_per_second": 32.438, "num_input_tokens_seen": 3811696, "step": 42328 }, { "epoch": 11.00051975051975, "grad_norm": 0.11923190951347351, "learning_rate": 2.4998866080268067e-05, "loss": 0.2435, "num_input_tokens_seen": 3811904, "step": 42330 }, { "epoch": 11.001819126819127, "grad_norm": 2.3503782749176025, "learning_rate": 2.4993196481690038e-05, "loss": 0.3411, "num_input_tokens_seen": 3812352, "step": 42335 }, { "epoch": 11.003118503118504, "grad_norm": 2.158823013305664, "learning_rate": 2.498752688346191e-05, "loss": 0.3118, "num_input_tokens_seen": 3812784, "step": 42340 }, { "epoch": 11.004417879417879, "grad_norm": 2.3610002994537354, "learning_rate": 2.4981857285875295e-05, "loss": 0.3588, "num_input_tokens_seen": 3813232, "step": 42345 }, { "epoch": 11.005717255717256, "grad_norm": 2.7958943843841553, "learning_rate": 2.4976187689221765e-05, "loss": 0.2678, "num_input_tokens_seen": 3813696, "step": 42350 }, { "epoch": 11.007016632016631, "grad_norm": 0.5296095609664917, "learning_rate": 2.4970518093792944e-05, "loss": 0.2236, "num_input_tokens_seen": 3814144, "step": 42355 }, { "epoch": 11.008316008316008, "grad_norm": 1.0824964046478271, "learning_rate": 2.496484849988039e-05, "loss": 0.1661, "num_input_tokens_seen": 3814560, "step": 42360 }, { "epoch": 11.009615384615385, "grad_norm": 1.1509113311767578, "learning_rate": 2.495917890777572e-05, "loss": 0.1507, "num_input_tokens_seen": 3815008, "step": 42365 }, { "epoch": 11.01091476091476, "grad_norm": 0.8932845592498779, "learning_rate": 2.495350931777051e-05, "loss": 0.1594, "num_input_tokens_seen": 3815472, "step": 42370 }, { "epoch": 11.012214137214137, "grad_norm": 0.7590469121932983, "learning_rate": 2.4947839730156372e-05, "loss": 0.2255, "num_input_tokens_seen": 3815888, "step": 42375 }, { "epoch": 11.013513513513514, "grad_norm": 3.3553779125213623, "learning_rate": 2.4942170145224874e-05, "loss": 0.2564, "num_input_tokens_seen": 3816336, "step": 42380 }, { "epoch": 11.01481288981289, "grad_norm": 0.16250301897525787, "learning_rate": 2.4936500563267627e-05, "loss": 0.2615, "num_input_tokens_seen": 3816832, "step": 42385 }, { "epoch": 11.016112266112266, "grad_norm": 2.8083019256591797, "learning_rate": 2.493083098457622e-05, "loss": 0.1725, "num_input_tokens_seen": 3817296, "step": 42390 }, { "epoch": 11.017411642411643, "grad_norm": 1.9291800260543823, "learning_rate": 2.492516140944224e-05, "loss": 0.2777, "num_input_tokens_seen": 3817760, "step": 42395 }, { "epoch": 11.018711018711018, "grad_norm": 2.459650993347168, "learning_rate": 2.4919491838157278e-05, "loss": 0.3057, "num_input_tokens_seen": 3818224, "step": 42400 }, { "epoch": 11.020010395010395, "grad_norm": 1.483722448348999, "learning_rate": 2.4913822271012923e-05, "loss": 0.3606, "num_input_tokens_seen": 3818688, "step": 42405 }, { "epoch": 11.021309771309772, "grad_norm": 1.8817546367645264, "learning_rate": 2.4908152708300784e-05, "loss": 0.1461, "num_input_tokens_seen": 3819168, "step": 42410 }, { "epoch": 11.022609147609147, "grad_norm": 0.9986880421638489, "learning_rate": 2.4902483150312428e-05, "loss": 0.1783, "num_input_tokens_seen": 3819616, "step": 42415 }, { "epoch": 11.023908523908524, "grad_norm": 2.5772557258605957, "learning_rate": 2.489681359733946e-05, "loss": 0.2611, "num_input_tokens_seen": 3820096, "step": 42420 }, { "epoch": 11.025207900207901, "grad_norm": 1.467538595199585, "learning_rate": 2.4891144049673464e-05, "loss": 0.1447, "num_input_tokens_seen": 3820544, "step": 42425 }, { "epoch": 11.026507276507276, "grad_norm": 1.4255712032318115, "learning_rate": 2.4885474507606045e-05, "loss": 0.2007, "num_input_tokens_seen": 3821008, "step": 42430 }, { "epoch": 11.027806652806653, "grad_norm": 2.0403225421905518, "learning_rate": 2.4879804971428768e-05, "loss": 0.3271, "num_input_tokens_seen": 3821456, "step": 42435 }, { "epoch": 11.029106029106028, "grad_norm": 1.7948530912399292, "learning_rate": 2.487413544143325e-05, "loss": 0.1985, "num_input_tokens_seen": 3821904, "step": 42440 }, { "epoch": 11.030405405405405, "grad_norm": 5.565356731414795, "learning_rate": 2.4868465917911053e-05, "loss": 0.2054, "num_input_tokens_seen": 3822352, "step": 42445 }, { "epoch": 11.031704781704782, "grad_norm": 3.227409839630127, "learning_rate": 2.4862796401153792e-05, "loss": 0.2025, "num_input_tokens_seen": 3822800, "step": 42450 }, { "epoch": 11.033004158004157, "grad_norm": 2.511624574661255, "learning_rate": 2.4857126891453046e-05, "loss": 0.133, "num_input_tokens_seen": 3823248, "step": 42455 }, { "epoch": 11.034303534303534, "grad_norm": 1.3852334022521973, "learning_rate": 2.485145738910039e-05, "loss": 0.206, "num_input_tokens_seen": 3823728, "step": 42460 }, { "epoch": 11.035602910602911, "grad_norm": 1.0391770601272583, "learning_rate": 2.4845787894387425e-05, "loss": 0.1798, "num_input_tokens_seen": 3824192, "step": 42465 }, { "epoch": 11.036902286902286, "grad_norm": 1.036550521850586, "learning_rate": 2.4840118407605734e-05, "loss": 0.104, "num_input_tokens_seen": 3824624, "step": 42470 }, { "epoch": 11.038201663201663, "grad_norm": 3.891164779663086, "learning_rate": 2.4834448929046918e-05, "loss": 0.3622, "num_input_tokens_seen": 3825088, "step": 42475 }, { "epoch": 11.03950103950104, "grad_norm": 1.2818074226379395, "learning_rate": 2.482877945900254e-05, "loss": 0.1412, "num_input_tokens_seen": 3825520, "step": 42480 }, { "epoch": 11.040800415800415, "grad_norm": 0.8084231019020081, "learning_rate": 2.4823109997764206e-05, "loss": 0.1585, "num_input_tokens_seen": 3825952, "step": 42485 }, { "epoch": 11.042099792099792, "grad_norm": 2.8867743015289307, "learning_rate": 2.4817440545623486e-05, "loss": 0.0966, "num_input_tokens_seen": 3826384, "step": 42490 }, { "epoch": 11.04339916839917, "grad_norm": 5.064302921295166, "learning_rate": 2.4811771102871985e-05, "loss": 0.3961, "num_input_tokens_seen": 3826816, "step": 42495 }, { "epoch": 11.044698544698544, "grad_norm": 7.500986576080322, "learning_rate": 2.4806101669801266e-05, "loss": 0.4114, "num_input_tokens_seen": 3827264, "step": 42500 }, { "epoch": 11.045997920997921, "grad_norm": 5.776702404022217, "learning_rate": 2.4800432246702928e-05, "loss": 0.4136, "num_input_tokens_seen": 3827744, "step": 42505 }, { "epoch": 11.047297297297296, "grad_norm": 5.9852423667907715, "learning_rate": 2.479476283386855e-05, "loss": 0.2671, "num_input_tokens_seen": 3828208, "step": 42510 }, { "epoch": 11.048596673596673, "grad_norm": 2.738248586654663, "learning_rate": 2.478909343158972e-05, "loss": 0.2634, "num_input_tokens_seen": 3828704, "step": 42515 }, { "epoch": 11.04989604989605, "grad_norm": 0.5804296135902405, "learning_rate": 2.4783424040158018e-05, "loss": 0.2196, "num_input_tokens_seen": 3829152, "step": 42520 }, { "epoch": 11.051195426195425, "grad_norm": 3.6643078327178955, "learning_rate": 2.4777754659865015e-05, "loss": 0.4359, "num_input_tokens_seen": 3829584, "step": 42525 }, { "epoch": 11.052494802494802, "grad_norm": 3.9646291732788086, "learning_rate": 2.4772085291002318e-05, "loss": 0.3182, "num_input_tokens_seen": 3830032, "step": 42530 }, { "epoch": 11.05379417879418, "grad_norm": 0.6003473997116089, "learning_rate": 2.476641593386148e-05, "loss": 0.1378, "num_input_tokens_seen": 3830448, "step": 42535 }, { "epoch": 11.055093555093555, "grad_norm": 2.8608059883117676, "learning_rate": 2.47607465887341e-05, "loss": 0.2673, "num_input_tokens_seen": 3830896, "step": 42540 }, { "epoch": 11.056392931392931, "grad_norm": 0.31611236929893494, "learning_rate": 2.4755077255911743e-05, "loss": 0.2021, "num_input_tokens_seen": 3831328, "step": 42545 }, { "epoch": 11.057692307692308, "grad_norm": 1.467620611190796, "learning_rate": 2.4749407935686014e-05, "loss": 0.3015, "num_input_tokens_seen": 3831776, "step": 42550 }, { "epoch": 11.058991683991684, "grad_norm": 4.357539176940918, "learning_rate": 2.4743738628348463e-05, "loss": 0.132, "num_input_tokens_seen": 3832240, "step": 42555 }, { "epoch": 11.06029106029106, "grad_norm": 0.5453603863716125, "learning_rate": 2.473806933419068e-05, "loss": 0.2313, "num_input_tokens_seen": 3832672, "step": 42560 }, { "epoch": 11.061590436590437, "grad_norm": 1.481557846069336, "learning_rate": 2.4732400053504243e-05, "loss": 0.0778, "num_input_tokens_seen": 3833136, "step": 42565 }, { "epoch": 11.062889812889813, "grad_norm": 6.322305202484131, "learning_rate": 2.4726730786580735e-05, "loss": 0.2173, "num_input_tokens_seen": 3833584, "step": 42570 }, { "epoch": 11.06418918918919, "grad_norm": 7.5001397132873535, "learning_rate": 2.4721061533711716e-05, "loss": 0.3106, "num_input_tokens_seen": 3834064, "step": 42575 }, { "epoch": 11.065488565488565, "grad_norm": 0.48551636934280396, "learning_rate": 2.4715392295188772e-05, "loss": 0.057, "num_input_tokens_seen": 3834528, "step": 42580 }, { "epoch": 11.066787941787942, "grad_norm": 1.7294442653656006, "learning_rate": 2.4709723071303485e-05, "loss": 0.2131, "num_input_tokens_seen": 3834976, "step": 42585 }, { "epoch": 11.068087318087318, "grad_norm": 0.1119549423456192, "learning_rate": 2.4704053862347402e-05, "loss": 0.0241, "num_input_tokens_seen": 3835424, "step": 42590 }, { "epoch": 11.069386694386694, "grad_norm": 0.3459295332431793, "learning_rate": 2.469838466861212e-05, "loss": 0.4571, "num_input_tokens_seen": 3835904, "step": 42595 }, { "epoch": 11.07068607068607, "grad_norm": 5.2767109870910645, "learning_rate": 2.4692715490389202e-05, "loss": 0.3894, "num_input_tokens_seen": 3836384, "step": 42600 }, { "epoch": 11.071985446985448, "grad_norm": 7.425130367279053, "learning_rate": 2.4687046327970227e-05, "loss": 0.366, "num_input_tokens_seen": 3836832, "step": 42605 }, { "epoch": 11.073284823284823, "grad_norm": 0.6371113061904907, "learning_rate": 2.4681377181646752e-05, "loss": 0.2217, "num_input_tokens_seen": 3837248, "step": 42610 }, { "epoch": 11.0745841995842, "grad_norm": 5.287148475646973, "learning_rate": 2.4675708051710355e-05, "loss": 0.4056, "num_input_tokens_seen": 3837728, "step": 42615 }, { "epoch": 11.075883575883577, "grad_norm": 1.4354453086853027, "learning_rate": 2.46700389384526e-05, "loss": 0.2957, "num_input_tokens_seen": 3838192, "step": 42620 }, { "epoch": 11.077182952182952, "grad_norm": 2.7859950065612793, "learning_rate": 2.4664369842165068e-05, "loss": 0.1769, "num_input_tokens_seen": 3838672, "step": 42625 }, { "epoch": 11.078482328482329, "grad_norm": 0.7281127572059631, "learning_rate": 2.465870076313931e-05, "loss": 0.1281, "num_input_tokens_seen": 3839120, "step": 42630 }, { "epoch": 11.079781704781706, "grad_norm": 3.699324607849121, "learning_rate": 2.4653031701666902e-05, "loss": 0.5147, "num_input_tokens_seen": 3839552, "step": 42635 }, { "epoch": 11.08108108108108, "grad_norm": 0.3874177932739258, "learning_rate": 2.46473626580394e-05, "loss": 0.1912, "num_input_tokens_seen": 3839984, "step": 42640 }, { "epoch": 11.082380457380458, "grad_norm": 5.877848148345947, "learning_rate": 2.4641693632548385e-05, "loss": 0.365, "num_input_tokens_seen": 3840416, "step": 42645 }, { "epoch": 11.083679833679835, "grad_norm": 1.3810874223709106, "learning_rate": 2.4636024625485403e-05, "loss": 0.2678, "num_input_tokens_seen": 3840848, "step": 42650 }, { "epoch": 11.08497920997921, "grad_norm": 3.2344493865966797, "learning_rate": 2.463035563714202e-05, "loss": 0.2857, "num_input_tokens_seen": 3841376, "step": 42655 }, { "epoch": 11.086278586278587, "grad_norm": 2.243886709213257, "learning_rate": 2.462468666780981e-05, "loss": 0.178, "num_input_tokens_seen": 3841824, "step": 42660 }, { "epoch": 11.087577962577962, "grad_norm": 0.4934665262699127, "learning_rate": 2.4619017717780316e-05, "loss": 0.1287, "num_input_tokens_seen": 3842256, "step": 42665 }, { "epoch": 11.088877338877339, "grad_norm": 1.2412097454071045, "learning_rate": 2.461334878734511e-05, "loss": 0.2201, "num_input_tokens_seen": 3842688, "step": 42670 }, { "epoch": 11.090176715176716, "grad_norm": 1.7261046171188354, "learning_rate": 2.4607679876795738e-05, "loss": 0.2011, "num_input_tokens_seen": 3843120, "step": 42675 }, { "epoch": 11.09147609147609, "grad_norm": 5.2775115966796875, "learning_rate": 2.4602010986423782e-05, "loss": 0.2326, "num_input_tokens_seen": 3843552, "step": 42680 }, { "epoch": 11.092775467775468, "grad_norm": 0.48422971367836, "learning_rate": 2.459634211652076e-05, "loss": 0.0885, "num_input_tokens_seen": 3843968, "step": 42685 }, { "epoch": 11.094074844074845, "grad_norm": 4.183984279632568, "learning_rate": 2.4590673267378273e-05, "loss": 0.2723, "num_input_tokens_seen": 3844432, "step": 42690 }, { "epoch": 11.09537422037422, "grad_norm": 4.994154930114746, "learning_rate": 2.4585004439287838e-05, "loss": 0.3865, "num_input_tokens_seen": 3844880, "step": 42695 }, { "epoch": 11.096673596673597, "grad_norm": 4.633088111877441, "learning_rate": 2.4579335632541026e-05, "loss": 0.3519, "num_input_tokens_seen": 3845344, "step": 42700 }, { "epoch": 11.097972972972974, "grad_norm": 0.6406814455986023, "learning_rate": 2.4573666847429384e-05, "loss": 0.1531, "num_input_tokens_seen": 3845808, "step": 42705 }, { "epoch": 11.099272349272349, "grad_norm": 7.0434889793396, "learning_rate": 2.456799808424447e-05, "loss": 0.4157, "num_input_tokens_seen": 3846240, "step": 42710 }, { "epoch": 11.100571725571726, "grad_norm": 2.0124008655548096, "learning_rate": 2.4562329343277825e-05, "loss": 0.1966, "num_input_tokens_seen": 3846672, "step": 42715 }, { "epoch": 11.101871101871103, "grad_norm": 0.6971806287765503, "learning_rate": 2.4556660624820998e-05, "loss": 0.1186, "num_input_tokens_seen": 3847152, "step": 42720 }, { "epoch": 11.103170478170478, "grad_norm": 0.1672798991203308, "learning_rate": 2.4550991929165553e-05, "loss": 0.2862, "num_input_tokens_seen": 3847600, "step": 42725 }, { "epoch": 11.104469854469855, "grad_norm": 2.687438488006592, "learning_rate": 2.4545323256603007e-05, "loss": 0.3622, "num_input_tokens_seen": 3848016, "step": 42730 }, { "epoch": 11.10576923076923, "grad_norm": 1.0492769479751587, "learning_rate": 2.4539654607424927e-05, "loss": 0.2779, "num_input_tokens_seen": 3848512, "step": 42735 }, { "epoch": 11.107068607068607, "grad_norm": 0.8038819432258606, "learning_rate": 2.453398598192285e-05, "loss": 0.1333, "num_input_tokens_seen": 3848960, "step": 42740 }, { "epoch": 11.108367983367984, "grad_norm": 3.341492176055908, "learning_rate": 2.4528317380388328e-05, "loss": 0.3361, "num_input_tokens_seen": 3849392, "step": 42745 }, { "epoch": 11.109667359667359, "grad_norm": 0.7444812059402466, "learning_rate": 2.4522648803112886e-05, "loss": 0.214, "num_input_tokens_seen": 3849808, "step": 42750 }, { "epoch": 11.110966735966736, "grad_norm": 0.4261057376861572, "learning_rate": 2.4516980250388077e-05, "loss": 0.1335, "num_input_tokens_seen": 3850304, "step": 42755 }, { "epoch": 11.112266112266113, "grad_norm": 2.512019157409668, "learning_rate": 2.4511311722505433e-05, "loss": 0.1632, "num_input_tokens_seen": 3850752, "step": 42760 }, { "epoch": 11.113565488565488, "grad_norm": 1.1866871118545532, "learning_rate": 2.4505643219756504e-05, "loss": 0.2006, "num_input_tokens_seen": 3851232, "step": 42765 }, { "epoch": 11.114864864864865, "grad_norm": 3.564999580383301, "learning_rate": 2.449997474243281e-05, "loss": 0.2363, "num_input_tokens_seen": 3851712, "step": 42770 }, { "epoch": 11.116164241164242, "grad_norm": 3.5414650440216064, "learning_rate": 2.44943062908259e-05, "loss": 0.2854, "num_input_tokens_seen": 3852176, "step": 42775 }, { "epoch": 11.117463617463617, "grad_norm": 0.508133590221405, "learning_rate": 2.4488637865227306e-05, "loss": 0.2267, "num_input_tokens_seen": 3852624, "step": 42780 }, { "epoch": 11.118762993762994, "grad_norm": 3.6212639808654785, "learning_rate": 2.4482969465928543e-05, "loss": 0.498, "num_input_tokens_seen": 3853056, "step": 42785 }, { "epoch": 11.12006237006237, "grad_norm": 4.502221584320068, "learning_rate": 2.4477301093221163e-05, "loss": 0.2536, "num_input_tokens_seen": 3853536, "step": 42790 }, { "epoch": 11.121361746361746, "grad_norm": 1.6245485544204712, "learning_rate": 2.4471632747396687e-05, "loss": 0.1905, "num_input_tokens_seen": 3854032, "step": 42795 }, { "epoch": 11.122661122661123, "grad_norm": 3.2231876850128174, "learning_rate": 2.4465964428746652e-05, "loss": 0.3209, "num_input_tokens_seen": 3854480, "step": 42800 }, { "epoch": 11.123960498960498, "grad_norm": 2.442178726196289, "learning_rate": 2.4460296137562565e-05, "loss": 0.2606, "num_input_tokens_seen": 3854944, "step": 42805 }, { "epoch": 11.125259875259875, "grad_norm": 2.374061346054077, "learning_rate": 2.4454627874135974e-05, "loss": 0.1854, "num_input_tokens_seen": 3855392, "step": 42810 }, { "epoch": 11.126559251559252, "grad_norm": 4.216822147369385, "learning_rate": 2.444895963875839e-05, "loss": 0.219, "num_input_tokens_seen": 3855840, "step": 42815 }, { "epoch": 11.127858627858627, "grad_norm": 0.4242197275161743, "learning_rate": 2.4443291431721345e-05, "loss": 0.2304, "num_input_tokens_seen": 3856320, "step": 42820 }, { "epoch": 11.129158004158004, "grad_norm": 1.3916879892349243, "learning_rate": 2.443762325331635e-05, "loss": 0.2346, "num_input_tokens_seen": 3856752, "step": 42825 }, { "epoch": 11.130457380457381, "grad_norm": 3.0262434482574463, "learning_rate": 2.4431955103834933e-05, "loss": 0.1257, "num_input_tokens_seen": 3857216, "step": 42830 }, { "epoch": 11.131756756756756, "grad_norm": 8.911115646362305, "learning_rate": 2.4426286983568602e-05, "loss": 0.232, "num_input_tokens_seen": 3857680, "step": 42835 }, { "epoch": 11.133056133056133, "grad_norm": 1.582801103591919, "learning_rate": 2.4420618892808895e-05, "loss": 0.1682, "num_input_tokens_seen": 3858128, "step": 42840 }, { "epoch": 11.13435550935551, "grad_norm": 7.539074420928955, "learning_rate": 2.441495083184731e-05, "loss": 0.2983, "num_input_tokens_seen": 3858592, "step": 42845 }, { "epoch": 11.135654885654885, "grad_norm": 11.838862419128418, "learning_rate": 2.4409282800975352e-05, "loss": 0.5179, "num_input_tokens_seen": 3859072, "step": 42850 }, { "epoch": 11.136954261954262, "grad_norm": 0.5531953573226929, "learning_rate": 2.4403614800484563e-05, "loss": 0.1484, "num_input_tokens_seen": 3859520, "step": 42855 }, { "epoch": 11.138253638253639, "grad_norm": 0.4597133994102478, "learning_rate": 2.4397946830666422e-05, "loss": 0.1847, "num_input_tokens_seen": 3859952, "step": 42860 }, { "epoch": 11.139553014553014, "grad_norm": 2.465818166732788, "learning_rate": 2.4392278891812455e-05, "loss": 0.3095, "num_input_tokens_seen": 3860400, "step": 42865 }, { "epoch": 11.140852390852391, "grad_norm": 3.8982138633728027, "learning_rate": 2.4386610984214163e-05, "loss": 0.24, "num_input_tokens_seen": 3860832, "step": 42870 }, { "epoch": 11.142151767151766, "grad_norm": 2.1756439208984375, "learning_rate": 2.438094310816307e-05, "loss": 0.2309, "num_input_tokens_seen": 3861280, "step": 42875 }, { "epoch": 11.143451143451143, "grad_norm": 3.7215380668640137, "learning_rate": 2.4375275263950654e-05, "loss": 0.3674, "num_input_tokens_seen": 3861760, "step": 42880 }, { "epoch": 11.14475051975052, "grad_norm": 1.7606157064437866, "learning_rate": 2.4369607451868435e-05, "loss": 0.3283, "num_input_tokens_seen": 3862208, "step": 42885 }, { "epoch": 11.146049896049895, "grad_norm": 0.4979439973831177, "learning_rate": 2.4363939672207904e-05, "loss": 0.1245, "num_input_tokens_seen": 3862640, "step": 42890 }, { "epoch": 11.147349272349272, "grad_norm": 0.2853378653526306, "learning_rate": 2.4358271925260574e-05, "loss": 0.2912, "num_input_tokens_seen": 3863072, "step": 42895 }, { "epoch": 11.14864864864865, "grad_norm": 5.231906414031982, "learning_rate": 2.4352604211317924e-05, "loss": 0.17, "num_input_tokens_seen": 3863488, "step": 42900 }, { "epoch": 11.149948024948024, "grad_norm": 2.615589141845703, "learning_rate": 2.4346936530671465e-05, "loss": 0.111, "num_input_tokens_seen": 3863952, "step": 42905 }, { "epoch": 11.151247401247401, "grad_norm": 1.3795169591903687, "learning_rate": 2.434126888361269e-05, "loss": 0.2505, "num_input_tokens_seen": 3864368, "step": 42910 }, { "epoch": 11.152546777546778, "grad_norm": 1.1138402223587036, "learning_rate": 2.433560127043308e-05, "loss": 0.1347, "num_input_tokens_seen": 3864784, "step": 42915 }, { "epoch": 11.153846153846153, "grad_norm": 3.017143487930298, "learning_rate": 2.4329933691424137e-05, "loss": 0.1955, "num_input_tokens_seen": 3865280, "step": 42920 }, { "epoch": 11.15514553014553, "grad_norm": 0.8927382230758667, "learning_rate": 2.432426614687734e-05, "loss": 0.2663, "num_input_tokens_seen": 3865728, "step": 42925 }, { "epoch": 11.156444906444907, "grad_norm": 0.34570565819740295, "learning_rate": 2.431859863708419e-05, "loss": 0.1794, "num_input_tokens_seen": 3866176, "step": 42930 }, { "epoch": 11.157744282744282, "grad_norm": 6.786512851715088, "learning_rate": 2.431293116233616e-05, "loss": 0.2944, "num_input_tokens_seen": 3866576, "step": 42935 }, { "epoch": 11.15904365904366, "grad_norm": 0.2183835357427597, "learning_rate": 2.4307263722924744e-05, "loss": 0.2443, "num_input_tokens_seen": 3867024, "step": 42940 }, { "epoch": 11.160343035343036, "grad_norm": 0.5230621695518494, "learning_rate": 2.430159631914141e-05, "loss": 0.3265, "num_input_tokens_seen": 3867488, "step": 42945 }, { "epoch": 11.161642411642411, "grad_norm": 1.0216628313064575, "learning_rate": 2.4295928951277653e-05, "loss": 0.2353, "num_input_tokens_seen": 3867968, "step": 42950 }, { "epoch": 11.162941787941788, "grad_norm": 5.066307067871094, "learning_rate": 2.429026161962494e-05, "loss": 0.5051, "num_input_tokens_seen": 3868432, "step": 42955 }, { "epoch": 11.164241164241163, "grad_norm": 7.623213291168213, "learning_rate": 2.4284594324474763e-05, "loss": 0.357, "num_input_tokens_seen": 3868896, "step": 42960 }, { "epoch": 11.16554054054054, "grad_norm": 0.5707219243049622, "learning_rate": 2.427892706611857e-05, "loss": 0.1772, "num_input_tokens_seen": 3869344, "step": 42965 }, { "epoch": 11.166839916839917, "grad_norm": 5.304512977600098, "learning_rate": 2.427325984484786e-05, "loss": 0.4932, "num_input_tokens_seen": 3869760, "step": 42970 }, { "epoch": 11.168139293139292, "grad_norm": 4.2139387130737305, "learning_rate": 2.4267592660954096e-05, "loss": 0.1923, "num_input_tokens_seen": 3870208, "step": 42975 }, { "epoch": 11.16943866943867, "grad_norm": 1.4496206045150757, "learning_rate": 2.4261925514728733e-05, "loss": 0.14, "num_input_tokens_seen": 3870736, "step": 42980 }, { "epoch": 11.170738045738046, "grad_norm": 5.173951148986816, "learning_rate": 2.4256258406463253e-05, "loss": 0.2441, "num_input_tokens_seen": 3871200, "step": 42985 }, { "epoch": 11.172037422037421, "grad_norm": 4.048989772796631, "learning_rate": 2.425059133644911e-05, "loss": 0.308, "num_input_tokens_seen": 3871664, "step": 42990 }, { "epoch": 11.173336798336798, "grad_norm": 1.040972352027893, "learning_rate": 2.4244924304977785e-05, "loss": 0.2005, "num_input_tokens_seen": 3872112, "step": 42995 }, { "epoch": 11.174636174636175, "grad_norm": 2.2009127140045166, "learning_rate": 2.4239257312340712e-05, "loss": 0.2289, "num_input_tokens_seen": 3872576, "step": 43000 }, { "epoch": 11.17593555093555, "grad_norm": 4.651636123657227, "learning_rate": 2.4233590358829374e-05, "loss": 0.2514, "num_input_tokens_seen": 3873024, "step": 43005 }, { "epoch": 11.177234927234927, "grad_norm": 6.070191383361816, "learning_rate": 2.422792344473521e-05, "loss": 0.394, "num_input_tokens_seen": 3873472, "step": 43010 }, { "epoch": 11.178534303534304, "grad_norm": 0.5138832926750183, "learning_rate": 2.4222256570349692e-05, "loss": 0.2335, "num_input_tokens_seen": 3873936, "step": 43015 }, { "epoch": 11.17983367983368, "grad_norm": 4.3226213455200195, "learning_rate": 2.421658973596426e-05, "loss": 0.2108, "num_input_tokens_seen": 3874400, "step": 43020 }, { "epoch": 11.181133056133056, "grad_norm": 3.0291149616241455, "learning_rate": 2.4210922941870367e-05, "loss": 0.3731, "num_input_tokens_seen": 3874832, "step": 43025 }, { "epoch": 11.182432432432432, "grad_norm": 2.1407439708709717, "learning_rate": 2.420525618835946e-05, "loss": 0.1921, "num_input_tokens_seen": 3875264, "step": 43030 }, { "epoch": 11.183731808731808, "grad_norm": 2.6986052989959717, "learning_rate": 2.4199589475723e-05, "loss": 0.1286, "num_input_tokens_seen": 3875728, "step": 43035 }, { "epoch": 11.185031185031185, "grad_norm": 3.955451011657715, "learning_rate": 2.4193922804252416e-05, "loss": 0.3959, "num_input_tokens_seen": 3876192, "step": 43040 }, { "epoch": 11.18633056133056, "grad_norm": 1.313134789466858, "learning_rate": 2.4188256174239146e-05, "loss": 0.0669, "num_input_tokens_seen": 3876624, "step": 43045 }, { "epoch": 11.187629937629938, "grad_norm": 1.8589671850204468, "learning_rate": 2.4182589585974653e-05, "loss": 0.2809, "num_input_tokens_seen": 3877072, "step": 43050 }, { "epoch": 11.188929313929314, "grad_norm": 3.9048941135406494, "learning_rate": 2.4176923039750347e-05, "loss": 0.5073, "num_input_tokens_seen": 3877488, "step": 43055 }, { "epoch": 11.19022869022869, "grad_norm": 2.780388593673706, "learning_rate": 2.4171256535857684e-05, "loss": 0.3199, "num_input_tokens_seen": 3877936, "step": 43060 }, { "epoch": 11.191528066528067, "grad_norm": 4.88160514831543, "learning_rate": 2.4165590074588085e-05, "loss": 0.2109, "num_input_tokens_seen": 3878368, "step": 43065 }, { "epoch": 11.192827442827443, "grad_norm": 0.12151890993118286, "learning_rate": 2.4159923656233e-05, "loss": 0.1987, "num_input_tokens_seen": 3878784, "step": 43070 }, { "epoch": 11.194126819126819, "grad_norm": 2.749476194381714, "learning_rate": 2.4154257281083837e-05, "loss": 0.2036, "num_input_tokens_seen": 3879264, "step": 43075 }, { "epoch": 11.195426195426196, "grad_norm": 2.7084391117095947, "learning_rate": 2.4148590949432035e-05, "loss": 0.3448, "num_input_tokens_seen": 3879696, "step": 43080 }, { "epoch": 11.196725571725572, "grad_norm": 0.9437779784202576, "learning_rate": 2.4142924661569013e-05, "loss": 0.1787, "num_input_tokens_seen": 3880176, "step": 43085 }, { "epoch": 11.198024948024948, "grad_norm": 0.9745856523513794, "learning_rate": 2.4137258417786206e-05, "loss": 0.1988, "num_input_tokens_seen": 3880640, "step": 43090 }, { "epoch": 11.199324324324325, "grad_norm": 4.1662750244140625, "learning_rate": 2.4131592218375017e-05, "loss": 0.2189, "num_input_tokens_seen": 3881120, "step": 43095 }, { "epoch": 11.200623700623701, "grad_norm": 4.162607669830322, "learning_rate": 2.4125926063626875e-05, "loss": 0.1412, "num_input_tokens_seen": 3881600, "step": 43100 }, { "epoch": 11.201923076923077, "grad_norm": 0.45359987020492554, "learning_rate": 2.41202599538332e-05, "loss": 0.4343, "num_input_tokens_seen": 3882048, "step": 43105 }, { "epoch": 11.203222453222454, "grad_norm": 0.8909082412719727, "learning_rate": 2.4114593889285385e-05, "loss": 0.2712, "num_input_tokens_seen": 3882512, "step": 43110 }, { "epoch": 11.204521829521829, "grad_norm": 5.368185520172119, "learning_rate": 2.4108927870274863e-05, "loss": 0.3787, "num_input_tokens_seen": 3882976, "step": 43115 }, { "epoch": 11.205821205821206, "grad_norm": 2.990017890930176, "learning_rate": 2.4103261897093028e-05, "loss": 0.1682, "num_input_tokens_seen": 3883424, "step": 43120 }, { "epoch": 11.207120582120583, "grad_norm": 4.438772678375244, "learning_rate": 2.4097595970031304e-05, "loss": 0.3169, "num_input_tokens_seen": 3883856, "step": 43125 }, { "epoch": 11.208419958419958, "grad_norm": 1.3881899118423462, "learning_rate": 2.409193008938107e-05, "loss": 0.2043, "num_input_tokens_seen": 3884272, "step": 43130 }, { "epoch": 11.209719334719335, "grad_norm": 2.7622482776641846, "learning_rate": 2.408626425543375e-05, "loss": 0.2298, "num_input_tokens_seen": 3884720, "step": 43135 }, { "epoch": 11.211018711018712, "grad_norm": 3.5840985774993896, "learning_rate": 2.4080598468480732e-05, "loss": 0.1514, "num_input_tokens_seen": 3885216, "step": 43140 }, { "epoch": 11.212318087318087, "grad_norm": 5.6749444007873535, "learning_rate": 2.4074932728813422e-05, "loss": 0.3789, "num_input_tokens_seen": 3885664, "step": 43145 }, { "epoch": 11.213617463617464, "grad_norm": 0.9263845682144165, "learning_rate": 2.40692670367232e-05, "loss": 0.2918, "num_input_tokens_seen": 3886112, "step": 43150 }, { "epoch": 11.21491683991684, "grad_norm": 3.774143934249878, "learning_rate": 2.406360139250147e-05, "loss": 0.3597, "num_input_tokens_seen": 3886576, "step": 43155 }, { "epoch": 11.216216216216216, "grad_norm": 2.8124938011169434, "learning_rate": 2.4057935796439613e-05, "loss": 0.4732, "num_input_tokens_seen": 3887008, "step": 43160 }, { "epoch": 11.217515592515593, "grad_norm": 3.6154491901397705, "learning_rate": 2.405227024882903e-05, "loss": 0.257, "num_input_tokens_seen": 3887488, "step": 43165 }, { "epoch": 11.21881496881497, "grad_norm": 2.599717617034912, "learning_rate": 2.4046604749961093e-05, "loss": 0.2873, "num_input_tokens_seen": 3887952, "step": 43170 }, { "epoch": 11.220114345114345, "grad_norm": 0.6255558133125305, "learning_rate": 2.404093930012718e-05, "loss": 0.0879, "num_input_tokens_seen": 3888400, "step": 43175 }, { "epoch": 11.221413721413722, "grad_norm": 0.1936023086309433, "learning_rate": 2.4035273899618683e-05, "loss": 0.3629, "num_input_tokens_seen": 3888880, "step": 43180 }, { "epoch": 11.222713097713097, "grad_norm": 0.9475414752960205, "learning_rate": 2.402960854872697e-05, "loss": 0.2368, "num_input_tokens_seen": 3889376, "step": 43185 }, { "epoch": 11.224012474012474, "grad_norm": 0.4882851541042328, "learning_rate": 2.402394324774343e-05, "loss": 0.4148, "num_input_tokens_seen": 3889840, "step": 43190 }, { "epoch": 11.22531185031185, "grad_norm": 1.4811553955078125, "learning_rate": 2.4018277996959412e-05, "loss": 0.1905, "num_input_tokens_seen": 3890336, "step": 43195 }, { "epoch": 11.226611226611226, "grad_norm": 4.612390518188477, "learning_rate": 2.4012612796666302e-05, "loss": 0.3359, "num_input_tokens_seen": 3890784, "step": 43200 }, { "epoch": 11.227910602910603, "grad_norm": 1.949976921081543, "learning_rate": 2.400694764715546e-05, "loss": 0.2469, "num_input_tokens_seen": 3891200, "step": 43205 }, { "epoch": 11.22920997920998, "grad_norm": 3.3813607692718506, "learning_rate": 2.4001282548718258e-05, "loss": 0.1285, "num_input_tokens_seen": 3891664, "step": 43210 }, { "epoch": 11.230509355509355, "grad_norm": 1.046003818511963, "learning_rate": 2.399561750164604e-05, "loss": 0.2129, "num_input_tokens_seen": 3892096, "step": 43215 }, { "epoch": 11.231808731808732, "grad_norm": 0.5375886559486389, "learning_rate": 2.3989952506230187e-05, "loss": 0.3228, "num_input_tokens_seen": 3892576, "step": 43220 }, { "epoch": 11.233108108108109, "grad_norm": 1.0714373588562012, "learning_rate": 2.3984287562762037e-05, "loss": 0.0257, "num_input_tokens_seen": 3892976, "step": 43225 }, { "epoch": 11.234407484407484, "grad_norm": 8.450736999511719, "learning_rate": 2.397862267153296e-05, "loss": 0.2847, "num_input_tokens_seen": 3893408, "step": 43230 }, { "epoch": 11.23570686070686, "grad_norm": 0.7243627309799194, "learning_rate": 2.3972957832834295e-05, "loss": 0.1484, "num_input_tokens_seen": 3893856, "step": 43235 }, { "epoch": 11.237006237006238, "grad_norm": 0.7750981450080872, "learning_rate": 2.3967293046957383e-05, "loss": 0.019, "num_input_tokens_seen": 3894256, "step": 43240 }, { "epoch": 11.238305613305613, "grad_norm": 5.4073944091796875, "learning_rate": 2.3961628314193595e-05, "loss": 0.5246, "num_input_tokens_seen": 3894736, "step": 43245 }, { "epoch": 11.23960498960499, "grad_norm": 9.025887489318848, "learning_rate": 2.3955963634834244e-05, "loss": 0.6324, "num_input_tokens_seen": 3895168, "step": 43250 }, { "epoch": 11.240904365904365, "grad_norm": 0.16731323301792145, "learning_rate": 2.395029900917069e-05, "loss": 0.1507, "num_input_tokens_seen": 3895632, "step": 43255 }, { "epoch": 11.242203742203742, "grad_norm": 5.473321914672852, "learning_rate": 2.3944634437494256e-05, "loss": 0.3152, "num_input_tokens_seen": 3896048, "step": 43260 }, { "epoch": 11.243503118503119, "grad_norm": 10.482754707336426, "learning_rate": 2.39389699200963e-05, "loss": 0.3448, "num_input_tokens_seen": 3896528, "step": 43265 }, { "epoch": 11.244802494802494, "grad_norm": 7.02183723449707, "learning_rate": 2.393330545726812e-05, "loss": 0.3239, "num_input_tokens_seen": 3897088, "step": 43270 }, { "epoch": 11.246101871101871, "grad_norm": 4.33080530166626, "learning_rate": 2.392764104930107e-05, "loss": 0.2929, "num_input_tokens_seen": 3897568, "step": 43275 }, { "epoch": 11.247401247401248, "grad_norm": 5.733090877532959, "learning_rate": 2.3921976696486468e-05, "loss": 0.3194, "num_input_tokens_seen": 3898016, "step": 43280 }, { "epoch": 11.248700623700623, "grad_norm": 0.26526379585266113, "learning_rate": 2.3916312399115646e-05, "loss": 0.1486, "num_input_tokens_seen": 3898480, "step": 43285 }, { "epoch": 11.25, "grad_norm": 0.6865307092666626, "learning_rate": 2.39106481574799e-05, "loss": 0.1653, "num_input_tokens_seen": 3898960, "step": 43290 }, { "epoch": 11.251299376299377, "grad_norm": 3.1746819019317627, "learning_rate": 2.3904983971870573e-05, "loss": 0.4652, "num_input_tokens_seen": 3899392, "step": 43295 }, { "epoch": 11.252598752598752, "grad_norm": 8.643607139587402, "learning_rate": 2.3899319842578972e-05, "loss": 0.3771, "num_input_tokens_seen": 3899808, "step": 43300 }, { "epoch": 11.253898128898129, "grad_norm": 0.8132534027099609, "learning_rate": 2.3893655769896396e-05, "loss": 0.2078, "num_input_tokens_seen": 3900224, "step": 43305 }, { "epoch": 11.255197505197506, "grad_norm": 6.652132511138916, "learning_rate": 2.3887991754114173e-05, "loss": 0.2824, "num_input_tokens_seen": 3900656, "step": 43310 }, { "epoch": 11.256496881496881, "grad_norm": 0.557861328125, "learning_rate": 2.388232779552359e-05, "loss": 0.0598, "num_input_tokens_seen": 3901072, "step": 43315 }, { "epoch": 11.257796257796258, "grad_norm": 0.36972877383232117, "learning_rate": 2.3876663894415974e-05, "loss": 0.4059, "num_input_tokens_seen": 3901504, "step": 43320 }, { "epoch": 11.259095634095633, "grad_norm": 0.794636607170105, "learning_rate": 2.3871000051082594e-05, "loss": 0.1257, "num_input_tokens_seen": 3901968, "step": 43325 }, { "epoch": 11.26039501039501, "grad_norm": 0.9260558485984802, "learning_rate": 2.3865336265814773e-05, "loss": 0.1468, "num_input_tokens_seen": 3902416, "step": 43330 }, { "epoch": 11.261694386694387, "grad_norm": 7.465297698974609, "learning_rate": 2.385967253890379e-05, "loss": 0.1734, "num_input_tokens_seen": 3902848, "step": 43335 }, { "epoch": 11.262993762993762, "grad_norm": 4.477358341217041, "learning_rate": 2.385400887064095e-05, "loss": 0.3412, "num_input_tokens_seen": 3903296, "step": 43340 }, { "epoch": 11.26429313929314, "grad_norm": 1.8194167613983154, "learning_rate": 2.384834526131752e-05, "loss": 0.5133, "num_input_tokens_seen": 3903728, "step": 43345 }, { "epoch": 11.265592515592516, "grad_norm": 0.2037779688835144, "learning_rate": 2.38426817112248e-05, "loss": 0.2353, "num_input_tokens_seen": 3904176, "step": 43350 }, { "epoch": 11.266891891891891, "grad_norm": 0.5274785757064819, "learning_rate": 2.3837018220654066e-05, "loss": 0.2741, "num_input_tokens_seen": 3904608, "step": 43355 }, { "epoch": 11.268191268191268, "grad_norm": 9.992986679077148, "learning_rate": 2.3831354789896612e-05, "loss": 0.3468, "num_input_tokens_seen": 3905072, "step": 43360 }, { "epoch": 11.269490644490645, "grad_norm": 2.578167676925659, "learning_rate": 2.3825691419243694e-05, "loss": 0.3102, "num_input_tokens_seen": 3905504, "step": 43365 }, { "epoch": 11.27079002079002, "grad_norm": 6.255338191986084, "learning_rate": 2.3820028108986586e-05, "loss": 0.3539, "num_input_tokens_seen": 3905936, "step": 43370 }, { "epoch": 11.272089397089397, "grad_norm": 3.6621270179748535, "learning_rate": 2.3814364859416574e-05, "loss": 0.2201, "num_input_tokens_seen": 3906400, "step": 43375 }, { "epoch": 11.273388773388774, "grad_norm": 4.003225803375244, "learning_rate": 2.38087016708249e-05, "loss": 0.1596, "num_input_tokens_seen": 3906864, "step": 43380 }, { "epoch": 11.27468814968815, "grad_norm": 1.7988697290420532, "learning_rate": 2.3803038543502847e-05, "loss": 0.1179, "num_input_tokens_seen": 3907328, "step": 43385 }, { "epoch": 11.275987525987526, "grad_norm": 1.8251131772994995, "learning_rate": 2.3797375477741665e-05, "loss": 0.1232, "num_input_tokens_seen": 3907792, "step": 43390 }, { "epoch": 11.277286902286903, "grad_norm": 2.131486415863037, "learning_rate": 2.3791712473832627e-05, "loss": 0.4665, "num_input_tokens_seen": 3908240, "step": 43395 }, { "epoch": 11.278586278586278, "grad_norm": 2.5727779865264893, "learning_rate": 2.3786049532066957e-05, "loss": 0.1959, "num_input_tokens_seen": 3908720, "step": 43400 }, { "epoch": 11.279885654885655, "grad_norm": 4.340233325958252, "learning_rate": 2.3780386652735934e-05, "loss": 0.411, "num_input_tokens_seen": 3909168, "step": 43405 }, { "epoch": 11.28118503118503, "grad_norm": 0.9048608541488647, "learning_rate": 2.3774723836130787e-05, "loss": 0.1984, "num_input_tokens_seen": 3909616, "step": 43410 }, { "epoch": 11.282484407484407, "grad_norm": 3.351393461227417, "learning_rate": 2.376906108254277e-05, "loss": 0.2408, "num_input_tokens_seen": 3910080, "step": 43415 }, { "epoch": 11.283783783783784, "grad_norm": 4.60866117477417, "learning_rate": 2.3763398392263118e-05, "loss": 0.3627, "num_input_tokens_seen": 3910544, "step": 43420 }, { "epoch": 11.28508316008316, "grad_norm": 1.6687861680984497, "learning_rate": 2.3757735765583083e-05, "loss": 0.3566, "num_input_tokens_seen": 3910992, "step": 43425 }, { "epoch": 11.286382536382536, "grad_norm": 3.02221417427063, "learning_rate": 2.3752073202793882e-05, "loss": 0.2417, "num_input_tokens_seen": 3911456, "step": 43430 }, { "epoch": 11.287681912681913, "grad_norm": 8.075928688049316, "learning_rate": 2.3746410704186744e-05, "loss": 0.2568, "num_input_tokens_seen": 3911920, "step": 43435 }, { "epoch": 11.288981288981288, "grad_norm": 3.317873954772949, "learning_rate": 2.3740748270052917e-05, "loss": 0.32, "num_input_tokens_seen": 3912400, "step": 43440 }, { "epoch": 11.290280665280665, "grad_norm": 7.965487957000732, "learning_rate": 2.3735085900683602e-05, "loss": 0.2718, "num_input_tokens_seen": 3912816, "step": 43445 }, { "epoch": 11.291580041580042, "grad_norm": 2.034287452697754, "learning_rate": 2.3729423596370036e-05, "loss": 0.2172, "num_input_tokens_seen": 3913248, "step": 43450 }, { "epoch": 11.292879417879417, "grad_norm": 1.5035099983215332, "learning_rate": 2.372376135740343e-05, "loss": 0.1981, "num_input_tokens_seen": 3913664, "step": 43455 }, { "epoch": 11.294178794178794, "grad_norm": 4.33463191986084, "learning_rate": 2.3718099184075008e-05, "loss": 0.2198, "num_input_tokens_seen": 3914080, "step": 43460 }, { "epoch": 11.295478170478171, "grad_norm": 0.9037827849388123, "learning_rate": 2.371243707667596e-05, "loss": 0.1558, "num_input_tokens_seen": 3914544, "step": 43465 }, { "epoch": 11.296777546777546, "grad_norm": 0.22588323056697845, "learning_rate": 2.3706775035497517e-05, "loss": 0.1419, "num_input_tokens_seen": 3914976, "step": 43470 }, { "epoch": 11.298076923076923, "grad_norm": 1.4790681600570679, "learning_rate": 2.3701113060830865e-05, "loss": 0.2383, "num_input_tokens_seen": 3915392, "step": 43475 }, { "epoch": 11.299376299376299, "grad_norm": 6.240270137786865, "learning_rate": 2.3695451152967225e-05, "loss": 0.2574, "num_input_tokens_seen": 3915824, "step": 43480 }, { "epoch": 11.300675675675675, "grad_norm": 3.3059685230255127, "learning_rate": 2.3689789312197772e-05, "loss": 0.2452, "num_input_tokens_seen": 3916304, "step": 43485 }, { "epoch": 11.301975051975052, "grad_norm": 4.840027809143066, "learning_rate": 2.3684127538813714e-05, "loss": 0.4442, "num_input_tokens_seen": 3916768, "step": 43490 }, { "epoch": 11.303274428274428, "grad_norm": 4.676412105560303, "learning_rate": 2.3678465833106243e-05, "loss": 0.4289, "num_input_tokens_seen": 3917248, "step": 43495 }, { "epoch": 11.304573804573804, "grad_norm": 4.7609686851501465, "learning_rate": 2.367280419536653e-05, "loss": 0.1429, "num_input_tokens_seen": 3917696, "step": 43500 }, { "epoch": 11.305873180873181, "grad_norm": 0.6775404214859009, "learning_rate": 2.366714262588577e-05, "loss": 0.141, "num_input_tokens_seen": 3918128, "step": 43505 }, { "epoch": 11.307172557172557, "grad_norm": 6.956228256225586, "learning_rate": 2.3661481124955142e-05, "loss": 0.3843, "num_input_tokens_seen": 3918592, "step": 43510 }, { "epoch": 11.308471933471933, "grad_norm": 0.29274195432662964, "learning_rate": 2.3655819692865832e-05, "loss": 0.1915, "num_input_tokens_seen": 3919040, "step": 43515 }, { "epoch": 11.30977130977131, "grad_norm": 0.7609712481498718, "learning_rate": 2.3650158329908993e-05, "loss": 0.1139, "num_input_tokens_seen": 3919488, "step": 43520 }, { "epoch": 11.311070686070686, "grad_norm": 1.324439525604248, "learning_rate": 2.364449703637581e-05, "loss": 0.2254, "num_input_tokens_seen": 3919936, "step": 43525 }, { "epoch": 11.312370062370062, "grad_norm": 0.972686767578125, "learning_rate": 2.3638835812557433e-05, "loss": 0.2528, "num_input_tokens_seen": 3920400, "step": 43530 }, { "epoch": 11.31366943866944, "grad_norm": 0.5637104511260986, "learning_rate": 2.363317465874505e-05, "loss": 0.1358, "num_input_tokens_seen": 3920816, "step": 43535 }, { "epoch": 11.314968814968815, "grad_norm": 4.719266891479492, "learning_rate": 2.362751357522979e-05, "loss": 0.1104, "num_input_tokens_seen": 3921248, "step": 43540 }, { "epoch": 11.316268191268192, "grad_norm": 6.246837139129639, "learning_rate": 2.362185256230283e-05, "loss": 0.3113, "num_input_tokens_seen": 3921744, "step": 43545 }, { "epoch": 11.317567567567568, "grad_norm": 6.3845133781433105, "learning_rate": 2.3616191620255307e-05, "loss": 0.2788, "num_input_tokens_seen": 3922192, "step": 43550 }, { "epoch": 11.318866943866944, "grad_norm": 4.654322624206543, "learning_rate": 2.3610530749378386e-05, "loss": 0.2993, "num_input_tokens_seen": 3922624, "step": 43555 }, { "epoch": 11.32016632016632, "grad_norm": 7.15776252746582, "learning_rate": 2.3604869949963192e-05, "loss": 0.4235, "num_input_tokens_seen": 3923072, "step": 43560 }, { "epoch": 11.321465696465696, "grad_norm": 1.0551865100860596, "learning_rate": 2.3599209222300874e-05, "loss": 0.2605, "num_input_tokens_seen": 3923552, "step": 43565 }, { "epoch": 11.322765072765073, "grad_norm": 7.058123588562012, "learning_rate": 2.359354856668257e-05, "loss": 0.2321, "num_input_tokens_seen": 3924000, "step": 43570 }, { "epoch": 11.32406444906445, "grad_norm": 0.6523234248161316, "learning_rate": 2.3587887983399407e-05, "loss": 0.1953, "num_input_tokens_seen": 3924448, "step": 43575 }, { "epoch": 11.325363825363825, "grad_norm": 0.959464430809021, "learning_rate": 2.3582227472742518e-05, "loss": 0.2727, "num_input_tokens_seen": 3924912, "step": 43580 }, { "epoch": 11.326663201663202, "grad_norm": 5.837760925292969, "learning_rate": 2.3576567035003027e-05, "loss": 0.3947, "num_input_tokens_seen": 3925344, "step": 43585 }, { "epoch": 11.327962577962579, "grad_norm": 0.4076293706893921, "learning_rate": 2.3570906670472068e-05, "loss": 0.4504, "num_input_tokens_seen": 3925840, "step": 43590 }, { "epoch": 11.329261954261954, "grad_norm": 0.6245130300521851, "learning_rate": 2.3565246379440737e-05, "loss": 0.1133, "num_input_tokens_seen": 3926288, "step": 43595 }, { "epoch": 11.33056133056133, "grad_norm": 0.5171809792518616, "learning_rate": 2.3559586162200164e-05, "loss": 0.134, "num_input_tokens_seen": 3926704, "step": 43600 }, { "epoch": 11.331860706860708, "grad_norm": 0.8661254048347473, "learning_rate": 2.355392601904145e-05, "loss": 0.0703, "num_input_tokens_seen": 3927152, "step": 43605 }, { "epoch": 11.333160083160083, "grad_norm": 4.90920877456665, "learning_rate": 2.3548265950255717e-05, "loss": 0.4291, "num_input_tokens_seen": 3927584, "step": 43610 }, { "epoch": 11.33445945945946, "grad_norm": 2.9441659450531006, "learning_rate": 2.3542605956134044e-05, "loss": 0.1568, "num_input_tokens_seen": 3928048, "step": 43615 }, { "epoch": 11.335758835758837, "grad_norm": 0.29880115389823914, "learning_rate": 2.353694603696755e-05, "loss": 0.162, "num_input_tokens_seen": 3928464, "step": 43620 }, { "epoch": 11.337058212058212, "grad_norm": 0.9653662443161011, "learning_rate": 2.353128619304733e-05, "loss": 0.1478, "num_input_tokens_seen": 3928944, "step": 43625 }, { "epoch": 11.338357588357589, "grad_norm": 5.713562488555908, "learning_rate": 2.3525626424664456e-05, "loss": 0.3338, "num_input_tokens_seen": 3929360, "step": 43630 }, { "epoch": 11.339656964656964, "grad_norm": 4.483511447906494, "learning_rate": 2.3519966732110037e-05, "loss": 0.3832, "num_input_tokens_seen": 3929824, "step": 43635 }, { "epoch": 11.34095634095634, "grad_norm": 3.3080203533172607, "learning_rate": 2.3514307115675138e-05, "loss": 0.2606, "num_input_tokens_seen": 3930272, "step": 43640 }, { "epoch": 11.342255717255718, "grad_norm": 4.508114814758301, "learning_rate": 2.3508647575650858e-05, "loss": 0.4245, "num_input_tokens_seen": 3930720, "step": 43645 }, { "epoch": 11.343555093555093, "grad_norm": 2.4983601570129395, "learning_rate": 2.3502988112328253e-05, "loss": 0.3643, "num_input_tokens_seen": 3931184, "step": 43650 }, { "epoch": 11.34485446985447, "grad_norm": 4.6337809562683105, "learning_rate": 2.3497328725998406e-05, "loss": 0.2391, "num_input_tokens_seen": 3931616, "step": 43655 }, { "epoch": 11.346153846153847, "grad_norm": 3.032838821411133, "learning_rate": 2.349166941695238e-05, "loss": 0.1273, "num_input_tokens_seen": 3932064, "step": 43660 }, { "epoch": 11.347453222453222, "grad_norm": 0.5480709671974182, "learning_rate": 2.3486010185481248e-05, "loss": 0.1568, "num_input_tokens_seen": 3932528, "step": 43665 }, { "epoch": 11.348752598752599, "grad_norm": 3.661836862564087, "learning_rate": 2.3480351031876054e-05, "loss": 0.2897, "num_input_tokens_seen": 3932976, "step": 43670 }, { "epoch": 11.350051975051976, "grad_norm": 4.638038635253906, "learning_rate": 2.3474691956427875e-05, "loss": 0.268, "num_input_tokens_seen": 3933392, "step": 43675 }, { "epoch": 11.35135135135135, "grad_norm": 3.5534026622772217, "learning_rate": 2.346903295942774e-05, "loss": 0.226, "num_input_tokens_seen": 3933824, "step": 43680 }, { "epoch": 11.352650727650728, "grad_norm": 1.1392674446105957, "learning_rate": 2.346337404116671e-05, "loss": 0.253, "num_input_tokens_seen": 3934288, "step": 43685 }, { "epoch": 11.353950103950105, "grad_norm": 1.1937880516052246, "learning_rate": 2.345771520193583e-05, "loss": 0.2183, "num_input_tokens_seen": 3934704, "step": 43690 }, { "epoch": 11.35524948024948, "grad_norm": 3.3801229000091553, "learning_rate": 2.3452056442026127e-05, "loss": 0.251, "num_input_tokens_seen": 3935152, "step": 43695 }, { "epoch": 11.356548856548857, "grad_norm": 1.8816035985946655, "learning_rate": 2.344639776172865e-05, "loss": 0.123, "num_input_tokens_seen": 3935584, "step": 43700 }, { "epoch": 11.357848232848234, "grad_norm": 0.9561656713485718, "learning_rate": 2.3440739161334417e-05, "loss": 0.0882, "num_input_tokens_seen": 3936064, "step": 43705 }, { "epoch": 11.359147609147609, "grad_norm": 1.2974388599395752, "learning_rate": 2.3435080641134478e-05, "loss": 0.4284, "num_input_tokens_seen": 3936560, "step": 43710 }, { "epoch": 11.360446985446986, "grad_norm": 0.925127387046814, "learning_rate": 2.3429422201419827e-05, "loss": 0.0726, "num_input_tokens_seen": 3936976, "step": 43715 }, { "epoch": 11.361746361746361, "grad_norm": 7.7768378257751465, "learning_rate": 2.34237638424815e-05, "loss": 0.3879, "num_input_tokens_seen": 3937440, "step": 43720 }, { "epoch": 11.363045738045738, "grad_norm": 7.103767395019531, "learning_rate": 2.3418105564610508e-05, "loss": 0.3374, "num_input_tokens_seen": 3937888, "step": 43725 }, { "epoch": 11.364345114345115, "grad_norm": 4.576688766479492, "learning_rate": 2.341244736809787e-05, "loss": 0.3754, "num_input_tokens_seen": 3938320, "step": 43730 }, { "epoch": 11.36564449064449, "grad_norm": 0.402191698551178, "learning_rate": 2.3406789253234575e-05, "loss": 0.3414, "num_input_tokens_seen": 3938752, "step": 43735 }, { "epoch": 11.366943866943867, "grad_norm": 4.059560298919678, "learning_rate": 2.3401131220311643e-05, "loss": 0.2263, "num_input_tokens_seen": 3939232, "step": 43740 }, { "epoch": 11.368243243243244, "grad_norm": 2.680626630783081, "learning_rate": 2.3395473269620056e-05, "loss": 0.2703, "num_input_tokens_seen": 3939696, "step": 43745 }, { "epoch": 11.369542619542619, "grad_norm": 8.60914421081543, "learning_rate": 2.3389815401450827e-05, "loss": 0.2411, "num_input_tokens_seen": 3940144, "step": 43750 }, { "epoch": 11.370841995841996, "grad_norm": 1.6130422353744507, "learning_rate": 2.338415761609493e-05, "loss": 0.4106, "num_input_tokens_seen": 3940608, "step": 43755 }, { "epoch": 11.372141372141373, "grad_norm": 1.8320187330245972, "learning_rate": 2.337849991384335e-05, "loss": 0.1981, "num_input_tokens_seen": 3941056, "step": 43760 }, { "epoch": 11.373440748440748, "grad_norm": 1.4159001111984253, "learning_rate": 2.3372842294987083e-05, "loss": 0.2341, "num_input_tokens_seen": 3941536, "step": 43765 }, { "epoch": 11.374740124740125, "grad_norm": 1.1708087921142578, "learning_rate": 2.3367184759817087e-05, "loss": 0.2849, "num_input_tokens_seen": 3941984, "step": 43770 }, { "epoch": 11.3760395010395, "grad_norm": 2.471914052963257, "learning_rate": 2.336152730862435e-05, "loss": 0.2889, "num_input_tokens_seen": 3942400, "step": 43775 }, { "epoch": 11.377338877338877, "grad_norm": 3.6165289878845215, "learning_rate": 2.3355869941699822e-05, "loss": 0.135, "num_input_tokens_seen": 3942832, "step": 43780 }, { "epoch": 11.378638253638254, "grad_norm": 1.9951223134994507, "learning_rate": 2.3350212659334493e-05, "loss": 0.1364, "num_input_tokens_seen": 3943360, "step": 43785 }, { "epoch": 11.37993762993763, "grad_norm": 13.642547607421875, "learning_rate": 2.3344555461819297e-05, "loss": 0.3155, "num_input_tokens_seen": 3943792, "step": 43790 }, { "epoch": 11.381237006237006, "grad_norm": 4.043295383453369, "learning_rate": 2.3338898349445203e-05, "loss": 0.1451, "num_input_tokens_seen": 3944208, "step": 43795 }, { "epoch": 11.382536382536383, "grad_norm": 1.4300599098205566, "learning_rate": 2.3333241322503157e-05, "loss": 0.2807, "num_input_tokens_seen": 3944656, "step": 43800 }, { "epoch": 11.383835758835758, "grad_norm": 5.970213890075684, "learning_rate": 2.3327584381284115e-05, "loss": 0.645, "num_input_tokens_seen": 3945152, "step": 43805 }, { "epoch": 11.385135135135135, "grad_norm": 0.6542975902557373, "learning_rate": 2.3321927526079e-05, "loss": 0.106, "num_input_tokens_seen": 3945568, "step": 43810 }, { "epoch": 11.386434511434512, "grad_norm": 2.2927560806274414, "learning_rate": 2.3316270757178764e-05, "loss": 0.3419, "num_input_tokens_seen": 3946016, "step": 43815 }, { "epoch": 11.387733887733887, "grad_norm": 2.1411821842193604, "learning_rate": 2.3310614074874346e-05, "loss": 0.2585, "num_input_tokens_seen": 3946480, "step": 43820 }, { "epoch": 11.389033264033264, "grad_norm": 0.6208267211914062, "learning_rate": 2.330495747945665e-05, "loss": 0.2425, "num_input_tokens_seen": 3946928, "step": 43825 }, { "epoch": 11.390332640332641, "grad_norm": 0.5631834268569946, "learning_rate": 2.3299300971216623e-05, "loss": 0.3275, "num_input_tokens_seen": 3947360, "step": 43830 }, { "epoch": 11.391632016632016, "grad_norm": 1.000537395477295, "learning_rate": 2.329364455044517e-05, "loss": 0.1302, "num_input_tokens_seen": 3947776, "step": 43835 }, { "epoch": 11.392931392931393, "grad_norm": 0.3168438673019409, "learning_rate": 2.3287988217433224e-05, "loss": 0.0729, "num_input_tokens_seen": 3948240, "step": 43840 }, { "epoch": 11.39423076923077, "grad_norm": 3.3977389335632324, "learning_rate": 2.3282331972471673e-05, "loss": 0.2169, "num_input_tokens_seen": 3948688, "step": 43845 }, { "epoch": 11.395530145530145, "grad_norm": 4.107855319976807, "learning_rate": 2.327667581585144e-05, "loss": 0.3502, "num_input_tokens_seen": 3949152, "step": 43850 }, { "epoch": 11.396829521829522, "grad_norm": 0.1828298717737198, "learning_rate": 2.3271019747863414e-05, "loss": 0.362, "num_input_tokens_seen": 3949616, "step": 43855 }, { "epoch": 11.398128898128897, "grad_norm": 3.961595296859741, "learning_rate": 2.3265363768798512e-05, "loss": 0.1403, "num_input_tokens_seen": 3950064, "step": 43860 }, { "epoch": 11.399428274428274, "grad_norm": 0.7259796857833862, "learning_rate": 2.32597078789476e-05, "loss": 0.0748, "num_input_tokens_seen": 3950560, "step": 43865 }, { "epoch": 11.400727650727651, "grad_norm": 1.291370153427124, "learning_rate": 2.3254052078601588e-05, "loss": 0.1103, "num_input_tokens_seen": 3950976, "step": 43870 }, { "epoch": 11.402027027027026, "grad_norm": 6.453426361083984, "learning_rate": 2.324839636805134e-05, "loss": 0.1898, "num_input_tokens_seen": 3951408, "step": 43875 }, { "epoch": 11.403326403326403, "grad_norm": 3.0696046352386475, "learning_rate": 2.3242740747587762e-05, "loss": 0.3416, "num_input_tokens_seen": 3951888, "step": 43880 }, { "epoch": 11.40462577962578, "grad_norm": 4.387760162353516, "learning_rate": 2.3237085217501697e-05, "loss": 0.2529, "num_input_tokens_seen": 3952400, "step": 43885 }, { "epoch": 11.405925155925155, "grad_norm": 2.618041753768921, "learning_rate": 2.3231429778084034e-05, "loss": 0.0574, "num_input_tokens_seen": 3952848, "step": 43890 }, { "epoch": 11.407224532224532, "grad_norm": 0.3621775805950165, "learning_rate": 2.322577442962564e-05, "loss": 0.278, "num_input_tokens_seen": 3953312, "step": 43895 }, { "epoch": 11.40852390852391, "grad_norm": 0.5313878655433655, "learning_rate": 2.3220119172417348e-05, "loss": 0.3409, "num_input_tokens_seen": 3953744, "step": 43900 }, { "epoch": 11.409823284823284, "grad_norm": 0.8736481070518494, "learning_rate": 2.321446400675005e-05, "loss": 0.0766, "num_input_tokens_seen": 3954208, "step": 43905 }, { "epoch": 11.411122661122661, "grad_norm": 7.074672698974609, "learning_rate": 2.320880893291457e-05, "loss": 0.4248, "num_input_tokens_seen": 3954672, "step": 43910 }, { "epoch": 11.412422037422038, "grad_norm": 2.862135887145996, "learning_rate": 2.3203153951201764e-05, "loss": 0.2268, "num_input_tokens_seen": 3955120, "step": 43915 }, { "epoch": 11.413721413721413, "grad_norm": 2.7005114555358887, "learning_rate": 2.319749906190247e-05, "loss": 0.3224, "num_input_tokens_seen": 3955568, "step": 43920 }, { "epoch": 11.41502079002079, "grad_norm": 0.8521463871002197, "learning_rate": 2.3191844265307532e-05, "loss": 0.1957, "num_input_tokens_seen": 3955984, "step": 43925 }, { "epoch": 11.416320166320165, "grad_norm": 0.31569162011146545, "learning_rate": 2.3186189561707773e-05, "loss": 0.1483, "num_input_tokens_seen": 3956432, "step": 43930 }, { "epoch": 11.417619542619542, "grad_norm": 4.063239574432373, "learning_rate": 2.3180534951394022e-05, "loss": 0.2839, "num_input_tokens_seen": 3956864, "step": 43935 }, { "epoch": 11.41891891891892, "grad_norm": 0.7757665514945984, "learning_rate": 2.3174880434657097e-05, "loss": 0.2133, "num_input_tokens_seen": 3957296, "step": 43940 }, { "epoch": 11.420218295218294, "grad_norm": 7.949016571044922, "learning_rate": 2.3169226011787835e-05, "loss": 0.3156, "num_input_tokens_seen": 3957760, "step": 43945 }, { "epoch": 11.421517671517671, "grad_norm": 1.3915197849273682, "learning_rate": 2.316357168307702e-05, "loss": 0.1482, "num_input_tokens_seen": 3958192, "step": 43950 }, { "epoch": 11.422817047817048, "grad_norm": 0.5182886123657227, "learning_rate": 2.3157917448815475e-05, "loss": 0.2641, "num_input_tokens_seen": 3958608, "step": 43955 }, { "epoch": 11.424116424116423, "grad_norm": 0.5412570238113403, "learning_rate": 2.315226330929401e-05, "loss": 0.1724, "num_input_tokens_seen": 3959024, "step": 43960 }, { "epoch": 11.4254158004158, "grad_norm": 0.2700720429420471, "learning_rate": 2.31466092648034e-05, "loss": 0.1857, "num_input_tokens_seen": 3959488, "step": 43965 }, { "epoch": 11.426715176715177, "grad_norm": 0.8445931077003479, "learning_rate": 2.314095531563446e-05, "loss": 0.2801, "num_input_tokens_seen": 3959952, "step": 43970 }, { "epoch": 11.428014553014552, "grad_norm": 1.3038218021392822, "learning_rate": 2.313530146207796e-05, "loss": 0.2793, "num_input_tokens_seen": 3960384, "step": 43975 }, { "epoch": 11.42931392931393, "grad_norm": 0.8549911379814148, "learning_rate": 2.3129647704424706e-05, "loss": 0.026, "num_input_tokens_seen": 3960832, "step": 43980 }, { "epoch": 11.430613305613306, "grad_norm": 0.33569061756134033, "learning_rate": 2.3123994042965453e-05, "loss": 0.2183, "num_input_tokens_seen": 3961296, "step": 43985 }, { "epoch": 11.431912681912682, "grad_norm": 10.374342918395996, "learning_rate": 2.3118340477990987e-05, "loss": 0.4997, "num_input_tokens_seen": 3961744, "step": 43990 }, { "epoch": 11.433212058212058, "grad_norm": 6.063970565795898, "learning_rate": 2.3112687009792068e-05, "loss": 0.3404, "num_input_tokens_seen": 3962240, "step": 43995 }, { "epoch": 11.434511434511435, "grad_norm": 0.66560959815979, "learning_rate": 2.3107033638659476e-05, "loss": 0.3542, "num_input_tokens_seen": 3962720, "step": 44000 }, { "epoch": 11.43581081081081, "grad_norm": 4.408219814300537, "learning_rate": 2.3101380364883946e-05, "loss": 0.4345, "num_input_tokens_seen": 3963136, "step": 44005 }, { "epoch": 11.437110187110187, "grad_norm": 4.808623790740967, "learning_rate": 2.309572718875625e-05, "loss": 0.2551, "num_input_tokens_seen": 3963600, "step": 44010 }, { "epoch": 11.438409563409563, "grad_norm": 1.3615871667861938, "learning_rate": 2.3090074110567124e-05, "loss": 0.3579, "num_input_tokens_seen": 3964064, "step": 44015 }, { "epoch": 11.43970893970894, "grad_norm": 4.990614891052246, "learning_rate": 2.3084421130607323e-05, "loss": 0.3316, "num_input_tokens_seen": 3964512, "step": 44020 }, { "epoch": 11.441008316008316, "grad_norm": 2.1650562286376953, "learning_rate": 2.3078768249167575e-05, "loss": 0.3177, "num_input_tokens_seen": 3964928, "step": 44025 }, { "epoch": 11.442307692307692, "grad_norm": 1.5112245082855225, "learning_rate": 2.3073115466538614e-05, "loss": 0.1171, "num_input_tokens_seen": 3965360, "step": 44030 }, { "epoch": 11.443607068607069, "grad_norm": 3.231436252593994, "learning_rate": 2.3067462783011183e-05, "loss": 0.1241, "num_input_tokens_seen": 3965824, "step": 44035 }, { "epoch": 11.444906444906445, "grad_norm": 1.61289381980896, "learning_rate": 2.3061810198875978e-05, "loss": 0.0751, "num_input_tokens_seen": 3966240, "step": 44040 }, { "epoch": 11.44620582120582, "grad_norm": 3.8852615356445312, "learning_rate": 2.3056157714423736e-05, "loss": 0.3591, "num_input_tokens_seen": 3966688, "step": 44045 }, { "epoch": 11.447505197505198, "grad_norm": 0.3544216752052307, "learning_rate": 2.3050505329945163e-05, "loss": 0.2018, "num_input_tokens_seen": 3967152, "step": 44050 }, { "epoch": 11.448804573804575, "grad_norm": 1.6495829820632935, "learning_rate": 2.304485304573098e-05, "loss": 0.3532, "num_input_tokens_seen": 3967584, "step": 44055 }, { "epoch": 11.45010395010395, "grad_norm": 2.625499725341797, "learning_rate": 2.3039200862071863e-05, "loss": 0.2308, "num_input_tokens_seen": 3968032, "step": 44060 }, { "epoch": 11.451403326403327, "grad_norm": 7.625828266143799, "learning_rate": 2.3033548779258535e-05, "loss": 0.1259, "num_input_tokens_seen": 3968480, "step": 44065 }, { "epoch": 11.452702702702704, "grad_norm": 0.407266765832901, "learning_rate": 2.302789679758167e-05, "loss": 0.0741, "num_input_tokens_seen": 3968944, "step": 44070 }, { "epoch": 11.454002079002079, "grad_norm": 0.5045433044433594, "learning_rate": 2.3022244917331974e-05, "loss": 0.1999, "num_input_tokens_seen": 3969440, "step": 44075 }, { "epoch": 11.455301455301456, "grad_norm": 0.63333660364151, "learning_rate": 2.3016593138800104e-05, "loss": 0.2263, "num_input_tokens_seen": 3969872, "step": 44080 }, { "epoch": 11.45660083160083, "grad_norm": 5.813002586364746, "learning_rate": 2.3010941462276755e-05, "loss": 0.3595, "num_input_tokens_seen": 3970336, "step": 44085 }, { "epoch": 11.457900207900208, "grad_norm": 2.868190288543701, "learning_rate": 2.30052898880526e-05, "loss": 0.3104, "num_input_tokens_seen": 3970816, "step": 44090 }, { "epoch": 11.459199584199585, "grad_norm": 0.5755570530891418, "learning_rate": 2.2999638416418283e-05, "loss": 0.3647, "num_input_tokens_seen": 3971248, "step": 44095 }, { "epoch": 11.46049896049896, "grad_norm": 0.7174464464187622, "learning_rate": 2.299398704766449e-05, "loss": 0.1323, "num_input_tokens_seen": 3971712, "step": 44100 }, { "epoch": 11.461798336798337, "grad_norm": 3.3605313301086426, "learning_rate": 2.2988335782081855e-05, "loss": 0.3615, "num_input_tokens_seen": 3972128, "step": 44105 }, { "epoch": 11.463097713097714, "grad_norm": 0.7067080736160278, "learning_rate": 2.2982684619961048e-05, "loss": 0.2768, "num_input_tokens_seen": 3972592, "step": 44110 }, { "epoch": 11.464397089397089, "grad_norm": 4.888877868652344, "learning_rate": 2.2977033561592694e-05, "loss": 0.404, "num_input_tokens_seen": 3973040, "step": 44115 }, { "epoch": 11.465696465696466, "grad_norm": 0.4592474699020386, "learning_rate": 2.297138260726745e-05, "loss": 0.1467, "num_input_tokens_seen": 3973488, "step": 44120 }, { "epoch": 11.466995841995843, "grad_norm": 4.195497989654541, "learning_rate": 2.2965731757275936e-05, "loss": 0.1274, "num_input_tokens_seen": 3973936, "step": 44125 }, { "epoch": 11.468295218295218, "grad_norm": 5.537315845489502, "learning_rate": 2.2960081011908797e-05, "loss": 0.2042, "num_input_tokens_seen": 3974384, "step": 44130 }, { "epoch": 11.469594594594595, "grad_norm": 2.598907470703125, "learning_rate": 2.295443037145663e-05, "loss": 0.1289, "num_input_tokens_seen": 3974800, "step": 44135 }, { "epoch": 11.470893970893972, "grad_norm": 3.6618316173553467, "learning_rate": 2.2948779836210088e-05, "loss": 0.2432, "num_input_tokens_seen": 3975248, "step": 44140 }, { "epoch": 11.472193347193347, "grad_norm": 1.8031607866287231, "learning_rate": 2.294312940645975e-05, "loss": 0.2823, "num_input_tokens_seen": 3975696, "step": 44145 }, { "epoch": 11.473492723492724, "grad_norm": 2.052170753479004, "learning_rate": 2.2937479082496243e-05, "loss": 0.157, "num_input_tokens_seen": 3976128, "step": 44150 }, { "epoch": 11.4747920997921, "grad_norm": 0.09661190956830978, "learning_rate": 2.293182886461017e-05, "loss": 0.4562, "num_input_tokens_seen": 3976560, "step": 44155 }, { "epoch": 11.476091476091476, "grad_norm": 1.7748527526855469, "learning_rate": 2.292617875309211e-05, "loss": 0.2393, "num_input_tokens_seen": 3976992, "step": 44160 }, { "epoch": 11.477390852390853, "grad_norm": 2.6793787479400635, "learning_rate": 2.2920528748232668e-05, "loss": 0.274, "num_input_tokens_seen": 3977440, "step": 44165 }, { "epoch": 11.478690228690228, "grad_norm": 1.0563995838165283, "learning_rate": 2.291487885032242e-05, "loss": 0.2316, "num_input_tokens_seen": 3977888, "step": 44170 }, { "epoch": 11.479989604989605, "grad_norm": 1.2885242700576782, "learning_rate": 2.290922905965196e-05, "loss": 0.2095, "num_input_tokens_seen": 3978320, "step": 44175 }, { "epoch": 11.481288981288982, "grad_norm": 1.0104787349700928, "learning_rate": 2.2903579376511842e-05, "loss": 0.1495, "num_input_tokens_seen": 3978800, "step": 44180 }, { "epoch": 11.482588357588357, "grad_norm": 3.562079906463623, "learning_rate": 2.289792980119265e-05, "loss": 0.3276, "num_input_tokens_seen": 3979280, "step": 44185 }, { "epoch": 11.483887733887734, "grad_norm": 12.269722938537598, "learning_rate": 2.2892280333984938e-05, "loss": 0.2587, "num_input_tokens_seen": 3979760, "step": 44190 }, { "epoch": 11.48518711018711, "grad_norm": 0.11734846234321594, "learning_rate": 2.288663097517928e-05, "loss": 0.1048, "num_input_tokens_seen": 3980208, "step": 44195 }, { "epoch": 11.486486486486486, "grad_norm": 7.294980049133301, "learning_rate": 2.2880981725066205e-05, "loss": 0.4423, "num_input_tokens_seen": 3980624, "step": 44200 }, { "epoch": 11.487785862785863, "grad_norm": 0.5861764550209045, "learning_rate": 2.2875332583936276e-05, "loss": 0.497, "num_input_tokens_seen": 3981056, "step": 44205 }, { "epoch": 11.48908523908524, "grad_norm": 0.23464469611644745, "learning_rate": 2.286968355208002e-05, "loss": 0.0267, "num_input_tokens_seen": 3981536, "step": 44210 }, { "epoch": 11.490384615384615, "grad_norm": 4.678337097167969, "learning_rate": 2.2864034629787993e-05, "loss": 0.1366, "num_input_tokens_seen": 3981968, "step": 44215 }, { "epoch": 11.491683991683992, "grad_norm": 6.237514972686768, "learning_rate": 2.2858385817350704e-05, "loss": 0.5351, "num_input_tokens_seen": 3982416, "step": 44220 }, { "epoch": 11.492983367983369, "grad_norm": 4.283920764923096, "learning_rate": 2.2852737115058682e-05, "loss": 0.4252, "num_input_tokens_seen": 3982880, "step": 44225 }, { "epoch": 11.494282744282744, "grad_norm": 0.12607848644256592, "learning_rate": 2.2847088523202457e-05, "loss": 0.0444, "num_input_tokens_seen": 3983312, "step": 44230 }, { "epoch": 11.495582120582121, "grad_norm": 3.6729555130004883, "learning_rate": 2.284144004207252e-05, "loss": 0.1002, "num_input_tokens_seen": 3983792, "step": 44235 }, { "epoch": 11.496881496881496, "grad_norm": 0.9196068048477173, "learning_rate": 2.2835791671959397e-05, "loss": 0.2139, "num_input_tokens_seen": 3984208, "step": 44240 }, { "epoch": 11.498180873180873, "grad_norm": 0.2387286275625229, "learning_rate": 2.2830143413153576e-05, "loss": 0.1791, "num_input_tokens_seen": 3984640, "step": 44245 }, { "epoch": 11.49948024948025, "grad_norm": 6.010965347290039, "learning_rate": 2.2824495265945568e-05, "loss": 0.4928, "num_input_tokens_seen": 3985104, "step": 44250 }, { "epoch": 11.500779625779625, "grad_norm": 2.9523472785949707, "learning_rate": 2.2818847230625846e-05, "loss": 0.0961, "num_input_tokens_seen": 3985536, "step": 44255 }, { "epoch": 11.502079002079002, "grad_norm": 1.47666597366333, "learning_rate": 2.2813199307484904e-05, "loss": 0.2718, "num_input_tokens_seen": 3985984, "step": 44260 }, { "epoch": 11.503378378378379, "grad_norm": 5.065033435821533, "learning_rate": 2.280755149681321e-05, "loss": 0.4683, "num_input_tokens_seen": 3986432, "step": 44265 }, { "epoch": 11.504677754677754, "grad_norm": 7.809203147888184, "learning_rate": 2.2801903798901256e-05, "loss": 0.3585, "num_input_tokens_seen": 3986880, "step": 44270 }, { "epoch": 11.505977130977131, "grad_norm": 4.369736194610596, "learning_rate": 2.2796256214039483e-05, "loss": 0.373, "num_input_tokens_seen": 3987312, "step": 44275 }, { "epoch": 11.507276507276508, "grad_norm": 0.9004629850387573, "learning_rate": 2.2790608742518372e-05, "loss": 0.1483, "num_input_tokens_seen": 3987808, "step": 44280 }, { "epoch": 11.508575883575883, "grad_norm": 0.32521647214889526, "learning_rate": 2.2784961384628374e-05, "loss": 0.2936, "num_input_tokens_seen": 3988272, "step": 44285 }, { "epoch": 11.50987525987526, "grad_norm": 0.5535052418708801, "learning_rate": 2.2779314140659923e-05, "loss": 0.194, "num_input_tokens_seen": 3988720, "step": 44290 }, { "epoch": 11.511174636174637, "grad_norm": 1.2304472923278809, "learning_rate": 2.277366701090348e-05, "loss": 0.1114, "num_input_tokens_seen": 3989152, "step": 44295 }, { "epoch": 11.512474012474012, "grad_norm": 4.059344291687012, "learning_rate": 2.276801999564947e-05, "loss": 0.2694, "num_input_tokens_seen": 3989584, "step": 44300 }, { "epoch": 11.513773388773389, "grad_norm": 0.1556776463985443, "learning_rate": 2.276237309518834e-05, "loss": 0.0623, "num_input_tokens_seen": 3990112, "step": 44305 }, { "epoch": 11.515072765072766, "grad_norm": 4.623854637145996, "learning_rate": 2.2756726309810496e-05, "loss": 0.3675, "num_input_tokens_seen": 3990576, "step": 44310 }, { "epoch": 11.516372141372141, "grad_norm": 3.393983840942383, "learning_rate": 2.2751079639806376e-05, "loss": 0.1525, "num_input_tokens_seen": 3990992, "step": 44315 }, { "epoch": 11.517671517671518, "grad_norm": 0.1879797726869583, "learning_rate": 2.2745433085466374e-05, "loss": 0.1706, "num_input_tokens_seen": 3991440, "step": 44320 }, { "epoch": 11.518970893970893, "grad_norm": 9.168889045715332, "learning_rate": 2.2739786647080924e-05, "loss": 0.5704, "num_input_tokens_seen": 3991936, "step": 44325 }, { "epoch": 11.52027027027027, "grad_norm": 5.305960655212402, "learning_rate": 2.2734140324940398e-05, "loss": 0.1841, "num_input_tokens_seen": 3992368, "step": 44330 }, { "epoch": 11.521569646569647, "grad_norm": 7.348409652709961, "learning_rate": 2.2728494119335214e-05, "loss": 0.517, "num_input_tokens_seen": 3992816, "step": 44335 }, { "epoch": 11.522869022869022, "grad_norm": 1.3584617376327515, "learning_rate": 2.2722848030555745e-05, "loss": 0.1733, "num_input_tokens_seen": 3993264, "step": 44340 }, { "epoch": 11.5241683991684, "grad_norm": 1.4642218351364136, "learning_rate": 2.27172020588924e-05, "loss": 0.216, "num_input_tokens_seen": 3993712, "step": 44345 }, { "epoch": 11.525467775467776, "grad_norm": 1.720219373703003, "learning_rate": 2.271155620463553e-05, "loss": 0.1712, "num_input_tokens_seen": 3994128, "step": 44350 }, { "epoch": 11.526767151767151, "grad_norm": 1.720627784729004, "learning_rate": 2.2705910468075516e-05, "loss": 0.1592, "num_input_tokens_seen": 3994592, "step": 44355 }, { "epoch": 11.528066528066528, "grad_norm": 2.0699338912963867, "learning_rate": 2.270026484950273e-05, "loss": 0.2301, "num_input_tokens_seen": 3995040, "step": 44360 }, { "epoch": 11.529365904365905, "grad_norm": 1.8277777433395386, "learning_rate": 2.2694619349207523e-05, "loss": 0.3041, "num_input_tokens_seen": 3995504, "step": 44365 }, { "epoch": 11.53066528066528, "grad_norm": 0.8211451172828674, "learning_rate": 2.268897396748025e-05, "loss": 0.1841, "num_input_tokens_seen": 3995952, "step": 44370 }, { "epoch": 11.531964656964657, "grad_norm": 5.499149799346924, "learning_rate": 2.2683328704611255e-05, "loss": 0.2388, "num_input_tokens_seen": 3996432, "step": 44375 }, { "epoch": 11.533264033264032, "grad_norm": 1.1430758237838745, "learning_rate": 2.26776835608909e-05, "loss": 0.2997, "num_input_tokens_seen": 3996848, "step": 44380 }, { "epoch": 11.53456340956341, "grad_norm": 2.310809850692749, "learning_rate": 2.2672038536609487e-05, "loss": 0.2617, "num_input_tokens_seen": 3997264, "step": 44385 }, { "epoch": 11.535862785862786, "grad_norm": 4.790080547332764, "learning_rate": 2.266639363205738e-05, "loss": 0.1499, "num_input_tokens_seen": 3997744, "step": 44390 }, { "epoch": 11.537162162162161, "grad_norm": 3.3084185123443604, "learning_rate": 2.266074884752487e-05, "loss": 0.372, "num_input_tokens_seen": 3998192, "step": 44395 }, { "epoch": 11.538461538461538, "grad_norm": 2.2884364128112793, "learning_rate": 2.2655104183302294e-05, "loss": 0.4931, "num_input_tokens_seen": 3998608, "step": 44400 }, { "epoch": 11.539760914760915, "grad_norm": 4.181210041046143, "learning_rate": 2.2649459639679954e-05, "loss": 0.1067, "num_input_tokens_seen": 3999056, "step": 44405 }, { "epoch": 11.54106029106029, "grad_norm": 5.1941022872924805, "learning_rate": 2.2643815216948166e-05, "loss": 0.2773, "num_input_tokens_seen": 3999584, "step": 44410 }, { "epoch": 11.542359667359667, "grad_norm": 0.4935092628002167, "learning_rate": 2.2638170915397214e-05, "loss": 0.2481, "num_input_tokens_seen": 4000032, "step": 44415 }, { "epoch": 11.543659043659044, "grad_norm": 0.45047634840011597, "learning_rate": 2.2632526735317387e-05, "loss": 0.2198, "num_input_tokens_seen": 4000512, "step": 44420 }, { "epoch": 11.54495841995842, "grad_norm": 0.25863826274871826, "learning_rate": 2.2626882676998994e-05, "loss": 0.3143, "num_input_tokens_seen": 4000928, "step": 44425 }, { "epoch": 11.546257796257796, "grad_norm": 4.488259792327881, "learning_rate": 2.2621238740732284e-05, "loss": 0.165, "num_input_tokens_seen": 4001408, "step": 44430 }, { "epoch": 11.547557172557173, "grad_norm": 2.7541654109954834, "learning_rate": 2.261559492680755e-05, "loss": 0.205, "num_input_tokens_seen": 4001808, "step": 44435 }, { "epoch": 11.548856548856548, "grad_norm": 1.3577394485473633, "learning_rate": 2.260995123551505e-05, "loss": 0.3714, "num_input_tokens_seen": 4002288, "step": 44440 }, { "epoch": 11.550155925155925, "grad_norm": 0.6601695418357849, "learning_rate": 2.260430766714506e-05, "loss": 0.3401, "num_input_tokens_seen": 4002736, "step": 44445 }, { "epoch": 11.551455301455302, "grad_norm": 2.212385416030884, "learning_rate": 2.259866422198781e-05, "loss": 0.2461, "num_input_tokens_seen": 4003152, "step": 44450 }, { "epoch": 11.552754677754677, "grad_norm": 2.3532590866088867, "learning_rate": 2.2593020900333563e-05, "loss": 0.1652, "num_input_tokens_seen": 4003568, "step": 44455 }, { "epoch": 11.554054054054054, "grad_norm": 2.3901913166046143, "learning_rate": 2.2587377702472555e-05, "loss": 0.3002, "num_input_tokens_seen": 4004016, "step": 44460 }, { "epoch": 11.55535343035343, "grad_norm": 0.5269133448600769, "learning_rate": 2.2581734628695034e-05, "loss": 0.1413, "num_input_tokens_seen": 4004432, "step": 44465 }, { "epoch": 11.556652806652806, "grad_norm": 3.9288480281829834, "learning_rate": 2.2576091679291205e-05, "loss": 0.2393, "num_input_tokens_seen": 4004880, "step": 44470 }, { "epoch": 11.557952182952183, "grad_norm": 3.206977367401123, "learning_rate": 2.257044885455131e-05, "loss": 0.4497, "num_input_tokens_seen": 4005328, "step": 44475 }, { "epoch": 11.559251559251559, "grad_norm": 0.6799969673156738, "learning_rate": 2.2564806154765565e-05, "loss": 0.1676, "num_input_tokens_seen": 4005776, "step": 44480 }, { "epoch": 11.560550935550935, "grad_norm": 3.7807610034942627, "learning_rate": 2.2559163580224163e-05, "loss": 0.1363, "num_input_tokens_seen": 4006192, "step": 44485 }, { "epoch": 11.561850311850312, "grad_norm": 5.206502437591553, "learning_rate": 2.255352113121732e-05, "loss": 0.3314, "num_input_tokens_seen": 4006640, "step": 44490 }, { "epoch": 11.563149688149688, "grad_norm": 1.1848642826080322, "learning_rate": 2.2547878808035223e-05, "loss": 0.0838, "num_input_tokens_seen": 4007120, "step": 44495 }, { "epoch": 11.564449064449065, "grad_norm": 2.5221574306488037, "learning_rate": 2.254223661096808e-05, "loss": 0.044, "num_input_tokens_seen": 4007552, "step": 44500 }, { "epoch": 11.565748440748441, "grad_norm": 4.09886360168457, "learning_rate": 2.253659454030605e-05, "loss": 0.3941, "num_input_tokens_seen": 4007984, "step": 44505 }, { "epoch": 11.567047817047817, "grad_norm": 3.8334968090057373, "learning_rate": 2.2530952596339334e-05, "loss": 0.2514, "num_input_tokens_seen": 4008464, "step": 44510 }, { "epoch": 11.568347193347194, "grad_norm": 8.423591613769531, "learning_rate": 2.2525310779358084e-05, "loss": 0.3421, "num_input_tokens_seen": 4008928, "step": 44515 }, { "epoch": 11.56964656964657, "grad_norm": 2.6376967430114746, "learning_rate": 2.251966908965248e-05, "loss": 0.2899, "num_input_tokens_seen": 4009376, "step": 44520 }, { "epoch": 11.570945945945946, "grad_norm": 2.687683582305908, "learning_rate": 2.2514027527512664e-05, "loss": 0.279, "num_input_tokens_seen": 4009840, "step": 44525 }, { "epoch": 11.572245322245323, "grad_norm": 4.348421096801758, "learning_rate": 2.2508386093228798e-05, "loss": 0.2884, "num_input_tokens_seen": 4010256, "step": 44530 }, { "epoch": 11.573544698544698, "grad_norm": 4.101791858673096, "learning_rate": 2.2502744787091015e-05, "loss": 0.2967, "num_input_tokens_seen": 4010688, "step": 44535 }, { "epoch": 11.574844074844075, "grad_norm": 1.6260253190994263, "learning_rate": 2.2497103609389475e-05, "loss": 0.1278, "num_input_tokens_seen": 4011168, "step": 44540 }, { "epoch": 11.576143451143452, "grad_norm": 2.6506245136260986, "learning_rate": 2.2491462560414287e-05, "loss": 0.2428, "num_input_tokens_seen": 4011632, "step": 44545 }, { "epoch": 11.577442827442827, "grad_norm": 8.356532096862793, "learning_rate": 2.248582164045558e-05, "loss": 0.2401, "num_input_tokens_seen": 4012080, "step": 44550 }, { "epoch": 11.578742203742204, "grad_norm": 2.6752429008483887, "learning_rate": 2.248018084980348e-05, "loss": 0.2447, "num_input_tokens_seen": 4012528, "step": 44555 }, { "epoch": 11.58004158004158, "grad_norm": 5.944462776184082, "learning_rate": 2.2474540188748088e-05, "loss": 0.2539, "num_input_tokens_seen": 4013008, "step": 44560 }, { "epoch": 11.581340956340956, "grad_norm": 0.6797380447387695, "learning_rate": 2.246889965757952e-05, "loss": 0.1814, "num_input_tokens_seen": 4013456, "step": 44565 }, { "epoch": 11.582640332640333, "grad_norm": 5.765392303466797, "learning_rate": 2.2463259256587855e-05, "loss": 0.1894, "num_input_tokens_seen": 4013952, "step": 44570 }, { "epoch": 11.58393970893971, "grad_norm": 0.5145164728164673, "learning_rate": 2.2457618986063217e-05, "loss": 0.166, "num_input_tokens_seen": 4014400, "step": 44575 }, { "epoch": 11.585239085239085, "grad_norm": 0.36224445700645447, "learning_rate": 2.2451978846295654e-05, "loss": 0.0633, "num_input_tokens_seen": 4014848, "step": 44580 }, { "epoch": 11.586538461538462, "grad_norm": 6.331992149353027, "learning_rate": 2.2446338837575268e-05, "loss": 0.2199, "num_input_tokens_seen": 4015296, "step": 44585 }, { "epoch": 11.587837837837839, "grad_norm": 0.39489638805389404, "learning_rate": 2.2440698960192115e-05, "loss": 0.0986, "num_input_tokens_seen": 4015760, "step": 44590 }, { "epoch": 11.589137214137214, "grad_norm": 0.21505065262317657, "learning_rate": 2.243505921443628e-05, "loss": 0.0912, "num_input_tokens_seen": 4016208, "step": 44595 }, { "epoch": 11.59043659043659, "grad_norm": 5.1292805671691895, "learning_rate": 2.2429419600597796e-05, "loss": 0.1504, "num_input_tokens_seen": 4016672, "step": 44600 }, { "epoch": 11.591735966735968, "grad_norm": 1.0509434938430786, "learning_rate": 2.242378011896673e-05, "loss": 0.4012, "num_input_tokens_seen": 4017152, "step": 44605 }, { "epoch": 11.593035343035343, "grad_norm": 0.43198662996292114, "learning_rate": 2.2418140769833125e-05, "loss": 0.2061, "num_input_tokens_seen": 4017600, "step": 44610 }, { "epoch": 11.59433471933472, "grad_norm": 10.988152503967285, "learning_rate": 2.2412501553487003e-05, "loss": 0.6149, "num_input_tokens_seen": 4018048, "step": 44615 }, { "epoch": 11.595634095634095, "grad_norm": 1.8935065269470215, "learning_rate": 2.240686247021841e-05, "loss": 0.2238, "num_input_tokens_seen": 4018512, "step": 44620 }, { "epoch": 11.596933471933472, "grad_norm": 3.3370656967163086, "learning_rate": 2.2401223520317362e-05, "loss": 0.3365, "num_input_tokens_seen": 4018976, "step": 44625 }, { "epoch": 11.598232848232849, "grad_norm": 4.437946796417236, "learning_rate": 2.239558470407389e-05, "loss": 0.291, "num_input_tokens_seen": 4019392, "step": 44630 }, { "epoch": 11.599532224532224, "grad_norm": 0.7160483598709106, "learning_rate": 2.2389946021777976e-05, "loss": 0.1027, "num_input_tokens_seen": 4019840, "step": 44635 }, { "epoch": 11.6008316008316, "grad_norm": 6.008934020996094, "learning_rate": 2.238430747371965e-05, "loss": 0.1914, "num_input_tokens_seen": 4020256, "step": 44640 }, { "epoch": 11.602130977130978, "grad_norm": 5.142937660217285, "learning_rate": 2.237866906018889e-05, "loss": 0.2519, "num_input_tokens_seen": 4020672, "step": 44645 }, { "epoch": 11.603430353430353, "grad_norm": 0.7283177375793457, "learning_rate": 2.2373030781475697e-05, "loss": 0.1248, "num_input_tokens_seen": 4021120, "step": 44650 }, { "epoch": 11.60472972972973, "grad_norm": 6.498376369476318, "learning_rate": 2.236739263787004e-05, "loss": 0.1908, "num_input_tokens_seen": 4021584, "step": 44655 }, { "epoch": 11.606029106029107, "grad_norm": 10.138989448547363, "learning_rate": 2.236175462966192e-05, "loss": 0.2101, "num_input_tokens_seen": 4022048, "step": 44660 }, { "epoch": 11.607328482328482, "grad_norm": 2.15677547454834, "learning_rate": 2.235611675714127e-05, "loss": 0.1125, "num_input_tokens_seen": 4022496, "step": 44665 }, { "epoch": 11.608627858627859, "grad_norm": 3.9802653789520264, "learning_rate": 2.2350479020598074e-05, "loss": 0.4746, "num_input_tokens_seen": 4022976, "step": 44670 }, { "epoch": 11.609927234927234, "grad_norm": 4.093607425689697, "learning_rate": 2.2344841420322287e-05, "loss": 0.1423, "num_input_tokens_seen": 4023424, "step": 44675 }, { "epoch": 11.611226611226611, "grad_norm": 8.577143669128418, "learning_rate": 2.233920395660384e-05, "loss": 0.5336, "num_input_tokens_seen": 4023872, "step": 44680 }, { "epoch": 11.612525987525988, "grad_norm": 0.32015490531921387, "learning_rate": 2.233356662973269e-05, "loss": 0.0456, "num_input_tokens_seen": 4024352, "step": 44685 }, { "epoch": 11.613825363825363, "grad_norm": 5.676109790802002, "learning_rate": 2.2327929439998755e-05, "loss": 0.2499, "num_input_tokens_seen": 4024800, "step": 44690 }, { "epoch": 11.61512474012474, "grad_norm": 8.60830307006836, "learning_rate": 2.232229238769198e-05, "loss": 0.1788, "num_input_tokens_seen": 4025248, "step": 44695 }, { "epoch": 11.616424116424117, "grad_norm": 4.585947513580322, "learning_rate": 2.231665547310226e-05, "loss": 0.1701, "num_input_tokens_seen": 4025696, "step": 44700 }, { "epoch": 11.617723492723492, "grad_norm": 4.836632251739502, "learning_rate": 2.2311018696519532e-05, "loss": 0.2032, "num_input_tokens_seen": 4026128, "step": 44705 }, { "epoch": 11.619022869022869, "grad_norm": 3.994296073913574, "learning_rate": 2.230538205823368e-05, "loss": 0.3947, "num_input_tokens_seen": 4026576, "step": 44710 }, { "epoch": 11.620322245322246, "grad_norm": 0.8546560406684875, "learning_rate": 2.229974555853462e-05, "loss": 0.2482, "num_input_tokens_seen": 4027008, "step": 44715 }, { "epoch": 11.621621621621621, "grad_norm": 9.676445007324219, "learning_rate": 2.2294109197712223e-05, "loss": 0.4792, "num_input_tokens_seen": 4027504, "step": 44720 }, { "epoch": 11.622920997920998, "grad_norm": 4.145840644836426, "learning_rate": 2.228847297605639e-05, "loss": 0.2783, "num_input_tokens_seen": 4027936, "step": 44725 }, { "epoch": 11.624220374220375, "grad_norm": 1.5550265312194824, "learning_rate": 2.228283689385698e-05, "loss": 0.2243, "num_input_tokens_seen": 4028384, "step": 44730 }, { "epoch": 11.62551975051975, "grad_norm": 13.516754150390625, "learning_rate": 2.2277200951403887e-05, "loss": 0.3171, "num_input_tokens_seen": 4028864, "step": 44735 }, { "epoch": 11.626819126819127, "grad_norm": 3.5130701065063477, "learning_rate": 2.227156514898695e-05, "loss": 0.3205, "num_input_tokens_seen": 4029360, "step": 44740 }, { "epoch": 11.628118503118504, "grad_norm": 3.3943724632263184, "learning_rate": 2.2265929486896028e-05, "loss": 0.408, "num_input_tokens_seen": 4029808, "step": 44745 }, { "epoch": 11.629417879417879, "grad_norm": 3.6872012615203857, "learning_rate": 2.2260293965420982e-05, "loss": 0.4869, "num_input_tokens_seen": 4030224, "step": 44750 }, { "epoch": 11.630717255717256, "grad_norm": 0.2948959767818451, "learning_rate": 2.2254658584851633e-05, "loss": 0.0311, "num_input_tokens_seen": 4030688, "step": 44755 }, { "epoch": 11.632016632016633, "grad_norm": 0.4023982584476471, "learning_rate": 2.224902334547783e-05, "loss": 0.4247, "num_input_tokens_seen": 4031152, "step": 44760 }, { "epoch": 11.633316008316008, "grad_norm": 3.592643976211548, "learning_rate": 2.2243388247589382e-05, "loss": 0.232, "num_input_tokens_seen": 4031600, "step": 44765 }, { "epoch": 11.634615384615385, "grad_norm": 5.524733066558838, "learning_rate": 2.2237753291476133e-05, "loss": 0.3151, "num_input_tokens_seen": 4032080, "step": 44770 }, { "epoch": 11.63591476091476, "grad_norm": 4.614160537719727, "learning_rate": 2.2232118477427867e-05, "loss": 0.1533, "num_input_tokens_seen": 4032528, "step": 44775 }, { "epoch": 11.637214137214137, "grad_norm": 2.9276111125946045, "learning_rate": 2.2226483805734404e-05, "loss": 0.283, "num_input_tokens_seen": 4032960, "step": 44780 }, { "epoch": 11.638513513513514, "grad_norm": 2.3527987003326416, "learning_rate": 2.222084927668553e-05, "loss": 0.5956, "num_input_tokens_seen": 4033424, "step": 44785 }, { "epoch": 11.63981288981289, "grad_norm": 0.8906258344650269, "learning_rate": 2.2215214890571053e-05, "loss": 0.3115, "num_input_tokens_seen": 4033872, "step": 44790 }, { "epoch": 11.641112266112266, "grad_norm": 6.083507537841797, "learning_rate": 2.2209580647680735e-05, "loss": 0.1251, "num_input_tokens_seen": 4034400, "step": 44795 }, { "epoch": 11.642411642411643, "grad_norm": 2.261449098587036, "learning_rate": 2.2203946548304358e-05, "loss": 0.1681, "num_input_tokens_seen": 4034848, "step": 44800 }, { "epoch": 11.643711018711018, "grad_norm": 3.150339365005493, "learning_rate": 2.2198312592731695e-05, "loss": 0.3774, "num_input_tokens_seen": 4035312, "step": 44805 }, { "epoch": 11.645010395010395, "grad_norm": 1.9678430557250977, "learning_rate": 2.2192678781252492e-05, "loss": 0.1436, "num_input_tokens_seen": 4035760, "step": 44810 }, { "epoch": 11.646309771309772, "grad_norm": 0.2112901210784912, "learning_rate": 2.2187045114156513e-05, "loss": 0.2097, "num_input_tokens_seen": 4036208, "step": 44815 }, { "epoch": 11.647609147609147, "grad_norm": 0.8402948379516602, "learning_rate": 2.218141159173349e-05, "loss": 0.0979, "num_input_tokens_seen": 4036640, "step": 44820 }, { "epoch": 11.648908523908524, "grad_norm": 7.5333170890808105, "learning_rate": 2.2175778214273185e-05, "loss": 0.4602, "num_input_tokens_seen": 4037072, "step": 44825 }, { "epoch": 11.6502079002079, "grad_norm": 0.12452735006809235, "learning_rate": 2.2170144982065303e-05, "loss": 0.2324, "num_input_tokens_seen": 4037520, "step": 44830 }, { "epoch": 11.651507276507276, "grad_norm": 0.4459519386291504, "learning_rate": 2.216451189539958e-05, "loss": 0.1828, "num_input_tokens_seen": 4037952, "step": 44835 }, { "epoch": 11.652806652806653, "grad_norm": 0.057890184223651886, "learning_rate": 2.2158878954565717e-05, "loss": 0.1535, "num_input_tokens_seen": 4038432, "step": 44840 }, { "epoch": 11.654106029106028, "grad_norm": 0.4663446247577667, "learning_rate": 2.2153246159853446e-05, "loss": 0.0254, "num_input_tokens_seen": 4038832, "step": 44845 }, { "epoch": 11.655405405405405, "grad_norm": 0.11943992227315903, "learning_rate": 2.2147613511552443e-05, "loss": 0.1318, "num_input_tokens_seen": 4039264, "step": 44850 }, { "epoch": 11.656704781704782, "grad_norm": 0.3630470931529999, "learning_rate": 2.2141981009952414e-05, "loss": 0.0117, "num_input_tokens_seen": 4039680, "step": 44855 }, { "epoch": 11.658004158004157, "grad_norm": 6.911283493041992, "learning_rate": 2.2136348655343033e-05, "loss": 0.4031, "num_input_tokens_seen": 4040128, "step": 44860 }, { "epoch": 11.659303534303534, "grad_norm": 0.08852630108594894, "learning_rate": 2.2130716448014e-05, "loss": 0.2123, "num_input_tokens_seen": 4040592, "step": 44865 }, { "epoch": 11.660602910602911, "grad_norm": 0.17210358381271362, "learning_rate": 2.2125084388254962e-05, "loss": 0.4567, "num_input_tokens_seen": 4041040, "step": 44870 }, { "epoch": 11.661902286902286, "grad_norm": 0.4183543622493744, "learning_rate": 2.2119452476355577e-05, "loss": 0.6216, "num_input_tokens_seen": 4041552, "step": 44875 }, { "epoch": 11.663201663201663, "grad_norm": 0.16137701272964478, "learning_rate": 2.211382071260553e-05, "loss": 0.0139, "num_input_tokens_seen": 4042000, "step": 44880 }, { "epoch": 11.66450103950104, "grad_norm": 0.6676397919654846, "learning_rate": 2.210818909729443e-05, "loss": 0.1692, "num_input_tokens_seen": 4042464, "step": 44885 }, { "epoch": 11.665800415800415, "grad_norm": 0.45631149411201477, "learning_rate": 2.210255763071195e-05, "loss": 0.1553, "num_input_tokens_seen": 4042912, "step": 44890 }, { "epoch": 11.667099792099792, "grad_norm": 4.415797233581543, "learning_rate": 2.20969263131477e-05, "loss": 0.4951, "num_input_tokens_seen": 4043376, "step": 44895 }, { "epoch": 11.66839916839917, "grad_norm": 3.9523684978485107, "learning_rate": 2.2091295144891317e-05, "loss": 0.5766, "num_input_tokens_seen": 4043840, "step": 44900 }, { "epoch": 11.669698544698544, "grad_norm": 4.225253105163574, "learning_rate": 2.20856641262324e-05, "loss": 0.5207, "num_input_tokens_seen": 4044288, "step": 44905 }, { "epoch": 11.670997920997921, "grad_norm": 2.3303346633911133, "learning_rate": 2.2080033257460586e-05, "loss": 0.3373, "num_input_tokens_seen": 4044736, "step": 44910 }, { "epoch": 11.672297297297296, "grad_norm": 0.130095973610878, "learning_rate": 2.207440253886545e-05, "loss": 0.1575, "num_input_tokens_seen": 4045216, "step": 44915 }, { "epoch": 11.673596673596673, "grad_norm": 3.961961269378662, "learning_rate": 2.2068771970736595e-05, "loss": 0.4291, "num_input_tokens_seen": 4045648, "step": 44920 }, { "epoch": 11.67489604989605, "grad_norm": 5.607157230377197, "learning_rate": 2.2063141553363603e-05, "loss": 0.2977, "num_input_tokens_seen": 4046128, "step": 44925 }, { "epoch": 11.676195426195425, "grad_norm": 0.8513463735580444, "learning_rate": 2.2057511287036064e-05, "loss": 0.2553, "num_input_tokens_seen": 4046576, "step": 44930 }, { "epoch": 11.677494802494802, "grad_norm": 5.607776165008545, "learning_rate": 2.2051881172043538e-05, "loss": 0.2458, "num_input_tokens_seen": 4047072, "step": 44935 }, { "epoch": 11.67879417879418, "grad_norm": 3.219994306564331, "learning_rate": 2.2046251208675578e-05, "loss": 0.3264, "num_input_tokens_seen": 4047520, "step": 44940 }, { "epoch": 11.680093555093555, "grad_norm": 5.046657085418701, "learning_rate": 2.204062139722176e-05, "loss": 0.2351, "num_input_tokens_seen": 4047968, "step": 44945 }, { "epoch": 11.681392931392931, "grad_norm": 2.1195132732391357, "learning_rate": 2.2034991737971608e-05, "loss": 0.1597, "num_input_tokens_seen": 4048448, "step": 44950 }, { "epoch": 11.682692307692308, "grad_norm": 4.740317344665527, "learning_rate": 2.2029362231214677e-05, "loss": 0.1676, "num_input_tokens_seen": 4048944, "step": 44955 }, { "epoch": 11.683991683991684, "grad_norm": 0.4682474136352539, "learning_rate": 2.202373287724049e-05, "loss": 0.2836, "num_input_tokens_seen": 4049392, "step": 44960 }, { "epoch": 11.68529106029106, "grad_norm": 2.356391191482544, "learning_rate": 2.2018103676338583e-05, "loss": 0.3198, "num_input_tokens_seen": 4049840, "step": 44965 }, { "epoch": 11.686590436590437, "grad_norm": 8.920246124267578, "learning_rate": 2.2012474628798448e-05, "loss": 0.1019, "num_input_tokens_seen": 4050304, "step": 44970 }, { "epoch": 11.687889812889813, "grad_norm": 5.4789299964904785, "learning_rate": 2.2006845734909614e-05, "loss": 0.3557, "num_input_tokens_seen": 4050736, "step": 44975 }, { "epoch": 11.68918918918919, "grad_norm": 5.323229789733887, "learning_rate": 2.2001216994961565e-05, "loss": 0.3721, "num_input_tokens_seen": 4051184, "step": 44980 }, { "epoch": 11.690488565488565, "grad_norm": 1.0760109424591064, "learning_rate": 2.199558840924381e-05, "loss": 0.2975, "num_input_tokens_seen": 4051632, "step": 44985 }, { "epoch": 11.691787941787942, "grad_norm": 5.898285388946533, "learning_rate": 2.198995997804581e-05, "loss": 0.2828, "num_input_tokens_seen": 4052096, "step": 44990 }, { "epoch": 11.693087318087318, "grad_norm": 1.0174028873443604, "learning_rate": 2.198433170165706e-05, "loss": 0.3265, "num_input_tokens_seen": 4052512, "step": 44995 }, { "epoch": 11.694386694386694, "grad_norm": 3.7822422981262207, "learning_rate": 2.1978703580367024e-05, "loss": 0.2421, "num_input_tokens_seen": 4052928, "step": 45000 }, { "epoch": 11.69568607068607, "grad_norm": 2.011286735534668, "learning_rate": 2.1973075614465147e-05, "loss": 0.292, "num_input_tokens_seen": 4053344, "step": 45005 }, { "epoch": 11.696985446985448, "grad_norm": 1.6685564517974854, "learning_rate": 2.1967447804240895e-05, "loss": 0.1408, "num_input_tokens_seen": 4053808, "step": 45010 }, { "epoch": 11.698284823284823, "grad_norm": 3.3542723655700684, "learning_rate": 2.1961820149983706e-05, "loss": 0.1452, "num_input_tokens_seen": 4054240, "step": 45015 }, { "epoch": 11.6995841995842, "grad_norm": 5.956518173217773, "learning_rate": 2.1956192651983028e-05, "loss": 0.1831, "num_input_tokens_seen": 4054688, "step": 45020 }, { "epoch": 11.700883575883577, "grad_norm": 0.82234787940979, "learning_rate": 2.1950565310528266e-05, "loss": 0.1378, "num_input_tokens_seen": 4055104, "step": 45025 }, { "epoch": 11.702182952182952, "grad_norm": 3.7935831546783447, "learning_rate": 2.1944938125908857e-05, "loss": 0.2168, "num_input_tokens_seen": 4055600, "step": 45030 }, { "epoch": 11.703482328482329, "grad_norm": 4.416140079498291, "learning_rate": 2.1939311098414202e-05, "loss": 0.2259, "num_input_tokens_seen": 4056096, "step": 45035 }, { "epoch": 11.704781704781706, "grad_norm": 4.997784614562988, "learning_rate": 2.1933684228333722e-05, "loss": 0.2292, "num_input_tokens_seen": 4056496, "step": 45040 }, { "epoch": 11.70608108108108, "grad_norm": 1.722916603088379, "learning_rate": 2.1928057515956788e-05, "loss": 0.2139, "num_input_tokens_seen": 4056944, "step": 45045 }, { "epoch": 11.707380457380458, "grad_norm": 0.9944283366203308, "learning_rate": 2.1922430961572806e-05, "loss": 0.3119, "num_input_tokens_seen": 4057408, "step": 45050 }, { "epoch": 11.708679833679835, "grad_norm": 1.4764224290847778, "learning_rate": 2.1916804565471143e-05, "loss": 0.1379, "num_input_tokens_seen": 4057824, "step": 45055 }, { "epoch": 11.70997920997921, "grad_norm": 7.348563194274902, "learning_rate": 2.1911178327941183e-05, "loss": 0.5591, "num_input_tokens_seen": 4058256, "step": 45060 }, { "epoch": 11.711278586278587, "grad_norm": 5.443587779998779, "learning_rate": 2.1905552249272284e-05, "loss": 0.2286, "num_input_tokens_seen": 4058688, "step": 45065 }, { "epoch": 11.712577962577962, "grad_norm": 9.091466903686523, "learning_rate": 2.1899926329753783e-05, "loss": 0.2896, "num_input_tokens_seen": 4059120, "step": 45070 }, { "epoch": 11.713877338877339, "grad_norm": 9.07482624053955, "learning_rate": 2.1894300569675056e-05, "loss": 0.2157, "num_input_tokens_seen": 4059568, "step": 45075 }, { "epoch": 11.715176715176716, "grad_norm": 8.076530456542969, "learning_rate": 2.1888674969325414e-05, "loss": 0.2154, "num_input_tokens_seen": 4060000, "step": 45080 }, { "epoch": 11.71647609147609, "grad_norm": 0.7462867498397827, "learning_rate": 2.1883049528994208e-05, "loss": 0.5, "num_input_tokens_seen": 4060448, "step": 45085 }, { "epoch": 11.717775467775468, "grad_norm": 3.1960318088531494, "learning_rate": 2.1877424248970743e-05, "loss": 0.2524, "num_input_tokens_seen": 4060880, "step": 45090 }, { "epoch": 11.719074844074845, "grad_norm": 0.3982001841068268, "learning_rate": 2.1871799129544355e-05, "loss": 0.4448, "num_input_tokens_seen": 4061328, "step": 45095 }, { "epoch": 11.72037422037422, "grad_norm": 7.725228309631348, "learning_rate": 2.1866174171004324e-05, "loss": 0.0818, "num_input_tokens_seen": 4061792, "step": 45100 }, { "epoch": 11.721673596673597, "grad_norm": 7.973264217376709, "learning_rate": 2.186054937363996e-05, "loss": 0.2743, "num_input_tokens_seen": 4062288, "step": 45105 }, { "epoch": 11.722972972972974, "grad_norm": 1.0662938356399536, "learning_rate": 2.1854924737740546e-05, "loss": 0.1061, "num_input_tokens_seen": 4062736, "step": 45110 }, { "epoch": 11.724272349272349, "grad_norm": 0.25091421604156494, "learning_rate": 2.1849300263595377e-05, "loss": 0.3817, "num_input_tokens_seen": 4063232, "step": 45115 }, { "epoch": 11.725571725571726, "grad_norm": 11.783750534057617, "learning_rate": 2.1843675951493696e-05, "loss": 0.2926, "num_input_tokens_seen": 4063696, "step": 45120 }, { "epoch": 11.726871101871101, "grad_norm": 7.101750373840332, "learning_rate": 2.1838051801724805e-05, "loss": 0.2415, "num_input_tokens_seen": 4064144, "step": 45125 }, { "epoch": 11.728170478170478, "grad_norm": 2.141079902648926, "learning_rate": 2.1832427814577936e-05, "loss": 0.1965, "num_input_tokens_seen": 4064560, "step": 45130 }, { "epoch": 11.729469854469855, "grad_norm": 0.8080512881278992, "learning_rate": 2.1826803990342327e-05, "loss": 0.2501, "num_input_tokens_seen": 4064992, "step": 45135 }, { "epoch": 11.73076923076923, "grad_norm": 4.780800819396973, "learning_rate": 2.1821180329307242e-05, "loss": 0.2131, "num_input_tokens_seen": 4065440, "step": 45140 }, { "epoch": 11.732068607068607, "grad_norm": 6.750481128692627, "learning_rate": 2.1815556831761886e-05, "loss": 0.2409, "num_input_tokens_seen": 4065872, "step": 45145 }, { "epoch": 11.733367983367984, "grad_norm": 1.3772566318511963, "learning_rate": 2.1809933497995504e-05, "loss": 0.2086, "num_input_tokens_seen": 4066272, "step": 45150 }, { "epoch": 11.734667359667359, "grad_norm": 1.3655201196670532, "learning_rate": 2.1804310328297293e-05, "loss": 0.0829, "num_input_tokens_seen": 4066704, "step": 45155 }, { "epoch": 11.735966735966736, "grad_norm": 6.012420654296875, "learning_rate": 2.179868732295647e-05, "loss": 0.2695, "num_input_tokens_seen": 4067168, "step": 45160 }, { "epoch": 11.737266112266113, "grad_norm": 0.562736988067627, "learning_rate": 2.1793064482262218e-05, "loss": 0.0259, "num_input_tokens_seen": 4067616, "step": 45165 }, { "epoch": 11.738565488565488, "grad_norm": 0.3998231291770935, "learning_rate": 2.178744180650374e-05, "loss": 0.2363, "num_input_tokens_seen": 4068032, "step": 45170 }, { "epoch": 11.739864864864865, "grad_norm": 0.8904289603233337, "learning_rate": 2.1781819295970196e-05, "loss": 0.2303, "num_input_tokens_seen": 4068480, "step": 45175 }, { "epoch": 11.741164241164242, "grad_norm": 0.441782146692276, "learning_rate": 2.1776196950950787e-05, "loss": 0.3357, "num_input_tokens_seen": 4068912, "step": 45180 }, { "epoch": 11.742463617463617, "grad_norm": 5.782259464263916, "learning_rate": 2.1770574771734642e-05, "loss": 0.3158, "num_input_tokens_seen": 4069344, "step": 45185 }, { "epoch": 11.743762993762994, "grad_norm": 6.0477614402771, "learning_rate": 2.176495275861094e-05, "loss": 0.5404, "num_input_tokens_seen": 4069776, "step": 45190 }, { "epoch": 11.74506237006237, "grad_norm": 6.6669158935546875, "learning_rate": 2.175933091186882e-05, "loss": 0.6293, "num_input_tokens_seen": 4070224, "step": 45195 }, { "epoch": 11.746361746361746, "grad_norm": 3.109283685684204, "learning_rate": 2.1753709231797403e-05, "loss": 0.2558, "num_input_tokens_seen": 4070656, "step": 45200 }, { "epoch": 11.747661122661123, "grad_norm": 0.5515692830085754, "learning_rate": 2.174808771868584e-05, "loss": 0.2552, "num_input_tokens_seen": 4071072, "step": 45205 }, { "epoch": 11.7489604989605, "grad_norm": 1.0802847146987915, "learning_rate": 2.1742466372823233e-05, "loss": 0.1988, "num_input_tokens_seen": 4071536, "step": 45210 }, { "epoch": 11.750259875259875, "grad_norm": 7.822587490081787, "learning_rate": 2.173684519449872e-05, "loss": 0.2834, "num_input_tokens_seen": 4071968, "step": 45215 }, { "epoch": 11.751559251559252, "grad_norm": 6.284854412078857, "learning_rate": 2.1731224184001365e-05, "loss": 0.305, "num_input_tokens_seen": 4072432, "step": 45220 }, { "epoch": 11.752858627858627, "grad_norm": 1.5576651096343994, "learning_rate": 2.1725603341620293e-05, "loss": 0.1859, "num_input_tokens_seen": 4072848, "step": 45225 }, { "epoch": 11.754158004158004, "grad_norm": 7.262653827667236, "learning_rate": 2.171998266764457e-05, "loss": 0.3516, "num_input_tokens_seen": 4073296, "step": 45230 }, { "epoch": 11.755457380457381, "grad_norm": 4.538121700286865, "learning_rate": 2.1714362162363293e-05, "loss": 0.2077, "num_input_tokens_seen": 4073728, "step": 45235 }, { "epoch": 11.756756756756756, "grad_norm": 9.230142593383789, "learning_rate": 2.170874182606551e-05, "loss": 0.3635, "num_input_tokens_seen": 4074160, "step": 45240 }, { "epoch": 11.758056133056133, "grad_norm": 1.8070064783096313, "learning_rate": 2.170312165904029e-05, "loss": 0.2713, "num_input_tokens_seen": 4074592, "step": 45245 }, { "epoch": 11.75935550935551, "grad_norm": 1.5675857067108154, "learning_rate": 2.169750166157668e-05, "loss": 0.1555, "num_input_tokens_seen": 4075040, "step": 45250 }, { "epoch": 11.760654885654885, "grad_norm": 7.160696506500244, "learning_rate": 2.169188183396373e-05, "loss": 0.3033, "num_input_tokens_seen": 4075488, "step": 45255 }, { "epoch": 11.761954261954262, "grad_norm": 1.726701021194458, "learning_rate": 2.1686262176490467e-05, "loss": 0.1931, "num_input_tokens_seen": 4075904, "step": 45260 }, { "epoch": 11.763253638253639, "grad_norm": 1.0052083730697632, "learning_rate": 2.168064268944591e-05, "loss": 0.3621, "num_input_tokens_seen": 4076352, "step": 45265 }, { "epoch": 11.764553014553014, "grad_norm": 7.435225486755371, "learning_rate": 2.1675023373119085e-05, "loss": 0.2606, "num_input_tokens_seen": 4076784, "step": 45270 }, { "epoch": 11.765852390852391, "grad_norm": 4.329103946685791, "learning_rate": 2.1669404227798988e-05, "loss": 0.1289, "num_input_tokens_seen": 4077264, "step": 45275 }, { "epoch": 11.767151767151766, "grad_norm": 4.570298194885254, "learning_rate": 2.166378525377463e-05, "loss": 0.2611, "num_input_tokens_seen": 4077680, "step": 45280 }, { "epoch": 11.768451143451143, "grad_norm": 4.686921119689941, "learning_rate": 2.165816645133498e-05, "loss": 0.1755, "num_input_tokens_seen": 4078144, "step": 45285 }, { "epoch": 11.76975051975052, "grad_norm": 2.60932993888855, "learning_rate": 2.1652547820769046e-05, "loss": 0.146, "num_input_tokens_seen": 4078560, "step": 45290 }, { "epoch": 11.771049896049895, "grad_norm": 5.842402458190918, "learning_rate": 2.1646929362365774e-05, "loss": 0.679, "num_input_tokens_seen": 4079008, "step": 45295 }, { "epoch": 11.772349272349272, "grad_norm": 8.263123512268066, "learning_rate": 2.1641311076414145e-05, "loss": 0.3677, "num_input_tokens_seen": 4079472, "step": 45300 }, { "epoch": 11.77364864864865, "grad_norm": 5.483224391937256, "learning_rate": 2.1635692963203098e-05, "loss": 0.1946, "num_input_tokens_seen": 4079920, "step": 45305 }, { "epoch": 11.774948024948024, "grad_norm": 5.143537998199463, "learning_rate": 2.1630075023021596e-05, "loss": 0.1862, "num_input_tokens_seen": 4080336, "step": 45310 }, { "epoch": 11.776247401247401, "grad_norm": 1.4518274068832397, "learning_rate": 2.1624457256158554e-05, "loss": 0.1683, "num_input_tokens_seen": 4080784, "step": 45315 }, { "epoch": 11.777546777546778, "grad_norm": 0.8983792662620544, "learning_rate": 2.1618839662902916e-05, "loss": 0.2289, "num_input_tokens_seen": 4081216, "step": 45320 }, { "epoch": 11.778846153846153, "grad_norm": 5.043601989746094, "learning_rate": 2.1613222243543597e-05, "loss": 0.3163, "num_input_tokens_seen": 4081680, "step": 45325 }, { "epoch": 11.78014553014553, "grad_norm": 9.074570655822754, "learning_rate": 2.1607604998369495e-05, "loss": 0.2711, "num_input_tokens_seen": 4082112, "step": 45330 }, { "epoch": 11.781444906444907, "grad_norm": 4.956896781921387, "learning_rate": 2.1601987927669524e-05, "loss": 0.1067, "num_input_tokens_seen": 4082560, "step": 45335 }, { "epoch": 11.782744282744282, "grad_norm": 0.19710451364517212, "learning_rate": 2.159637103173256e-05, "loss": 0.2021, "num_input_tokens_seen": 4082992, "step": 45340 }, { "epoch": 11.78404365904366, "grad_norm": 11.110321998596191, "learning_rate": 2.159075431084751e-05, "loss": 0.5363, "num_input_tokens_seen": 4083488, "step": 45345 }, { "epoch": 11.785343035343036, "grad_norm": 0.17466452717781067, "learning_rate": 2.158513776530322e-05, "loss": 0.3327, "num_input_tokens_seen": 4083936, "step": 45350 }, { "epoch": 11.786642411642411, "grad_norm": 6.263943672180176, "learning_rate": 2.1579521395388573e-05, "loss": 0.3722, "num_input_tokens_seen": 4084432, "step": 45355 }, { "epoch": 11.787941787941788, "grad_norm": 0.762385368347168, "learning_rate": 2.157390520139241e-05, "loss": 0.1749, "num_input_tokens_seen": 4084880, "step": 45360 }, { "epoch": 11.789241164241163, "grad_norm": 1.9690557718276978, "learning_rate": 2.1568289183603598e-05, "loss": 0.1049, "num_input_tokens_seen": 4085296, "step": 45365 }, { "epoch": 11.79054054054054, "grad_norm": 4.642643451690674, "learning_rate": 2.1562673342310946e-05, "loss": 0.4353, "num_input_tokens_seen": 4085744, "step": 45370 }, { "epoch": 11.791839916839917, "grad_norm": 4.22018575668335, "learning_rate": 2.1557057677803313e-05, "loss": 0.2265, "num_input_tokens_seen": 4086176, "step": 45375 }, { "epoch": 11.793139293139292, "grad_norm": 7.434584617614746, "learning_rate": 2.1551442190369493e-05, "loss": 0.1386, "num_input_tokens_seen": 4086592, "step": 45380 }, { "epoch": 11.79443866943867, "grad_norm": 0.853719174861908, "learning_rate": 2.154582688029831e-05, "loss": 0.3176, "num_input_tokens_seen": 4087040, "step": 45385 }, { "epoch": 11.795738045738046, "grad_norm": 0.5157082080841064, "learning_rate": 2.1540211747878563e-05, "loss": 0.3025, "num_input_tokens_seen": 4087504, "step": 45390 }, { "epoch": 11.797037422037421, "grad_norm": 8.406194686889648, "learning_rate": 2.1534596793399032e-05, "loss": 0.5708, "num_input_tokens_seen": 4087968, "step": 45395 }, { "epoch": 11.798336798336798, "grad_norm": 4.687753677368164, "learning_rate": 2.1528982017148515e-05, "loss": 0.2096, "num_input_tokens_seen": 4088400, "step": 45400 }, { "epoch": 11.799636174636175, "grad_norm": 0.26402702927589417, "learning_rate": 2.152336741941577e-05, "loss": 0.1115, "num_input_tokens_seen": 4088848, "step": 45405 }, { "epoch": 11.80093555093555, "grad_norm": 6.263082504272461, "learning_rate": 2.1517753000489586e-05, "loss": 0.3877, "num_input_tokens_seen": 4089312, "step": 45410 }, { "epoch": 11.802234927234927, "grad_norm": 1.165220856666565, "learning_rate": 2.1512138760658684e-05, "loss": 0.1907, "num_input_tokens_seen": 4089760, "step": 45415 }, { "epoch": 11.803534303534304, "grad_norm": 0.16255980730056763, "learning_rate": 2.1506524700211838e-05, "loss": 0.1404, "num_input_tokens_seen": 4090240, "step": 45420 }, { "epoch": 11.80483367983368, "grad_norm": 3.9900126457214355, "learning_rate": 2.1500910819437766e-05, "loss": 0.2486, "num_input_tokens_seen": 4090672, "step": 45425 }, { "epoch": 11.806133056133056, "grad_norm": 7.75413703918457, "learning_rate": 2.1495297118625215e-05, "loss": 0.2105, "num_input_tokens_seen": 4091184, "step": 45430 }, { "epoch": 11.807432432432432, "grad_norm": 0.7286781072616577, "learning_rate": 2.148968359806288e-05, "loss": 0.1106, "num_input_tokens_seen": 4091632, "step": 45435 }, { "epoch": 11.808731808731808, "grad_norm": 0.8682584166526794, "learning_rate": 2.1484070258039488e-05, "loss": 0.4124, "num_input_tokens_seen": 4092048, "step": 45440 }, { "epoch": 11.810031185031185, "grad_norm": 6.981697082519531, "learning_rate": 2.1478457098843724e-05, "loss": 0.3048, "num_input_tokens_seen": 4092512, "step": 45445 }, { "epoch": 11.81133056133056, "grad_norm": 7.750202178955078, "learning_rate": 2.1472844120764295e-05, "loss": 0.4372, "num_input_tokens_seen": 4092960, "step": 45450 }, { "epoch": 11.812629937629938, "grad_norm": 6.136624813079834, "learning_rate": 2.146723132408987e-05, "loss": 0.2679, "num_input_tokens_seen": 4093424, "step": 45455 }, { "epoch": 11.813929313929314, "grad_norm": 3.939504861831665, "learning_rate": 2.1461618709109116e-05, "loss": 0.1344, "num_input_tokens_seen": 4093872, "step": 45460 }, { "epoch": 11.81522869022869, "grad_norm": 7.0432562828063965, "learning_rate": 2.1456006276110717e-05, "loss": 0.4258, "num_input_tokens_seen": 4094352, "step": 45465 }, { "epoch": 11.816528066528067, "grad_norm": 6.021924018859863, "learning_rate": 2.1450394025383295e-05, "loss": 0.332, "num_input_tokens_seen": 4094784, "step": 45470 }, { "epoch": 11.817827442827443, "grad_norm": 1.5220766067504883, "learning_rate": 2.1444781957215515e-05, "loss": 0.3312, "num_input_tokens_seen": 4095248, "step": 45475 }, { "epoch": 11.819126819126819, "grad_norm": 0.8938331007957458, "learning_rate": 2.1439170071896e-05, "loss": 0.1107, "num_input_tokens_seen": 4095696, "step": 45480 }, { "epoch": 11.820426195426196, "grad_norm": 0.37646669149398804, "learning_rate": 2.1433558369713394e-05, "loss": 0.1076, "num_input_tokens_seen": 4096128, "step": 45485 }, { "epoch": 11.821725571725572, "grad_norm": 3.4817254543304443, "learning_rate": 2.1427946850956286e-05, "loss": 0.2701, "num_input_tokens_seen": 4096576, "step": 45490 }, { "epoch": 11.823024948024948, "grad_norm": 4.8225789070129395, "learning_rate": 2.14223355159133e-05, "loss": 0.2624, "num_input_tokens_seen": 4097040, "step": 45495 }, { "epoch": 11.824324324324325, "grad_norm": 4.750308513641357, "learning_rate": 2.141672436487302e-05, "loss": 0.1696, "num_input_tokens_seen": 4097472, "step": 45500 }, { "epoch": 11.825623700623701, "grad_norm": 2.274949550628662, "learning_rate": 2.141111339812405e-05, "loss": 0.1169, "num_input_tokens_seen": 4097904, "step": 45505 }, { "epoch": 11.826923076923077, "grad_norm": 5.182999134063721, "learning_rate": 2.1405502615954945e-05, "loss": 0.2725, "num_input_tokens_seen": 4098320, "step": 45510 }, { "epoch": 11.828222453222454, "grad_norm": 0.7707659006118774, "learning_rate": 2.139989201865429e-05, "loss": 0.1748, "num_input_tokens_seen": 4098768, "step": 45515 }, { "epoch": 11.829521829521829, "grad_norm": 0.139680877327919, "learning_rate": 2.1394281606510635e-05, "loss": 0.0455, "num_input_tokens_seen": 4099200, "step": 45520 }, { "epoch": 11.830821205821206, "grad_norm": 6.648404598236084, "learning_rate": 2.1388671379812543e-05, "loss": 0.5663, "num_input_tokens_seen": 4099664, "step": 45525 }, { "epoch": 11.832120582120583, "grad_norm": 9.40778923034668, "learning_rate": 2.1383061338848533e-05, "loss": 0.4036, "num_input_tokens_seen": 4100128, "step": 45530 }, { "epoch": 11.833419958419958, "grad_norm": 2.05515193939209, "learning_rate": 2.137745148390714e-05, "loss": 0.3273, "num_input_tokens_seen": 4100544, "step": 45535 }, { "epoch": 11.834719334719335, "grad_norm": 0.2424919456243515, "learning_rate": 2.13718418152769e-05, "loss": 0.1328, "num_input_tokens_seen": 4100976, "step": 45540 }, { "epoch": 11.836018711018712, "grad_norm": 3.095123052597046, "learning_rate": 2.13662323332463e-05, "loss": 0.2302, "num_input_tokens_seen": 4101456, "step": 45545 }, { "epoch": 11.837318087318087, "grad_norm": 4.944365501403809, "learning_rate": 2.1360623038103858e-05, "loss": 0.4989, "num_input_tokens_seen": 4101904, "step": 45550 }, { "epoch": 11.838617463617464, "grad_norm": 7.881525993347168, "learning_rate": 2.1355013930138054e-05, "loss": 0.3252, "num_input_tokens_seen": 4102320, "step": 45555 }, { "epoch": 11.83991683991684, "grad_norm": 3.8411507606506348, "learning_rate": 2.1349405009637388e-05, "loss": 0.212, "num_input_tokens_seen": 4102768, "step": 45560 }, { "epoch": 11.841216216216216, "grad_norm": 0.20144878327846527, "learning_rate": 2.134379627689031e-05, "loss": 0.2321, "num_input_tokens_seen": 4103168, "step": 45565 }, { "epoch": 11.842515592515593, "grad_norm": 5.823935031890869, "learning_rate": 2.13381877321853e-05, "loss": 0.2318, "num_input_tokens_seen": 4103616, "step": 45570 }, { "epoch": 11.84381496881497, "grad_norm": 0.5295219421386719, "learning_rate": 2.1332579375810794e-05, "loss": 0.1896, "num_input_tokens_seen": 4104048, "step": 45575 }, { "epoch": 11.845114345114345, "grad_norm": 0.6143555641174316, "learning_rate": 2.1326971208055258e-05, "loss": 0.2119, "num_input_tokens_seen": 4104496, "step": 45580 }, { "epoch": 11.846413721413722, "grad_norm": 5.730975151062012, "learning_rate": 2.1321363229207096e-05, "loss": 0.2263, "num_input_tokens_seen": 4104960, "step": 45585 }, { "epoch": 11.847713097713097, "grad_norm": 4.174559593200684, "learning_rate": 2.131575543955476e-05, "loss": 0.4231, "num_input_tokens_seen": 4105424, "step": 45590 }, { "epoch": 11.849012474012474, "grad_norm": 0.8544387221336365, "learning_rate": 2.131014783938666e-05, "loss": 0.1078, "num_input_tokens_seen": 4105872, "step": 45595 }, { "epoch": 11.85031185031185, "grad_norm": 2.5366265773773193, "learning_rate": 2.130454042899117e-05, "loss": 0.1359, "num_input_tokens_seen": 4106336, "step": 45600 }, { "epoch": 11.851611226611226, "grad_norm": 5.292912006378174, "learning_rate": 2.1298933208656718e-05, "loss": 0.1819, "num_input_tokens_seen": 4106784, "step": 45605 }, { "epoch": 11.852910602910603, "grad_norm": 4.71028470993042, "learning_rate": 2.1293326178671676e-05, "loss": 0.3892, "num_input_tokens_seen": 4107280, "step": 45610 }, { "epoch": 11.85420997920998, "grad_norm": 2.62825345993042, "learning_rate": 2.1287719339324426e-05, "loss": 0.2619, "num_input_tokens_seen": 4107696, "step": 45615 }, { "epoch": 11.855509355509355, "grad_norm": 0.13667839765548706, "learning_rate": 2.128211269090331e-05, "loss": 0.2983, "num_input_tokens_seen": 4108144, "step": 45620 }, { "epoch": 11.856808731808732, "grad_norm": 4.764989376068115, "learning_rate": 2.127650623369672e-05, "loss": 0.3437, "num_input_tokens_seen": 4108592, "step": 45625 }, { "epoch": 11.858108108108109, "grad_norm": 2.4894115924835205, "learning_rate": 2.127089996799297e-05, "loss": 0.0956, "num_input_tokens_seen": 4109072, "step": 45630 }, { "epoch": 11.859407484407484, "grad_norm": 3.911344528198242, "learning_rate": 2.1265293894080412e-05, "loss": 0.2232, "num_input_tokens_seen": 4109472, "step": 45635 }, { "epoch": 11.86070686070686, "grad_norm": 0.5365197658538818, "learning_rate": 2.1259688012247364e-05, "loss": 0.176, "num_input_tokens_seen": 4109936, "step": 45640 }, { "epoch": 11.862006237006238, "grad_norm": 4.919029235839844, "learning_rate": 2.1254082322782157e-05, "loss": 0.2595, "num_input_tokens_seen": 4110416, "step": 45645 }, { "epoch": 11.863305613305613, "grad_norm": 3.0987496376037598, "learning_rate": 2.124847682597307e-05, "loss": 0.1639, "num_input_tokens_seen": 4110848, "step": 45650 }, { "epoch": 11.86460498960499, "grad_norm": 5.707505702972412, "learning_rate": 2.1242871522108422e-05, "loss": 0.3611, "num_input_tokens_seen": 4111296, "step": 45655 }, { "epoch": 11.865904365904367, "grad_norm": 1.6952617168426514, "learning_rate": 2.12372664114765e-05, "loss": 0.2843, "num_input_tokens_seen": 4111760, "step": 45660 }, { "epoch": 11.867203742203742, "grad_norm": 3.5380706787109375, "learning_rate": 2.123166149436556e-05, "loss": 0.2749, "num_input_tokens_seen": 4112240, "step": 45665 }, { "epoch": 11.868503118503119, "grad_norm": 3.323683977127075, "learning_rate": 2.1226056771063883e-05, "loss": 0.1785, "num_input_tokens_seen": 4112656, "step": 45670 }, { "epoch": 11.869802494802494, "grad_norm": 3.501230239868164, "learning_rate": 2.1220452241859718e-05, "loss": 0.3445, "num_input_tokens_seen": 4113104, "step": 45675 }, { "epoch": 11.871101871101871, "grad_norm": 1.918104887008667, "learning_rate": 2.1214847907041326e-05, "loss": 0.2244, "num_input_tokens_seen": 4113536, "step": 45680 }, { "epoch": 11.872401247401248, "grad_norm": 0.5686607360839844, "learning_rate": 2.1209243766896923e-05, "loss": 0.1309, "num_input_tokens_seen": 4113968, "step": 45685 }, { "epoch": 11.873700623700623, "grad_norm": 0.47378048300743103, "learning_rate": 2.1203639821714748e-05, "loss": 0.1953, "num_input_tokens_seen": 4114400, "step": 45690 }, { "epoch": 11.875, "grad_norm": 2.47540545463562, "learning_rate": 2.119803607178301e-05, "loss": 0.1782, "num_input_tokens_seen": 4114896, "step": 45695 }, { "epoch": 11.876299376299377, "grad_norm": 1.8887841701507568, "learning_rate": 2.119243251738993e-05, "loss": 0.1857, "num_input_tokens_seen": 4115376, "step": 45700 }, { "epoch": 11.877598752598752, "grad_norm": 3.0528883934020996, "learning_rate": 2.1186829158823686e-05, "loss": 0.2667, "num_input_tokens_seen": 4115808, "step": 45705 }, { "epoch": 11.878898128898129, "grad_norm": 0.8559607863426208, "learning_rate": 2.1181225996372477e-05, "loss": 0.1732, "num_input_tokens_seen": 4116272, "step": 45710 }, { "epoch": 11.880197505197506, "grad_norm": 2.076937437057495, "learning_rate": 2.1175623030324468e-05, "loss": 0.1146, "num_input_tokens_seen": 4116720, "step": 45715 }, { "epoch": 11.881496881496881, "grad_norm": 1.5132248401641846, "learning_rate": 2.117002026096784e-05, "loss": 0.3013, "num_input_tokens_seen": 4117216, "step": 45720 }, { "epoch": 11.882796257796258, "grad_norm": 4.466894626617432, "learning_rate": 2.116441768859074e-05, "loss": 0.2516, "num_input_tokens_seen": 4117664, "step": 45725 }, { "epoch": 11.884095634095633, "grad_norm": 8.791643142700195, "learning_rate": 2.11588153134813e-05, "loss": 0.5683, "num_input_tokens_seen": 4118112, "step": 45730 }, { "epoch": 11.88539501039501, "grad_norm": 9.611876487731934, "learning_rate": 2.115321313592768e-05, "loss": 0.2976, "num_input_tokens_seen": 4118576, "step": 45735 }, { "epoch": 11.886694386694387, "grad_norm": 0.9128137230873108, "learning_rate": 2.114761115621799e-05, "loss": 0.2205, "num_input_tokens_seen": 4119008, "step": 45740 }, { "epoch": 11.887993762993762, "grad_norm": 7.743868827819824, "learning_rate": 2.114200937464035e-05, "loss": 0.5161, "num_input_tokens_seen": 4119456, "step": 45745 }, { "epoch": 11.88929313929314, "grad_norm": 3.009242057800293, "learning_rate": 2.1136407791482862e-05, "loss": 0.353, "num_input_tokens_seen": 4119872, "step": 45750 }, { "epoch": 11.890592515592516, "grad_norm": 3.1803953647613525, "learning_rate": 2.1130806407033633e-05, "loss": 0.3132, "num_input_tokens_seen": 4120336, "step": 45755 }, { "epoch": 11.891891891891891, "grad_norm": 1.7960518598556519, "learning_rate": 2.1125205221580725e-05, "loss": 0.1298, "num_input_tokens_seen": 4120784, "step": 45760 }, { "epoch": 11.893191268191268, "grad_norm": 2.920703649520874, "learning_rate": 2.1119604235412233e-05, "loss": 0.2428, "num_input_tokens_seen": 4121248, "step": 45765 }, { "epoch": 11.894490644490645, "grad_norm": 4.293121337890625, "learning_rate": 2.1114003448816206e-05, "loss": 0.271, "num_input_tokens_seen": 4121696, "step": 45770 }, { "epoch": 11.89579002079002, "grad_norm": 6.168631553649902, "learning_rate": 2.1108402862080716e-05, "loss": 0.1251, "num_input_tokens_seen": 4122192, "step": 45775 }, { "epoch": 11.897089397089397, "grad_norm": 6.247742176055908, "learning_rate": 2.1102802475493786e-05, "loss": 0.2245, "num_input_tokens_seen": 4122640, "step": 45780 }, { "epoch": 11.898388773388774, "grad_norm": 2.12453293800354, "learning_rate": 2.1097202289343464e-05, "loss": 0.4158, "num_input_tokens_seen": 4123072, "step": 45785 }, { "epoch": 11.89968814968815, "grad_norm": 6.624774932861328, "learning_rate": 2.109160230391777e-05, "loss": 0.248, "num_input_tokens_seen": 4123552, "step": 45790 }, { "epoch": 11.900987525987526, "grad_norm": 1.0859345197677612, "learning_rate": 2.1086002519504707e-05, "loss": 0.3457, "num_input_tokens_seen": 4124016, "step": 45795 }, { "epoch": 11.902286902286903, "grad_norm": 4.983315944671631, "learning_rate": 2.1080402936392292e-05, "loss": 0.2175, "num_input_tokens_seen": 4124448, "step": 45800 }, { "epoch": 11.903586278586278, "grad_norm": 4.215297222137451, "learning_rate": 2.10748035548685e-05, "loss": 0.2597, "num_input_tokens_seen": 4124912, "step": 45805 }, { "epoch": 11.904885654885655, "grad_norm": 5.114809513092041, "learning_rate": 2.1069204375221334e-05, "loss": 0.3457, "num_input_tokens_seen": 4125328, "step": 45810 }, { "epoch": 11.90618503118503, "grad_norm": 0.8494316339492798, "learning_rate": 2.1063605397738743e-05, "loss": 0.1021, "num_input_tokens_seen": 4125760, "step": 45815 }, { "epoch": 11.907484407484407, "grad_norm": 1.0545604228973389, "learning_rate": 2.10580066227087e-05, "loss": 0.172, "num_input_tokens_seen": 4126224, "step": 45820 }, { "epoch": 11.908783783783784, "grad_norm": 7.821484565734863, "learning_rate": 2.1052408050419152e-05, "loss": 0.4495, "num_input_tokens_seen": 4126624, "step": 45825 }, { "epoch": 11.91008316008316, "grad_norm": 4.893515110015869, "learning_rate": 2.104680968115805e-05, "loss": 0.0929, "num_input_tokens_seen": 4127056, "step": 45830 }, { "epoch": 11.911382536382536, "grad_norm": 8.84161376953125, "learning_rate": 2.1041211515213304e-05, "loss": 0.2424, "num_input_tokens_seen": 4127504, "step": 45835 }, { "epoch": 11.912681912681913, "grad_norm": 4.64796781539917, "learning_rate": 2.103561355287285e-05, "loss": 0.3039, "num_input_tokens_seen": 4127968, "step": 45840 }, { "epoch": 11.913981288981288, "grad_norm": 4.12613582611084, "learning_rate": 2.103001579442458e-05, "loss": 0.1429, "num_input_tokens_seen": 4128448, "step": 45845 }, { "epoch": 11.915280665280665, "grad_norm": 2.5274059772491455, "learning_rate": 2.1024418240156413e-05, "loss": 0.214, "num_input_tokens_seen": 4128864, "step": 45850 }, { "epoch": 11.916580041580042, "grad_norm": 14.092799186706543, "learning_rate": 2.1018820890356224e-05, "loss": 0.2568, "num_input_tokens_seen": 4129312, "step": 45855 }, { "epoch": 11.917879417879417, "grad_norm": 8.584785461425781, "learning_rate": 2.101322374531188e-05, "loss": 0.7127, "num_input_tokens_seen": 4129744, "step": 45860 }, { "epoch": 11.919178794178794, "grad_norm": 5.879669189453125, "learning_rate": 2.1007626805311272e-05, "loss": 0.3815, "num_input_tokens_seen": 4130224, "step": 45865 }, { "epoch": 11.920478170478171, "grad_norm": 8.541300773620605, "learning_rate": 2.1002030070642224e-05, "loss": 0.3531, "num_input_tokens_seen": 4130688, "step": 45870 }, { "epoch": 11.921777546777546, "grad_norm": 0.998473048210144, "learning_rate": 2.099643354159262e-05, "loss": 0.7763, "num_input_tokens_seen": 4131136, "step": 45875 }, { "epoch": 11.923076923076923, "grad_norm": 5.1133317947387695, "learning_rate": 2.0990837218450265e-05, "loss": 0.2613, "num_input_tokens_seen": 4131552, "step": 45880 }, { "epoch": 11.924376299376299, "grad_norm": 1.2685171365737915, "learning_rate": 2.0985241101502996e-05, "loss": 0.2372, "num_input_tokens_seen": 4132016, "step": 45885 }, { "epoch": 11.925675675675675, "grad_norm": 1.755781888961792, "learning_rate": 2.0979645191038623e-05, "loss": 0.2464, "num_input_tokens_seen": 4132464, "step": 45890 }, { "epoch": 11.926975051975052, "grad_norm": 2.880396842956543, "learning_rate": 2.0974049487344953e-05, "loss": 0.4431, "num_input_tokens_seen": 4132896, "step": 45895 }, { "epoch": 11.928274428274428, "grad_norm": 3.764310598373413, "learning_rate": 2.0968453990709768e-05, "loss": 0.2426, "num_input_tokens_seen": 4133344, "step": 45900 }, { "epoch": 11.929573804573804, "grad_norm": 2.584688663482666, "learning_rate": 2.0962858701420866e-05, "loss": 0.3299, "num_input_tokens_seen": 4133824, "step": 45905 }, { "epoch": 11.930873180873181, "grad_norm": 0.9201558828353882, "learning_rate": 2.0957263619766e-05, "loss": 0.2142, "num_input_tokens_seen": 4134304, "step": 45910 }, { "epoch": 11.932172557172557, "grad_norm": 11.684093475341797, "learning_rate": 2.0951668746032953e-05, "loss": 0.37, "num_input_tokens_seen": 4134768, "step": 45915 }, { "epoch": 11.933471933471933, "grad_norm": 6.035831928253174, "learning_rate": 2.0946074080509453e-05, "loss": 0.2154, "num_input_tokens_seen": 4135232, "step": 45920 }, { "epoch": 11.93477130977131, "grad_norm": 5.427606105804443, "learning_rate": 2.0940479623483246e-05, "loss": 0.1817, "num_input_tokens_seen": 4135696, "step": 45925 }, { "epoch": 11.936070686070686, "grad_norm": 5.202140808105469, "learning_rate": 2.0934885375242068e-05, "loss": 0.1798, "num_input_tokens_seen": 4136112, "step": 45930 }, { "epoch": 11.937370062370062, "grad_norm": 0.7730783224105835, "learning_rate": 2.092929133607362e-05, "loss": 0.3763, "num_input_tokens_seen": 4136560, "step": 45935 }, { "epoch": 11.93866943866944, "grad_norm": 1.2943722009658813, "learning_rate": 2.0923697506265625e-05, "loss": 0.1001, "num_input_tokens_seen": 4137008, "step": 45940 }, { "epoch": 11.939968814968815, "grad_norm": 4.314658164978027, "learning_rate": 2.0918103886105768e-05, "loss": 0.2924, "num_input_tokens_seen": 4137440, "step": 45945 }, { "epoch": 11.941268191268192, "grad_norm": 7.196037769317627, "learning_rate": 2.091251047588175e-05, "loss": 0.5767, "num_input_tokens_seen": 4137904, "step": 45950 }, { "epoch": 11.942567567567568, "grad_norm": 4.179629325866699, "learning_rate": 2.0906917275881224e-05, "loss": 0.2133, "num_input_tokens_seen": 4138320, "step": 45955 }, { "epoch": 11.943866943866944, "grad_norm": 7.103030204772949, "learning_rate": 2.090132428639187e-05, "loss": 0.3538, "num_input_tokens_seen": 4138736, "step": 45960 }, { "epoch": 11.94516632016632, "grad_norm": 5.611041069030762, "learning_rate": 2.089573150770133e-05, "loss": 0.4233, "num_input_tokens_seen": 4139184, "step": 45965 }, { "epoch": 11.946465696465696, "grad_norm": 9.144351959228516, "learning_rate": 2.089013894009726e-05, "loss": 0.2448, "num_input_tokens_seen": 4139664, "step": 45970 }, { "epoch": 11.947765072765073, "grad_norm": 1.651275873184204, "learning_rate": 2.088454658386727e-05, "loss": 0.2699, "num_input_tokens_seen": 4140112, "step": 45975 }, { "epoch": 11.94906444906445, "grad_norm": 4.339052200317383, "learning_rate": 2.0878954439299003e-05, "loss": 0.1416, "num_input_tokens_seen": 4140592, "step": 45980 }, { "epoch": 11.950363825363825, "grad_norm": 6.015580177307129, "learning_rate": 2.0873362506680057e-05, "loss": 0.1943, "num_input_tokens_seen": 4141104, "step": 45985 }, { "epoch": 11.951663201663202, "grad_norm": 10.139862060546875, "learning_rate": 2.0867770786298023e-05, "loss": 0.2343, "num_input_tokens_seen": 4141616, "step": 45990 }, { "epoch": 11.952962577962579, "grad_norm": 1.1752232313156128, "learning_rate": 2.0862179278440507e-05, "loss": 0.1152, "num_input_tokens_seen": 4142096, "step": 45995 }, { "epoch": 11.954261954261954, "grad_norm": 1.3809280395507812, "learning_rate": 2.0856587983395064e-05, "loss": 0.1579, "num_input_tokens_seen": 4142528, "step": 46000 }, { "epoch": 11.95556133056133, "grad_norm": 8.531806945800781, "learning_rate": 2.0850996901449284e-05, "loss": 0.4035, "num_input_tokens_seen": 4142944, "step": 46005 }, { "epoch": 11.956860706860708, "grad_norm": 7.990804672241211, "learning_rate": 2.0845406032890698e-05, "loss": 0.277, "num_input_tokens_seen": 4143392, "step": 46010 }, { "epoch": 11.958160083160083, "grad_norm": 6.80515718460083, "learning_rate": 2.0839815378006865e-05, "loss": 0.2869, "num_input_tokens_seen": 4143840, "step": 46015 }, { "epoch": 11.95945945945946, "grad_norm": 1.9271057844161987, "learning_rate": 2.083422493708531e-05, "loss": 0.2103, "num_input_tokens_seen": 4144304, "step": 46020 }, { "epoch": 11.960758835758837, "grad_norm": 4.045886993408203, "learning_rate": 2.0828634710413565e-05, "loss": 0.2954, "num_input_tokens_seen": 4144736, "step": 46025 }, { "epoch": 11.962058212058212, "grad_norm": 3.5370278358459473, "learning_rate": 2.0823044698279126e-05, "loss": 0.1454, "num_input_tokens_seen": 4145200, "step": 46030 }, { "epoch": 11.963357588357589, "grad_norm": 3.6843490600585938, "learning_rate": 2.0817454900969508e-05, "loss": 0.3743, "num_input_tokens_seen": 4145664, "step": 46035 }, { "epoch": 11.964656964656964, "grad_norm": 6.40044641494751, "learning_rate": 2.081186531877218e-05, "loss": 0.183, "num_input_tokens_seen": 4146080, "step": 46040 }, { "epoch": 11.96595634095634, "grad_norm": 8.769394874572754, "learning_rate": 2.0806275951974647e-05, "loss": 0.2635, "num_input_tokens_seen": 4146496, "step": 46045 }, { "epoch": 11.967255717255718, "grad_norm": 3.473545789718628, "learning_rate": 2.0800686800864353e-05, "loss": 0.228, "num_input_tokens_seen": 4146928, "step": 46050 }, { "epoch": 11.968555093555093, "grad_norm": 3.821376085281372, "learning_rate": 2.0795097865728755e-05, "loss": 0.4353, "num_input_tokens_seen": 4147344, "step": 46055 }, { "epoch": 11.96985446985447, "grad_norm": 6.995368003845215, "learning_rate": 2.0789509146855316e-05, "loss": 0.2053, "num_input_tokens_seen": 4147760, "step": 46060 }, { "epoch": 11.971153846153847, "grad_norm": 0.5106738805770874, "learning_rate": 2.078392064453144e-05, "loss": 0.3361, "num_input_tokens_seen": 4148256, "step": 46065 }, { "epoch": 11.972453222453222, "grad_norm": 3.600172996520996, "learning_rate": 2.0778332359044576e-05, "loss": 0.3083, "num_input_tokens_seen": 4148688, "step": 46070 }, { "epoch": 11.973752598752599, "grad_norm": 3.12788987159729, "learning_rate": 2.0772744290682113e-05, "loss": 0.237, "num_input_tokens_seen": 4149152, "step": 46075 }, { "epoch": 11.975051975051976, "grad_norm": 1.7661110162734985, "learning_rate": 2.076715643973148e-05, "loss": 0.2837, "num_input_tokens_seen": 4149584, "step": 46080 }, { "epoch": 11.97635135135135, "grad_norm": 2.855365753173828, "learning_rate": 2.076156880648003e-05, "loss": 0.2324, "num_input_tokens_seen": 4150000, "step": 46085 }, { "epoch": 11.977650727650728, "grad_norm": 1.8350636959075928, "learning_rate": 2.075598139121516e-05, "loss": 0.2007, "num_input_tokens_seen": 4150464, "step": 46090 }, { "epoch": 11.978950103950105, "grad_norm": 6.691379070281982, "learning_rate": 2.0750394194224232e-05, "loss": 0.4343, "num_input_tokens_seen": 4150896, "step": 46095 }, { "epoch": 11.98024948024948, "grad_norm": 0.11626799404621124, "learning_rate": 2.0744807215794614e-05, "loss": 0.1682, "num_input_tokens_seen": 4151360, "step": 46100 }, { "epoch": 11.981548856548857, "grad_norm": 3.3043062686920166, "learning_rate": 2.073922045621362e-05, "loss": 0.3765, "num_input_tokens_seen": 4151776, "step": 46105 }, { "epoch": 11.982848232848234, "grad_norm": 0.5947103500366211, "learning_rate": 2.073363391576862e-05, "loss": 0.2099, "num_input_tokens_seen": 4152256, "step": 46110 }, { "epoch": 11.984147609147609, "grad_norm": 2.959822177886963, "learning_rate": 2.0728047594746912e-05, "loss": 0.2649, "num_input_tokens_seen": 4152720, "step": 46115 }, { "epoch": 11.985446985446986, "grad_norm": 0.6896659135818481, "learning_rate": 2.0722461493435794e-05, "loss": 0.3409, "num_input_tokens_seen": 4153136, "step": 46120 }, { "epoch": 11.986746361746361, "grad_norm": 1.4657536745071411, "learning_rate": 2.0716875612122596e-05, "loss": 0.1165, "num_input_tokens_seen": 4153600, "step": 46125 }, { "epoch": 11.988045738045738, "grad_norm": 3.551570177078247, "learning_rate": 2.071128995109458e-05, "loss": 0.1521, "num_input_tokens_seen": 4154080, "step": 46130 }, { "epoch": 11.989345114345115, "grad_norm": 4.052803993225098, "learning_rate": 2.0705704510639035e-05, "loss": 0.2066, "num_input_tokens_seen": 4154544, "step": 46135 }, { "epoch": 11.99064449064449, "grad_norm": 0.3488479256629944, "learning_rate": 2.0700119291043215e-05, "loss": 0.3007, "num_input_tokens_seen": 4154944, "step": 46140 }, { "epoch": 11.991943866943867, "grad_norm": 1.4174095392227173, "learning_rate": 2.0694534292594392e-05, "loss": 0.1536, "num_input_tokens_seen": 4155424, "step": 46145 }, { "epoch": 11.993243243243244, "grad_norm": 2.688108444213867, "learning_rate": 2.0688949515579785e-05, "loss": 0.2133, "num_input_tokens_seen": 4155888, "step": 46150 }, { "epoch": 11.994542619542619, "grad_norm": 3.701430082321167, "learning_rate": 2.068336496028664e-05, "loss": 0.2207, "num_input_tokens_seen": 4156336, "step": 46155 }, { "epoch": 11.995841995841996, "grad_norm": 4.810431957244873, "learning_rate": 2.0677780627002166e-05, "loss": 0.3826, "num_input_tokens_seen": 4156784, "step": 46160 }, { "epoch": 11.997141372141373, "grad_norm": 3.1331441402435303, "learning_rate": 2.0672196516013587e-05, "loss": 0.4042, "num_input_tokens_seen": 4157248, "step": 46165 }, { "epoch": 11.998440748440748, "grad_norm": 0.15890143811702728, "learning_rate": 2.066661262760808e-05, "loss": 0.1922, "num_input_tokens_seen": 4157728, "step": 46170 }, { "epoch": 11.999740124740125, "grad_norm": 0.8522543907165527, "learning_rate": 2.0661028962072836e-05, "loss": 0.0849, "num_input_tokens_seen": 4158128, "step": 46175 }, { "epoch": 12.0, "eval_loss": 0.2743130028247833, "eval_runtime": 13.1417, "eval_samples_per_second": 65.136, "eval_steps_per_second": 32.568, "num_input_tokens_seen": 4158168, "step": 46176 }, { "epoch": 12.001039501039502, "grad_norm": 2.2527050971984863, "learning_rate": 2.065544551969504e-05, "loss": 0.0922, "num_input_tokens_seen": 4158536, "step": 46180 }, { "epoch": 12.002338877338877, "grad_norm": 2.3289408683776855, "learning_rate": 2.0649862300761833e-05, "loss": 0.1746, "num_input_tokens_seen": 4158968, "step": 46185 }, { "epoch": 12.003638253638254, "grad_norm": 4.122756004333496, "learning_rate": 2.064427930556038e-05, "loss": 0.2245, "num_input_tokens_seen": 4159416, "step": 46190 }, { "epoch": 12.00493762993763, "grad_norm": 2.715284824371338, "learning_rate": 2.063869653437781e-05, "loss": 0.3747, "num_input_tokens_seen": 4159848, "step": 46195 }, { "epoch": 12.006237006237006, "grad_norm": 5.840455055236816, "learning_rate": 2.063311398750127e-05, "loss": 0.3108, "num_input_tokens_seen": 4160280, "step": 46200 }, { "epoch": 12.007536382536383, "grad_norm": 0.15614116191864014, "learning_rate": 2.0627531665217844e-05, "loss": 0.2724, "num_input_tokens_seen": 4160744, "step": 46205 }, { "epoch": 12.008835758835758, "grad_norm": 0.26955994963645935, "learning_rate": 2.0621949567814663e-05, "loss": 0.2015, "num_input_tokens_seen": 4161192, "step": 46210 }, { "epoch": 12.010135135135135, "grad_norm": 0.9961588382720947, "learning_rate": 2.061636769557881e-05, "loss": 0.2151, "num_input_tokens_seen": 4161640, "step": 46215 }, { "epoch": 12.011434511434512, "grad_norm": 5.746662616729736, "learning_rate": 2.061078604879737e-05, "loss": 0.3001, "num_input_tokens_seen": 4162104, "step": 46220 }, { "epoch": 12.012733887733887, "grad_norm": 4.449621677398682, "learning_rate": 2.0605204627757403e-05, "loss": 0.1857, "num_input_tokens_seen": 4162536, "step": 46225 }, { "epoch": 12.014033264033264, "grad_norm": 5.37693452835083, "learning_rate": 2.0599623432745976e-05, "loss": 0.4462, "num_input_tokens_seen": 4163000, "step": 46230 }, { "epoch": 12.015332640332641, "grad_norm": 4.758779525756836, "learning_rate": 2.059404246405013e-05, "loss": 0.2057, "num_input_tokens_seen": 4163464, "step": 46235 }, { "epoch": 12.016632016632016, "grad_norm": 2.276247978210449, "learning_rate": 2.058846172195691e-05, "loss": 0.4208, "num_input_tokens_seen": 4163912, "step": 46240 }, { "epoch": 12.017931392931393, "grad_norm": 0.7212021350860596, "learning_rate": 2.0582881206753323e-05, "loss": 0.1011, "num_input_tokens_seen": 4164344, "step": 46245 }, { "epoch": 12.01923076923077, "grad_norm": 8.347296714782715, "learning_rate": 2.0577300918726382e-05, "loss": 0.2519, "num_input_tokens_seen": 4164776, "step": 46250 }, { "epoch": 12.020530145530145, "grad_norm": 8.16822624206543, "learning_rate": 2.0571720858163105e-05, "loss": 0.191, "num_input_tokens_seen": 4165224, "step": 46255 }, { "epoch": 12.021829521829522, "grad_norm": 3.4111833572387695, "learning_rate": 2.056614102535046e-05, "loss": 0.2391, "num_input_tokens_seen": 4165672, "step": 46260 }, { "epoch": 12.023128898128897, "grad_norm": 6.787723541259766, "learning_rate": 2.0560561420575434e-05, "loss": 0.25, "num_input_tokens_seen": 4166136, "step": 46265 }, { "epoch": 12.024428274428274, "grad_norm": 5.749077320098877, "learning_rate": 2.055498204412498e-05, "loss": 0.1578, "num_input_tokens_seen": 4166600, "step": 46270 }, { "epoch": 12.025727650727651, "grad_norm": 3.1637492179870605, "learning_rate": 2.0549402896286072e-05, "loss": 0.1297, "num_input_tokens_seen": 4167048, "step": 46275 }, { "epoch": 12.027027027027026, "grad_norm": 3.3731436729431152, "learning_rate": 2.0543823977345626e-05, "loss": 0.2984, "num_input_tokens_seen": 4167480, "step": 46280 }, { "epoch": 12.028326403326403, "grad_norm": 4.891641616821289, "learning_rate": 2.0538245287590585e-05, "loss": 0.2256, "num_input_tokens_seen": 4167928, "step": 46285 }, { "epoch": 12.02962577962578, "grad_norm": 10.954526901245117, "learning_rate": 2.0532666827307862e-05, "loss": 0.1993, "num_input_tokens_seen": 4168456, "step": 46290 }, { "epoch": 12.030925155925155, "grad_norm": 3.430067539215088, "learning_rate": 2.0527088596784374e-05, "loss": 0.1234, "num_input_tokens_seen": 4168856, "step": 46295 }, { "epoch": 12.032224532224532, "grad_norm": 5.542466163635254, "learning_rate": 2.0521510596306994e-05, "loss": 0.1265, "num_input_tokens_seen": 4169304, "step": 46300 }, { "epoch": 12.03352390852391, "grad_norm": 8.204692840576172, "learning_rate": 2.051593282616262e-05, "loss": 0.3331, "num_input_tokens_seen": 4169720, "step": 46305 }, { "epoch": 12.034823284823284, "grad_norm": 6.262103080749512, "learning_rate": 2.0510355286638124e-05, "loss": 0.1153, "num_input_tokens_seen": 4170168, "step": 46310 }, { "epoch": 12.036122661122661, "grad_norm": 0.5770502090454102, "learning_rate": 2.0504777978020346e-05, "loss": 0.3041, "num_input_tokens_seen": 4170616, "step": 46315 }, { "epoch": 12.037422037422038, "grad_norm": 13.59311580657959, "learning_rate": 2.0499200900596148e-05, "loss": 0.1744, "num_input_tokens_seen": 4171016, "step": 46320 }, { "epoch": 12.038721413721413, "grad_norm": 3.2102293968200684, "learning_rate": 2.0493624054652357e-05, "loss": 0.2806, "num_input_tokens_seen": 4171464, "step": 46325 }, { "epoch": 12.04002079002079, "grad_norm": 0.857323408126831, "learning_rate": 2.048804744047581e-05, "loss": 0.0926, "num_input_tokens_seen": 4171896, "step": 46330 }, { "epoch": 12.041320166320165, "grad_norm": 0.03515113890171051, "learning_rate": 2.0482471058353292e-05, "loss": 0.1034, "num_input_tokens_seen": 4172344, "step": 46335 }, { "epoch": 12.042619542619542, "grad_norm": 0.18552720546722412, "learning_rate": 2.0476894908571626e-05, "loss": 0.1332, "num_input_tokens_seen": 4172808, "step": 46340 }, { "epoch": 12.04391891891892, "grad_norm": 0.2945356070995331, "learning_rate": 2.0471318991417578e-05, "loss": 0.3275, "num_input_tokens_seen": 4173304, "step": 46345 }, { "epoch": 12.045218295218294, "grad_norm": 3.9184277057647705, "learning_rate": 2.046574330717795e-05, "loss": 0.3394, "num_input_tokens_seen": 4173768, "step": 46350 }, { "epoch": 12.046517671517671, "grad_norm": 3.308603286743164, "learning_rate": 2.0460167856139467e-05, "loss": 0.2277, "num_input_tokens_seen": 4174184, "step": 46355 }, { "epoch": 12.047817047817048, "grad_norm": 1.6785101890563965, "learning_rate": 2.0454592638588917e-05, "loss": 0.2232, "num_input_tokens_seen": 4174616, "step": 46360 }, { "epoch": 12.049116424116423, "grad_norm": 3.377708673477173, "learning_rate": 2.0449017654813013e-05, "loss": 0.1084, "num_input_tokens_seen": 4175112, "step": 46365 }, { "epoch": 12.0504158004158, "grad_norm": 11.414217948913574, "learning_rate": 2.04434429050985e-05, "loss": 0.6514, "num_input_tokens_seen": 4175624, "step": 46370 }, { "epoch": 12.051715176715177, "grad_norm": 7.508973121643066, "learning_rate": 2.0437868389732086e-05, "loss": 0.4479, "num_input_tokens_seen": 4176024, "step": 46375 }, { "epoch": 12.053014553014552, "grad_norm": 6.751091480255127, "learning_rate": 2.0432294109000463e-05, "loss": 0.3399, "num_input_tokens_seen": 4176488, "step": 46380 }, { "epoch": 12.05431392931393, "grad_norm": 0.4304884672164917, "learning_rate": 2.0426720063190335e-05, "loss": 0.363, "num_input_tokens_seen": 4176936, "step": 46385 }, { "epoch": 12.055613305613306, "grad_norm": 3.50146746635437, "learning_rate": 2.0421146252588375e-05, "loss": 0.1392, "num_input_tokens_seen": 4177384, "step": 46390 }, { "epoch": 12.056912681912682, "grad_norm": 6.434546947479248, "learning_rate": 2.0415572677481252e-05, "loss": 0.2871, "num_input_tokens_seen": 4177832, "step": 46395 }, { "epoch": 12.058212058212058, "grad_norm": 1.3768583536148071, "learning_rate": 2.0409999338155617e-05, "loss": 0.0905, "num_input_tokens_seen": 4178264, "step": 46400 }, { "epoch": 12.059511434511435, "grad_norm": 10.320619583129883, "learning_rate": 2.0404426234898115e-05, "loss": 0.1198, "num_input_tokens_seen": 4178744, "step": 46405 }, { "epoch": 12.06081081081081, "grad_norm": 2.528503894805908, "learning_rate": 2.039885336799537e-05, "loss": 0.2213, "num_input_tokens_seen": 4179208, "step": 46410 }, { "epoch": 12.062110187110187, "grad_norm": 10.147758483886719, "learning_rate": 2.0393280737734017e-05, "loss": 0.1911, "num_input_tokens_seen": 4179688, "step": 46415 }, { "epoch": 12.063409563409563, "grad_norm": 0.12216616421937943, "learning_rate": 2.038770834440064e-05, "loss": 0.0583, "num_input_tokens_seen": 4180104, "step": 46420 }, { "epoch": 12.06470893970894, "grad_norm": 0.18211404979228973, "learning_rate": 2.0382136188281846e-05, "loss": 0.1874, "num_input_tokens_seen": 4180536, "step": 46425 }, { "epoch": 12.066008316008316, "grad_norm": 2.412984848022461, "learning_rate": 2.0376564269664207e-05, "loss": 0.3825, "num_input_tokens_seen": 4181000, "step": 46430 }, { "epoch": 12.067307692307692, "grad_norm": 2.1231210231781006, "learning_rate": 2.0370992588834306e-05, "loss": 0.1474, "num_input_tokens_seen": 4181448, "step": 46435 }, { "epoch": 12.068607068607069, "grad_norm": 2.655726194381714, "learning_rate": 2.036542114607869e-05, "loss": 0.2473, "num_input_tokens_seen": 4181912, "step": 46440 }, { "epoch": 12.069906444906445, "grad_norm": 2.2519984245300293, "learning_rate": 2.0359849941683894e-05, "loss": 0.1845, "num_input_tokens_seen": 4182344, "step": 46445 }, { "epoch": 12.07120582120582, "grad_norm": 8.491887092590332, "learning_rate": 2.0354278975936477e-05, "loss": 0.265, "num_input_tokens_seen": 4182792, "step": 46450 }, { "epoch": 12.072505197505198, "grad_norm": 0.11913653463125229, "learning_rate": 2.0348708249122933e-05, "loss": 0.3344, "num_input_tokens_seen": 4183240, "step": 46455 }, { "epoch": 12.073804573804575, "grad_norm": 3.303372859954834, "learning_rate": 2.0343137761529785e-05, "loss": 0.2507, "num_input_tokens_seen": 4183720, "step": 46460 }, { "epoch": 12.07510395010395, "grad_norm": 0.9159818291664124, "learning_rate": 2.033756751344352e-05, "loss": 0.311, "num_input_tokens_seen": 4184168, "step": 46465 }, { "epoch": 12.076403326403327, "grad_norm": 9.546374320983887, "learning_rate": 2.0331997505150632e-05, "loss": 0.3792, "num_input_tokens_seen": 4184600, "step": 46470 }, { "epoch": 12.077702702702704, "grad_norm": 6.770151615142822, "learning_rate": 2.0326427736937576e-05, "loss": 0.1529, "num_input_tokens_seen": 4185016, "step": 46475 }, { "epoch": 12.079002079002079, "grad_norm": 7.192610740661621, "learning_rate": 2.0320858209090827e-05, "loss": 0.3086, "num_input_tokens_seen": 4185464, "step": 46480 }, { "epoch": 12.080301455301456, "grad_norm": 1.8364579677581787, "learning_rate": 2.0315288921896815e-05, "loss": 0.0843, "num_input_tokens_seen": 4185912, "step": 46485 }, { "epoch": 12.08160083160083, "grad_norm": 1.0329066514968872, "learning_rate": 2.0309719875641995e-05, "loss": 0.128, "num_input_tokens_seen": 4186376, "step": 46490 }, { "epoch": 12.082900207900208, "grad_norm": 0.7550274729728699, "learning_rate": 2.0304151070612763e-05, "loss": 0.178, "num_input_tokens_seen": 4186824, "step": 46495 }, { "epoch": 12.084199584199585, "grad_norm": 2.240579843521118, "learning_rate": 2.029858250709555e-05, "loss": 0.3249, "num_input_tokens_seen": 4187304, "step": 46500 }, { "epoch": 12.08549896049896, "grad_norm": 2.669510841369629, "learning_rate": 2.0293014185376747e-05, "loss": 0.0652, "num_input_tokens_seen": 4187768, "step": 46505 }, { "epoch": 12.086798336798337, "grad_norm": 0.10779096931219101, "learning_rate": 2.0287446105742723e-05, "loss": 0.0476, "num_input_tokens_seen": 4188232, "step": 46510 }, { "epoch": 12.088097713097714, "grad_norm": 0.8348109126091003, "learning_rate": 2.0281878268479865e-05, "loss": 0.1659, "num_input_tokens_seen": 4188696, "step": 46515 }, { "epoch": 12.089397089397089, "grad_norm": 0.45810920000076294, "learning_rate": 2.0276310673874525e-05, "loss": 0.2385, "num_input_tokens_seen": 4189160, "step": 46520 }, { "epoch": 12.090696465696466, "grad_norm": 11.66124153137207, "learning_rate": 2.027074332221306e-05, "loss": 0.4437, "num_input_tokens_seen": 4189624, "step": 46525 }, { "epoch": 12.091995841995843, "grad_norm": 5.724422931671143, "learning_rate": 2.0265176213781793e-05, "loss": 0.2597, "num_input_tokens_seen": 4190072, "step": 46530 }, { "epoch": 12.093295218295218, "grad_norm": 9.05970573425293, "learning_rate": 2.0259609348867053e-05, "loss": 0.26, "num_input_tokens_seen": 4190488, "step": 46535 }, { "epoch": 12.094594594594595, "grad_norm": 0.30370256304740906, "learning_rate": 2.025404272775514e-05, "loss": 0.1595, "num_input_tokens_seen": 4190936, "step": 46540 }, { "epoch": 12.095893970893972, "grad_norm": 0.7290006875991821, "learning_rate": 2.0248476350732368e-05, "loss": 0.4296, "num_input_tokens_seen": 4191384, "step": 46545 }, { "epoch": 12.097193347193347, "grad_norm": 15.271376609802246, "learning_rate": 2.0242910218085e-05, "loss": 0.2024, "num_input_tokens_seen": 4191800, "step": 46550 }, { "epoch": 12.098492723492724, "grad_norm": 12.151119232177734, "learning_rate": 2.023734433009932e-05, "loss": 0.2014, "num_input_tokens_seen": 4192232, "step": 46555 }, { "epoch": 12.0997920997921, "grad_norm": 0.10398455709218979, "learning_rate": 2.023177868706158e-05, "loss": 0.3091, "num_input_tokens_seen": 4192664, "step": 46560 }, { "epoch": 12.101091476091476, "grad_norm": 1.0629273653030396, "learning_rate": 2.0226213289258043e-05, "loss": 0.491, "num_input_tokens_seen": 4193096, "step": 46565 }, { "epoch": 12.102390852390853, "grad_norm": 0.49400267004966736, "learning_rate": 2.0220648136974927e-05, "loss": 0.2483, "num_input_tokens_seen": 4193528, "step": 46570 }, { "epoch": 12.103690228690228, "grad_norm": 8.035466194152832, "learning_rate": 2.021508323049845e-05, "loss": 0.2245, "num_input_tokens_seen": 4193976, "step": 46575 }, { "epoch": 12.104989604989605, "grad_norm": 10.956523895263672, "learning_rate": 2.0209518570114837e-05, "loss": 0.2246, "num_input_tokens_seen": 4194408, "step": 46580 }, { "epoch": 12.106288981288982, "grad_norm": 0.33738937973976135, "learning_rate": 2.0203954156110262e-05, "loss": 0.0397, "num_input_tokens_seen": 4194872, "step": 46585 }, { "epoch": 12.107588357588357, "grad_norm": 6.405430793762207, "learning_rate": 2.0198389988770927e-05, "loss": 0.1719, "num_input_tokens_seen": 4195304, "step": 46590 }, { "epoch": 12.108887733887734, "grad_norm": 7.984939098358154, "learning_rate": 2.0192826068382988e-05, "loss": 0.5185, "num_input_tokens_seen": 4195736, "step": 46595 }, { "epoch": 12.11018711018711, "grad_norm": 15.740752220153809, "learning_rate": 2.0187262395232622e-05, "loss": 0.2937, "num_input_tokens_seen": 4196168, "step": 46600 }, { "epoch": 12.111486486486486, "grad_norm": 0.5660198330879211, "learning_rate": 2.018169896960595e-05, "loss": 0.2187, "num_input_tokens_seen": 4196584, "step": 46605 }, { "epoch": 12.112785862785863, "grad_norm": 6.966472625732422, "learning_rate": 2.0176135791789127e-05, "loss": 0.3754, "num_input_tokens_seen": 4197032, "step": 46610 }, { "epoch": 12.11408523908524, "grad_norm": 0.2944363057613373, "learning_rate": 2.0170572862068253e-05, "loss": 0.2451, "num_input_tokens_seen": 4197544, "step": 46615 }, { "epoch": 12.115384615384615, "grad_norm": 1.6913150548934937, "learning_rate": 2.0165010180729453e-05, "loss": 0.2692, "num_input_tokens_seen": 4198008, "step": 46620 }, { "epoch": 12.116683991683992, "grad_norm": 1.172234296798706, "learning_rate": 2.0159447748058805e-05, "loss": 0.0855, "num_input_tokens_seen": 4198440, "step": 46625 }, { "epoch": 12.117983367983369, "grad_norm": 2.2166621685028076, "learning_rate": 2.0153885564342405e-05, "loss": 0.4258, "num_input_tokens_seen": 4198888, "step": 46630 }, { "epoch": 12.119282744282744, "grad_norm": 3.638129234313965, "learning_rate": 2.0148323629866315e-05, "loss": 0.2077, "num_input_tokens_seen": 4199336, "step": 46635 }, { "epoch": 12.120582120582121, "grad_norm": 5.440376281738281, "learning_rate": 2.0142761944916576e-05, "loss": 0.2663, "num_input_tokens_seen": 4199752, "step": 46640 }, { "epoch": 12.121881496881496, "grad_norm": 0.17712482810020447, "learning_rate": 2.0137200509779262e-05, "loss": 0.2943, "num_input_tokens_seen": 4200200, "step": 46645 }, { "epoch": 12.123180873180873, "grad_norm": 11.070590019226074, "learning_rate": 2.013163932474037e-05, "loss": 0.3241, "num_input_tokens_seen": 4200648, "step": 46650 }, { "epoch": 12.12448024948025, "grad_norm": 7.635962009429932, "learning_rate": 2.012607839008594e-05, "loss": 0.1673, "num_input_tokens_seen": 4201096, "step": 46655 }, { "epoch": 12.125779625779625, "grad_norm": 2.2883005142211914, "learning_rate": 2.012051770610196e-05, "loss": 0.1355, "num_input_tokens_seen": 4201544, "step": 46660 }, { "epoch": 12.127079002079002, "grad_norm": 0.27256253361701965, "learning_rate": 2.0114957273074442e-05, "loss": 0.2333, "num_input_tokens_seen": 4201976, "step": 46665 }, { "epoch": 12.128378378378379, "grad_norm": 8.088513374328613, "learning_rate": 2.010939709128934e-05, "loss": 0.3311, "num_input_tokens_seen": 4202408, "step": 46670 }, { "epoch": 12.129677754677754, "grad_norm": 8.645899772644043, "learning_rate": 2.010383716103264e-05, "loss": 0.4093, "num_input_tokens_seen": 4202856, "step": 46675 }, { "epoch": 12.130977130977131, "grad_norm": 0.830302357673645, "learning_rate": 2.009827748259028e-05, "loss": 0.0745, "num_input_tokens_seen": 4203320, "step": 46680 }, { "epoch": 12.132276507276508, "grad_norm": 0.7484582662582397, "learning_rate": 2.0092718056248216e-05, "loss": 0.1302, "num_input_tokens_seen": 4203752, "step": 46685 }, { "epoch": 12.133575883575883, "grad_norm": 0.35589227080345154, "learning_rate": 2.0087158882292352e-05, "loss": 0.1724, "num_input_tokens_seen": 4204248, "step": 46690 }, { "epoch": 12.13487525987526, "grad_norm": 2.475616693496704, "learning_rate": 2.008159996100862e-05, "loss": 0.0359, "num_input_tokens_seen": 4204696, "step": 46695 }, { "epoch": 12.136174636174637, "grad_norm": 12.5101957321167, "learning_rate": 2.0076041292682922e-05, "loss": 0.2191, "num_input_tokens_seen": 4205128, "step": 46700 }, { "epoch": 12.137474012474012, "grad_norm": 8.558263778686523, "learning_rate": 2.0070482877601127e-05, "loss": 0.3094, "num_input_tokens_seen": 4205544, "step": 46705 }, { "epoch": 12.138773388773389, "grad_norm": 0.6390611529350281, "learning_rate": 2.0064924716049125e-05, "loss": 0.3245, "num_input_tokens_seen": 4205992, "step": 46710 }, { "epoch": 12.140072765072764, "grad_norm": 6.907789707183838, "learning_rate": 2.0059366808312764e-05, "loss": 0.2651, "num_input_tokens_seen": 4206440, "step": 46715 }, { "epoch": 12.141372141372141, "grad_norm": 0.11238148808479309, "learning_rate": 2.005380915467792e-05, "loss": 0.3682, "num_input_tokens_seen": 4206856, "step": 46720 }, { "epoch": 12.142671517671518, "grad_norm": 0.5957747101783752, "learning_rate": 2.0048251755430398e-05, "loss": 0.1524, "num_input_tokens_seen": 4207304, "step": 46725 }, { "epoch": 12.143970893970893, "grad_norm": 9.286856651306152, "learning_rate": 2.0042694610856032e-05, "loss": 0.4712, "num_input_tokens_seen": 4207752, "step": 46730 }, { "epoch": 12.14527027027027, "grad_norm": 5.997462749481201, "learning_rate": 2.0037137721240633e-05, "loss": 0.1531, "num_input_tokens_seen": 4208200, "step": 46735 }, { "epoch": 12.146569646569647, "grad_norm": 1.0710644721984863, "learning_rate": 2.0031581086870006e-05, "loss": 0.329, "num_input_tokens_seen": 4208632, "step": 46740 }, { "epoch": 12.147869022869022, "grad_norm": 9.89770793914795, "learning_rate": 2.002602470802991e-05, "loss": 0.389, "num_input_tokens_seen": 4209064, "step": 46745 }, { "epoch": 12.1491683991684, "grad_norm": 3.126311779022217, "learning_rate": 2.002046858500614e-05, "loss": 0.1901, "num_input_tokens_seen": 4209496, "step": 46750 }, { "epoch": 12.150467775467776, "grad_norm": 2.751357078552246, "learning_rate": 2.0014912718084432e-05, "loss": 0.1001, "num_input_tokens_seen": 4209928, "step": 46755 }, { "epoch": 12.151767151767151, "grad_norm": 9.214273452758789, "learning_rate": 2.0009357107550553e-05, "loss": 0.2576, "num_input_tokens_seen": 4210392, "step": 46760 }, { "epoch": 12.153066528066528, "grad_norm": 8.397010803222656, "learning_rate": 2.0003801753690214e-05, "loss": 0.1341, "num_input_tokens_seen": 4210808, "step": 46765 }, { "epoch": 12.154365904365905, "grad_norm": 2.5330748558044434, "learning_rate": 1.999824665678913e-05, "loss": 0.0545, "num_input_tokens_seen": 4211256, "step": 46770 }, { "epoch": 12.15566528066528, "grad_norm": 0.12750747799873352, "learning_rate": 1.9992691817133024e-05, "loss": 0.3399, "num_input_tokens_seen": 4211736, "step": 46775 }, { "epoch": 12.156964656964657, "grad_norm": 8.012123107910156, "learning_rate": 1.9987137235007565e-05, "loss": 0.1544, "num_input_tokens_seen": 4212200, "step": 46780 }, { "epoch": 12.158264033264032, "grad_norm": 6.641829967498779, "learning_rate": 1.998158291069845e-05, "loss": 0.5723, "num_input_tokens_seen": 4212648, "step": 46785 }, { "epoch": 12.15956340956341, "grad_norm": 2.544844150543213, "learning_rate": 1.9976028844491326e-05, "loss": 0.1306, "num_input_tokens_seen": 4213080, "step": 46790 }, { "epoch": 12.160862785862786, "grad_norm": 2.295041561126709, "learning_rate": 1.9970475036671864e-05, "loss": 0.1873, "num_input_tokens_seen": 4213512, "step": 46795 }, { "epoch": 12.162162162162161, "grad_norm": 19.5306453704834, "learning_rate": 1.996492148752568e-05, "loss": 0.408, "num_input_tokens_seen": 4213960, "step": 46800 }, { "epoch": 12.163461538461538, "grad_norm": 5.228544235229492, "learning_rate": 1.995936819733841e-05, "loss": 0.1963, "num_input_tokens_seen": 4214424, "step": 46805 }, { "epoch": 12.164760914760915, "grad_norm": 6.645438194274902, "learning_rate": 1.995381516639566e-05, "loss": 0.3705, "num_input_tokens_seen": 4214920, "step": 46810 }, { "epoch": 12.16606029106029, "grad_norm": 9.536645889282227, "learning_rate": 1.994826239498304e-05, "loss": 0.3101, "num_input_tokens_seen": 4215352, "step": 46815 }, { "epoch": 12.167359667359667, "grad_norm": 8.130240440368652, "learning_rate": 1.994270988338612e-05, "loss": 0.1337, "num_input_tokens_seen": 4215800, "step": 46820 }, { "epoch": 12.168659043659044, "grad_norm": 5.718014717102051, "learning_rate": 1.993715763189048e-05, "loss": 0.4194, "num_input_tokens_seen": 4216280, "step": 46825 }, { "epoch": 12.16995841995842, "grad_norm": 4.261713027954102, "learning_rate": 1.9931605640781676e-05, "loss": 0.6489, "num_input_tokens_seen": 4216696, "step": 46830 }, { "epoch": 12.171257796257796, "grad_norm": 7.792886734008789, "learning_rate": 1.9926053910345242e-05, "loss": 0.4903, "num_input_tokens_seen": 4217128, "step": 46835 }, { "epoch": 12.172557172557173, "grad_norm": 2.4329025745391846, "learning_rate": 1.992050244086672e-05, "loss": 0.1911, "num_input_tokens_seen": 4217592, "step": 46840 }, { "epoch": 12.173856548856548, "grad_norm": 2.502795696258545, "learning_rate": 1.991495123263162e-05, "loss": 0.1104, "num_input_tokens_seen": 4218088, "step": 46845 }, { "epoch": 12.175155925155925, "grad_norm": 1.0822564363479614, "learning_rate": 1.9909400285925464e-05, "loss": 0.1876, "num_input_tokens_seen": 4218568, "step": 46850 }, { "epoch": 12.176455301455302, "grad_norm": 1.9966806173324585, "learning_rate": 1.990384960103371e-05, "loss": 0.1449, "num_input_tokens_seen": 4219000, "step": 46855 }, { "epoch": 12.177754677754677, "grad_norm": 0.09730890393257141, "learning_rate": 1.9898299178241868e-05, "loss": 0.1133, "num_input_tokens_seen": 4219448, "step": 46860 }, { "epoch": 12.179054054054054, "grad_norm": 1.355806827545166, "learning_rate": 1.9892749017835384e-05, "loss": 0.2391, "num_input_tokens_seen": 4219928, "step": 46865 }, { "epoch": 12.18035343035343, "grad_norm": 2.335968017578125, "learning_rate": 1.988719912009971e-05, "loss": 0.2041, "num_input_tokens_seen": 4220360, "step": 46870 }, { "epoch": 12.181652806652806, "grad_norm": 12.971534729003906, "learning_rate": 1.988164948532028e-05, "loss": 0.5748, "num_input_tokens_seen": 4220824, "step": 46875 }, { "epoch": 12.182952182952183, "grad_norm": 1.7736434936523438, "learning_rate": 1.9876100113782533e-05, "loss": 0.2207, "num_input_tokens_seen": 4221256, "step": 46880 }, { "epoch": 12.184251559251559, "grad_norm": 6.574595928192139, "learning_rate": 1.9870551005771857e-05, "loss": 0.1183, "num_input_tokens_seen": 4221736, "step": 46885 }, { "epoch": 12.185550935550935, "grad_norm": 12.683334350585938, "learning_rate": 1.9865002161573658e-05, "loss": 0.388, "num_input_tokens_seen": 4222200, "step": 46890 }, { "epoch": 12.186850311850312, "grad_norm": 2.4704830646514893, "learning_rate": 1.985945358147333e-05, "loss": 0.139, "num_input_tokens_seen": 4222680, "step": 46895 }, { "epoch": 12.188149688149688, "grad_norm": 1.8335745334625244, "learning_rate": 1.9853905265756215e-05, "loss": 0.0904, "num_input_tokens_seen": 4223144, "step": 46900 }, { "epoch": 12.189449064449065, "grad_norm": 10.000273704528809, "learning_rate": 1.984835721470769e-05, "loss": 0.1978, "num_input_tokens_seen": 4223592, "step": 46905 }, { "epoch": 12.190748440748441, "grad_norm": 1.0298775434494019, "learning_rate": 1.984280942861308e-05, "loss": 0.181, "num_input_tokens_seen": 4224040, "step": 46910 }, { "epoch": 12.192047817047817, "grad_norm": 9.091915130615234, "learning_rate": 1.983726190775774e-05, "loss": 0.4126, "num_input_tokens_seen": 4224472, "step": 46915 }, { "epoch": 12.193347193347194, "grad_norm": 6.914697647094727, "learning_rate": 1.983171465242695e-05, "loss": 0.1304, "num_input_tokens_seen": 4224936, "step": 46920 }, { "epoch": 12.19464656964657, "grad_norm": 4.058331489562988, "learning_rate": 1.9826167662906036e-05, "loss": 0.3199, "num_input_tokens_seen": 4225384, "step": 46925 }, { "epoch": 12.195945945945946, "grad_norm": 1.0294278860092163, "learning_rate": 1.9820620939480274e-05, "loss": 0.4072, "num_input_tokens_seen": 4225848, "step": 46930 }, { "epoch": 12.197245322245323, "grad_norm": 5.169346332550049, "learning_rate": 1.9815074482434945e-05, "loss": 0.3491, "num_input_tokens_seen": 4226264, "step": 46935 }, { "epoch": 12.198544698544698, "grad_norm": 8.22774600982666, "learning_rate": 1.9809528292055297e-05, "loss": 0.2334, "num_input_tokens_seen": 4226712, "step": 46940 }, { "epoch": 12.199844074844075, "grad_norm": 0.8051449060440063, "learning_rate": 1.9803982368626583e-05, "loss": 0.2326, "num_input_tokens_seen": 4227144, "step": 46945 }, { "epoch": 12.201143451143452, "grad_norm": 5.989831447601318, "learning_rate": 1.9798436712434033e-05, "loss": 0.1011, "num_input_tokens_seen": 4227576, "step": 46950 }, { "epoch": 12.202442827442827, "grad_norm": 0.5916569232940674, "learning_rate": 1.9792891323762874e-05, "loss": 0.1966, "num_input_tokens_seen": 4228040, "step": 46955 }, { "epoch": 12.203742203742204, "grad_norm": 4.186446189880371, "learning_rate": 1.9787346202898298e-05, "loss": 0.1512, "num_input_tokens_seen": 4228520, "step": 46960 }, { "epoch": 12.20504158004158, "grad_norm": 4.566736221313477, "learning_rate": 1.9781801350125497e-05, "loss": 0.1453, "num_input_tokens_seen": 4228968, "step": 46965 }, { "epoch": 12.206340956340956, "grad_norm": 0.18849733471870422, "learning_rate": 1.977625676572967e-05, "loss": 0.5323, "num_input_tokens_seen": 4229416, "step": 46970 }, { "epoch": 12.207640332640333, "grad_norm": 5.05592155456543, "learning_rate": 1.9770712449995943e-05, "loss": 0.1591, "num_input_tokens_seen": 4229848, "step": 46975 }, { "epoch": 12.20893970893971, "grad_norm": 0.7047784328460693, "learning_rate": 1.976516840320949e-05, "loss": 0.2001, "num_input_tokens_seen": 4230312, "step": 46980 }, { "epoch": 12.210239085239085, "grad_norm": 7.069403648376465, "learning_rate": 1.975962462565544e-05, "loss": 0.1572, "num_input_tokens_seen": 4230776, "step": 46985 }, { "epoch": 12.211538461538462, "grad_norm": 2.2426857948303223, "learning_rate": 1.9754081117618926e-05, "loss": 0.2356, "num_input_tokens_seen": 4231192, "step": 46990 }, { "epoch": 12.212837837837839, "grad_norm": 6.022212505340576, "learning_rate": 1.974853787938504e-05, "loss": 0.2126, "num_input_tokens_seen": 4231624, "step": 46995 }, { "epoch": 12.214137214137214, "grad_norm": 10.521141052246094, "learning_rate": 1.9742994911238882e-05, "loss": 0.2259, "num_input_tokens_seen": 4232088, "step": 47000 }, { "epoch": 12.21543659043659, "grad_norm": 2.4481914043426514, "learning_rate": 1.973745221346553e-05, "loss": 0.1499, "num_input_tokens_seen": 4232520, "step": 47005 }, { "epoch": 12.216735966735968, "grad_norm": 2.2632951736450195, "learning_rate": 1.9731909786350068e-05, "loss": 0.0878, "num_input_tokens_seen": 4233016, "step": 47010 }, { "epoch": 12.218035343035343, "grad_norm": 12.083375930786133, "learning_rate": 1.9726367630177518e-05, "loss": 0.4069, "num_input_tokens_seen": 4233464, "step": 47015 }, { "epoch": 12.21933471933472, "grad_norm": 5.812315464019775, "learning_rate": 1.9720825745232937e-05, "loss": 0.4537, "num_input_tokens_seen": 4233896, "step": 47020 }, { "epoch": 12.220634095634095, "grad_norm": 1.838659644126892, "learning_rate": 1.9715284131801353e-05, "loss": 0.3005, "num_input_tokens_seen": 4234328, "step": 47025 }, { "epoch": 12.221933471933472, "grad_norm": 0.310064435005188, "learning_rate": 1.9709742790167763e-05, "loss": 0.236, "num_input_tokens_seen": 4234760, "step": 47030 }, { "epoch": 12.223232848232849, "grad_norm": 2.4114990234375, "learning_rate": 1.9704201720617172e-05, "loss": 0.2544, "num_input_tokens_seen": 4235192, "step": 47035 }, { "epoch": 12.224532224532224, "grad_norm": 2.6757969856262207, "learning_rate": 1.9698660923434552e-05, "loss": 0.0666, "num_input_tokens_seen": 4235672, "step": 47040 }, { "epoch": 12.2258316008316, "grad_norm": 0.17837004363536835, "learning_rate": 1.9693120398904896e-05, "loss": 0.0139, "num_input_tokens_seen": 4236104, "step": 47045 }, { "epoch": 12.227130977130978, "grad_norm": 5.073970794677734, "learning_rate": 1.9687580147313134e-05, "loss": 0.1883, "num_input_tokens_seen": 4236568, "step": 47050 }, { "epoch": 12.228430353430353, "grad_norm": 10.614350318908691, "learning_rate": 1.9682040168944216e-05, "loss": 0.5268, "num_input_tokens_seen": 4236984, "step": 47055 }, { "epoch": 12.22972972972973, "grad_norm": 0.3922075927257538, "learning_rate": 1.9676500464083064e-05, "loss": 0.2191, "num_input_tokens_seen": 4237432, "step": 47060 }, { "epoch": 12.231029106029107, "grad_norm": 5.530266284942627, "learning_rate": 1.9670961033014605e-05, "loss": 0.634, "num_input_tokens_seen": 4237864, "step": 47065 }, { "epoch": 12.232328482328482, "grad_norm": 1.8194845914840698, "learning_rate": 1.966542187602371e-05, "loss": 0.1836, "num_input_tokens_seen": 4238280, "step": 47070 }, { "epoch": 12.233627858627859, "grad_norm": 3.6125600337982178, "learning_rate": 1.965988299339529e-05, "loss": 0.208, "num_input_tokens_seen": 4238728, "step": 47075 }, { "epoch": 12.234927234927236, "grad_norm": 2.3615331649780273, "learning_rate": 1.96543443854142e-05, "loss": 0.3165, "num_input_tokens_seen": 4239176, "step": 47080 }, { "epoch": 12.236226611226611, "grad_norm": 7.760458469390869, "learning_rate": 1.964880605236531e-05, "loss": 0.1471, "num_input_tokens_seen": 4239640, "step": 47085 }, { "epoch": 12.237525987525988, "grad_norm": 0.15734215080738068, "learning_rate": 1.9643267994533444e-05, "loss": 0.3963, "num_input_tokens_seen": 4240088, "step": 47090 }, { "epoch": 12.238825363825363, "grad_norm": 11.717860221862793, "learning_rate": 1.9637730212203433e-05, "loss": 0.3465, "num_input_tokens_seen": 4240568, "step": 47095 }, { "epoch": 12.24012474012474, "grad_norm": 0.543968677520752, "learning_rate": 1.963219270566011e-05, "loss": 0.1722, "num_input_tokens_seen": 4241000, "step": 47100 }, { "epoch": 12.241424116424117, "grad_norm": 0.5647592544555664, "learning_rate": 1.9626655475188238e-05, "loss": 0.2218, "num_input_tokens_seen": 4241464, "step": 47105 }, { "epoch": 12.242723492723492, "grad_norm": 2.9158196449279785, "learning_rate": 1.962111852107264e-05, "loss": 0.2762, "num_input_tokens_seen": 4241928, "step": 47110 }, { "epoch": 12.244022869022869, "grad_norm": 9.726140022277832, "learning_rate": 1.961558184359806e-05, "loss": 0.3093, "num_input_tokens_seen": 4242392, "step": 47115 }, { "epoch": 12.245322245322246, "grad_norm": 2.266282081604004, "learning_rate": 1.961004544304927e-05, "loss": 0.1592, "num_input_tokens_seen": 4242840, "step": 47120 }, { "epoch": 12.246621621621621, "grad_norm": 6.5982513427734375, "learning_rate": 1.9604509319711007e-05, "loss": 0.2291, "num_input_tokens_seen": 4243288, "step": 47125 }, { "epoch": 12.247920997920998, "grad_norm": 12.492488861083984, "learning_rate": 1.9598973473868004e-05, "loss": 0.1219, "num_input_tokens_seen": 4243704, "step": 47130 }, { "epoch": 12.249220374220375, "grad_norm": 13.248913764953613, "learning_rate": 1.959343790580496e-05, "loss": 0.383, "num_input_tokens_seen": 4244168, "step": 47135 }, { "epoch": 12.25051975051975, "grad_norm": 4.091067790985107, "learning_rate": 1.9587902615806595e-05, "loss": 0.1781, "num_input_tokens_seen": 4244584, "step": 47140 }, { "epoch": 12.251819126819127, "grad_norm": 0.7927879691123962, "learning_rate": 1.9582367604157577e-05, "loss": 0.4155, "num_input_tokens_seen": 4245000, "step": 47145 }, { "epoch": 12.253118503118504, "grad_norm": 5.049561500549316, "learning_rate": 1.957683287114259e-05, "loss": 0.3261, "num_input_tokens_seen": 4245432, "step": 47150 }, { "epoch": 12.254417879417879, "grad_norm": 9.070965766906738, "learning_rate": 1.957129841704628e-05, "loss": 0.381, "num_input_tokens_seen": 4245848, "step": 47155 }, { "epoch": 12.255717255717256, "grad_norm": 6.913297653198242, "learning_rate": 1.9565764242153296e-05, "loss": 0.408, "num_input_tokens_seen": 4246264, "step": 47160 }, { "epoch": 12.257016632016631, "grad_norm": 1.5960787534713745, "learning_rate": 1.9560230346748266e-05, "loss": 0.1726, "num_input_tokens_seen": 4246744, "step": 47165 }, { "epoch": 12.258316008316008, "grad_norm": 0.5941672325134277, "learning_rate": 1.9554696731115797e-05, "loss": 0.2239, "num_input_tokens_seen": 4247224, "step": 47170 }, { "epoch": 12.259615384615385, "grad_norm": 7.888181686401367, "learning_rate": 1.9549163395540495e-05, "loss": 0.2634, "num_input_tokens_seen": 4247640, "step": 47175 }, { "epoch": 12.26091476091476, "grad_norm": 10.906094551086426, "learning_rate": 1.9543630340306938e-05, "loss": 0.2621, "num_input_tokens_seen": 4248072, "step": 47180 }, { "epoch": 12.262214137214137, "grad_norm": 6.66096305847168, "learning_rate": 1.953809756569971e-05, "loss": 0.3438, "num_input_tokens_seen": 4248520, "step": 47185 }, { "epoch": 12.263513513513514, "grad_norm": 2.188615560531616, "learning_rate": 1.9532565072003348e-05, "loss": 0.0714, "num_input_tokens_seen": 4248984, "step": 47190 }, { "epoch": 12.26481288981289, "grad_norm": 0.9189915657043457, "learning_rate": 1.952703285950241e-05, "loss": 0.0975, "num_input_tokens_seen": 4249448, "step": 47195 }, { "epoch": 12.266112266112266, "grad_norm": 7.6322712898254395, "learning_rate": 1.9521500928481405e-05, "loss": 0.3429, "num_input_tokens_seen": 4249864, "step": 47200 }, { "epoch": 12.267411642411643, "grad_norm": 6.876007080078125, "learning_rate": 1.951596927922487e-05, "loss": 0.4039, "num_input_tokens_seen": 4250296, "step": 47205 }, { "epoch": 12.268711018711018, "grad_norm": 5.726874351501465, "learning_rate": 1.951043791201728e-05, "loss": 0.4376, "num_input_tokens_seen": 4250744, "step": 47210 }, { "epoch": 12.270010395010395, "grad_norm": 8.010948181152344, "learning_rate": 1.9504906827143136e-05, "loss": 0.3048, "num_input_tokens_seen": 4251160, "step": 47215 }, { "epoch": 12.271309771309772, "grad_norm": 2.2396814823150635, "learning_rate": 1.9499376024886888e-05, "loss": 0.2253, "num_input_tokens_seen": 4251576, "step": 47220 }, { "epoch": 12.272609147609147, "grad_norm": 6.468631744384766, "learning_rate": 1.9493845505533016e-05, "loss": 0.1217, "num_input_tokens_seen": 4252040, "step": 47225 }, { "epoch": 12.273908523908524, "grad_norm": 3.562533140182495, "learning_rate": 1.948831526936594e-05, "loss": 0.2542, "num_input_tokens_seen": 4252456, "step": 47230 }, { "epoch": 12.2752079002079, "grad_norm": 4.579392910003662, "learning_rate": 1.9482785316670082e-05, "loss": 0.2187, "num_input_tokens_seen": 4252920, "step": 47235 }, { "epoch": 12.276507276507276, "grad_norm": 3.069967031478882, "learning_rate": 1.9477255647729873e-05, "loss": 0.1408, "num_input_tokens_seen": 4253352, "step": 47240 }, { "epoch": 12.277806652806653, "grad_norm": 4.376865386962891, "learning_rate": 1.9471726262829688e-05, "loss": 0.0877, "num_input_tokens_seen": 4253800, "step": 47245 }, { "epoch": 12.279106029106028, "grad_norm": 4.98991584777832, "learning_rate": 1.9466197162253927e-05, "loss": 0.1572, "num_input_tokens_seen": 4254232, "step": 47250 }, { "epoch": 12.280405405405405, "grad_norm": 3.34440541267395, "learning_rate": 1.946066834628694e-05, "loss": 0.2979, "num_input_tokens_seen": 4254680, "step": 47255 }, { "epoch": 12.281704781704782, "grad_norm": 6.372241020202637, "learning_rate": 1.9455139815213097e-05, "loss": 0.3249, "num_input_tokens_seen": 4255144, "step": 47260 }, { "epoch": 12.283004158004157, "grad_norm": 3.7433815002441406, "learning_rate": 1.9449611569316717e-05, "loss": 0.403, "num_input_tokens_seen": 4255576, "step": 47265 }, { "epoch": 12.284303534303534, "grad_norm": 2.874462127685547, "learning_rate": 1.9444083608882135e-05, "loss": 0.1609, "num_input_tokens_seen": 4256008, "step": 47270 }, { "epoch": 12.285602910602911, "grad_norm": 4.131306171417236, "learning_rate": 1.943855593419365e-05, "loss": 0.3321, "num_input_tokens_seen": 4256440, "step": 47275 }, { "epoch": 12.286902286902286, "grad_norm": 1.7640225887298584, "learning_rate": 1.943302854553558e-05, "loss": 0.09, "num_input_tokens_seen": 4256872, "step": 47280 }, { "epoch": 12.288201663201663, "grad_norm": 3.1105806827545166, "learning_rate": 1.942750144319217e-05, "loss": 0.3756, "num_input_tokens_seen": 4257304, "step": 47285 }, { "epoch": 12.28950103950104, "grad_norm": 3.9493305683135986, "learning_rate": 1.942197462744771e-05, "loss": 0.3148, "num_input_tokens_seen": 4257784, "step": 47290 }, { "epoch": 12.290800415800415, "grad_norm": 3.665972948074341, "learning_rate": 1.9416448098586436e-05, "loss": 0.0896, "num_input_tokens_seen": 4258248, "step": 47295 }, { "epoch": 12.292099792099792, "grad_norm": 0.32006368041038513, "learning_rate": 1.9410921856892582e-05, "loss": 0.1349, "num_input_tokens_seen": 4258680, "step": 47300 }, { "epoch": 12.29339916839917, "grad_norm": 3.671827793121338, "learning_rate": 1.940539590265038e-05, "loss": 0.1622, "num_input_tokens_seen": 4259144, "step": 47305 }, { "epoch": 12.294698544698544, "grad_norm": 1.684194803237915, "learning_rate": 1.9399870236144015e-05, "loss": 0.191, "num_input_tokens_seen": 4259608, "step": 47310 }, { "epoch": 12.295997920997921, "grad_norm": 3.9636728763580322, "learning_rate": 1.9394344857657704e-05, "loss": 0.2342, "num_input_tokens_seen": 4260056, "step": 47315 }, { "epoch": 12.297297297297296, "grad_norm": 3.9649832248687744, "learning_rate": 1.9388819767475596e-05, "loss": 0.1947, "num_input_tokens_seen": 4260520, "step": 47320 }, { "epoch": 12.298596673596673, "grad_norm": 6.586145877838135, "learning_rate": 1.938329496588187e-05, "loss": 0.2581, "num_input_tokens_seen": 4260952, "step": 47325 }, { "epoch": 12.29989604989605, "grad_norm": 0.9329425096511841, "learning_rate": 1.937777045316066e-05, "loss": 0.1484, "num_input_tokens_seen": 4261384, "step": 47330 }, { "epoch": 12.301195426195425, "grad_norm": 12.827899932861328, "learning_rate": 1.9372246229596113e-05, "loss": 0.2042, "num_input_tokens_seen": 4261832, "step": 47335 }, { "epoch": 12.302494802494802, "grad_norm": 5.605079650878906, "learning_rate": 1.9366722295472318e-05, "loss": 0.1831, "num_input_tokens_seen": 4262248, "step": 47340 }, { "epoch": 12.30379417879418, "grad_norm": 10.105995178222656, "learning_rate": 1.9361198651073408e-05, "loss": 0.4669, "num_input_tokens_seen": 4262696, "step": 47345 }, { "epoch": 12.305093555093555, "grad_norm": 0.5234205722808838, "learning_rate": 1.9355675296683447e-05, "loss": 0.1158, "num_input_tokens_seen": 4263176, "step": 47350 }, { "epoch": 12.306392931392931, "grad_norm": 1.1511222124099731, "learning_rate": 1.9350152232586518e-05, "loss": 0.2378, "num_input_tokens_seen": 4263608, "step": 47355 }, { "epoch": 12.307692307692308, "grad_norm": 0.4306655824184418, "learning_rate": 1.9344629459066677e-05, "loss": 0.1069, "num_input_tokens_seen": 4264056, "step": 47360 }, { "epoch": 12.308991683991684, "grad_norm": 9.117521286010742, "learning_rate": 1.9339106976407952e-05, "loss": 0.2794, "num_input_tokens_seen": 4264520, "step": 47365 }, { "epoch": 12.31029106029106, "grad_norm": 11.424860000610352, "learning_rate": 1.9333584784894383e-05, "loss": 0.5124, "num_input_tokens_seen": 4264968, "step": 47370 }, { "epoch": 12.311590436590437, "grad_norm": 0.17830970883369446, "learning_rate": 1.9328062884809975e-05, "loss": 0.0983, "num_input_tokens_seen": 4265416, "step": 47375 }, { "epoch": 12.312889812889813, "grad_norm": 0.8019187450408936, "learning_rate": 1.932254127643874e-05, "loss": 0.0588, "num_input_tokens_seen": 4265928, "step": 47380 }, { "epoch": 12.31418918918919, "grad_norm": 3.6097991466522217, "learning_rate": 1.9317019960064632e-05, "loss": 0.2473, "num_input_tokens_seen": 4266360, "step": 47385 }, { "epoch": 12.315488565488565, "grad_norm": 1.3307713270187378, "learning_rate": 1.9311498935971638e-05, "loss": 0.0886, "num_input_tokens_seen": 4266808, "step": 47390 }, { "epoch": 12.316787941787942, "grad_norm": 1.2300184965133667, "learning_rate": 1.93059782044437e-05, "loss": 0.3753, "num_input_tokens_seen": 4267224, "step": 47395 }, { "epoch": 12.318087318087318, "grad_norm": 2.1958701610565186, "learning_rate": 1.930045776576477e-05, "loss": 0.2651, "num_input_tokens_seen": 4267672, "step": 47400 }, { "epoch": 12.319386694386694, "grad_norm": 5.854915142059326, "learning_rate": 1.9294937620218746e-05, "loss": 0.2314, "num_input_tokens_seen": 4268120, "step": 47405 }, { "epoch": 12.32068607068607, "grad_norm": 4.559421539306641, "learning_rate": 1.9289417768089553e-05, "loss": 0.2078, "num_input_tokens_seen": 4268536, "step": 47410 }, { "epoch": 12.321985446985448, "grad_norm": 3.5902466773986816, "learning_rate": 1.9283898209661066e-05, "loss": 0.277, "num_input_tokens_seen": 4268968, "step": 47415 }, { "epoch": 12.323284823284823, "grad_norm": 4.484269618988037, "learning_rate": 1.9278378945217186e-05, "loss": 0.2773, "num_input_tokens_seen": 4269384, "step": 47420 }, { "epoch": 12.3245841995842, "grad_norm": 4.094974994659424, "learning_rate": 1.9272859975041754e-05, "loss": 0.3682, "num_input_tokens_seen": 4269816, "step": 47425 }, { "epoch": 12.325883575883577, "grad_norm": 3.597973585128784, "learning_rate": 1.9267341299418615e-05, "loss": 0.2992, "num_input_tokens_seen": 4270248, "step": 47430 }, { "epoch": 12.327182952182952, "grad_norm": 0.5512571930885315, "learning_rate": 1.926182291863162e-05, "loss": 0.2006, "num_input_tokens_seen": 4270696, "step": 47435 }, { "epoch": 12.328482328482329, "grad_norm": 3.834929943084717, "learning_rate": 1.925630483296455e-05, "loss": 0.2237, "num_input_tokens_seen": 4271192, "step": 47440 }, { "epoch": 12.329781704781706, "grad_norm": 5.869855880737305, "learning_rate": 1.925078704270124e-05, "loss": 0.1857, "num_input_tokens_seen": 4271656, "step": 47445 }, { "epoch": 12.33108108108108, "grad_norm": 2.3616864681243896, "learning_rate": 1.924526954812545e-05, "loss": 0.08, "num_input_tokens_seen": 4272152, "step": 47450 }, { "epoch": 12.332380457380458, "grad_norm": 5.591568470001221, "learning_rate": 1.923975234952098e-05, "loss": 0.1903, "num_input_tokens_seen": 4272584, "step": 47455 }, { "epoch": 12.333679833679835, "grad_norm": 2.7575104236602783, "learning_rate": 1.9234235447171548e-05, "loss": 0.1086, "num_input_tokens_seen": 4273016, "step": 47460 }, { "epoch": 12.33497920997921, "grad_norm": 0.22047674655914307, "learning_rate": 1.9228718841360917e-05, "loss": 0.1838, "num_input_tokens_seen": 4273464, "step": 47465 }, { "epoch": 12.336278586278587, "grad_norm": 0.21291056275367737, "learning_rate": 1.9223202532372802e-05, "loss": 0.1594, "num_input_tokens_seen": 4273912, "step": 47470 }, { "epoch": 12.337577962577962, "grad_norm": 0.7400745749473572, "learning_rate": 1.921768652049093e-05, "loss": 0.1602, "num_input_tokens_seen": 4274360, "step": 47475 }, { "epoch": 12.338877338877339, "grad_norm": 12.58939266204834, "learning_rate": 1.9212170805998965e-05, "loss": 0.6146, "num_input_tokens_seen": 4274808, "step": 47480 }, { "epoch": 12.340176715176716, "grad_norm": 0.8632784485816956, "learning_rate": 1.9206655389180604e-05, "loss": 0.0082, "num_input_tokens_seen": 4275304, "step": 47485 }, { "epoch": 12.34147609147609, "grad_norm": 0.6449670195579529, "learning_rate": 1.920114027031952e-05, "loss": 0.1637, "num_input_tokens_seen": 4275768, "step": 47490 }, { "epoch": 12.342775467775468, "grad_norm": 1.1927837133407593, "learning_rate": 1.9195625449699334e-05, "loss": 0.3417, "num_input_tokens_seen": 4276216, "step": 47495 }, { "epoch": 12.344074844074845, "grad_norm": 4.521920204162598, "learning_rate": 1.9190110927603695e-05, "loss": 0.4548, "num_input_tokens_seen": 4276664, "step": 47500 }, { "epoch": 12.34537422037422, "grad_norm": 4.195981502532959, "learning_rate": 1.918459670431622e-05, "loss": 0.1874, "num_input_tokens_seen": 4277096, "step": 47505 }, { "epoch": 12.346673596673597, "grad_norm": 7.21063232421875, "learning_rate": 1.917908278012051e-05, "loss": 0.2871, "num_input_tokens_seen": 4277528, "step": 47510 }, { "epoch": 12.347972972972974, "grad_norm": 7.676538467407227, "learning_rate": 1.9173569155300148e-05, "loss": 0.4327, "num_input_tokens_seen": 4277944, "step": 47515 }, { "epoch": 12.349272349272349, "grad_norm": 0.12229203432798386, "learning_rate": 1.9168055830138706e-05, "loss": 0.1356, "num_input_tokens_seen": 4278344, "step": 47520 }, { "epoch": 12.350571725571726, "grad_norm": 2.0244011878967285, "learning_rate": 1.9162542804919736e-05, "loss": 0.0917, "num_input_tokens_seen": 4278776, "step": 47525 }, { "epoch": 12.351871101871103, "grad_norm": 4.708502292633057, "learning_rate": 1.9157030079926796e-05, "loss": 0.3034, "num_input_tokens_seen": 4279256, "step": 47530 }, { "epoch": 12.353170478170478, "grad_norm": 1.0647504329681396, "learning_rate": 1.9151517655443386e-05, "loss": 0.0844, "num_input_tokens_seen": 4279704, "step": 47535 }, { "epoch": 12.354469854469855, "grad_norm": 2.199524164199829, "learning_rate": 1.914600553175303e-05, "loss": 0.1507, "num_input_tokens_seen": 4280120, "step": 47540 }, { "epoch": 12.35576923076923, "grad_norm": 1.2932353019714355, "learning_rate": 1.9140493709139214e-05, "loss": 0.2969, "num_input_tokens_seen": 4280568, "step": 47545 }, { "epoch": 12.357068607068607, "grad_norm": 1.858178973197937, "learning_rate": 1.9134982187885433e-05, "loss": 0.1575, "num_input_tokens_seen": 4281000, "step": 47550 }, { "epoch": 12.358367983367984, "grad_norm": 3.8341727256774902, "learning_rate": 1.912947096827513e-05, "loss": 0.1706, "num_input_tokens_seen": 4281448, "step": 47555 }, { "epoch": 12.359667359667359, "grad_norm": 9.281519889831543, "learning_rate": 1.912396005059176e-05, "loss": 0.6045, "num_input_tokens_seen": 4281912, "step": 47560 }, { "epoch": 12.360966735966736, "grad_norm": 12.035759925842285, "learning_rate": 1.911844943511876e-05, "loss": 0.336, "num_input_tokens_seen": 4282344, "step": 47565 }, { "epoch": 12.362266112266113, "grad_norm": 0.7267513275146484, "learning_rate": 1.911293912213953e-05, "loss": 0.6087, "num_input_tokens_seen": 4282824, "step": 47570 }, { "epoch": 12.363565488565488, "grad_norm": 6.191726207733154, "learning_rate": 1.9107429111937493e-05, "loss": 0.429, "num_input_tokens_seen": 4283256, "step": 47575 }, { "epoch": 12.364864864864865, "grad_norm": 2.8836348056793213, "learning_rate": 1.9101919404796008e-05, "loss": 0.3235, "num_input_tokens_seen": 4283720, "step": 47580 }, { "epoch": 12.366164241164242, "grad_norm": 11.065245628356934, "learning_rate": 1.9096410000998475e-05, "loss": 0.3593, "num_input_tokens_seen": 4284216, "step": 47585 }, { "epoch": 12.367463617463617, "grad_norm": 0.4103145897388458, "learning_rate": 1.9090900900828217e-05, "loss": 0.1249, "num_input_tokens_seen": 4284648, "step": 47590 }, { "epoch": 12.368762993762994, "grad_norm": 0.6730096936225891, "learning_rate": 1.9085392104568606e-05, "loss": 0.0649, "num_input_tokens_seen": 4285080, "step": 47595 }, { "epoch": 12.37006237006237, "grad_norm": 0.8104211091995239, "learning_rate": 1.907988361250293e-05, "loss": 0.2228, "num_input_tokens_seen": 4285496, "step": 47600 }, { "epoch": 12.371361746361746, "grad_norm": 4.6544928550720215, "learning_rate": 1.907437542491452e-05, "loss": 0.2617, "num_input_tokens_seen": 4285992, "step": 47605 }, { "epoch": 12.372661122661123, "grad_norm": 3.5987982749938965, "learning_rate": 1.9068867542086656e-05, "loss": 0.1136, "num_input_tokens_seen": 4286456, "step": 47610 }, { "epoch": 12.3739604989605, "grad_norm": 5.198220252990723, "learning_rate": 1.9063359964302628e-05, "loss": 0.3204, "num_input_tokens_seen": 4286952, "step": 47615 }, { "epoch": 12.375259875259875, "grad_norm": 5.4086737632751465, "learning_rate": 1.9057852691845677e-05, "loss": 0.4006, "num_input_tokens_seen": 4287384, "step": 47620 }, { "epoch": 12.376559251559252, "grad_norm": 3.9744532108306885, "learning_rate": 1.905234572499905e-05, "loss": 0.1556, "num_input_tokens_seen": 4287816, "step": 47625 }, { "epoch": 12.377858627858627, "grad_norm": 7.230223655700684, "learning_rate": 1.9046839064045993e-05, "loss": 0.3758, "num_input_tokens_seen": 4288264, "step": 47630 }, { "epoch": 12.379158004158004, "grad_norm": 8.432494163513184, "learning_rate": 1.9041332709269697e-05, "loss": 0.3333, "num_input_tokens_seen": 4288696, "step": 47635 }, { "epoch": 12.380457380457381, "grad_norm": 0.9347290992736816, "learning_rate": 1.9035826660953374e-05, "loss": 0.063, "num_input_tokens_seen": 4289128, "step": 47640 }, { "epoch": 12.381756756756756, "grad_norm": 2.8690147399902344, "learning_rate": 1.9030320919380194e-05, "loss": 0.2939, "num_input_tokens_seen": 4289560, "step": 47645 }, { "epoch": 12.383056133056133, "grad_norm": 0.6699333190917969, "learning_rate": 1.902481548483334e-05, "loss": 0.1722, "num_input_tokens_seen": 4289992, "step": 47650 }, { "epoch": 12.38435550935551, "grad_norm": 7.569740295410156, "learning_rate": 1.901931035759594e-05, "loss": 0.2782, "num_input_tokens_seen": 4290424, "step": 47655 }, { "epoch": 12.385654885654885, "grad_norm": 3.2051093578338623, "learning_rate": 1.9013805537951144e-05, "loss": 0.2142, "num_input_tokens_seen": 4290920, "step": 47660 }, { "epoch": 12.386954261954262, "grad_norm": 11.1986665725708, "learning_rate": 1.900830102618206e-05, "loss": 0.3486, "num_input_tokens_seen": 4291368, "step": 47665 }, { "epoch": 12.388253638253639, "grad_norm": 8.721303939819336, "learning_rate": 1.9002796822571807e-05, "loss": 0.3997, "num_input_tokens_seen": 4291800, "step": 47670 }, { "epoch": 12.389553014553014, "grad_norm": 0.781363844871521, "learning_rate": 1.8997292927403448e-05, "loss": 0.1856, "num_input_tokens_seen": 4292264, "step": 47675 }, { "epoch": 12.390852390852391, "grad_norm": 2.322880268096924, "learning_rate": 1.8991789340960072e-05, "loss": 0.4569, "num_input_tokens_seen": 4292760, "step": 47680 }, { "epoch": 12.392151767151766, "grad_norm": 14.070368766784668, "learning_rate": 1.8986286063524733e-05, "loss": 0.2325, "num_input_tokens_seen": 4293224, "step": 47685 }, { "epoch": 12.393451143451143, "grad_norm": 1.8619022369384766, "learning_rate": 1.898078309538045e-05, "loss": 0.2896, "num_input_tokens_seen": 4293688, "step": 47690 }, { "epoch": 12.39475051975052, "grad_norm": 0.8918924331665039, "learning_rate": 1.8975280436810266e-05, "loss": 0.461, "num_input_tokens_seen": 4294104, "step": 47695 }, { "epoch": 12.396049896049895, "grad_norm": 3.0053555965423584, "learning_rate": 1.8969778088097175e-05, "loss": 0.2031, "num_input_tokens_seen": 4294568, "step": 47700 }, { "epoch": 12.397349272349272, "grad_norm": 1.3329229354858398, "learning_rate": 1.896427604952419e-05, "loss": 0.071, "num_input_tokens_seen": 4295032, "step": 47705 }, { "epoch": 12.39864864864865, "grad_norm": 0.12995314598083496, "learning_rate": 1.8958774321374256e-05, "loss": 0.0354, "num_input_tokens_seen": 4295480, "step": 47710 }, { "epoch": 12.399948024948024, "grad_norm": 4.351792812347412, "learning_rate": 1.8953272903930353e-05, "loss": 0.0609, "num_input_tokens_seen": 4295928, "step": 47715 }, { "epoch": 12.401247401247401, "grad_norm": 16.350337982177734, "learning_rate": 1.8947771797475414e-05, "loss": 0.3914, "num_input_tokens_seen": 4296392, "step": 47720 }, { "epoch": 12.402546777546778, "grad_norm": 10.429069519042969, "learning_rate": 1.894227100229238e-05, "loss": 0.3982, "num_input_tokens_seen": 4296856, "step": 47725 }, { "epoch": 12.403846153846153, "grad_norm": 0.09560329467058182, "learning_rate": 1.8936770518664145e-05, "loss": 0.0403, "num_input_tokens_seen": 4297288, "step": 47730 }, { "epoch": 12.40514553014553, "grad_norm": 9.652414321899414, "learning_rate": 1.8931270346873613e-05, "loss": 0.2928, "num_input_tokens_seen": 4297784, "step": 47735 }, { "epoch": 12.406444906444907, "grad_norm": 4.208939075469971, "learning_rate": 1.892577048720366e-05, "loss": 0.3908, "num_input_tokens_seen": 4298216, "step": 47740 }, { "epoch": 12.407744282744282, "grad_norm": 1.3322947025299072, "learning_rate": 1.892027093993716e-05, "loss": 0.2268, "num_input_tokens_seen": 4298648, "step": 47745 }, { "epoch": 12.40904365904366, "grad_norm": 0.23886504769325256, "learning_rate": 1.8914771705356944e-05, "loss": 0.5287, "num_input_tokens_seen": 4299096, "step": 47750 }, { "epoch": 12.410343035343036, "grad_norm": 1.71425461769104, "learning_rate": 1.8909272783745846e-05, "loss": 0.1777, "num_input_tokens_seen": 4299528, "step": 47755 }, { "epoch": 12.411642411642411, "grad_norm": 4.669065952301025, "learning_rate": 1.8903774175386695e-05, "loss": 0.0382, "num_input_tokens_seen": 4299976, "step": 47760 }, { "epoch": 12.412941787941788, "grad_norm": 4.40771484375, "learning_rate": 1.889827588056227e-05, "loss": 0.1968, "num_input_tokens_seen": 4300408, "step": 47765 }, { "epoch": 12.414241164241163, "grad_norm": 2.8722691535949707, "learning_rate": 1.8892777899555363e-05, "loss": 0.119, "num_input_tokens_seen": 4300856, "step": 47770 }, { "epoch": 12.41554054054054, "grad_norm": 1.3295105695724487, "learning_rate": 1.8887280232648742e-05, "loss": 0.0938, "num_input_tokens_seen": 4301272, "step": 47775 }, { "epoch": 12.416839916839917, "grad_norm": 1.7900925874710083, "learning_rate": 1.8881782880125163e-05, "loss": 0.4335, "num_input_tokens_seen": 4301720, "step": 47780 }, { "epoch": 12.418139293139292, "grad_norm": 10.616080284118652, "learning_rate": 1.887628584226734e-05, "loss": 0.369, "num_input_tokens_seen": 4302152, "step": 47785 }, { "epoch": 12.41943866943867, "grad_norm": 0.09089832752943039, "learning_rate": 1.8870789119358014e-05, "loss": 0.107, "num_input_tokens_seen": 4302600, "step": 47790 }, { "epoch": 12.420738045738046, "grad_norm": 0.06922129541635513, "learning_rate": 1.8865292711679866e-05, "loss": 0.3801, "num_input_tokens_seen": 4303032, "step": 47795 }, { "epoch": 12.422037422037421, "grad_norm": 8.735993385314941, "learning_rate": 1.8859796619515606e-05, "loss": 0.1819, "num_input_tokens_seen": 4303480, "step": 47800 }, { "epoch": 12.423336798336798, "grad_norm": 2.9786105155944824, "learning_rate": 1.8854300843147875e-05, "loss": 0.1456, "num_input_tokens_seen": 4303928, "step": 47805 }, { "epoch": 12.424636174636175, "grad_norm": 1.9163028001785278, "learning_rate": 1.8848805382859348e-05, "loss": 0.3261, "num_input_tokens_seen": 4304376, "step": 47810 }, { "epoch": 12.42593555093555, "grad_norm": 1.3565808534622192, "learning_rate": 1.884331023893266e-05, "loss": 0.305, "num_input_tokens_seen": 4304792, "step": 47815 }, { "epoch": 12.427234927234927, "grad_norm": 1.4011088609695435, "learning_rate": 1.883781541165041e-05, "loss": 0.2861, "num_input_tokens_seen": 4305272, "step": 47820 }, { "epoch": 12.428534303534304, "grad_norm": 3.324357748031616, "learning_rate": 1.8832320901295227e-05, "loss": 0.125, "num_input_tokens_seen": 4305688, "step": 47825 }, { "epoch": 12.42983367983368, "grad_norm": 0.7588304877281189, "learning_rate": 1.8826826708149687e-05, "loss": 0.1222, "num_input_tokens_seen": 4306136, "step": 47830 }, { "epoch": 12.431133056133056, "grad_norm": 0.12426922470331192, "learning_rate": 1.8821332832496367e-05, "loss": 0.1852, "num_input_tokens_seen": 4306600, "step": 47835 }, { "epoch": 12.432432432432432, "grad_norm": 11.214459419250488, "learning_rate": 1.881583927461781e-05, "loss": 0.0606, "num_input_tokens_seen": 4307032, "step": 47840 }, { "epoch": 12.433731808731808, "grad_norm": 0.3818230926990509, "learning_rate": 1.8810346034796582e-05, "loss": 0.0212, "num_input_tokens_seen": 4307464, "step": 47845 }, { "epoch": 12.435031185031185, "grad_norm": 1.7204924821853638, "learning_rate": 1.880485311331517e-05, "loss": 0.0974, "num_input_tokens_seen": 4307912, "step": 47850 }, { "epoch": 12.43633056133056, "grad_norm": 0.05596962943673134, "learning_rate": 1.879936051045611e-05, "loss": 0.3295, "num_input_tokens_seen": 4308376, "step": 47855 }, { "epoch": 12.437629937629938, "grad_norm": 1.8959699869155884, "learning_rate": 1.879386822650187e-05, "loss": 0.4736, "num_input_tokens_seen": 4308856, "step": 47860 }, { "epoch": 12.438929313929314, "grad_norm": 6.112311840057373, "learning_rate": 1.8788376261734948e-05, "loss": 0.1605, "num_input_tokens_seen": 4309288, "step": 47865 }, { "epoch": 12.44022869022869, "grad_norm": 1.4253178834915161, "learning_rate": 1.8782884616437777e-05, "loss": 0.0788, "num_input_tokens_seen": 4309704, "step": 47870 }, { "epoch": 12.441528066528067, "grad_norm": 0.314736545085907, "learning_rate": 1.877739329089281e-05, "loss": 0.3597, "num_input_tokens_seen": 4310120, "step": 47875 }, { "epoch": 12.442827442827443, "grad_norm": 9.060541152954102, "learning_rate": 1.8771902285382474e-05, "loss": 0.1943, "num_input_tokens_seen": 4310552, "step": 47880 }, { "epoch": 12.444126819126819, "grad_norm": 11.102787971496582, "learning_rate": 1.876641160018916e-05, "loss": 0.3356, "num_input_tokens_seen": 4311016, "step": 47885 }, { "epoch": 12.445426195426196, "grad_norm": 5.200123310089111, "learning_rate": 1.8760921235595275e-05, "loss": 0.2158, "num_input_tokens_seen": 4311432, "step": 47890 }, { "epoch": 12.446725571725572, "grad_norm": 0.7132435441017151, "learning_rate": 1.8755431191883183e-05, "loss": 0.3146, "num_input_tokens_seen": 4311896, "step": 47895 }, { "epoch": 12.448024948024948, "grad_norm": 2.480694055557251, "learning_rate": 1.8749941469335258e-05, "loss": 0.1138, "num_input_tokens_seen": 4312360, "step": 47900 }, { "epoch": 12.449324324324325, "grad_norm": 2.065319299697876, "learning_rate": 1.8744452068233825e-05, "loss": 0.2347, "num_input_tokens_seen": 4312808, "step": 47905 }, { "epoch": 12.450623700623701, "grad_norm": 12.7650785446167, "learning_rate": 1.873896298886122e-05, "loss": 0.3241, "num_input_tokens_seen": 4313256, "step": 47910 }, { "epoch": 12.451923076923077, "grad_norm": 3.4217329025268555, "learning_rate": 1.873347423149974e-05, "loss": 0.1044, "num_input_tokens_seen": 4313704, "step": 47915 }, { "epoch": 12.453222453222454, "grad_norm": 5.827894687652588, "learning_rate": 1.8727985796431697e-05, "loss": 0.0647, "num_input_tokens_seen": 4314152, "step": 47920 }, { "epoch": 12.454521829521829, "grad_norm": 1.8352465629577637, "learning_rate": 1.8722497683939343e-05, "loss": 0.3351, "num_input_tokens_seen": 4314568, "step": 47925 }, { "epoch": 12.455821205821206, "grad_norm": 0.7567836046218872, "learning_rate": 1.871700989430495e-05, "loss": 0.4956, "num_input_tokens_seen": 4314968, "step": 47930 }, { "epoch": 12.457120582120583, "grad_norm": 0.8078921437263489, "learning_rate": 1.8711522427810757e-05, "loss": 0.2629, "num_input_tokens_seen": 4315464, "step": 47935 }, { "epoch": 12.458419958419958, "grad_norm": 4.198396682739258, "learning_rate": 1.8706035284739e-05, "loss": 0.3066, "num_input_tokens_seen": 4315912, "step": 47940 }, { "epoch": 12.459719334719335, "grad_norm": 2.515109062194824, "learning_rate": 1.8700548465371874e-05, "loss": 0.1316, "num_input_tokens_seen": 4316392, "step": 47945 }, { "epoch": 12.461018711018712, "grad_norm": 5.676604747772217, "learning_rate": 1.8695061969991574e-05, "loss": 0.5037, "num_input_tokens_seen": 4316808, "step": 47950 }, { "epoch": 12.462318087318087, "grad_norm": 0.14225521683692932, "learning_rate": 1.8689575798880283e-05, "loss": 0.1062, "num_input_tokens_seen": 4317272, "step": 47955 }, { "epoch": 12.463617463617464, "grad_norm": 6.843587875366211, "learning_rate": 1.868408995232015e-05, "loss": 0.0752, "num_input_tokens_seen": 4317720, "step": 47960 }, { "epoch": 12.46491683991684, "grad_norm": 0.4934086799621582, "learning_rate": 1.867860443059332e-05, "loss": 0.1952, "num_input_tokens_seen": 4318184, "step": 47965 }, { "epoch": 12.466216216216216, "grad_norm": 8.103368759155273, "learning_rate": 1.8673119233981923e-05, "loss": 0.2368, "num_input_tokens_seen": 4318648, "step": 47970 }, { "epoch": 12.467515592515593, "grad_norm": 1.0682803392410278, "learning_rate": 1.866763436276807e-05, "loss": 0.3366, "num_input_tokens_seen": 4319144, "step": 47975 }, { "epoch": 12.46881496881497, "grad_norm": 10.015446662902832, "learning_rate": 1.8662149817233847e-05, "loss": 0.4906, "num_input_tokens_seen": 4319608, "step": 47980 }, { "epoch": 12.470114345114345, "grad_norm": 6.956386566162109, "learning_rate": 1.8656665597661333e-05, "loss": 0.4753, "num_input_tokens_seen": 4320056, "step": 47985 }, { "epoch": 12.471413721413722, "grad_norm": 1.1863456964492798, "learning_rate": 1.8651181704332578e-05, "loss": 0.071, "num_input_tokens_seen": 4320488, "step": 47990 }, { "epoch": 12.472713097713097, "grad_norm": 1.4838066101074219, "learning_rate": 1.8645698137529644e-05, "loss": 0.2273, "num_input_tokens_seen": 4320920, "step": 47995 }, { "epoch": 12.474012474012474, "grad_norm": 9.650426864624023, "learning_rate": 1.8640214897534532e-05, "loss": 0.4636, "num_input_tokens_seen": 4321400, "step": 48000 }, { "epoch": 12.47531185031185, "grad_norm": 0.6803253889083862, "learning_rate": 1.8634731984629263e-05, "loss": 0.1108, "num_input_tokens_seen": 4321832, "step": 48005 }, { "epoch": 12.476611226611226, "grad_norm": 2.3515894412994385, "learning_rate": 1.8629249399095835e-05, "loss": 0.0841, "num_input_tokens_seen": 4322296, "step": 48010 }, { "epoch": 12.477910602910603, "grad_norm": 0.10964205861091614, "learning_rate": 1.8623767141216207e-05, "loss": 0.3293, "num_input_tokens_seen": 4322696, "step": 48015 }, { "epoch": 12.47920997920998, "grad_norm": 13.907759666442871, "learning_rate": 1.8618285211272345e-05, "loss": 0.4278, "num_input_tokens_seen": 4323160, "step": 48020 }, { "epoch": 12.480509355509355, "grad_norm": 2.445180654525757, "learning_rate": 1.8612803609546187e-05, "loss": 0.185, "num_input_tokens_seen": 4323608, "step": 48025 }, { "epoch": 12.481808731808732, "grad_norm": 14.870885848999023, "learning_rate": 1.8607322336319667e-05, "loss": 0.2416, "num_input_tokens_seen": 4324024, "step": 48030 }, { "epoch": 12.483108108108109, "grad_norm": 9.87949275970459, "learning_rate": 1.8601841391874674e-05, "loss": 0.4004, "num_input_tokens_seen": 4324456, "step": 48035 }, { "epoch": 12.484407484407484, "grad_norm": 2.0166070461273193, "learning_rate": 1.859636077649312e-05, "loss": 0.1516, "num_input_tokens_seen": 4324904, "step": 48040 }, { "epoch": 12.48570686070686, "grad_norm": 0.029316376894712448, "learning_rate": 1.8590880490456854e-05, "loss": 0.2241, "num_input_tokens_seen": 4325352, "step": 48045 }, { "epoch": 12.487006237006238, "grad_norm": 0.9792629480361938, "learning_rate": 1.8585400534047758e-05, "loss": 0.1702, "num_input_tokens_seen": 4325784, "step": 48050 }, { "epoch": 12.488305613305613, "grad_norm": 8.400167465209961, "learning_rate": 1.857992090754765e-05, "loss": 0.3899, "num_input_tokens_seen": 4326232, "step": 48055 }, { "epoch": 12.48960498960499, "grad_norm": 6.907670974731445, "learning_rate": 1.857444161123837e-05, "loss": 0.2588, "num_input_tokens_seen": 4326680, "step": 48060 }, { "epoch": 12.490904365904367, "grad_norm": 11.79072093963623, "learning_rate": 1.85689626454017e-05, "loss": 0.291, "num_input_tokens_seen": 4327176, "step": 48065 }, { "epoch": 12.492203742203742, "grad_norm": 2.5814754962921143, "learning_rate": 1.856348401031946e-05, "loss": 0.4023, "num_input_tokens_seen": 4327656, "step": 48070 }, { "epoch": 12.493503118503119, "grad_norm": 5.191179275512695, "learning_rate": 1.85580057062734e-05, "loss": 0.2053, "num_input_tokens_seen": 4328136, "step": 48075 }, { "epoch": 12.494802494802494, "grad_norm": 1.341960072517395, "learning_rate": 1.855252773354527e-05, "loss": 0.1683, "num_input_tokens_seen": 4328600, "step": 48080 }, { "epoch": 12.496101871101871, "grad_norm": 5.9364013671875, "learning_rate": 1.8547050092416828e-05, "loss": 0.1649, "num_input_tokens_seen": 4329064, "step": 48085 }, { "epoch": 12.497401247401248, "grad_norm": 10.65316104888916, "learning_rate": 1.854157278316977e-05, "loss": 0.2239, "num_input_tokens_seen": 4329496, "step": 48090 }, { "epoch": 12.498700623700623, "grad_norm": 0.6869930624961853, "learning_rate": 1.8536095806085823e-05, "loss": 0.0552, "num_input_tokens_seen": 4329912, "step": 48095 }, { "epoch": 12.5, "grad_norm": 3.7482237815856934, "learning_rate": 1.8530619161446655e-05, "loss": 0.169, "num_input_tokens_seen": 4330344, "step": 48100 }, { "epoch": 12.501299376299377, "grad_norm": 4.6428704261779785, "learning_rate": 1.8525142849533946e-05, "loss": 0.1912, "num_input_tokens_seen": 4330776, "step": 48105 }, { "epoch": 12.502598752598752, "grad_norm": 0.5335671305656433, "learning_rate": 1.851966687062934e-05, "loss": 0.1438, "num_input_tokens_seen": 4331256, "step": 48110 }, { "epoch": 12.503898128898129, "grad_norm": 0.09656008332967758, "learning_rate": 1.8514191225014484e-05, "loss": 0.1191, "num_input_tokens_seen": 4331688, "step": 48115 }, { "epoch": 12.505197505197506, "grad_norm": 9.19533634185791, "learning_rate": 1.8508715912970978e-05, "loss": 0.148, "num_input_tokens_seen": 4332120, "step": 48120 }, { "epoch": 12.506496881496881, "grad_norm": 2.0040619373321533, "learning_rate": 1.8503240934780436e-05, "loss": 0.5795, "num_input_tokens_seen": 4332536, "step": 48125 }, { "epoch": 12.507796257796258, "grad_norm": 0.24025267362594604, "learning_rate": 1.8497766290724436e-05, "loss": 0.5817, "num_input_tokens_seen": 4333048, "step": 48130 }, { "epoch": 12.509095634095633, "grad_norm": 3.9513158798217773, "learning_rate": 1.8492291981084552e-05, "loss": 0.2091, "num_input_tokens_seen": 4333464, "step": 48135 }, { "epoch": 12.51039501039501, "grad_norm": 9.851156234741211, "learning_rate": 1.8486818006142327e-05, "loss": 0.1469, "num_input_tokens_seen": 4333992, "step": 48140 }, { "epoch": 12.511694386694387, "grad_norm": 4.574683666229248, "learning_rate": 1.8481344366179284e-05, "loss": 0.159, "num_input_tokens_seen": 4334472, "step": 48145 }, { "epoch": 12.512993762993762, "grad_norm": 4.058508396148682, "learning_rate": 1.8475871061476958e-05, "loss": 0.1443, "num_input_tokens_seen": 4334920, "step": 48150 }, { "epoch": 12.51429313929314, "grad_norm": 11.269693374633789, "learning_rate": 1.8470398092316823e-05, "loss": 0.152, "num_input_tokens_seen": 4335352, "step": 48155 }, { "epoch": 12.515592515592516, "grad_norm": 2.7839300632476807, "learning_rate": 1.8464925458980376e-05, "loss": 0.4751, "num_input_tokens_seen": 4335784, "step": 48160 }, { "epoch": 12.516891891891891, "grad_norm": 9.567276954650879, "learning_rate": 1.845945316174907e-05, "loss": 0.3703, "num_input_tokens_seen": 4336232, "step": 48165 }, { "epoch": 12.518191268191268, "grad_norm": 0.028506845235824585, "learning_rate": 1.8453981200904365e-05, "loss": 0.2761, "num_input_tokens_seen": 4336696, "step": 48170 }, { "epoch": 12.519490644490645, "grad_norm": 5.465214729309082, "learning_rate": 1.8448509576727667e-05, "loss": 0.4742, "num_input_tokens_seen": 4337144, "step": 48175 }, { "epoch": 12.52079002079002, "grad_norm": 2.751241445541382, "learning_rate": 1.84430382895004e-05, "loss": 0.2343, "num_input_tokens_seen": 4337560, "step": 48180 }, { "epoch": 12.522089397089397, "grad_norm": 14.838005065917969, "learning_rate": 1.843756733950396e-05, "loss": 0.3255, "num_input_tokens_seen": 4338024, "step": 48185 }, { "epoch": 12.523388773388774, "grad_norm": 16.64314079284668, "learning_rate": 1.8432096727019727e-05, "loss": 0.283, "num_input_tokens_seen": 4338472, "step": 48190 }, { "epoch": 12.52468814968815, "grad_norm": 10.936295509338379, "learning_rate": 1.8426626452329036e-05, "loss": 0.3383, "num_input_tokens_seen": 4338888, "step": 48195 }, { "epoch": 12.525987525987526, "grad_norm": 3.5725650787353516, "learning_rate": 1.8421156515713257e-05, "loss": 0.1484, "num_input_tokens_seen": 4339368, "step": 48200 }, { "epoch": 12.527286902286903, "grad_norm": 3.498561382293701, "learning_rate": 1.8415686917453707e-05, "loss": 0.2262, "num_input_tokens_seen": 4339800, "step": 48205 }, { "epoch": 12.528586278586278, "grad_norm": 4.98117208480835, "learning_rate": 1.8410217657831674e-05, "loss": 0.234, "num_input_tokens_seen": 4340264, "step": 48210 }, { "epoch": 12.529885654885655, "grad_norm": 3.7930538654327393, "learning_rate": 1.840474873712847e-05, "loss": 0.0734, "num_input_tokens_seen": 4340712, "step": 48215 }, { "epoch": 12.53118503118503, "grad_norm": 2.775529623031616, "learning_rate": 1.8399280155625347e-05, "loss": 0.2006, "num_input_tokens_seen": 4341176, "step": 48220 }, { "epoch": 12.532484407484407, "grad_norm": 1.5477312803268433, "learning_rate": 1.839381191360358e-05, "loss": 0.2316, "num_input_tokens_seen": 4341640, "step": 48225 }, { "epoch": 12.533783783783784, "grad_norm": 0.37524470686912537, "learning_rate": 1.8388344011344395e-05, "loss": 0.129, "num_input_tokens_seen": 4342120, "step": 48230 }, { "epoch": 12.53508316008316, "grad_norm": 4.375100612640381, "learning_rate": 1.838287644912901e-05, "loss": 0.2001, "num_input_tokens_seen": 4342536, "step": 48235 }, { "epoch": 12.536382536382536, "grad_norm": 0.637205183506012, "learning_rate": 1.8377409227238624e-05, "loss": 0.0561, "num_input_tokens_seen": 4342984, "step": 48240 }, { "epoch": 12.537681912681913, "grad_norm": 7.445860385894775, "learning_rate": 1.837194234595444e-05, "loss": 0.1452, "num_input_tokens_seen": 4343448, "step": 48245 }, { "epoch": 12.538981288981288, "grad_norm": 1.356534481048584, "learning_rate": 1.8366475805557602e-05, "loss": 0.3319, "num_input_tokens_seen": 4343880, "step": 48250 }, { "epoch": 12.540280665280665, "grad_norm": 3.845986843109131, "learning_rate": 1.836100960632927e-05, "loss": 0.1988, "num_input_tokens_seen": 4344376, "step": 48255 }, { "epoch": 12.541580041580042, "grad_norm": 7.639052391052246, "learning_rate": 1.8355543748550573e-05, "loss": 0.1799, "num_input_tokens_seen": 4344824, "step": 48260 }, { "epoch": 12.542879417879417, "grad_norm": 7.265125274658203, "learning_rate": 1.835007823250264e-05, "loss": 0.3078, "num_input_tokens_seen": 4345272, "step": 48265 }, { "epoch": 12.544178794178794, "grad_norm": 11.354650497436523, "learning_rate": 1.8344613058466547e-05, "loss": 0.1985, "num_input_tokens_seen": 4345752, "step": 48270 }, { "epoch": 12.545478170478171, "grad_norm": 0.16869491338729858, "learning_rate": 1.8339148226723378e-05, "loss": 0.1371, "num_input_tokens_seen": 4346184, "step": 48275 }, { "epoch": 12.546777546777546, "grad_norm": 2.8321101665496826, "learning_rate": 1.8333683737554207e-05, "loss": 0.147, "num_input_tokens_seen": 4346616, "step": 48280 }, { "epoch": 12.548076923076923, "grad_norm": 1.2408570051193237, "learning_rate": 1.832821959124006e-05, "loss": 0.1941, "num_input_tokens_seen": 4347112, "step": 48285 }, { "epoch": 12.549376299376299, "grad_norm": 10.0319242477417, "learning_rate": 1.8322755788061975e-05, "loss": 0.2061, "num_input_tokens_seen": 4347576, "step": 48290 }, { "epoch": 12.550675675675675, "grad_norm": 0.32323116064071655, "learning_rate": 1.831729232830095e-05, "loss": 0.1515, "num_input_tokens_seen": 4348008, "step": 48295 }, { "epoch": 12.551975051975052, "grad_norm": 4.861008644104004, "learning_rate": 1.8311829212237995e-05, "loss": 0.3311, "num_input_tokens_seen": 4348440, "step": 48300 }, { "epoch": 12.553274428274428, "grad_norm": 3.5194921493530273, "learning_rate": 1.8306366440154066e-05, "loss": 0.1771, "num_input_tokens_seen": 4348904, "step": 48305 }, { "epoch": 12.554573804573804, "grad_norm": 8.74453067779541, "learning_rate": 1.8300904012330127e-05, "loss": 0.4069, "num_input_tokens_seen": 4349336, "step": 48310 }, { "epoch": 12.555873180873181, "grad_norm": 14.95783519744873, "learning_rate": 1.829544192904711e-05, "loss": 0.3278, "num_input_tokens_seen": 4349784, "step": 48315 }, { "epoch": 12.557172557172557, "grad_norm": 10.071497917175293, "learning_rate": 1.828998019058595e-05, "loss": 0.2157, "num_input_tokens_seen": 4350232, "step": 48320 }, { "epoch": 12.558471933471933, "grad_norm": 9.163558006286621, "learning_rate": 1.828451879722752e-05, "loss": 0.4988, "num_input_tokens_seen": 4350680, "step": 48325 }, { "epoch": 12.55977130977131, "grad_norm": 6.321639060974121, "learning_rate": 1.8279057749252736e-05, "loss": 0.1696, "num_input_tokens_seen": 4351128, "step": 48330 }, { "epoch": 12.561070686070686, "grad_norm": 3.211073637008667, "learning_rate": 1.8273597046942453e-05, "loss": 0.1324, "num_input_tokens_seen": 4351560, "step": 48335 }, { "epoch": 12.562370062370062, "grad_norm": 4.300140857696533, "learning_rate": 1.8268136690577502e-05, "loss": 0.3559, "num_input_tokens_seen": 4352040, "step": 48340 }, { "epoch": 12.56366943866944, "grad_norm": 0.4144984483718872, "learning_rate": 1.826267668043875e-05, "loss": 0.2671, "num_input_tokens_seen": 4352472, "step": 48345 }, { "epoch": 12.564968814968815, "grad_norm": 0.6498139500617981, "learning_rate": 1.8257217016806982e-05, "loss": 0.0755, "num_input_tokens_seen": 4352920, "step": 48350 }, { "epoch": 12.566268191268192, "grad_norm": 1.2299021482467651, "learning_rate": 1.8251757699963006e-05, "loss": 0.2455, "num_input_tokens_seen": 4353384, "step": 48355 }, { "epoch": 12.567567567567568, "grad_norm": 0.1532190442085266, "learning_rate": 1.824629873018759e-05, "loss": 0.3831, "num_input_tokens_seen": 4353832, "step": 48360 }, { "epoch": 12.568866943866944, "grad_norm": 6.527109622955322, "learning_rate": 1.8240840107761516e-05, "loss": 0.1428, "num_input_tokens_seen": 4354296, "step": 48365 }, { "epoch": 12.57016632016632, "grad_norm": 0.6840643286705017, "learning_rate": 1.82353818329655e-05, "loss": 0.0217, "num_input_tokens_seen": 4354776, "step": 48370 }, { "epoch": 12.571465696465696, "grad_norm": 10.432648658752441, "learning_rate": 1.822992390608028e-05, "loss": 0.7209, "num_input_tokens_seen": 4355208, "step": 48375 }, { "epoch": 12.572765072765073, "grad_norm": 0.39386340975761414, "learning_rate": 1.8224466327386564e-05, "loss": 0.1511, "num_input_tokens_seen": 4355624, "step": 48380 }, { "epoch": 12.57406444906445, "grad_norm": 8.703587532043457, "learning_rate": 1.821900909716504e-05, "loss": 0.2548, "num_input_tokens_seen": 4356088, "step": 48385 }, { "epoch": 12.575363825363825, "grad_norm": 10.637154579162598, "learning_rate": 1.8213552215696367e-05, "loss": 0.3517, "num_input_tokens_seen": 4356552, "step": 48390 }, { "epoch": 12.576663201663202, "grad_norm": 0.24486172199249268, "learning_rate": 1.820809568326121e-05, "loss": 0.0764, "num_input_tokens_seen": 4357000, "step": 48395 }, { "epoch": 12.577962577962579, "grad_norm": 0.08333728462457657, "learning_rate": 1.8202639500140203e-05, "loss": 0.1188, "num_input_tokens_seen": 4357448, "step": 48400 }, { "epoch": 12.579261954261954, "grad_norm": 6.0479655265808105, "learning_rate": 1.8197183666613953e-05, "loss": 0.2892, "num_input_tokens_seen": 4357912, "step": 48405 }, { "epoch": 12.58056133056133, "grad_norm": 8.220104217529297, "learning_rate": 1.8191728182963068e-05, "loss": 0.2227, "num_input_tokens_seen": 4358328, "step": 48410 }, { "epoch": 12.581860706860708, "grad_norm": 6.797595500946045, "learning_rate": 1.8186273049468122e-05, "loss": 0.1558, "num_input_tokens_seen": 4358792, "step": 48415 }, { "epoch": 12.583160083160083, "grad_norm": 0.7010232210159302, "learning_rate": 1.818081826640969e-05, "loss": 0.2395, "num_input_tokens_seen": 4359240, "step": 48420 }, { "epoch": 12.58445945945946, "grad_norm": 7.826860427856445, "learning_rate": 1.81753638340683e-05, "loss": 0.1481, "num_input_tokens_seen": 4359704, "step": 48425 }, { "epoch": 12.585758835758837, "grad_norm": 2.2411129474639893, "learning_rate": 1.8169909752724494e-05, "loss": 0.3667, "num_input_tokens_seen": 4360152, "step": 48430 }, { "epoch": 12.587058212058212, "grad_norm": 0.730792760848999, "learning_rate": 1.8164456022658767e-05, "loss": 0.0871, "num_input_tokens_seen": 4360600, "step": 48435 }, { "epoch": 12.588357588357589, "grad_norm": 2.751741409301758, "learning_rate": 1.8159002644151625e-05, "loss": 0.2339, "num_input_tokens_seen": 4361064, "step": 48440 }, { "epoch": 12.589656964656964, "grad_norm": 2.587116003036499, "learning_rate": 1.815354961748352e-05, "loss": 0.0237, "num_input_tokens_seen": 4361544, "step": 48445 }, { "epoch": 12.59095634095634, "grad_norm": 17.368915557861328, "learning_rate": 1.8148096942934928e-05, "loss": 0.3492, "num_input_tokens_seen": 4362056, "step": 48450 }, { "epoch": 12.592255717255718, "grad_norm": 1.1785547733306885, "learning_rate": 1.8142644620786264e-05, "loss": 0.1179, "num_input_tokens_seen": 4362488, "step": 48455 }, { "epoch": 12.593555093555093, "grad_norm": 0.49978676438331604, "learning_rate": 1.8137192651317973e-05, "loss": 0.0505, "num_input_tokens_seen": 4362936, "step": 48460 }, { "epoch": 12.59485446985447, "grad_norm": 10.350739479064941, "learning_rate": 1.8131741034810435e-05, "loss": 0.1697, "num_input_tokens_seen": 4363400, "step": 48465 }, { "epoch": 12.596153846153847, "grad_norm": 9.39235782623291, "learning_rate": 1.812628977154403e-05, "loss": 0.1991, "num_input_tokens_seen": 4363864, "step": 48470 }, { "epoch": 12.597453222453222, "grad_norm": 4.342238903045654, "learning_rate": 1.8120838861799135e-05, "loss": 0.3568, "num_input_tokens_seen": 4364328, "step": 48475 }, { "epoch": 12.598752598752599, "grad_norm": 8.29540729522705, "learning_rate": 1.8115388305856082e-05, "loss": 0.1545, "num_input_tokens_seen": 4364760, "step": 48480 }, { "epoch": 12.600051975051976, "grad_norm": 13.05723762512207, "learning_rate": 1.810993810399521e-05, "loss": 0.5085, "num_input_tokens_seen": 4365224, "step": 48485 }, { "epoch": 12.60135135135135, "grad_norm": 13.534595489501953, "learning_rate": 1.8104488256496816e-05, "loss": 0.3994, "num_input_tokens_seen": 4365688, "step": 48490 }, { "epoch": 12.602650727650728, "grad_norm": 4.606783390045166, "learning_rate": 1.809903876364121e-05, "loss": 0.0347, "num_input_tokens_seen": 4366120, "step": 48495 }, { "epoch": 12.603950103950105, "grad_norm": 10.88502025604248, "learning_rate": 1.809358962570864e-05, "loss": 0.3574, "num_input_tokens_seen": 4366552, "step": 48500 }, { "epoch": 12.60524948024948, "grad_norm": 9.661307334899902, "learning_rate": 1.8088140842979383e-05, "loss": 0.649, "num_input_tokens_seen": 4367000, "step": 48505 }, { "epoch": 12.606548856548857, "grad_norm": 15.36205768585205, "learning_rate": 1.8082692415733654e-05, "loss": 0.3116, "num_input_tokens_seen": 4367448, "step": 48510 }, { "epoch": 12.607848232848234, "grad_norm": 4.482547760009766, "learning_rate": 1.8077244344251697e-05, "loss": 0.1312, "num_input_tokens_seen": 4367912, "step": 48515 }, { "epoch": 12.609147609147609, "grad_norm": 9.201501846313477, "learning_rate": 1.807179662881368e-05, "loss": 0.2751, "num_input_tokens_seen": 4368376, "step": 48520 }, { "epoch": 12.610446985446986, "grad_norm": 2.945882558822632, "learning_rate": 1.806634926969981e-05, "loss": 0.1402, "num_input_tokens_seen": 4368856, "step": 48525 }, { "epoch": 12.611746361746361, "grad_norm": 3.756758689880371, "learning_rate": 1.806090226719025e-05, "loss": 0.2063, "num_input_tokens_seen": 4369336, "step": 48530 }, { "epoch": 12.613045738045738, "grad_norm": 20.068647384643555, "learning_rate": 1.8055455621565117e-05, "loss": 0.2717, "num_input_tokens_seen": 4369768, "step": 48535 }, { "epoch": 12.614345114345115, "grad_norm": 0.5590720772743225, "learning_rate": 1.8050009333104566e-05, "loss": 0.1483, "num_input_tokens_seen": 4370216, "step": 48540 }, { "epoch": 12.61564449064449, "grad_norm": 13.049649238586426, "learning_rate": 1.8044563402088684e-05, "loss": 0.3515, "num_input_tokens_seen": 4370680, "step": 48545 }, { "epoch": 12.616943866943867, "grad_norm": 1.2862669229507446, "learning_rate": 1.8039117828797586e-05, "loss": 0.0502, "num_input_tokens_seen": 4371144, "step": 48550 }, { "epoch": 12.618243243243244, "grad_norm": 1.0846089124679565, "learning_rate": 1.8033672613511317e-05, "loss": 0.0735, "num_input_tokens_seen": 4371560, "step": 48555 }, { "epoch": 12.619542619542619, "grad_norm": 3.4705567359924316, "learning_rate": 1.8028227756509942e-05, "loss": 0.3173, "num_input_tokens_seen": 4372024, "step": 48560 }, { "epoch": 12.620841995841996, "grad_norm": 12.161914825439453, "learning_rate": 1.802278325807349e-05, "loss": 0.2856, "num_input_tokens_seen": 4372472, "step": 48565 }, { "epoch": 12.622141372141373, "grad_norm": 1.7900840044021606, "learning_rate": 1.801733911848199e-05, "loss": 0.3532, "num_input_tokens_seen": 4372904, "step": 48570 }, { "epoch": 12.623440748440748, "grad_norm": 15.138934135437012, "learning_rate": 1.8011895338015415e-05, "loss": 0.5184, "num_input_tokens_seen": 4373336, "step": 48575 }, { "epoch": 12.624740124740125, "grad_norm": 11.854889869689941, "learning_rate": 1.800645191695377e-05, "loss": 0.5869, "num_input_tokens_seen": 4373784, "step": 48580 }, { "epoch": 12.6260395010395, "grad_norm": 2.9876255989074707, "learning_rate": 1.8001008855577e-05, "loss": 0.4032, "num_input_tokens_seen": 4374200, "step": 48585 }, { "epoch": 12.627338877338877, "grad_norm": 2.9467756748199463, "learning_rate": 1.799556615416505e-05, "loss": 0.163, "num_input_tokens_seen": 4374648, "step": 48590 }, { "epoch": 12.628638253638254, "grad_norm": 1.1981687545776367, "learning_rate": 1.7990123812997854e-05, "loss": 0.062, "num_input_tokens_seen": 4375080, "step": 48595 }, { "epoch": 12.62993762993763, "grad_norm": 12.169642448425293, "learning_rate": 1.7984681832355295e-05, "loss": 0.4218, "num_input_tokens_seen": 4375528, "step": 48600 }, { "epoch": 12.631237006237006, "grad_norm": 8.501792907714844, "learning_rate": 1.7979240212517275e-05, "loss": 0.2831, "num_input_tokens_seen": 4375992, "step": 48605 }, { "epoch": 12.632536382536383, "grad_norm": 5.92280912399292, "learning_rate": 1.7973798953763654e-05, "loss": 0.2204, "num_input_tokens_seen": 4376488, "step": 48610 }, { "epoch": 12.633835758835758, "grad_norm": 0.19695092737674713, "learning_rate": 1.7968358056374295e-05, "loss": 0.102, "num_input_tokens_seen": 4376920, "step": 48615 }, { "epoch": 12.635135135135135, "grad_norm": 0.6383970975875854, "learning_rate": 1.7962917520629008e-05, "loss": 0.2225, "num_input_tokens_seen": 4377384, "step": 48620 }, { "epoch": 12.636434511434512, "grad_norm": 0.17562122642993927, "learning_rate": 1.795747734680762e-05, "loss": 0.3102, "num_input_tokens_seen": 4377800, "step": 48625 }, { "epoch": 12.637733887733887, "grad_norm": 3.0127036571502686, "learning_rate": 1.7952037535189916e-05, "loss": 0.0746, "num_input_tokens_seen": 4378232, "step": 48630 }, { "epoch": 12.639033264033264, "grad_norm": 6.601206302642822, "learning_rate": 1.7946598086055684e-05, "loss": 0.2894, "num_input_tokens_seen": 4378648, "step": 48635 }, { "epoch": 12.640332640332641, "grad_norm": 22.196849822998047, "learning_rate": 1.794115899968466e-05, "loss": 0.2114, "num_input_tokens_seen": 4379096, "step": 48640 }, { "epoch": 12.641632016632016, "grad_norm": 1.7506911754608154, "learning_rate": 1.7935720276356598e-05, "loss": 0.0464, "num_input_tokens_seen": 4379544, "step": 48645 }, { "epoch": 12.642931392931393, "grad_norm": 0.00946964230388403, "learning_rate": 1.7930281916351205e-05, "loss": 0.4364, "num_input_tokens_seen": 4379992, "step": 48650 }, { "epoch": 12.64423076923077, "grad_norm": 4.194024085998535, "learning_rate": 1.79248439199482e-05, "loss": 0.4985, "num_input_tokens_seen": 4380424, "step": 48655 }, { "epoch": 12.645530145530145, "grad_norm": 0.3864574432373047, "learning_rate": 1.7919406287427244e-05, "loss": 0.1554, "num_input_tokens_seen": 4380904, "step": 48660 }, { "epoch": 12.646829521829522, "grad_norm": 0.04772781953215599, "learning_rate": 1.7913969019068e-05, "loss": 0.3095, "num_input_tokens_seen": 4381352, "step": 48665 }, { "epoch": 12.648128898128899, "grad_norm": 11.410479545593262, "learning_rate": 1.7908532115150135e-05, "loss": 0.553, "num_input_tokens_seen": 4381816, "step": 48670 }, { "epoch": 12.649428274428274, "grad_norm": 9.306656837463379, "learning_rate": 1.790309557595324e-05, "loss": 0.2639, "num_input_tokens_seen": 4382248, "step": 48675 }, { "epoch": 12.650727650727651, "grad_norm": 8.222143173217773, "learning_rate": 1.789765940175695e-05, "loss": 0.2322, "num_input_tokens_seen": 4382728, "step": 48680 }, { "epoch": 12.652027027027026, "grad_norm": 0.7319682240486145, "learning_rate": 1.7892223592840834e-05, "loss": 0.2149, "num_input_tokens_seen": 4383144, "step": 48685 }, { "epoch": 12.653326403326403, "grad_norm": 4.528962135314941, "learning_rate": 1.788678814948448e-05, "loss": 0.222, "num_input_tokens_seen": 4383576, "step": 48690 }, { "epoch": 12.65462577962578, "grad_norm": 0.9925374984741211, "learning_rate": 1.7881353071967415e-05, "loss": 0.1568, "num_input_tokens_seen": 4384040, "step": 48695 }, { "epoch": 12.655925155925155, "grad_norm": 0.8207685351371765, "learning_rate": 1.7875918360569195e-05, "loss": 0.2641, "num_input_tokens_seen": 4384520, "step": 48700 }, { "epoch": 12.657224532224532, "grad_norm": 6.621394157409668, "learning_rate": 1.7870484015569306e-05, "loss": 0.0848, "num_input_tokens_seen": 4384984, "step": 48705 }, { "epoch": 12.65852390852391, "grad_norm": 9.091072082519531, "learning_rate": 1.786505003724727e-05, "loss": 0.3707, "num_input_tokens_seen": 4385432, "step": 48710 }, { "epoch": 12.659823284823284, "grad_norm": 11.689119338989258, "learning_rate": 1.7859616425882536e-05, "loss": 0.4363, "num_input_tokens_seen": 4385896, "step": 48715 }, { "epoch": 12.661122661122661, "grad_norm": 7.662652492523193, "learning_rate": 1.7854183181754574e-05, "loss": 0.0741, "num_input_tokens_seen": 4386440, "step": 48720 }, { "epoch": 12.662422037422038, "grad_norm": 5.2074055671691895, "learning_rate": 1.7848750305142824e-05, "loss": 0.2904, "num_input_tokens_seen": 4386888, "step": 48725 }, { "epoch": 12.663721413721413, "grad_norm": 0.1449497640132904, "learning_rate": 1.7843317796326688e-05, "loss": 0.0268, "num_input_tokens_seen": 4387320, "step": 48730 }, { "epoch": 12.66502079002079, "grad_norm": 0.5531530380249023, "learning_rate": 1.7837885655585578e-05, "loss": 0.2905, "num_input_tokens_seen": 4387768, "step": 48735 }, { "epoch": 12.666320166320165, "grad_norm": 1.6981273889541626, "learning_rate": 1.783245388319887e-05, "loss": 0.479, "num_input_tokens_seen": 4388200, "step": 48740 }, { "epoch": 12.667619542619542, "grad_norm": 5.663882255554199, "learning_rate": 1.7827022479445935e-05, "loss": 0.3655, "num_input_tokens_seen": 4388616, "step": 48745 }, { "epoch": 12.66891891891892, "grad_norm": 1.1053646802902222, "learning_rate": 1.7821591444606094e-05, "loss": 0.1411, "num_input_tokens_seen": 4389128, "step": 48750 }, { "epoch": 12.670218295218294, "grad_norm": 0.05310749262571335, "learning_rate": 1.781616077895869e-05, "loss": 0.2761, "num_input_tokens_seen": 4389608, "step": 48755 }, { "epoch": 12.671517671517671, "grad_norm": 4.755613803863525, "learning_rate": 1.7810730482783017e-05, "loss": 0.1218, "num_input_tokens_seen": 4390088, "step": 48760 }, { "epoch": 12.672817047817048, "grad_norm": 3.9117300510406494, "learning_rate": 1.7805300556358372e-05, "loss": 0.2898, "num_input_tokens_seen": 4390536, "step": 48765 }, { "epoch": 12.674116424116423, "grad_norm": 3.10736346244812, "learning_rate": 1.7799870999964004e-05, "loss": 0.1818, "num_input_tokens_seen": 4390984, "step": 48770 }, { "epoch": 12.6754158004158, "grad_norm": 1.9421441555023193, "learning_rate": 1.7794441813879172e-05, "loss": 0.2527, "num_input_tokens_seen": 4391432, "step": 48775 }, { "epoch": 12.676715176715177, "grad_norm": 8.995421409606934, "learning_rate": 1.7789012998383096e-05, "loss": 0.1397, "num_input_tokens_seen": 4391864, "step": 48780 }, { "epoch": 12.678014553014552, "grad_norm": 7.5055251121521, "learning_rate": 1.7783584553755006e-05, "loss": 0.2479, "num_input_tokens_seen": 4392280, "step": 48785 }, { "epoch": 12.67931392931393, "grad_norm": 0.25387445092201233, "learning_rate": 1.7778156480274066e-05, "loss": 0.1509, "num_input_tokens_seen": 4392728, "step": 48790 }, { "epoch": 12.680613305613306, "grad_norm": 3.5676393508911133, "learning_rate": 1.777272877821946e-05, "loss": 0.4261, "num_input_tokens_seen": 4393192, "step": 48795 }, { "epoch": 12.681912681912682, "grad_norm": 10.713658332824707, "learning_rate": 1.7767301447870342e-05, "loss": 0.1875, "num_input_tokens_seen": 4393656, "step": 48800 }, { "epoch": 12.683212058212058, "grad_norm": 13.567098617553711, "learning_rate": 1.776187448950583e-05, "loss": 0.3089, "num_input_tokens_seen": 4394072, "step": 48805 }, { "epoch": 12.684511434511435, "grad_norm": 2.925276517868042, "learning_rate": 1.7756447903405053e-05, "loss": 0.2825, "num_input_tokens_seen": 4394552, "step": 48810 }, { "epoch": 12.68581081081081, "grad_norm": 4.128438472747803, "learning_rate": 1.77510216898471e-05, "loss": 0.0914, "num_input_tokens_seen": 4394952, "step": 48815 }, { "epoch": 12.687110187110187, "grad_norm": 1.9307361841201782, "learning_rate": 1.7745595849111056e-05, "loss": 0.2538, "num_input_tokens_seen": 4395352, "step": 48820 }, { "epoch": 12.688409563409563, "grad_norm": 13.75729751586914, "learning_rate": 1.7740170381475953e-05, "loss": 0.1742, "num_input_tokens_seen": 4395816, "step": 48825 }, { "epoch": 12.68970893970894, "grad_norm": 2.5835676193237305, "learning_rate": 1.7734745287220854e-05, "loss": 0.2394, "num_input_tokens_seen": 4396232, "step": 48830 }, { "epoch": 12.691008316008316, "grad_norm": 11.237500190734863, "learning_rate": 1.772932056662476e-05, "loss": 0.3203, "num_input_tokens_seen": 4396664, "step": 48835 }, { "epoch": 12.692307692307692, "grad_norm": 3.231300115585327, "learning_rate": 1.772389621996668e-05, "loss": 0.0746, "num_input_tokens_seen": 4397112, "step": 48840 }, { "epoch": 12.693607068607069, "grad_norm": 8.271601676940918, "learning_rate": 1.7718472247525585e-05, "loss": 0.8697, "num_input_tokens_seen": 4397624, "step": 48845 }, { "epoch": 12.694906444906445, "grad_norm": 1.5368391275405884, "learning_rate": 1.771304864958045e-05, "loss": 0.5203, "num_input_tokens_seen": 4398072, "step": 48850 }, { "epoch": 12.69620582120582, "grad_norm": 3.982541561126709, "learning_rate": 1.7707625426410194e-05, "loss": 0.2258, "num_input_tokens_seen": 4398536, "step": 48855 }, { "epoch": 12.697505197505198, "grad_norm": 0.041166458278894424, "learning_rate": 1.7702202578293754e-05, "loss": 0.1798, "num_input_tokens_seen": 4398984, "step": 48860 }, { "epoch": 12.698804573804575, "grad_norm": 0.25700652599334717, "learning_rate": 1.769678010551003e-05, "loss": 0.0414, "num_input_tokens_seen": 4399432, "step": 48865 }, { "epoch": 12.70010395010395, "grad_norm": 4.745039939880371, "learning_rate": 1.76913580083379e-05, "loss": 0.2724, "num_input_tokens_seen": 4399864, "step": 48870 }, { "epoch": 12.701403326403327, "grad_norm": 0.44506120681762695, "learning_rate": 1.7685936287056228e-05, "loss": 0.116, "num_input_tokens_seen": 4400296, "step": 48875 }, { "epoch": 12.702702702702704, "grad_norm": 1.5783716440200806, "learning_rate": 1.768051494194386e-05, "loss": 0.3039, "num_input_tokens_seen": 4400728, "step": 48880 }, { "epoch": 12.704002079002079, "grad_norm": 2.780507802963257, "learning_rate": 1.767509397327963e-05, "loss": 0.1289, "num_input_tokens_seen": 4401192, "step": 48885 }, { "epoch": 12.705301455301456, "grad_norm": 0.7136800289154053, "learning_rate": 1.7669673381342328e-05, "loss": 0.3322, "num_input_tokens_seen": 4401672, "step": 48890 }, { "epoch": 12.70660083160083, "grad_norm": 0.41904398798942566, "learning_rate": 1.766425316641075e-05, "loss": 0.2773, "num_input_tokens_seen": 4402120, "step": 48895 }, { "epoch": 12.707900207900208, "grad_norm": 0.33940884470939636, "learning_rate": 1.7658833328763653e-05, "loss": 0.3505, "num_input_tokens_seen": 4402616, "step": 48900 }, { "epoch": 12.709199584199585, "grad_norm": 6.347043991088867, "learning_rate": 1.7653413868679804e-05, "loss": 0.2649, "num_input_tokens_seen": 4403048, "step": 48905 }, { "epoch": 12.71049896049896, "grad_norm": 1.7838079929351807, "learning_rate": 1.764799478643791e-05, "loss": 0.184, "num_input_tokens_seen": 4403480, "step": 48910 }, { "epoch": 12.711798336798337, "grad_norm": 0.19036482274532318, "learning_rate": 1.7642576082316695e-05, "loss": 0.0506, "num_input_tokens_seen": 4403944, "step": 48915 }, { "epoch": 12.713097713097714, "grad_norm": 9.874360084533691, "learning_rate": 1.7637157756594836e-05, "loss": 0.2075, "num_input_tokens_seen": 4404360, "step": 48920 }, { "epoch": 12.714397089397089, "grad_norm": 5.09797477722168, "learning_rate": 1.7631739809551014e-05, "loss": 0.1627, "num_input_tokens_seen": 4404840, "step": 48925 }, { "epoch": 12.715696465696466, "grad_norm": 11.78596305847168, "learning_rate": 1.7626322241463873e-05, "loss": 0.4625, "num_input_tokens_seen": 4405304, "step": 48930 }, { "epoch": 12.716995841995843, "grad_norm": 9.089554786682129, "learning_rate": 1.7620905052612035e-05, "loss": 0.2837, "num_input_tokens_seen": 4405736, "step": 48935 }, { "epoch": 12.718295218295218, "grad_norm": 3.6344473361968994, "learning_rate": 1.7615488243274135e-05, "loss": 0.2301, "num_input_tokens_seen": 4406152, "step": 48940 }, { "epoch": 12.719594594594595, "grad_norm": 11.119516372680664, "learning_rate": 1.761007181372874e-05, "loss": 0.3924, "num_input_tokens_seen": 4406632, "step": 48945 }, { "epoch": 12.720893970893972, "grad_norm": 2.6523616313934326, "learning_rate": 1.7604655764254435e-05, "loss": 0.3826, "num_input_tokens_seen": 4407112, "step": 48950 }, { "epoch": 12.722193347193347, "grad_norm": 0.20196881890296936, "learning_rate": 1.7599240095129764e-05, "loss": 0.3356, "num_input_tokens_seen": 4407544, "step": 48955 }, { "epoch": 12.723492723492724, "grad_norm": 3.289273977279663, "learning_rate": 1.759382480663328e-05, "loss": 0.2475, "num_input_tokens_seen": 4408008, "step": 48960 }, { "epoch": 12.7247920997921, "grad_norm": 8.382427215576172, "learning_rate": 1.7588409899043468e-05, "loss": 0.1722, "num_input_tokens_seen": 4408440, "step": 48965 }, { "epoch": 12.726091476091476, "grad_norm": 1.2080917358398438, "learning_rate": 1.7582995372638844e-05, "loss": 0.3528, "num_input_tokens_seen": 4408872, "step": 48970 }, { "epoch": 12.727390852390853, "grad_norm": 0.946793794631958, "learning_rate": 1.757758122769787e-05, "loss": 0.0902, "num_input_tokens_seen": 4409304, "step": 48975 }, { "epoch": 12.728690228690228, "grad_norm": 7.156267166137695, "learning_rate": 1.757216746449901e-05, "loss": 0.1045, "num_input_tokens_seen": 4409768, "step": 48980 }, { "epoch": 12.729989604989605, "grad_norm": 15.692107200622559, "learning_rate": 1.756675408332069e-05, "loss": 0.5262, "num_input_tokens_seen": 4410232, "step": 48985 }, { "epoch": 12.731288981288982, "grad_norm": 0.2320530265569687, "learning_rate": 1.7561341084441334e-05, "loss": 0.1872, "num_input_tokens_seen": 4410664, "step": 48990 }, { "epoch": 12.732588357588357, "grad_norm": 10.688507080078125, "learning_rate": 1.7555928468139337e-05, "loss": 0.5493, "num_input_tokens_seen": 4411160, "step": 48995 }, { "epoch": 12.733887733887734, "grad_norm": 1.8271416425704956, "learning_rate": 1.7550516234693065e-05, "loss": 0.0557, "num_input_tokens_seen": 4411608, "step": 49000 }, { "epoch": 12.73518711018711, "grad_norm": 0.5489606261253357, "learning_rate": 1.7545104384380885e-05, "loss": 0.0367, "num_input_tokens_seen": 4412040, "step": 49005 }, { "epoch": 12.736486486486486, "grad_norm": 0.03346553444862366, "learning_rate": 1.753969291748112e-05, "loss": 0.3503, "num_input_tokens_seen": 4412488, "step": 49010 }, { "epoch": 12.737785862785863, "grad_norm": 0.3367433547973633, "learning_rate": 1.753428183427211e-05, "loss": 0.1154, "num_input_tokens_seen": 4412920, "step": 49015 }, { "epoch": 12.73908523908524, "grad_norm": 1.4450809955596924, "learning_rate": 1.7528871135032127e-05, "loss": 0.0483, "num_input_tokens_seen": 4413384, "step": 49020 }, { "epoch": 12.740384615384615, "grad_norm": 1.7367905378341675, "learning_rate": 1.7523460820039464e-05, "loss": 0.3362, "num_input_tokens_seen": 4413816, "step": 49025 }, { "epoch": 12.741683991683992, "grad_norm": 11.719653129577637, "learning_rate": 1.7518050889572372e-05, "loss": 0.379, "num_input_tokens_seen": 4414280, "step": 49030 }, { "epoch": 12.742983367983367, "grad_norm": 0.04756821319460869, "learning_rate": 1.75126413439091e-05, "loss": 0.5843, "num_input_tokens_seen": 4414744, "step": 49035 }, { "epoch": 12.744282744282744, "grad_norm": 12.99576473236084, "learning_rate": 1.750723218332785e-05, "loss": 0.3959, "num_input_tokens_seen": 4415176, "step": 49040 }, { "epoch": 12.745582120582121, "grad_norm": 1.9597047567367554, "learning_rate": 1.750182340810683e-05, "loss": 0.1055, "num_input_tokens_seen": 4415592, "step": 49045 }, { "epoch": 12.746881496881496, "grad_norm": 0.403895765542984, "learning_rate": 1.7496415018524213e-05, "loss": 0.1572, "num_input_tokens_seen": 4416040, "step": 49050 }, { "epoch": 12.748180873180873, "grad_norm": 5.724686145782471, "learning_rate": 1.7491007014858172e-05, "loss": 0.2927, "num_input_tokens_seen": 4416520, "step": 49055 }, { "epoch": 12.74948024948025, "grad_norm": 0.16205209493637085, "learning_rate": 1.7485599397386827e-05, "loss": 0.3179, "num_input_tokens_seen": 4416968, "step": 49060 }, { "epoch": 12.750779625779625, "grad_norm": 1.8026784658432007, "learning_rate": 1.7480192166388303e-05, "loss": 0.4232, "num_input_tokens_seen": 4417448, "step": 49065 }, { "epoch": 12.752079002079002, "grad_norm": 0.3160232603549957, "learning_rate": 1.747478532214071e-05, "loss": 0.0037, "num_input_tokens_seen": 4417896, "step": 49070 }, { "epoch": 12.753378378378379, "grad_norm": 7.8825459480285645, "learning_rate": 1.7469378864922102e-05, "loss": 0.2064, "num_input_tokens_seen": 4418376, "step": 49075 }, { "epoch": 12.754677754677754, "grad_norm": 2.8983943462371826, "learning_rate": 1.7463972795010573e-05, "loss": 0.0967, "num_input_tokens_seen": 4418824, "step": 49080 }, { "epoch": 12.755977130977131, "grad_norm": 2.232785940170288, "learning_rate": 1.7458567112684133e-05, "loss": 0.3095, "num_input_tokens_seen": 4419288, "step": 49085 }, { "epoch": 12.757276507276508, "grad_norm": 8.035063743591309, "learning_rate": 1.7453161818220814e-05, "loss": 0.2477, "num_input_tokens_seen": 4419752, "step": 49090 }, { "epoch": 12.758575883575883, "grad_norm": 0.09451241046190262, "learning_rate": 1.744775691189861e-05, "loss": 0.1927, "num_input_tokens_seen": 4420200, "step": 49095 }, { "epoch": 12.75987525987526, "grad_norm": 7.812678337097168, "learning_rate": 1.7442352393995515e-05, "loss": 0.1614, "num_input_tokens_seen": 4420616, "step": 49100 }, { "epoch": 12.761174636174637, "grad_norm": 2.2263150215148926, "learning_rate": 1.7436948264789466e-05, "loss": 0.1914, "num_input_tokens_seen": 4421048, "step": 49105 }, { "epoch": 12.762474012474012, "grad_norm": 1.7578256130218506, "learning_rate": 1.7431544524558422e-05, "loss": 0.3844, "num_input_tokens_seen": 4421480, "step": 49110 }, { "epoch": 12.763773388773389, "grad_norm": 15.50760269165039, "learning_rate": 1.742614117358029e-05, "loss": 0.3999, "num_input_tokens_seen": 4421976, "step": 49115 }, { "epoch": 12.765072765072766, "grad_norm": 1.6972905397415161, "learning_rate": 1.7420738212132983e-05, "loss": 0.161, "num_input_tokens_seen": 4422392, "step": 49120 }, { "epoch": 12.766372141372141, "grad_norm": 2.515573501586914, "learning_rate": 1.7415335640494366e-05, "loss": 0.4247, "num_input_tokens_seen": 4422808, "step": 49125 }, { "epoch": 12.767671517671518, "grad_norm": 9.412361145019531, "learning_rate": 1.7409933458942303e-05, "loss": 0.5107, "num_input_tokens_seen": 4423256, "step": 49130 }, { "epoch": 12.768970893970893, "grad_norm": 8.729227066040039, "learning_rate": 1.740453166775464e-05, "loss": 0.2048, "num_input_tokens_seen": 4423752, "step": 49135 }, { "epoch": 12.77027027027027, "grad_norm": 11.230408668518066, "learning_rate": 1.739913026720918e-05, "loss": 0.2261, "num_input_tokens_seen": 4424168, "step": 49140 }, { "epoch": 12.771569646569647, "grad_norm": 1.2430498600006104, "learning_rate": 1.739372925758374e-05, "loss": 0.0764, "num_input_tokens_seen": 4424632, "step": 49145 }, { "epoch": 12.772869022869022, "grad_norm": 2.968522310256958, "learning_rate": 1.738832863915609e-05, "loss": 0.4189, "num_input_tokens_seen": 4425096, "step": 49150 }, { "epoch": 12.7741683991684, "grad_norm": 0.2623428404331207, "learning_rate": 1.7382928412204e-05, "loss": 0.1523, "num_input_tokens_seen": 4425528, "step": 49155 }, { "epoch": 12.775467775467776, "grad_norm": 4.982271671295166, "learning_rate": 1.737752857700519e-05, "loss": 0.1299, "num_input_tokens_seen": 4425976, "step": 49160 }, { "epoch": 12.776767151767151, "grad_norm": 5.575587272644043, "learning_rate": 1.7372129133837394e-05, "loss": 0.3129, "num_input_tokens_seen": 4426408, "step": 49165 }, { "epoch": 12.778066528066528, "grad_norm": 1.0195108652114868, "learning_rate": 1.7366730082978298e-05, "loss": 0.2661, "num_input_tokens_seen": 4426840, "step": 49170 }, { "epoch": 12.779365904365905, "grad_norm": 1.9539728164672852, "learning_rate": 1.73613314247056e-05, "loss": 0.2639, "num_input_tokens_seen": 4427320, "step": 49175 }, { "epoch": 12.78066528066528, "grad_norm": 1.9840824604034424, "learning_rate": 1.7355933159296934e-05, "loss": 0.2233, "num_input_tokens_seen": 4427768, "step": 49180 }, { "epoch": 12.781964656964657, "grad_norm": 1.1134737730026245, "learning_rate": 1.7350535287029957e-05, "loss": 0.1356, "num_input_tokens_seen": 4428248, "step": 49185 }, { "epoch": 12.783264033264032, "grad_norm": 3.4689621925354004, "learning_rate": 1.7345137808182282e-05, "loss": 0.2127, "num_input_tokens_seen": 4428696, "step": 49190 }, { "epoch": 12.78456340956341, "grad_norm": 0.27921560406684875, "learning_rate": 1.7339740723031497e-05, "loss": 0.0732, "num_input_tokens_seen": 4429096, "step": 49195 }, { "epoch": 12.785862785862786, "grad_norm": 3.4190802574157715, "learning_rate": 1.733434403185519e-05, "loss": 0.1584, "num_input_tokens_seen": 4429592, "step": 49200 }, { "epoch": 12.787162162162161, "grad_norm": 7.137631416320801, "learning_rate": 1.732894773493091e-05, "loss": 0.3493, "num_input_tokens_seen": 4430056, "step": 49205 }, { "epoch": 12.788461538461538, "grad_norm": 1.3695133924484253, "learning_rate": 1.7323551832536206e-05, "loss": 0.2667, "num_input_tokens_seen": 4430488, "step": 49210 }, { "epoch": 12.789760914760915, "grad_norm": 6.838953971862793, "learning_rate": 1.7318156324948577e-05, "loss": 0.1468, "num_input_tokens_seen": 4430920, "step": 49215 }, { "epoch": 12.79106029106029, "grad_norm": 3.718017816543579, "learning_rate": 1.7312761212445534e-05, "loss": 0.1431, "num_input_tokens_seen": 4431352, "step": 49220 }, { "epoch": 12.792359667359667, "grad_norm": 8.612848281860352, "learning_rate": 1.7307366495304545e-05, "loss": 0.5543, "num_input_tokens_seen": 4431784, "step": 49225 }, { "epoch": 12.793659043659044, "grad_norm": 8.561195373535156, "learning_rate": 1.7301972173803078e-05, "loss": 0.2586, "num_input_tokens_seen": 4432248, "step": 49230 }, { "epoch": 12.79495841995842, "grad_norm": 2.9806466102600098, "learning_rate": 1.7296578248218543e-05, "loss": 0.1866, "num_input_tokens_seen": 4432744, "step": 49235 }, { "epoch": 12.796257796257796, "grad_norm": 6.82840633392334, "learning_rate": 1.7291184718828376e-05, "loss": 0.4676, "num_input_tokens_seen": 4433160, "step": 49240 }, { "epoch": 12.797557172557173, "grad_norm": 11.698054313659668, "learning_rate": 1.728579158590996e-05, "loss": 0.2613, "num_input_tokens_seen": 4433608, "step": 49245 }, { "epoch": 12.798856548856548, "grad_norm": 7.464698791503906, "learning_rate": 1.7280398849740687e-05, "loss": 0.3087, "num_input_tokens_seen": 4434040, "step": 49250 }, { "epoch": 12.800155925155925, "grad_norm": 0.5178998708724976, "learning_rate": 1.727500651059789e-05, "loss": 0.1654, "num_input_tokens_seen": 4434488, "step": 49255 }, { "epoch": 12.801455301455302, "grad_norm": 2.16880202293396, "learning_rate": 1.7269614568758907e-05, "loss": 0.2891, "num_input_tokens_seen": 4434920, "step": 49260 }, { "epoch": 12.802754677754677, "grad_norm": 11.089598655700684, "learning_rate": 1.7264223024501064e-05, "loss": 0.2824, "num_input_tokens_seen": 4435352, "step": 49265 }, { "epoch": 12.804054054054054, "grad_norm": 7.755558967590332, "learning_rate": 1.7258831878101634e-05, "loss": 0.3202, "num_input_tokens_seen": 4435816, "step": 49270 }, { "epoch": 12.80535343035343, "grad_norm": 3.9686801433563232, "learning_rate": 1.7253441129837898e-05, "loss": 0.0952, "num_input_tokens_seen": 4436264, "step": 49275 }, { "epoch": 12.806652806652806, "grad_norm": 0.3598654270172119, "learning_rate": 1.7248050779987106e-05, "loss": 0.1928, "num_input_tokens_seen": 4436728, "step": 49280 }, { "epoch": 12.807952182952183, "grad_norm": 2.621840715408325, "learning_rate": 1.72426608288265e-05, "loss": 0.241, "num_input_tokens_seen": 4437224, "step": 49285 }, { "epoch": 12.809251559251559, "grad_norm": 4.682773113250732, "learning_rate": 1.7237271276633265e-05, "loss": 0.3766, "num_input_tokens_seen": 4437688, "step": 49290 }, { "epoch": 12.810550935550935, "grad_norm": 0.8722373247146606, "learning_rate": 1.7231882123684616e-05, "loss": 0.1287, "num_input_tokens_seen": 4438136, "step": 49295 }, { "epoch": 12.811850311850312, "grad_norm": 1.0840799808502197, "learning_rate": 1.7226493370257708e-05, "loss": 0.1955, "num_input_tokens_seen": 4438568, "step": 49300 }, { "epoch": 12.813149688149688, "grad_norm": 0.8814947605133057, "learning_rate": 1.72211050166297e-05, "loss": 0.1757, "num_input_tokens_seen": 4439048, "step": 49305 }, { "epoch": 12.814449064449065, "grad_norm": 6.1045331954956055, "learning_rate": 1.72157170630777e-05, "loss": 0.402, "num_input_tokens_seen": 4439496, "step": 49310 }, { "epoch": 12.815748440748441, "grad_norm": 0.45746728777885437, "learning_rate": 1.721032950987885e-05, "loss": 0.2212, "num_input_tokens_seen": 4439960, "step": 49315 }, { "epoch": 12.817047817047817, "grad_norm": 4.608283042907715, "learning_rate": 1.7204942357310217e-05, "loss": 0.139, "num_input_tokens_seen": 4440424, "step": 49320 }, { "epoch": 12.818347193347194, "grad_norm": 0.9688106775283813, "learning_rate": 1.719955560564885e-05, "loss": 0.2349, "num_input_tokens_seen": 4440856, "step": 49325 }, { "epoch": 12.81964656964657, "grad_norm": 4.597433090209961, "learning_rate": 1.7194169255171824e-05, "loss": 0.1373, "num_input_tokens_seen": 4441288, "step": 49330 }, { "epoch": 12.820945945945946, "grad_norm": 11.679105758666992, "learning_rate": 1.7188783306156146e-05, "loss": 0.5554, "num_input_tokens_seen": 4441832, "step": 49335 }, { "epoch": 12.822245322245323, "grad_norm": 5.271842956542969, "learning_rate": 1.718339775887883e-05, "loss": 0.1811, "num_input_tokens_seen": 4442264, "step": 49340 }, { "epoch": 12.823544698544698, "grad_norm": 11.946523666381836, "learning_rate": 1.717801261361685e-05, "loss": 0.1735, "num_input_tokens_seen": 4442712, "step": 49345 }, { "epoch": 12.824844074844075, "grad_norm": 0.47102126479148865, "learning_rate": 1.717262787064719e-05, "loss": 0.2394, "num_input_tokens_seen": 4443208, "step": 49350 }, { "epoch": 12.826143451143452, "grad_norm": 4.749190330505371, "learning_rate": 1.7167243530246767e-05, "loss": 0.2621, "num_input_tokens_seen": 4443704, "step": 49355 }, { "epoch": 12.827442827442827, "grad_norm": 1.6803677082061768, "learning_rate": 1.7161859592692518e-05, "loss": 0.4511, "num_input_tokens_seen": 4444120, "step": 49360 }, { "epoch": 12.828742203742204, "grad_norm": 13.595282554626465, "learning_rate": 1.715647605826134e-05, "loss": 0.3586, "num_input_tokens_seen": 4444552, "step": 49365 }, { "epoch": 12.83004158004158, "grad_norm": 7.947813987731934, "learning_rate": 1.715109292723012e-05, "loss": 0.4046, "num_input_tokens_seen": 4445016, "step": 49370 }, { "epoch": 12.831340956340956, "grad_norm": 6.2769083976745605, "learning_rate": 1.71457101998757e-05, "loss": 0.1636, "num_input_tokens_seen": 4445496, "step": 49375 }, { "epoch": 12.832640332640333, "grad_norm": 9.528694152832031, "learning_rate": 1.7140327876474938e-05, "loss": 0.318, "num_input_tokens_seen": 4445944, "step": 49380 }, { "epoch": 12.83393970893971, "grad_norm": 1.3958297967910767, "learning_rate": 1.713494595730465e-05, "loss": 0.2212, "num_input_tokens_seen": 4446424, "step": 49385 }, { "epoch": 12.835239085239085, "grad_norm": 0.13352757692337036, "learning_rate": 1.712956444264161e-05, "loss": 0.1988, "num_input_tokens_seen": 4446872, "step": 49390 }, { "epoch": 12.836538461538462, "grad_norm": 7.395781993865967, "learning_rate": 1.712418333276262e-05, "loss": 0.2532, "num_input_tokens_seen": 4447352, "step": 49395 }, { "epoch": 12.837837837837839, "grad_norm": 1.5325299501419067, "learning_rate": 1.7118802627944428e-05, "loss": 0.1037, "num_input_tokens_seen": 4447816, "step": 49400 }, { "epoch": 12.839137214137214, "grad_norm": 1.3372547626495361, "learning_rate": 1.7113422328463773e-05, "loss": 0.2101, "num_input_tokens_seen": 4448248, "step": 49405 }, { "epoch": 12.84043659043659, "grad_norm": 6.901198863983154, "learning_rate": 1.7108042434597356e-05, "loss": 0.1563, "num_input_tokens_seen": 4448664, "step": 49410 }, { "epoch": 12.841735966735968, "grad_norm": 0.15234997868537903, "learning_rate": 1.7102662946621885e-05, "loss": 0.048, "num_input_tokens_seen": 4449096, "step": 49415 }, { "epoch": 12.843035343035343, "grad_norm": 8.643488883972168, "learning_rate": 1.709728386481402e-05, "loss": 0.3625, "num_input_tokens_seen": 4449576, "step": 49420 }, { "epoch": 12.84433471933472, "grad_norm": 12.12930965423584, "learning_rate": 1.7091905189450423e-05, "loss": 0.7354, "num_input_tokens_seen": 4450056, "step": 49425 }, { "epoch": 12.845634095634095, "grad_norm": 9.472944259643555, "learning_rate": 1.7086526920807712e-05, "loss": 0.6769, "num_input_tokens_seen": 4450536, "step": 49430 }, { "epoch": 12.846933471933472, "grad_norm": 5.772449970245361, "learning_rate": 1.708114905916251e-05, "loss": 0.158, "num_input_tokens_seen": 4450984, "step": 49435 }, { "epoch": 12.848232848232849, "grad_norm": 9.586028099060059, "learning_rate": 1.7075771604791395e-05, "loss": 0.2179, "num_input_tokens_seen": 4451448, "step": 49440 }, { "epoch": 12.849532224532224, "grad_norm": 2.337754726409912, "learning_rate": 1.707039455797095e-05, "loss": 0.2448, "num_input_tokens_seen": 4451912, "step": 49445 }, { "epoch": 12.8508316008316, "grad_norm": 2.7280359268188477, "learning_rate": 1.70650179189777e-05, "loss": 0.1136, "num_input_tokens_seen": 4452424, "step": 49450 }, { "epoch": 12.852130977130978, "grad_norm": 1.9713765382766724, "learning_rate": 1.705964168808818e-05, "loss": 0.2412, "num_input_tokens_seen": 4452872, "step": 49455 }, { "epoch": 12.853430353430353, "grad_norm": 4.439254283905029, "learning_rate": 1.7054265865578904e-05, "loss": 0.1094, "num_input_tokens_seen": 4453336, "step": 49460 }, { "epoch": 12.85472972972973, "grad_norm": 2.32246994972229, "learning_rate": 1.704889045172634e-05, "loss": 0.1487, "num_input_tokens_seen": 4453768, "step": 49465 }, { "epoch": 12.856029106029107, "grad_norm": 9.033575057983398, "learning_rate": 1.7043515446806964e-05, "loss": 0.5882, "num_input_tokens_seen": 4454248, "step": 49470 }, { "epoch": 12.857328482328482, "grad_norm": 7.34734582901001, "learning_rate": 1.7038140851097206e-05, "loss": 0.4091, "num_input_tokens_seen": 4454712, "step": 49475 }, { "epoch": 12.858627858627859, "grad_norm": 5.038184642791748, "learning_rate": 1.7032766664873502e-05, "loss": 0.225, "num_input_tokens_seen": 4455176, "step": 49480 }, { "epoch": 12.859927234927234, "grad_norm": 0.6648630499839783, "learning_rate": 1.7027392888412235e-05, "loss": 0.344, "num_input_tokens_seen": 4455640, "step": 49485 }, { "epoch": 12.861226611226611, "grad_norm": 4.389416217803955, "learning_rate": 1.7022019521989793e-05, "loss": 0.2566, "num_input_tokens_seen": 4456104, "step": 49490 }, { "epoch": 12.862525987525988, "grad_norm": 5.989532470703125, "learning_rate": 1.7016646565882527e-05, "loss": 0.2701, "num_input_tokens_seen": 4456584, "step": 49495 }, { "epoch": 12.863825363825363, "grad_norm": 1.8904441595077515, "learning_rate": 1.701127402036679e-05, "loss": 0.4488, "num_input_tokens_seen": 4457048, "step": 49500 }, { "epoch": 12.86512474012474, "grad_norm": 6.869001388549805, "learning_rate": 1.700590188571887e-05, "loss": 0.2761, "num_input_tokens_seen": 4457512, "step": 49505 }, { "epoch": 12.866424116424117, "grad_norm": 0.8264189958572388, "learning_rate": 1.7000530162215083e-05, "loss": 0.2199, "num_input_tokens_seen": 4457960, "step": 49510 }, { "epoch": 12.867723492723492, "grad_norm": 1.0273158550262451, "learning_rate": 1.6995158850131697e-05, "loss": 0.1824, "num_input_tokens_seen": 4458440, "step": 49515 }, { "epoch": 12.869022869022869, "grad_norm": 3.070939540863037, "learning_rate": 1.6989787949744954e-05, "loss": 0.1684, "num_input_tokens_seen": 4458904, "step": 49520 }, { "epoch": 12.870322245322246, "grad_norm": 16.89935874938965, "learning_rate": 1.6984417461331097e-05, "loss": 0.564, "num_input_tokens_seen": 4459352, "step": 49525 }, { "epoch": 12.871621621621621, "grad_norm": 6.443000793457031, "learning_rate": 1.6979047385166325e-05, "loss": 0.381, "num_input_tokens_seen": 4459800, "step": 49530 }, { "epoch": 12.872920997920998, "grad_norm": 0.8843661546707153, "learning_rate": 1.6973677721526836e-05, "loss": 0.2188, "num_input_tokens_seen": 4460264, "step": 49535 }, { "epoch": 12.874220374220375, "grad_norm": 6.912937164306641, "learning_rate": 1.696830847068879e-05, "loss": 0.3345, "num_input_tokens_seen": 4460712, "step": 49540 }, { "epoch": 12.87551975051975, "grad_norm": 5.055303573608398, "learning_rate": 1.6962939632928334e-05, "loss": 0.4342, "num_input_tokens_seen": 4461144, "step": 49545 }, { "epoch": 12.876819126819127, "grad_norm": 8.037264823913574, "learning_rate": 1.695757120852159e-05, "loss": 0.2669, "num_input_tokens_seen": 4461544, "step": 49550 }, { "epoch": 12.878118503118504, "grad_norm": 9.248993873596191, "learning_rate": 1.6952203197744675e-05, "loss": 0.4085, "num_input_tokens_seen": 4462008, "step": 49555 }, { "epoch": 12.879417879417879, "grad_norm": 5.173120498657227, "learning_rate": 1.694683560087365e-05, "loss": 0.22, "num_input_tokens_seen": 4462488, "step": 49560 }, { "epoch": 12.880717255717256, "grad_norm": 4.542372703552246, "learning_rate": 1.6941468418184597e-05, "loss": 0.1906, "num_input_tokens_seen": 4462968, "step": 49565 }, { "epoch": 12.882016632016633, "grad_norm": 0.7510212063789368, "learning_rate": 1.6936101649953535e-05, "loss": 0.0973, "num_input_tokens_seen": 4463400, "step": 49570 }, { "epoch": 12.883316008316008, "grad_norm": 1.443077564239502, "learning_rate": 1.6930735296456496e-05, "loss": 0.2113, "num_input_tokens_seen": 4463864, "step": 49575 }, { "epoch": 12.884615384615385, "grad_norm": 0.49541202187538147, "learning_rate": 1.6925369357969477e-05, "loss": 0.3103, "num_input_tokens_seen": 4464312, "step": 49580 }, { "epoch": 12.88591476091476, "grad_norm": 5.42878532409668, "learning_rate": 1.6920003834768438e-05, "loss": 0.4844, "num_input_tokens_seen": 4464744, "step": 49585 }, { "epoch": 12.887214137214137, "grad_norm": 0.7389110326766968, "learning_rate": 1.691463872712935e-05, "loss": 0.1304, "num_input_tokens_seen": 4465208, "step": 49590 }, { "epoch": 12.888513513513514, "grad_norm": 7.820306301116943, "learning_rate": 1.6909274035328138e-05, "loss": 0.1395, "num_input_tokens_seen": 4465672, "step": 49595 }, { "epoch": 12.88981288981289, "grad_norm": 5.454453945159912, "learning_rate": 1.6903909759640716e-05, "loss": 0.3382, "num_input_tokens_seen": 4466152, "step": 49600 }, { "epoch": 12.891112266112266, "grad_norm": 1.723259687423706, "learning_rate": 1.6898545900342972e-05, "loss": 0.0523, "num_input_tokens_seen": 4466584, "step": 49605 }, { "epoch": 12.892411642411643, "grad_norm": 0.1845674365758896, "learning_rate": 1.6893182457710774e-05, "loss": 0.1922, "num_input_tokens_seen": 4467032, "step": 49610 }, { "epoch": 12.893711018711018, "grad_norm": 3.8383946418762207, "learning_rate": 1.6887819432019966e-05, "loss": 0.1175, "num_input_tokens_seen": 4467464, "step": 49615 }, { "epoch": 12.895010395010395, "grad_norm": 6.979602336883545, "learning_rate": 1.688245682354639e-05, "loss": 0.2086, "num_input_tokens_seen": 4467976, "step": 49620 }, { "epoch": 12.896309771309772, "grad_norm": 0.272944837808609, "learning_rate": 1.687709463256582e-05, "loss": 0.2886, "num_input_tokens_seen": 4468408, "step": 49625 }, { "epoch": 12.897609147609147, "grad_norm": 1.0430351495742798, "learning_rate": 1.6871732859354065e-05, "loss": 0.4101, "num_input_tokens_seen": 4468888, "step": 49630 }, { "epoch": 12.898908523908524, "grad_norm": 3.846534252166748, "learning_rate": 1.6866371504186874e-05, "loss": 0.3033, "num_input_tokens_seen": 4469304, "step": 49635 }, { "epoch": 12.9002079002079, "grad_norm": 2.941453218460083, "learning_rate": 1.686101056734e-05, "loss": 0.1879, "num_input_tokens_seen": 4469736, "step": 49640 }, { "epoch": 12.901507276507276, "grad_norm": 5.441102027893066, "learning_rate": 1.6855650049089146e-05, "loss": 0.5584, "num_input_tokens_seen": 4470232, "step": 49645 }, { "epoch": 12.902806652806653, "grad_norm": 3.399322748184204, "learning_rate": 1.6850289949710003e-05, "loss": 0.2693, "num_input_tokens_seen": 4470760, "step": 49650 }, { "epoch": 12.904106029106028, "grad_norm": 10.261659622192383, "learning_rate": 1.6844930269478274e-05, "loss": 0.2382, "num_input_tokens_seen": 4471192, "step": 49655 }, { "epoch": 12.905405405405405, "grad_norm": 0.670413076877594, "learning_rate": 1.683957100866958e-05, "loss": 0.1073, "num_input_tokens_seen": 4471656, "step": 49660 }, { "epoch": 12.906704781704782, "grad_norm": 1.4915659427642822, "learning_rate": 1.6834212167559575e-05, "loss": 0.1343, "num_input_tokens_seen": 4472072, "step": 49665 }, { "epoch": 12.908004158004157, "grad_norm": 7.125630855560303, "learning_rate": 1.6828853746423857e-05, "loss": 0.3212, "num_input_tokens_seen": 4472520, "step": 49670 }, { "epoch": 12.909303534303534, "grad_norm": 5.20585298538208, "learning_rate": 1.6823495745538033e-05, "loss": 0.2429, "num_input_tokens_seen": 4473016, "step": 49675 }, { "epoch": 12.910602910602911, "grad_norm": 2.5471901893615723, "learning_rate": 1.681813816517764e-05, "loss": 0.1613, "num_input_tokens_seen": 4473416, "step": 49680 }, { "epoch": 12.911902286902286, "grad_norm": 0.9986160397529602, "learning_rate": 1.6812781005618248e-05, "loss": 0.1594, "num_input_tokens_seen": 4473848, "step": 49685 }, { "epoch": 12.913201663201663, "grad_norm": 9.94287395477295, "learning_rate": 1.6807424267135374e-05, "loss": 0.2402, "num_input_tokens_seen": 4474312, "step": 49690 }, { "epoch": 12.91450103950104, "grad_norm": 3.488182306289673, "learning_rate": 1.6802067950004523e-05, "loss": 0.438, "num_input_tokens_seen": 4474728, "step": 49695 }, { "epoch": 12.915800415800415, "grad_norm": 6.761353015899658, "learning_rate": 1.6796712054501167e-05, "loss": 0.3756, "num_input_tokens_seen": 4475160, "step": 49700 }, { "epoch": 12.917099792099792, "grad_norm": 1.364758014678955, "learning_rate": 1.6791356580900775e-05, "loss": 0.2285, "num_input_tokens_seen": 4475624, "step": 49705 }, { "epoch": 12.91839916839917, "grad_norm": 5.008049011230469, "learning_rate": 1.6786001529478782e-05, "loss": 0.1731, "num_input_tokens_seen": 4476056, "step": 49710 }, { "epoch": 12.919698544698544, "grad_norm": 0.8964444398880005, "learning_rate": 1.678064690051059e-05, "loss": 0.2111, "num_input_tokens_seen": 4476536, "step": 49715 }, { "epoch": 12.920997920997921, "grad_norm": 8.969376564025879, "learning_rate": 1.677529269427161e-05, "loss": 0.3998, "num_input_tokens_seen": 4477000, "step": 49720 }, { "epoch": 12.922297297297296, "grad_norm": 1.8185125589370728, "learning_rate": 1.6769938911037204e-05, "loss": 0.3234, "num_input_tokens_seen": 4477432, "step": 49725 }, { "epoch": 12.923596673596673, "grad_norm": 0.35743558406829834, "learning_rate": 1.676458555108273e-05, "loss": 0.2852, "num_input_tokens_seen": 4477896, "step": 49730 }, { "epoch": 12.92489604989605, "grad_norm": 1.5846081972122192, "learning_rate": 1.6759232614683507e-05, "loss": 0.1275, "num_input_tokens_seen": 4478360, "step": 49735 }, { "epoch": 12.926195426195425, "grad_norm": 0.3118283748626709, "learning_rate": 1.675388010211485e-05, "loss": 0.1294, "num_input_tokens_seen": 4478808, "step": 49740 }, { "epoch": 12.927494802494802, "grad_norm": 5.512463092803955, "learning_rate": 1.674852801365203e-05, "loss": 0.1888, "num_input_tokens_seen": 4479256, "step": 49745 }, { "epoch": 12.92879417879418, "grad_norm": 0.30315715074539185, "learning_rate": 1.674317634957034e-05, "loss": 0.3335, "num_input_tokens_seen": 4479720, "step": 49750 }, { "epoch": 12.930093555093555, "grad_norm": 5.229447841644287, "learning_rate": 1.6737825110144982e-05, "loss": 0.4592, "num_input_tokens_seen": 4480200, "step": 49755 }, { "epoch": 12.931392931392931, "grad_norm": 8.575263977050781, "learning_rate": 1.6732474295651207e-05, "loss": 0.2348, "num_input_tokens_seen": 4480696, "step": 49760 }, { "epoch": 12.932692307692308, "grad_norm": 3.4971039295196533, "learning_rate": 1.6727123906364194e-05, "loss": 0.0993, "num_input_tokens_seen": 4481144, "step": 49765 }, { "epoch": 12.933991683991684, "grad_norm": 3.9319827556610107, "learning_rate": 1.6721773942559137e-05, "loss": 0.2342, "num_input_tokens_seen": 4481608, "step": 49770 }, { "epoch": 12.93529106029106, "grad_norm": 0.4017549753189087, "learning_rate": 1.671642440451117e-05, "loss": 0.1243, "num_input_tokens_seen": 4482024, "step": 49775 }, { "epoch": 12.936590436590437, "grad_norm": 4.346681118011475, "learning_rate": 1.6711075292495427e-05, "loss": 0.2943, "num_input_tokens_seen": 4482456, "step": 49780 }, { "epoch": 12.937889812889813, "grad_norm": 8.579809188842773, "learning_rate": 1.6705726606787037e-05, "loss": 0.1313, "num_input_tokens_seen": 4482904, "step": 49785 }, { "epoch": 12.93918918918919, "grad_norm": 2.2787859439849854, "learning_rate": 1.670037834766106e-05, "loss": 0.1514, "num_input_tokens_seen": 4483320, "step": 49790 }, { "epoch": 12.940488565488565, "grad_norm": 11.449796676635742, "learning_rate": 1.6695030515392586e-05, "loss": 0.2288, "num_input_tokens_seen": 4483800, "step": 49795 }, { "epoch": 12.941787941787942, "grad_norm": 0.3915660083293915, "learning_rate": 1.6689683110256646e-05, "loss": 0.2971, "num_input_tokens_seen": 4484280, "step": 49800 }, { "epoch": 12.943087318087318, "grad_norm": 10.2799654006958, "learning_rate": 1.6684336132528273e-05, "loss": 0.262, "num_input_tokens_seen": 4484728, "step": 49805 }, { "epoch": 12.944386694386694, "grad_norm": 0.28290459513664246, "learning_rate": 1.6678989582482448e-05, "loss": 0.2803, "num_input_tokens_seen": 4485160, "step": 49810 }, { "epoch": 12.94568607068607, "grad_norm": 8.149762153625488, "learning_rate": 1.6673643460394174e-05, "loss": 0.2428, "num_input_tokens_seen": 4485592, "step": 49815 }, { "epoch": 12.946985446985448, "grad_norm": 8.935312271118164, "learning_rate": 1.666829776653839e-05, "loss": 0.3753, "num_input_tokens_seen": 4486040, "step": 49820 }, { "epoch": 12.948284823284823, "grad_norm": 0.5459998250007629, "learning_rate": 1.6662952501190033e-05, "loss": 0.0888, "num_input_tokens_seen": 4486520, "step": 49825 }, { "epoch": 12.9495841995842, "grad_norm": 0.20554114878177643, "learning_rate": 1.6657607664624013e-05, "loss": 0.0509, "num_input_tokens_seen": 4486936, "step": 49830 }, { "epoch": 12.950883575883577, "grad_norm": 0.5172227621078491, "learning_rate": 1.6652263257115237e-05, "loss": 0.2373, "num_input_tokens_seen": 4487400, "step": 49835 }, { "epoch": 12.952182952182952, "grad_norm": 6.507544040679932, "learning_rate": 1.6646919278938555e-05, "loss": 0.199, "num_input_tokens_seen": 4487848, "step": 49840 }, { "epoch": 12.953482328482329, "grad_norm": 6.0079450607299805, "learning_rate": 1.664157573036881e-05, "loss": 0.3573, "num_input_tokens_seen": 4488312, "step": 49845 }, { "epoch": 12.954781704781706, "grad_norm": 2.5054831504821777, "learning_rate": 1.6636232611680848e-05, "loss": 0.1743, "num_input_tokens_seen": 4488776, "step": 49850 }, { "epoch": 12.95608108108108, "grad_norm": 0.9052993059158325, "learning_rate": 1.6630889923149446e-05, "loss": 0.1647, "num_input_tokens_seen": 4489208, "step": 49855 }, { "epoch": 12.957380457380458, "grad_norm": 7.33677339553833, "learning_rate": 1.6625547665049396e-05, "loss": 0.392, "num_input_tokens_seen": 4489624, "step": 49860 }, { "epoch": 12.958679833679835, "grad_norm": 8.072644233703613, "learning_rate": 1.662020583765545e-05, "loss": 0.4296, "num_input_tokens_seen": 4490088, "step": 49865 }, { "epoch": 12.95997920997921, "grad_norm": 9.415263175964355, "learning_rate": 1.6614864441242356e-05, "loss": 0.4787, "num_input_tokens_seen": 4490504, "step": 49870 }, { "epoch": 12.961278586278587, "grad_norm": 3.227524995803833, "learning_rate": 1.660952347608481e-05, "loss": 0.151, "num_input_tokens_seen": 4491000, "step": 49875 }, { "epoch": 12.962577962577962, "grad_norm": 9.99165153503418, "learning_rate": 1.6604182942457518e-05, "loss": 0.5963, "num_input_tokens_seen": 4491448, "step": 49880 }, { "epoch": 12.963877338877339, "grad_norm": 6.956396102905273, "learning_rate": 1.659884284063513e-05, "loss": 0.1344, "num_input_tokens_seen": 4491848, "step": 49885 }, { "epoch": 12.965176715176716, "grad_norm": 0.37333202362060547, "learning_rate": 1.659350317089232e-05, "loss": 0.1275, "num_input_tokens_seen": 4492248, "step": 49890 }, { "epoch": 12.96647609147609, "grad_norm": 0.7901602983474731, "learning_rate": 1.6588163933503685e-05, "loss": 0.2726, "num_input_tokens_seen": 4492664, "step": 49895 }, { "epoch": 12.967775467775468, "grad_norm": 1.096535086631775, "learning_rate": 1.6582825128743846e-05, "loss": 0.2278, "num_input_tokens_seen": 4493144, "step": 49900 }, { "epoch": 12.969074844074845, "grad_norm": 0.21602913737297058, "learning_rate": 1.6577486756887374e-05, "loss": 0.3431, "num_input_tokens_seen": 4493640, "step": 49905 }, { "epoch": 12.97037422037422, "grad_norm": 4.3491597175598145, "learning_rate": 1.6572148818208824e-05, "loss": 0.4234, "num_input_tokens_seen": 4494088, "step": 49910 }, { "epoch": 12.971673596673597, "grad_norm": 3.174525260925293, "learning_rate": 1.656681131298274e-05, "loss": 0.2439, "num_input_tokens_seen": 4494536, "step": 49915 }, { "epoch": 12.972972972972974, "grad_norm": 8.660642623901367, "learning_rate": 1.6561474241483624e-05, "loss": 0.1014, "num_input_tokens_seen": 4495016, "step": 49920 }, { "epoch": 12.974272349272349, "grad_norm": 4.029695510864258, "learning_rate": 1.6556137603985984e-05, "loss": 0.2218, "num_input_tokens_seen": 4495448, "step": 49925 }, { "epoch": 12.975571725571726, "grad_norm": 0.31516149640083313, "learning_rate": 1.6550801400764264e-05, "loss": 0.1343, "num_input_tokens_seen": 4495944, "step": 49930 }, { "epoch": 12.976871101871101, "grad_norm": 9.750588417053223, "learning_rate": 1.6545465632092933e-05, "loss": 0.1629, "num_input_tokens_seen": 4496424, "step": 49935 }, { "epoch": 12.978170478170478, "grad_norm": 0.06254038214683533, "learning_rate": 1.65401302982464e-05, "loss": 0.3848, "num_input_tokens_seen": 4496840, "step": 49940 }, { "epoch": 12.979469854469855, "grad_norm": 9.230995178222656, "learning_rate": 1.653479539949908e-05, "loss": 0.3836, "num_input_tokens_seen": 4497304, "step": 49945 }, { "epoch": 12.98076923076923, "grad_norm": 1.0109527111053467, "learning_rate": 1.6529460936125335e-05, "loss": 0.3188, "num_input_tokens_seen": 4497736, "step": 49950 }, { "epoch": 12.982068607068607, "grad_norm": 10.362585067749023, "learning_rate": 1.6524126908399533e-05, "loss": 0.3967, "num_input_tokens_seen": 4498200, "step": 49955 }, { "epoch": 12.983367983367984, "grad_norm": 0.25377923250198364, "learning_rate": 1.6518793316596002e-05, "loss": 0.1885, "num_input_tokens_seen": 4498664, "step": 49960 }, { "epoch": 12.984667359667359, "grad_norm": 6.0022501945495605, "learning_rate": 1.6513460160989065e-05, "loss": 0.1171, "num_input_tokens_seen": 4499144, "step": 49965 }, { "epoch": 12.985966735966736, "grad_norm": 9.860214233398438, "learning_rate": 1.6508127441852998e-05, "loss": 0.1797, "num_input_tokens_seen": 4499576, "step": 49970 }, { "epoch": 12.987266112266113, "grad_norm": 6.702459335327148, "learning_rate": 1.6502795159462074e-05, "loss": 0.2773, "num_input_tokens_seen": 4500024, "step": 49975 }, { "epoch": 12.988565488565488, "grad_norm": 10.352200508117676, "learning_rate": 1.649746331409054e-05, "loss": 0.4163, "num_input_tokens_seen": 4500488, "step": 49980 }, { "epoch": 12.989864864864865, "grad_norm": 1.3161778450012207, "learning_rate": 1.649213190601261e-05, "loss": 0.0843, "num_input_tokens_seen": 4500952, "step": 49985 }, { "epoch": 12.991164241164242, "grad_norm": 0.11718102544546127, "learning_rate": 1.648680093550249e-05, "loss": 0.0282, "num_input_tokens_seen": 4501416, "step": 49990 }, { "epoch": 12.992463617463617, "grad_norm": 4.332514762878418, "learning_rate": 1.6481470402834347e-05, "loss": 0.0776, "num_input_tokens_seen": 4501896, "step": 49995 }, { "epoch": 12.993762993762994, "grad_norm": 7.967604160308838, "learning_rate": 1.6476140308282357e-05, "loss": 0.3479, "num_input_tokens_seen": 4502392, "step": 50000 }, { "epoch": 12.99506237006237, "grad_norm": 8.258179664611816, "learning_rate": 1.6470810652120626e-05, "loss": 0.1368, "num_input_tokens_seen": 4502824, "step": 50005 }, { "epoch": 12.996361746361746, "grad_norm": 0.13351427018642426, "learning_rate": 1.6465481434623283e-05, "loss": 0.4048, "num_input_tokens_seen": 4503272, "step": 50010 }, { "epoch": 12.997661122661123, "grad_norm": 7.332115650177002, "learning_rate": 1.64601526560644e-05, "loss": 0.3055, "num_input_tokens_seen": 4503704, "step": 50015 }, { "epoch": 12.9989604989605, "grad_norm": 2.965646743774414, "learning_rate": 1.645482431671806e-05, "loss": 0.1104, "num_input_tokens_seen": 4504136, "step": 50020 }, { "epoch": 13.0, "eval_loss": 0.32643255591392517, "eval_runtime": 13.1561, "eval_samples_per_second": 65.065, "eval_steps_per_second": 32.532, "num_input_tokens_seen": 4504416, "step": 50024 }, { "epoch": 13.000259875259875, "grad_norm": 0.31236299872398376, "learning_rate": 1.6449496416858284e-05, "loss": 0.0781, "num_input_tokens_seen": 4504512, "step": 50025 }, { "epoch": 13.001559251559252, "grad_norm": 0.9216028451919556, "learning_rate": 1.6444168956759103e-05, "loss": 0.1596, "num_input_tokens_seen": 4504992, "step": 50030 }, { "epoch": 13.002858627858627, "grad_norm": 7.828392028808594, "learning_rate": 1.6438841936694517e-05, "loss": 0.3072, "num_input_tokens_seen": 4505440, "step": 50035 }, { "epoch": 13.004158004158004, "grad_norm": 0.39203307032585144, "learning_rate": 1.6433515356938477e-05, "loss": 0.1223, "num_input_tokens_seen": 4505920, "step": 50040 }, { "epoch": 13.005457380457381, "grad_norm": 1.9718152284622192, "learning_rate": 1.642818921776496e-05, "loss": 0.1815, "num_input_tokens_seen": 4506368, "step": 50045 }, { "epoch": 13.006756756756756, "grad_norm": 7.654115676879883, "learning_rate": 1.642286351944788e-05, "loss": 0.4132, "num_input_tokens_seen": 4506784, "step": 50050 }, { "epoch": 13.008056133056133, "grad_norm": 0.17432178556919098, "learning_rate": 1.641753826226116e-05, "loss": 0.1602, "num_input_tokens_seen": 4507216, "step": 50055 }, { "epoch": 13.00935550935551, "grad_norm": 0.4378105103969574, "learning_rate": 1.6412213446478653e-05, "loss": 0.2854, "num_input_tokens_seen": 4507680, "step": 50060 }, { "epoch": 13.010654885654885, "grad_norm": 3.0394535064697266, "learning_rate": 1.640688907237425e-05, "loss": 0.2979, "num_input_tokens_seen": 4508144, "step": 50065 }, { "epoch": 13.011954261954262, "grad_norm": 3.4205572605133057, "learning_rate": 1.6401565140221768e-05, "loss": 0.1113, "num_input_tokens_seen": 4508608, "step": 50070 }, { "epoch": 13.013253638253639, "grad_norm": 8.71650218963623, "learning_rate": 1.639624165029503e-05, "loss": 0.1639, "num_input_tokens_seen": 4509040, "step": 50075 }, { "epoch": 13.014553014553014, "grad_norm": 8.280720710754395, "learning_rate": 1.6390918602867828e-05, "loss": 0.2391, "num_input_tokens_seen": 4509456, "step": 50080 }, { "epoch": 13.015852390852391, "grad_norm": 2.0143191814422607, "learning_rate": 1.6385595998213936e-05, "loss": 0.3673, "num_input_tokens_seen": 4509872, "step": 50085 }, { "epoch": 13.017151767151768, "grad_norm": 3.533086061477661, "learning_rate": 1.6380273836607092e-05, "loss": 0.3338, "num_input_tokens_seen": 4510320, "step": 50090 }, { "epoch": 13.018451143451143, "grad_norm": 1.6867204904556274, "learning_rate": 1.6374952118321023e-05, "loss": 0.0796, "num_input_tokens_seen": 4510768, "step": 50095 }, { "epoch": 13.01975051975052, "grad_norm": 0.37459808588027954, "learning_rate": 1.6369630843629442e-05, "loss": 0.2352, "num_input_tokens_seen": 4511216, "step": 50100 }, { "epoch": 13.021049896049895, "grad_norm": 9.434207916259766, "learning_rate": 1.6364310012806e-05, "loss": 0.2014, "num_input_tokens_seen": 4511696, "step": 50105 }, { "epoch": 13.022349272349272, "grad_norm": 7.606536865234375, "learning_rate": 1.6358989626124376e-05, "loss": 0.3567, "num_input_tokens_seen": 4512144, "step": 50110 }, { "epoch": 13.02364864864865, "grad_norm": 0.6547182202339172, "learning_rate": 1.635366968385819e-05, "loss": 0.2822, "num_input_tokens_seen": 4512592, "step": 50115 }, { "epoch": 13.024948024948024, "grad_norm": 0.32281801104545593, "learning_rate": 1.6348350186281065e-05, "loss": 0.0854, "num_input_tokens_seen": 4513008, "step": 50120 }, { "epoch": 13.026247401247401, "grad_norm": 2.9872586727142334, "learning_rate": 1.634303113366657e-05, "loss": 0.3008, "num_input_tokens_seen": 4513472, "step": 50125 }, { "epoch": 13.027546777546778, "grad_norm": 13.053401947021484, "learning_rate": 1.6337712526288286e-05, "loss": 0.1566, "num_input_tokens_seen": 4513904, "step": 50130 }, { "epoch": 13.028846153846153, "grad_norm": 0.012797528877854347, "learning_rate": 1.6332394364419737e-05, "loss": 0.0306, "num_input_tokens_seen": 4514368, "step": 50135 }, { "epoch": 13.03014553014553, "grad_norm": 0.4365704357624054, "learning_rate": 1.6327076648334464e-05, "loss": 0.1023, "num_input_tokens_seen": 4514800, "step": 50140 }, { "epoch": 13.031444906444907, "grad_norm": 12.3575439453125, "learning_rate": 1.632175937830594e-05, "loss": 0.8298, "num_input_tokens_seen": 4515248, "step": 50145 }, { "epoch": 13.032744282744282, "grad_norm": 2.027107000350952, "learning_rate": 1.6316442554607646e-05, "loss": 0.2816, "num_input_tokens_seen": 4515712, "step": 50150 }, { "epoch": 13.03404365904366, "grad_norm": 2.4887876510620117, "learning_rate": 1.6311126177513032e-05, "loss": 0.0586, "num_input_tokens_seen": 4516144, "step": 50155 }, { "epoch": 13.035343035343036, "grad_norm": 0.6079332232475281, "learning_rate": 1.6305810247295534e-05, "loss": 0.1344, "num_input_tokens_seen": 4516560, "step": 50160 }, { "epoch": 13.036642411642411, "grad_norm": 10.001148223876953, "learning_rate": 1.6300494764228536e-05, "loss": 0.3909, "num_input_tokens_seen": 4516976, "step": 50165 }, { "epoch": 13.037941787941788, "grad_norm": 0.9265061616897583, "learning_rate": 1.6295179728585424e-05, "loss": 0.1804, "num_input_tokens_seen": 4517440, "step": 50170 }, { "epoch": 13.039241164241163, "grad_norm": 8.501337051391602, "learning_rate": 1.628986514063957e-05, "loss": 0.4007, "num_input_tokens_seen": 4517936, "step": 50175 }, { "epoch": 13.04054054054054, "grad_norm": 2.0847349166870117, "learning_rate": 1.628455100066429e-05, "loss": 0.1496, "num_input_tokens_seen": 4518368, "step": 50180 }, { "epoch": 13.041839916839917, "grad_norm": 0.2344919741153717, "learning_rate": 1.6279237308932905e-05, "loss": 0.2177, "num_input_tokens_seen": 4518816, "step": 50185 }, { "epoch": 13.043139293139292, "grad_norm": 7.196371555328369, "learning_rate": 1.6273924065718697e-05, "loss": 0.1764, "num_input_tokens_seen": 4519264, "step": 50190 }, { "epoch": 13.04443866943867, "grad_norm": 0.8557904958724976, "learning_rate": 1.6268611271294945e-05, "loss": 0.1719, "num_input_tokens_seen": 4519728, "step": 50195 }, { "epoch": 13.045738045738046, "grad_norm": 5.778846263885498, "learning_rate": 1.626329892593487e-05, "loss": 0.4136, "num_input_tokens_seen": 4520176, "step": 50200 }, { "epoch": 13.047037422037421, "grad_norm": 6.623461723327637, "learning_rate": 1.625798702991171e-05, "loss": 0.2877, "num_input_tokens_seen": 4520608, "step": 50205 }, { "epoch": 13.048336798336798, "grad_norm": 6.559925079345703, "learning_rate": 1.6252675583498645e-05, "loss": 0.1497, "num_input_tokens_seen": 4521040, "step": 50210 }, { "epoch": 13.049636174636175, "grad_norm": 1.3298671245574951, "learning_rate": 1.624736458696887e-05, "loss": 0.1489, "num_input_tokens_seen": 4521504, "step": 50215 }, { "epoch": 13.05093555093555, "grad_norm": 0.9977386593818665, "learning_rate": 1.6242054040595507e-05, "loss": 0.0466, "num_input_tokens_seen": 4521952, "step": 50220 }, { "epoch": 13.052234927234927, "grad_norm": 3.8866615295410156, "learning_rate": 1.6236743944651703e-05, "loss": 0.327, "num_input_tokens_seen": 4522384, "step": 50225 }, { "epoch": 13.053534303534304, "grad_norm": 0.18837526440620422, "learning_rate": 1.623143429941056e-05, "loss": 0.1233, "num_input_tokens_seen": 4522832, "step": 50230 }, { "epoch": 13.05483367983368, "grad_norm": 5.6459574699401855, "learning_rate": 1.622612510514514e-05, "loss": 0.3149, "num_input_tokens_seen": 4523264, "step": 50235 }, { "epoch": 13.056133056133056, "grad_norm": 1.7541546821594238, "learning_rate": 1.6220816362128522e-05, "loss": 0.5105, "num_input_tokens_seen": 4523728, "step": 50240 }, { "epoch": 13.057432432432432, "grad_norm": 10.335630416870117, "learning_rate": 1.621550807063372e-05, "loss": 0.1988, "num_input_tokens_seen": 4524160, "step": 50245 }, { "epoch": 13.058731808731808, "grad_norm": 1.7221622467041016, "learning_rate": 1.6210200230933768e-05, "loss": 0.1976, "num_input_tokens_seen": 4524656, "step": 50250 }, { "epoch": 13.060031185031185, "grad_norm": 2.210357427597046, "learning_rate": 1.6204892843301628e-05, "loss": 0.0957, "num_input_tokens_seen": 4525120, "step": 50255 }, { "epoch": 13.06133056133056, "grad_norm": 0.11291876435279846, "learning_rate": 1.6199585908010286e-05, "loss": 0.1285, "num_input_tokens_seen": 4525584, "step": 50260 }, { "epoch": 13.062629937629938, "grad_norm": 0.02972283773124218, "learning_rate": 1.6194279425332662e-05, "loss": 0.0145, "num_input_tokens_seen": 4526016, "step": 50265 }, { "epoch": 13.063929313929314, "grad_norm": 10.951330184936523, "learning_rate": 1.6188973395541696e-05, "loss": 0.192, "num_input_tokens_seen": 4526464, "step": 50270 }, { "epoch": 13.06522869022869, "grad_norm": 0.6829896569252014, "learning_rate": 1.6183667818910263e-05, "loss": 0.0566, "num_input_tokens_seen": 4526912, "step": 50275 }, { "epoch": 13.066528066528067, "grad_norm": 0.05586879327893257, "learning_rate": 1.6178362695711243e-05, "loss": 0.2305, "num_input_tokens_seen": 4527360, "step": 50280 }, { "epoch": 13.067827442827443, "grad_norm": 0.5814473628997803, "learning_rate": 1.617305802621748e-05, "loss": 0.056, "num_input_tokens_seen": 4527808, "step": 50285 }, { "epoch": 13.069126819126819, "grad_norm": 13.158428192138672, "learning_rate": 1.6167753810701806e-05, "loss": 0.3611, "num_input_tokens_seen": 4528288, "step": 50290 }, { "epoch": 13.070426195426196, "grad_norm": 8.56193733215332, "learning_rate": 1.6162450049437017e-05, "loss": 0.4196, "num_input_tokens_seen": 4528736, "step": 50295 }, { "epoch": 13.071725571725572, "grad_norm": 1.590022087097168, "learning_rate": 1.6157146742695884e-05, "loss": 0.1952, "num_input_tokens_seen": 4529216, "step": 50300 }, { "epoch": 13.073024948024948, "grad_norm": 0.39942964911460876, "learning_rate": 1.615184389075117e-05, "loss": 0.2346, "num_input_tokens_seen": 4529680, "step": 50305 }, { "epoch": 13.074324324324325, "grad_norm": 0.42034295201301575, "learning_rate": 1.6146541493875595e-05, "loss": 0.4459, "num_input_tokens_seen": 4530112, "step": 50310 }, { "epoch": 13.075623700623701, "grad_norm": 7.536062240600586, "learning_rate": 1.6141239552341885e-05, "loss": 0.261, "num_input_tokens_seen": 4530544, "step": 50315 }, { "epoch": 13.076923076923077, "grad_norm": 11.563011169433594, "learning_rate": 1.6135938066422707e-05, "loss": 0.3945, "num_input_tokens_seen": 4531072, "step": 50320 }, { "epoch": 13.078222453222454, "grad_norm": 11.728708267211914, "learning_rate": 1.6130637036390726e-05, "loss": 0.2433, "num_input_tokens_seen": 4531504, "step": 50325 }, { "epoch": 13.079521829521829, "grad_norm": 2.372699499130249, "learning_rate": 1.612533646251858e-05, "loss": 0.2903, "num_input_tokens_seen": 4531936, "step": 50330 }, { "epoch": 13.080821205821206, "grad_norm": 1.318151593208313, "learning_rate": 1.6120036345078895e-05, "loss": 0.0298, "num_input_tokens_seen": 4532416, "step": 50335 }, { "epoch": 13.082120582120583, "grad_norm": 7.91462516784668, "learning_rate": 1.6114736684344236e-05, "loss": 0.1793, "num_input_tokens_seen": 4532880, "step": 50340 }, { "epoch": 13.083419958419958, "grad_norm": 0.15706053376197815, "learning_rate": 1.610943748058719e-05, "loss": 0.0388, "num_input_tokens_seen": 4533328, "step": 50345 }, { "epoch": 13.084719334719335, "grad_norm": 0.8566750884056091, "learning_rate": 1.6104138734080285e-05, "loss": 0.1137, "num_input_tokens_seen": 4533808, "step": 50350 }, { "epoch": 13.086018711018712, "grad_norm": 6.567866802215576, "learning_rate": 1.609884044509606e-05, "loss": 0.3012, "num_input_tokens_seen": 4534272, "step": 50355 }, { "epoch": 13.087318087318087, "grad_norm": 2.3857932090759277, "learning_rate": 1.6093542613906992e-05, "loss": 0.1294, "num_input_tokens_seen": 4534704, "step": 50360 }, { "epoch": 13.088617463617464, "grad_norm": 14.904794692993164, "learning_rate": 1.608824524078556e-05, "loss": 0.1819, "num_input_tokens_seen": 4535152, "step": 50365 }, { "epoch": 13.08991683991684, "grad_norm": 1.7235223054885864, "learning_rate": 1.608294832600422e-05, "loss": 0.2073, "num_input_tokens_seen": 4535632, "step": 50370 }, { "epoch": 13.091216216216216, "grad_norm": 24.684099197387695, "learning_rate": 1.6077651869835385e-05, "loss": 0.2911, "num_input_tokens_seen": 4536080, "step": 50375 }, { "epoch": 13.092515592515593, "grad_norm": 0.6740115880966187, "learning_rate": 1.6072355872551467e-05, "loss": 0.1354, "num_input_tokens_seen": 4536544, "step": 50380 }, { "epoch": 13.09381496881497, "grad_norm": 9.43239688873291, "learning_rate": 1.6067060334424835e-05, "loss": 0.412, "num_input_tokens_seen": 4536944, "step": 50385 }, { "epoch": 13.095114345114345, "grad_norm": 0.7794360518455505, "learning_rate": 1.6061765255727863e-05, "loss": 0.2279, "num_input_tokens_seen": 4537408, "step": 50390 }, { "epoch": 13.096413721413722, "grad_norm": 0.2856278121471405, "learning_rate": 1.605647063673285e-05, "loss": 0.1949, "num_input_tokens_seen": 4537856, "step": 50395 }, { "epoch": 13.097713097713097, "grad_norm": 17.029769897460938, "learning_rate": 1.6051176477712136e-05, "loss": 0.4186, "num_input_tokens_seen": 4538288, "step": 50400 }, { "epoch": 13.099012474012474, "grad_norm": 0.17813187837600708, "learning_rate": 1.604588277893798e-05, "loss": 0.0919, "num_input_tokens_seen": 4538720, "step": 50405 }, { "epoch": 13.10031185031185, "grad_norm": 3.316356897354126, "learning_rate": 1.604058954068266e-05, "loss": 0.0909, "num_input_tokens_seen": 4539168, "step": 50410 }, { "epoch": 13.101611226611226, "grad_norm": 0.07563039660453796, "learning_rate": 1.6035296763218398e-05, "loss": 0.1388, "num_input_tokens_seen": 4539616, "step": 50415 }, { "epoch": 13.102910602910603, "grad_norm": 31.242895126342773, "learning_rate": 1.6030004446817417e-05, "loss": 0.396, "num_input_tokens_seen": 4540080, "step": 50420 }, { "epoch": 13.10420997920998, "grad_norm": 1.5622968673706055, "learning_rate": 1.6024712591751907e-05, "loss": 0.2687, "num_input_tokens_seen": 4540528, "step": 50425 }, { "epoch": 13.105509355509355, "grad_norm": 0.18992143869400024, "learning_rate": 1.601942119829402e-05, "loss": 0.0785, "num_input_tokens_seen": 4540976, "step": 50430 }, { "epoch": 13.106808731808732, "grad_norm": 2.2094027996063232, "learning_rate": 1.6014130266715908e-05, "loss": 0.1894, "num_input_tokens_seen": 4541440, "step": 50435 }, { "epoch": 13.108108108108109, "grad_norm": 16.560741424560547, "learning_rate": 1.6008839797289682e-05, "loss": 0.4979, "num_input_tokens_seen": 4541872, "step": 50440 }, { "epoch": 13.109407484407484, "grad_norm": 5.605659008026123, "learning_rate": 1.6003549790287446e-05, "loss": 0.037, "num_input_tokens_seen": 4542368, "step": 50445 }, { "epoch": 13.11070686070686, "grad_norm": 10.896997451782227, "learning_rate": 1.599826024598126e-05, "loss": 0.0501, "num_input_tokens_seen": 4542816, "step": 50450 }, { "epoch": 13.112006237006238, "grad_norm": 0.2668958306312561, "learning_rate": 1.599297116464318e-05, "loss": 0.2267, "num_input_tokens_seen": 4543264, "step": 50455 }, { "epoch": 13.113305613305613, "grad_norm": 17.083740234375, "learning_rate": 1.5987682546545214e-05, "loss": 0.29, "num_input_tokens_seen": 4543712, "step": 50460 }, { "epoch": 13.11460498960499, "grad_norm": 14.12203598022461, "learning_rate": 1.5982394391959382e-05, "loss": 0.2061, "num_input_tokens_seen": 4544192, "step": 50465 }, { "epoch": 13.115904365904365, "grad_norm": 6.304064750671387, "learning_rate": 1.5977106701157634e-05, "loss": 0.521, "num_input_tokens_seen": 4544624, "step": 50470 }, { "epoch": 13.117203742203742, "grad_norm": 0.37966546416282654, "learning_rate": 1.5971819474411943e-05, "loss": 0.1504, "num_input_tokens_seen": 4545072, "step": 50475 }, { "epoch": 13.118503118503119, "grad_norm": 6.376133441925049, "learning_rate": 1.596653271199422e-05, "loss": 0.2518, "num_input_tokens_seen": 4545520, "step": 50480 }, { "epoch": 13.119802494802494, "grad_norm": 0.7709886431694031, "learning_rate": 1.5961246414176385e-05, "loss": 0.1023, "num_input_tokens_seen": 4545968, "step": 50485 }, { "epoch": 13.121101871101871, "grad_norm": 10.067302703857422, "learning_rate": 1.5955960581230297e-05, "loss": 0.3901, "num_input_tokens_seen": 4546432, "step": 50490 }, { "epoch": 13.122401247401248, "grad_norm": 0.4467734694480896, "learning_rate": 1.5950675213427828e-05, "loss": 0.2744, "num_input_tokens_seen": 4546880, "step": 50495 }, { "epoch": 13.123700623700623, "grad_norm": 2.7159252166748047, "learning_rate": 1.594539031104081e-05, "loss": 0.1076, "num_input_tokens_seen": 4547376, "step": 50500 }, { "epoch": 13.125, "grad_norm": 1.2814366817474365, "learning_rate": 1.594010587434103e-05, "loss": 0.0325, "num_input_tokens_seen": 4547840, "step": 50505 }, { "epoch": 13.126299376299377, "grad_norm": 0.021856123581528664, "learning_rate": 1.5934821903600294e-05, "loss": 0.4014, "num_input_tokens_seen": 4548304, "step": 50510 }, { "epoch": 13.127598752598752, "grad_norm": 8.976420402526855, "learning_rate": 1.5929538399090345e-05, "loss": 0.2681, "num_input_tokens_seen": 4548752, "step": 50515 }, { "epoch": 13.128898128898129, "grad_norm": 0.12095943838357925, "learning_rate": 1.5924255361082942e-05, "loss": 0.2412, "num_input_tokens_seen": 4549184, "step": 50520 }, { "epoch": 13.130197505197506, "grad_norm": 1.5067790746688843, "learning_rate": 1.5918972789849764e-05, "loss": 0.2653, "num_input_tokens_seen": 4549664, "step": 50525 }, { "epoch": 13.131496881496881, "grad_norm": 6.799607753753662, "learning_rate": 1.5913690685662528e-05, "loss": 0.3321, "num_input_tokens_seen": 4550112, "step": 50530 }, { "epoch": 13.132796257796258, "grad_norm": 1.2109065055847168, "learning_rate": 1.590840904879288e-05, "loss": 0.2166, "num_input_tokens_seen": 4550544, "step": 50535 }, { "epoch": 13.134095634095635, "grad_norm": 9.154622077941895, "learning_rate": 1.5903127879512474e-05, "loss": 0.4896, "num_input_tokens_seen": 4550992, "step": 50540 }, { "epoch": 13.13539501039501, "grad_norm": 6.5427937507629395, "learning_rate": 1.58978471780929e-05, "loss": 0.1256, "num_input_tokens_seen": 4551424, "step": 50545 }, { "epoch": 13.136694386694387, "grad_norm": 10.093605995178223, "learning_rate": 1.5892566944805785e-05, "loss": 0.1147, "num_input_tokens_seen": 4551856, "step": 50550 }, { "epoch": 13.137993762993762, "grad_norm": 9.232498168945312, "learning_rate": 1.5887287179922674e-05, "loss": 0.5835, "num_input_tokens_seen": 4552288, "step": 50555 }, { "epoch": 13.13929313929314, "grad_norm": 0.43328455090522766, "learning_rate": 1.5882007883715112e-05, "loss": 0.019, "num_input_tokens_seen": 4552720, "step": 50560 }, { "epoch": 13.140592515592516, "grad_norm": 8.582321166992188, "learning_rate": 1.5876729056454627e-05, "loss": 0.4875, "num_input_tokens_seen": 4553200, "step": 50565 }, { "epoch": 13.141891891891891, "grad_norm": 0.1477499008178711, "learning_rate": 1.5871450698412697e-05, "loss": 0.1753, "num_input_tokens_seen": 4553632, "step": 50570 }, { "epoch": 13.143191268191268, "grad_norm": 0.442026823759079, "learning_rate": 1.5866172809860812e-05, "loss": 0.2257, "num_input_tokens_seen": 4554096, "step": 50575 }, { "epoch": 13.144490644490645, "grad_norm": 1.227684736251831, "learning_rate": 1.5860895391070408e-05, "loss": 0.0444, "num_input_tokens_seen": 4554608, "step": 50580 }, { "epoch": 13.14579002079002, "grad_norm": 1.0268679857254028, "learning_rate": 1.5855618442312918e-05, "loss": 0.1844, "num_input_tokens_seen": 4555088, "step": 50585 }, { "epoch": 13.147089397089397, "grad_norm": 1.8040636777877808, "learning_rate": 1.585034196385972e-05, "loss": 0.0928, "num_input_tokens_seen": 4555536, "step": 50590 }, { "epoch": 13.148388773388774, "grad_norm": 10.8704195022583, "learning_rate": 1.5845065955982214e-05, "loss": 0.3974, "num_input_tokens_seen": 4555936, "step": 50595 }, { "epoch": 13.14968814968815, "grad_norm": 7.780367374420166, "learning_rate": 1.5839790418951728e-05, "loss": 0.0851, "num_input_tokens_seen": 4556400, "step": 50600 }, { "epoch": 13.150987525987526, "grad_norm": 5.564435958862305, "learning_rate": 1.583451535303961e-05, "loss": 0.3172, "num_input_tokens_seen": 4556848, "step": 50605 }, { "epoch": 13.152286902286903, "grad_norm": 0.6205477714538574, "learning_rate": 1.5829240758517134e-05, "loss": 0.162, "num_input_tokens_seen": 4557280, "step": 50610 }, { "epoch": 13.153586278586278, "grad_norm": 0.3988841474056244, "learning_rate": 1.58239666356556e-05, "loss": 0.3747, "num_input_tokens_seen": 4557712, "step": 50615 }, { "epoch": 13.154885654885655, "grad_norm": 0.06609806418418884, "learning_rate": 1.5818692984726253e-05, "loss": 0.0611, "num_input_tokens_seen": 4558192, "step": 50620 }, { "epoch": 13.15618503118503, "grad_norm": 2.6547224521636963, "learning_rate": 1.581341980600033e-05, "loss": 0.1301, "num_input_tokens_seen": 4558640, "step": 50625 }, { "epoch": 13.157484407484407, "grad_norm": 1.5909918546676636, "learning_rate": 1.5808147099749023e-05, "loss": 0.4029, "num_input_tokens_seen": 4559056, "step": 50630 }, { "epoch": 13.158783783783784, "grad_norm": 0.7695451974868774, "learning_rate": 1.5802874866243515e-05, "loss": 0.1569, "num_input_tokens_seen": 4559504, "step": 50635 }, { "epoch": 13.16008316008316, "grad_norm": 6.090847969055176, "learning_rate": 1.579760310575497e-05, "loss": 0.4113, "num_input_tokens_seen": 4559920, "step": 50640 }, { "epoch": 13.161382536382536, "grad_norm": 0.19046573340892792, "learning_rate": 1.5792331818554513e-05, "loss": 0.1188, "num_input_tokens_seen": 4560352, "step": 50645 }, { "epoch": 13.162681912681913, "grad_norm": 1.562293529510498, "learning_rate": 1.5787061004913252e-05, "loss": 0.2121, "num_input_tokens_seen": 4560768, "step": 50650 }, { "epoch": 13.163981288981288, "grad_norm": 13.000516891479492, "learning_rate": 1.5781790665102265e-05, "loss": 0.1834, "num_input_tokens_seen": 4561168, "step": 50655 }, { "epoch": 13.165280665280665, "grad_norm": 1.8413461446762085, "learning_rate": 1.577652079939263e-05, "loss": 0.1375, "num_input_tokens_seen": 4561632, "step": 50660 }, { "epoch": 13.166580041580042, "grad_norm": 7.962892055511475, "learning_rate": 1.5771251408055353e-05, "loss": 0.1291, "num_input_tokens_seen": 4562080, "step": 50665 }, { "epoch": 13.167879417879417, "grad_norm": 18.23539161682129, "learning_rate": 1.576598249136147e-05, "loss": 0.3002, "num_input_tokens_seen": 4562528, "step": 50670 }, { "epoch": 13.169178794178794, "grad_norm": 19.05824851989746, "learning_rate": 1.576071404958195e-05, "loss": 0.3514, "num_input_tokens_seen": 4562992, "step": 50675 }, { "epoch": 13.170478170478171, "grad_norm": 0.9589827656745911, "learning_rate": 1.5755446082987762e-05, "loss": 0.1559, "num_input_tokens_seen": 4563440, "step": 50680 }, { "epoch": 13.171777546777546, "grad_norm": 0.4583655297756195, "learning_rate": 1.5750178591849835e-05, "loss": 0.2288, "num_input_tokens_seen": 4563840, "step": 50685 }, { "epoch": 13.173076923076923, "grad_norm": 0.9494274854660034, "learning_rate": 1.574491157643909e-05, "loss": 0.2211, "num_input_tokens_seen": 4564288, "step": 50690 }, { "epoch": 13.174376299376299, "grad_norm": 1.1746753454208374, "learning_rate": 1.5739645037026416e-05, "loss": 0.153, "num_input_tokens_seen": 4564752, "step": 50695 }, { "epoch": 13.175675675675675, "grad_norm": 7.597523212432861, "learning_rate": 1.5734378973882656e-05, "loss": 0.206, "num_input_tokens_seen": 4565168, "step": 50700 }, { "epoch": 13.176975051975052, "grad_norm": 3.7294225692749023, "learning_rate": 1.5729113387278673e-05, "loss": 0.1635, "num_input_tokens_seen": 4565616, "step": 50705 }, { "epoch": 13.178274428274428, "grad_norm": 0.024805568158626556, "learning_rate": 1.5723848277485264e-05, "loss": 0.2749, "num_input_tokens_seen": 4566080, "step": 50710 }, { "epoch": 13.179573804573804, "grad_norm": 0.07020555436611176, "learning_rate": 1.571858364477324e-05, "loss": 0.3011, "num_input_tokens_seen": 4566512, "step": 50715 }, { "epoch": 13.180873180873181, "grad_norm": 2.832932949066162, "learning_rate": 1.571331948941334e-05, "loss": 0.1431, "num_input_tokens_seen": 4566912, "step": 50720 }, { "epoch": 13.182172557172557, "grad_norm": 1.7334147691726685, "learning_rate": 1.570805581167632e-05, "loss": 0.2295, "num_input_tokens_seen": 4567376, "step": 50725 }, { "epoch": 13.183471933471933, "grad_norm": 2.1229872703552246, "learning_rate": 1.570279261183289e-05, "loss": 0.0894, "num_input_tokens_seen": 4567856, "step": 50730 }, { "epoch": 13.18477130977131, "grad_norm": 5.334033489227295, "learning_rate": 1.5697529890153754e-05, "loss": 0.2933, "num_input_tokens_seen": 4568320, "step": 50735 }, { "epoch": 13.186070686070686, "grad_norm": 6.776532173156738, "learning_rate": 1.569226764690956e-05, "loss": 0.4052, "num_input_tokens_seen": 4568800, "step": 50740 }, { "epoch": 13.187370062370062, "grad_norm": 1.7201480865478516, "learning_rate": 1.568700588237096e-05, "loss": 0.4864, "num_input_tokens_seen": 4569216, "step": 50745 }, { "epoch": 13.18866943866944, "grad_norm": 9.838550567626953, "learning_rate": 1.568174459680857e-05, "loss": 0.5318, "num_input_tokens_seen": 4569680, "step": 50750 }, { "epoch": 13.189968814968815, "grad_norm": 7.448657512664795, "learning_rate": 1.567648379049299e-05, "loss": 0.1869, "num_input_tokens_seen": 4570128, "step": 50755 }, { "epoch": 13.191268191268192, "grad_norm": 4.459167003631592, "learning_rate": 1.567122346369478e-05, "loss": 0.2383, "num_input_tokens_seen": 4570544, "step": 50760 }, { "epoch": 13.192567567567568, "grad_norm": 17.84700584411621, "learning_rate": 1.5665963616684476e-05, "loss": 0.2078, "num_input_tokens_seen": 4570960, "step": 50765 }, { "epoch": 13.193866943866944, "grad_norm": 0.2010030299425125, "learning_rate": 1.5660704249732623e-05, "loss": 0.2518, "num_input_tokens_seen": 4571424, "step": 50770 }, { "epoch": 13.19516632016632, "grad_norm": 18.304214477539062, "learning_rate": 1.5655445363109684e-05, "loss": 0.2269, "num_input_tokens_seen": 4571888, "step": 50775 }, { "epoch": 13.196465696465696, "grad_norm": 6.075757026672363, "learning_rate": 1.565018695708615e-05, "loss": 0.1593, "num_input_tokens_seen": 4572352, "step": 50780 }, { "epoch": 13.197765072765073, "grad_norm": 10.644291877746582, "learning_rate": 1.5644929031932454e-05, "loss": 0.2746, "num_input_tokens_seen": 4572816, "step": 50785 }, { "epoch": 13.19906444906445, "grad_norm": 0.07134204357862473, "learning_rate": 1.5639671587919032e-05, "loss": 0.1967, "num_input_tokens_seen": 4573280, "step": 50790 }, { "epoch": 13.200363825363825, "grad_norm": 0.7082000970840454, "learning_rate": 1.5634414625316258e-05, "loss": 0.1461, "num_input_tokens_seen": 4573744, "step": 50795 }, { "epoch": 13.201663201663202, "grad_norm": 4.028634071350098, "learning_rate": 1.5629158144394524e-05, "loss": 0.4776, "num_input_tokens_seen": 4574208, "step": 50800 }, { "epoch": 13.202962577962579, "grad_norm": 0.08897937089204788, "learning_rate": 1.5623902145424153e-05, "loss": 0.2641, "num_input_tokens_seen": 4574656, "step": 50805 }, { "epoch": 13.204261954261954, "grad_norm": 0.07602857798337936, "learning_rate": 1.5618646628675486e-05, "loss": 0.1269, "num_input_tokens_seen": 4575088, "step": 50810 }, { "epoch": 13.20556133056133, "grad_norm": 12.85807991027832, "learning_rate": 1.561339159441881e-05, "loss": 0.2051, "num_input_tokens_seen": 4575504, "step": 50815 }, { "epoch": 13.206860706860708, "grad_norm": 12.815521240234375, "learning_rate": 1.5608137042924403e-05, "loss": 0.2457, "num_input_tokens_seen": 4575936, "step": 50820 }, { "epoch": 13.208160083160083, "grad_norm": 13.435583114624023, "learning_rate": 1.5602882974462505e-05, "loss": 0.1979, "num_input_tokens_seen": 4576416, "step": 50825 }, { "epoch": 13.20945945945946, "grad_norm": 4.669650554656982, "learning_rate": 1.559762938930333e-05, "loss": 0.0723, "num_input_tokens_seen": 4576880, "step": 50830 }, { "epoch": 13.210758835758837, "grad_norm": 15.917222023010254, "learning_rate": 1.55923762877171e-05, "loss": 0.2317, "num_input_tokens_seen": 4577328, "step": 50835 }, { "epoch": 13.212058212058212, "grad_norm": 15.589842796325684, "learning_rate": 1.558712366997396e-05, "loss": 0.1822, "num_input_tokens_seen": 4577808, "step": 50840 }, { "epoch": 13.213357588357589, "grad_norm": 6.165384292602539, "learning_rate": 1.5581871536344072e-05, "loss": 0.1413, "num_input_tokens_seen": 4578224, "step": 50845 }, { "epoch": 13.214656964656964, "grad_norm": 0.07975506037473679, "learning_rate": 1.5576619887097553e-05, "loss": 0.0484, "num_input_tokens_seen": 4578688, "step": 50850 }, { "epoch": 13.21595634095634, "grad_norm": 3.8793981075286865, "learning_rate": 1.557136872250451e-05, "loss": 0.4423, "num_input_tokens_seen": 4579168, "step": 50855 }, { "epoch": 13.217255717255718, "grad_norm": 3.3743674755096436, "learning_rate": 1.5566118042834994e-05, "loss": 0.0763, "num_input_tokens_seen": 4579648, "step": 50860 }, { "epoch": 13.218555093555093, "grad_norm": 14.454080581665039, "learning_rate": 1.5560867848359077e-05, "loss": 0.5561, "num_input_tokens_seen": 4580096, "step": 50865 }, { "epoch": 13.21985446985447, "grad_norm": 1.648974895477295, "learning_rate": 1.5555618139346766e-05, "loss": 0.4998, "num_input_tokens_seen": 4580528, "step": 50870 }, { "epoch": 13.221153846153847, "grad_norm": 0.33475208282470703, "learning_rate": 1.555036891606807e-05, "loss": 0.231, "num_input_tokens_seen": 4580976, "step": 50875 }, { "epoch": 13.222453222453222, "grad_norm": 10.24898624420166, "learning_rate": 1.5545120178792944e-05, "loss": 0.1922, "num_input_tokens_seen": 4581408, "step": 50880 }, { "epoch": 13.223752598752599, "grad_norm": 0.9214943647384644, "learning_rate": 1.5539871927791356e-05, "loss": 0.0532, "num_input_tokens_seen": 4581872, "step": 50885 }, { "epoch": 13.225051975051976, "grad_norm": 7.204742431640625, "learning_rate": 1.5534624163333215e-05, "loss": 0.2764, "num_input_tokens_seen": 4582288, "step": 50890 }, { "epoch": 13.22635135135135, "grad_norm": 7.37635612487793, "learning_rate": 1.5529376885688422e-05, "loss": 0.172, "num_input_tokens_seen": 4582720, "step": 50895 }, { "epoch": 13.227650727650728, "grad_norm": 0.9136555194854736, "learning_rate": 1.5524130095126853e-05, "loss": 0.3443, "num_input_tokens_seen": 4583184, "step": 50900 }, { "epoch": 13.228950103950105, "grad_norm": 0.2870629131793976, "learning_rate": 1.5518883791918345e-05, "loss": 0.3116, "num_input_tokens_seen": 4583616, "step": 50905 }, { "epoch": 13.23024948024948, "grad_norm": 8.332466125488281, "learning_rate": 1.551363797633274e-05, "loss": 0.4443, "num_input_tokens_seen": 4584080, "step": 50910 }, { "epoch": 13.231548856548857, "grad_norm": 1.4977352619171143, "learning_rate": 1.5508392648639813e-05, "loss": 0.127, "num_input_tokens_seen": 4584528, "step": 50915 }, { "epoch": 13.232848232848234, "grad_norm": 1.2689088582992554, "learning_rate": 1.550314780910935e-05, "loss": 0.1286, "num_input_tokens_seen": 4584960, "step": 50920 }, { "epoch": 13.234147609147609, "grad_norm": 7.805394649505615, "learning_rate": 1.5497903458011092e-05, "loss": 0.165, "num_input_tokens_seen": 4585392, "step": 50925 }, { "epoch": 13.235446985446986, "grad_norm": 2.567761182785034, "learning_rate": 1.5492659595614774e-05, "loss": 0.3821, "num_input_tokens_seen": 4585856, "step": 50930 }, { "epoch": 13.236746361746361, "grad_norm": 0.2124943882226944, "learning_rate": 1.5487416222190068e-05, "loss": 0.1698, "num_input_tokens_seen": 4586288, "step": 50935 }, { "epoch": 13.238045738045738, "grad_norm": 0.6377378106117249, "learning_rate": 1.5482173338006666e-05, "loss": 0.1206, "num_input_tokens_seen": 4586736, "step": 50940 }, { "epoch": 13.239345114345115, "grad_norm": 0.3973632752895355, "learning_rate": 1.547693094333421e-05, "loss": 0.2834, "num_input_tokens_seen": 4587168, "step": 50945 }, { "epoch": 13.24064449064449, "grad_norm": 3.293482780456543, "learning_rate": 1.5471689038442326e-05, "loss": 0.3759, "num_input_tokens_seen": 4587568, "step": 50950 }, { "epoch": 13.241943866943867, "grad_norm": 1.8419286012649536, "learning_rate": 1.54664476236006e-05, "loss": 0.3957, "num_input_tokens_seen": 4588016, "step": 50955 }, { "epoch": 13.243243243243244, "grad_norm": 3.6672022342681885, "learning_rate": 1.5461206699078602e-05, "loss": 0.6387, "num_input_tokens_seen": 4588464, "step": 50960 }, { "epoch": 13.244542619542619, "grad_norm": 20.766637802124023, "learning_rate": 1.545596626514589e-05, "loss": 0.2268, "num_input_tokens_seen": 4588928, "step": 50965 }, { "epoch": 13.245841995841996, "grad_norm": 12.369688034057617, "learning_rate": 1.545072632207197e-05, "loss": 0.4034, "num_input_tokens_seen": 4589376, "step": 50970 }, { "epoch": 13.247141372141373, "grad_norm": 0.011829931288957596, "learning_rate": 1.5445486870126353e-05, "loss": 0.2222, "num_input_tokens_seen": 4589808, "step": 50975 }, { "epoch": 13.248440748440748, "grad_norm": 6.568887710571289, "learning_rate": 1.5440247909578494e-05, "loss": 0.1383, "num_input_tokens_seen": 4590272, "step": 50980 }, { "epoch": 13.249740124740125, "grad_norm": 18.45484733581543, "learning_rate": 1.543500944069785e-05, "loss": 0.2784, "num_input_tokens_seen": 4590736, "step": 50985 }, { "epoch": 13.2510395010395, "grad_norm": 14.300810813903809, "learning_rate": 1.542977146375383e-05, "loss": 0.2188, "num_input_tokens_seen": 4591200, "step": 50990 }, { "epoch": 13.252338877338877, "grad_norm": 10.232975959777832, "learning_rate": 1.5424533979015837e-05, "loss": 0.1915, "num_input_tokens_seen": 4591680, "step": 50995 }, { "epoch": 13.253638253638254, "grad_norm": 0.05547016113996506, "learning_rate": 1.5419296986753233e-05, "loss": 0.0162, "num_input_tokens_seen": 4592128, "step": 51000 }, { "epoch": 13.25493762993763, "grad_norm": 7.752046585083008, "learning_rate": 1.541406048723537e-05, "loss": 0.3253, "num_input_tokens_seen": 4592544, "step": 51005 }, { "epoch": 13.256237006237006, "grad_norm": 0.24160513281822205, "learning_rate": 1.5408824480731557e-05, "loss": 0.792, "num_input_tokens_seen": 4592976, "step": 51010 }, { "epoch": 13.257536382536383, "grad_norm": 13.394769668579102, "learning_rate": 1.5403588967511092e-05, "loss": 0.1486, "num_input_tokens_seen": 4593456, "step": 51015 }, { "epoch": 13.258835758835758, "grad_norm": 16.986751556396484, "learning_rate": 1.5398353947843247e-05, "loss": 0.7177, "num_input_tokens_seen": 4593872, "step": 51020 }, { "epoch": 13.260135135135135, "grad_norm": 5.164892673492432, "learning_rate": 1.539311942199725e-05, "loss": 0.0969, "num_input_tokens_seen": 4594320, "step": 51025 }, { "epoch": 13.261434511434512, "grad_norm": 4.187222480773926, "learning_rate": 1.538788539024233e-05, "loss": 0.2712, "num_input_tokens_seen": 4594784, "step": 51030 }, { "epoch": 13.262733887733887, "grad_norm": 8.100483894348145, "learning_rate": 1.538265185284767e-05, "loss": 0.1559, "num_input_tokens_seen": 4595232, "step": 51035 }, { "epoch": 13.264033264033264, "grad_norm": 0.7228415012359619, "learning_rate": 1.537741881008245e-05, "loss": 0.1234, "num_input_tokens_seen": 4595664, "step": 51040 }, { "epoch": 13.265332640332641, "grad_norm": 0.19288624823093414, "learning_rate": 1.5372186262215783e-05, "loss": 0.2703, "num_input_tokens_seen": 4596112, "step": 51045 }, { "epoch": 13.266632016632016, "grad_norm": 16.989206314086914, "learning_rate": 1.536695420951682e-05, "loss": 0.4208, "num_input_tokens_seen": 4596560, "step": 51050 }, { "epoch": 13.267931392931393, "grad_norm": 0.5939382910728455, "learning_rate": 1.5361722652254617e-05, "loss": 0.0779, "num_input_tokens_seen": 4597008, "step": 51055 }, { "epoch": 13.26923076923077, "grad_norm": 0.4244478642940521, "learning_rate": 1.5356491590698263e-05, "loss": 0.2226, "num_input_tokens_seen": 4597424, "step": 51060 }, { "epoch": 13.270530145530145, "grad_norm": 1.8480803966522217, "learning_rate": 1.535126102511678e-05, "loss": 0.0298, "num_input_tokens_seen": 4597856, "step": 51065 }, { "epoch": 13.271829521829522, "grad_norm": 0.7347455024719238, "learning_rate": 1.5346030955779195e-05, "loss": 0.3623, "num_input_tokens_seen": 4598320, "step": 51070 }, { "epoch": 13.273128898128897, "grad_norm": 12.65583610534668, "learning_rate": 1.534080138295448e-05, "loss": 0.2495, "num_input_tokens_seen": 4598736, "step": 51075 }, { "epoch": 13.274428274428274, "grad_norm": 0.3325984477996826, "learning_rate": 1.533557230691161e-05, "loss": 0.0868, "num_input_tokens_seen": 4599200, "step": 51080 }, { "epoch": 13.275727650727651, "grad_norm": 11.328140258789062, "learning_rate": 1.533034372791952e-05, "loss": 0.1944, "num_input_tokens_seen": 4599648, "step": 51085 }, { "epoch": 13.277027027027026, "grad_norm": 2.6261281967163086, "learning_rate": 1.532511564624711e-05, "loss": 0.1838, "num_input_tokens_seen": 4600128, "step": 51090 }, { "epoch": 13.278326403326403, "grad_norm": 13.645764350891113, "learning_rate": 1.5319888062163273e-05, "loss": 0.1052, "num_input_tokens_seen": 4600576, "step": 51095 }, { "epoch": 13.27962577962578, "grad_norm": 9.691994667053223, "learning_rate": 1.531466097593687e-05, "loss": 0.0991, "num_input_tokens_seen": 4601056, "step": 51100 }, { "epoch": 13.280925155925155, "grad_norm": 1.347412347793579, "learning_rate": 1.5309434387836735e-05, "loss": 0.0855, "num_input_tokens_seen": 4601504, "step": 51105 }, { "epoch": 13.282224532224532, "grad_norm": 3.1203088760375977, "learning_rate": 1.5304208298131667e-05, "loss": 0.1979, "num_input_tokens_seen": 4602000, "step": 51110 }, { "epoch": 13.28352390852391, "grad_norm": 3.9353153705596924, "learning_rate": 1.5298982707090464e-05, "loss": 0.3315, "num_input_tokens_seen": 4602464, "step": 51115 }, { "epoch": 13.284823284823284, "grad_norm": 18.40215492248535, "learning_rate": 1.529375761498187e-05, "loss": 0.3938, "num_input_tokens_seen": 4602912, "step": 51120 }, { "epoch": 13.286122661122661, "grad_norm": 1.8753173351287842, "learning_rate": 1.528853302207463e-05, "loss": 0.2281, "num_input_tokens_seen": 4603344, "step": 51125 }, { "epoch": 13.287422037422038, "grad_norm": 11.40394115447998, "learning_rate": 1.528330892863743e-05, "loss": 0.2351, "num_input_tokens_seen": 4603776, "step": 51130 }, { "epoch": 13.288721413721413, "grad_norm": 0.04786805436015129, "learning_rate": 1.527808533493897e-05, "loss": 0.0199, "num_input_tokens_seen": 4604256, "step": 51135 }, { "epoch": 13.29002079002079, "grad_norm": 7.087464332580566, "learning_rate": 1.5272862241247892e-05, "loss": 0.1972, "num_input_tokens_seen": 4604688, "step": 51140 }, { "epoch": 13.291320166320165, "grad_norm": 0.4723958969116211, "learning_rate": 1.5267639647832837e-05, "loss": 0.0512, "num_input_tokens_seen": 4605152, "step": 51145 }, { "epoch": 13.292619542619542, "grad_norm": 1.801965594291687, "learning_rate": 1.5262417554962395e-05, "loss": 0.0944, "num_input_tokens_seen": 4605584, "step": 51150 }, { "epoch": 13.29391891891892, "grad_norm": 13.216724395751953, "learning_rate": 1.5257195962905147e-05, "loss": 0.3618, "num_input_tokens_seen": 4606016, "step": 51155 }, { "epoch": 13.295218295218294, "grad_norm": 6.126416206359863, "learning_rate": 1.525197487192965e-05, "loss": 0.3123, "num_input_tokens_seen": 4606416, "step": 51160 }, { "epoch": 13.296517671517671, "grad_norm": 0.04809979721903801, "learning_rate": 1.5246754282304418e-05, "loss": 0.0323, "num_input_tokens_seen": 4606832, "step": 51165 }, { "epoch": 13.297817047817048, "grad_norm": 0.8297757506370544, "learning_rate": 1.5241534194297963e-05, "loss": 0.0619, "num_input_tokens_seen": 4607296, "step": 51170 }, { "epoch": 13.299116424116423, "grad_norm": 22.113231658935547, "learning_rate": 1.523631460817875e-05, "loss": 0.4179, "num_input_tokens_seen": 4607728, "step": 51175 }, { "epoch": 13.3004158004158, "grad_norm": 0.8466702699661255, "learning_rate": 1.5231095524215244e-05, "loss": 0.045, "num_input_tokens_seen": 4608224, "step": 51180 }, { "epoch": 13.301715176715177, "grad_norm": 0.19065731763839722, "learning_rate": 1.5225876942675842e-05, "loss": 0.3287, "num_input_tokens_seen": 4608624, "step": 51185 }, { "epoch": 13.303014553014552, "grad_norm": 17.12173080444336, "learning_rate": 1.522065886382896e-05, "loss": 0.3399, "num_input_tokens_seen": 4609072, "step": 51190 }, { "epoch": 13.30431392931393, "grad_norm": 14.427522659301758, "learning_rate": 1.5215441287942956e-05, "loss": 0.2559, "num_input_tokens_seen": 4609488, "step": 51195 }, { "epoch": 13.305613305613306, "grad_norm": 10.458005905151367, "learning_rate": 1.5210224215286195e-05, "loss": 0.0946, "num_input_tokens_seen": 4609920, "step": 51200 }, { "epoch": 13.306912681912682, "grad_norm": 9.514111518859863, "learning_rate": 1.520500764612697e-05, "loss": 0.2427, "num_input_tokens_seen": 4610352, "step": 51205 }, { "epoch": 13.308212058212058, "grad_norm": 0.09881501644849777, "learning_rate": 1.5199791580733593e-05, "loss": 0.107, "num_input_tokens_seen": 4610816, "step": 51210 }, { "epoch": 13.309511434511435, "grad_norm": 0.6687602400779724, "learning_rate": 1.519457601937433e-05, "loss": 0.0574, "num_input_tokens_seen": 4611280, "step": 51215 }, { "epoch": 13.31081081081081, "grad_norm": 18.0159912109375, "learning_rate": 1.5189360962317409e-05, "loss": 0.5958, "num_input_tokens_seen": 4611744, "step": 51220 }, { "epoch": 13.312110187110187, "grad_norm": 0.7919661998748779, "learning_rate": 1.5184146409831057e-05, "loss": 0.2434, "num_input_tokens_seen": 4612176, "step": 51225 }, { "epoch": 13.313409563409563, "grad_norm": 10.610100746154785, "learning_rate": 1.5178932362183457e-05, "loss": 0.3849, "num_input_tokens_seen": 4612624, "step": 51230 }, { "epoch": 13.31470893970894, "grad_norm": 10.948246955871582, "learning_rate": 1.5173718819642783e-05, "loss": 0.1114, "num_input_tokens_seen": 4613072, "step": 51235 }, { "epoch": 13.316008316008316, "grad_norm": 0.2398175150156021, "learning_rate": 1.5168505782477155e-05, "loss": 0.0624, "num_input_tokens_seen": 4613472, "step": 51240 }, { "epoch": 13.317307692307692, "grad_norm": 1.4369823932647705, "learning_rate": 1.5163293250954702e-05, "loss": 0.1575, "num_input_tokens_seen": 4613936, "step": 51245 }, { "epoch": 13.318607068607069, "grad_norm": 22.582162857055664, "learning_rate": 1.5158081225343496e-05, "loss": 0.579, "num_input_tokens_seen": 4614416, "step": 51250 }, { "epoch": 13.319906444906445, "grad_norm": 10.98555850982666, "learning_rate": 1.5152869705911616e-05, "loss": 0.2383, "num_input_tokens_seen": 4614864, "step": 51255 }, { "epoch": 13.32120582120582, "grad_norm": 0.35545840859413147, "learning_rate": 1.5147658692927067e-05, "loss": 0.3811, "num_input_tokens_seen": 4615344, "step": 51260 }, { "epoch": 13.322505197505198, "grad_norm": 0.09593168646097183, "learning_rate": 1.5142448186657878e-05, "loss": 0.1072, "num_input_tokens_seen": 4615808, "step": 51265 }, { "epoch": 13.323804573804575, "grad_norm": 8.06457233428955, "learning_rate": 1.5137238187372021e-05, "loss": 0.3376, "num_input_tokens_seen": 4616224, "step": 51270 }, { "epoch": 13.32510395010395, "grad_norm": 3.8543174266815186, "learning_rate": 1.5132028695337461e-05, "loss": 0.0488, "num_input_tokens_seen": 4616656, "step": 51275 }, { "epoch": 13.326403326403327, "grad_norm": 0.7255871891975403, "learning_rate": 1.5126819710822115e-05, "loss": 0.3014, "num_input_tokens_seen": 4617136, "step": 51280 }, { "epoch": 13.327702702702704, "grad_norm": 2.341778039932251, "learning_rate": 1.512161123409389e-05, "loss": 0.219, "num_input_tokens_seen": 4617584, "step": 51285 }, { "epoch": 13.329002079002079, "grad_norm": 14.218147277832031, "learning_rate": 1.511640326542067e-05, "loss": 0.1035, "num_input_tokens_seen": 4618016, "step": 51290 }, { "epoch": 13.330301455301456, "grad_norm": 8.169182777404785, "learning_rate": 1.5111195805070288e-05, "loss": 0.1875, "num_input_tokens_seen": 4618448, "step": 51295 }, { "epoch": 13.33160083160083, "grad_norm": 11.584661483764648, "learning_rate": 1.5105988853310596e-05, "loss": 0.1553, "num_input_tokens_seen": 4618912, "step": 51300 }, { "epoch": 13.332900207900208, "grad_norm": 10.590978622436523, "learning_rate": 1.5100782410409367e-05, "loss": 0.2036, "num_input_tokens_seen": 4619344, "step": 51305 }, { "epoch": 13.334199584199585, "grad_norm": 10.291388511657715, "learning_rate": 1.5095576476634388e-05, "loss": 0.2569, "num_input_tokens_seen": 4619760, "step": 51310 }, { "epoch": 13.33549896049896, "grad_norm": 15.90317440032959, "learning_rate": 1.5090371052253394e-05, "loss": 0.3013, "num_input_tokens_seen": 4620192, "step": 51315 }, { "epoch": 13.336798336798337, "grad_norm": 10.66425895690918, "learning_rate": 1.5085166137534123e-05, "loss": 0.1932, "num_input_tokens_seen": 4620672, "step": 51320 }, { "epoch": 13.338097713097714, "grad_norm": 6.953003883361816, "learning_rate": 1.507996173274425e-05, "loss": 0.2923, "num_input_tokens_seen": 4621152, "step": 51325 }, { "epoch": 13.339397089397089, "grad_norm": 27.60170555114746, "learning_rate": 1.5074757838151452e-05, "loss": 0.3195, "num_input_tokens_seen": 4621632, "step": 51330 }, { "epoch": 13.340696465696466, "grad_norm": 16.936912536621094, "learning_rate": 1.5069554454023366e-05, "loss": 0.1382, "num_input_tokens_seen": 4622048, "step": 51335 }, { "epoch": 13.341995841995843, "grad_norm": 18.704235076904297, "learning_rate": 1.5064351580627618e-05, "loss": 0.6462, "num_input_tokens_seen": 4622528, "step": 51340 }, { "epoch": 13.343295218295218, "grad_norm": 0.12518873810768127, "learning_rate": 1.505914921823178e-05, "loss": 0.006, "num_input_tokens_seen": 4622992, "step": 51345 }, { "epoch": 13.344594594594595, "grad_norm": 10.834986686706543, "learning_rate": 1.5053947367103422e-05, "loss": 0.1142, "num_input_tokens_seen": 4623424, "step": 51350 }, { "epoch": 13.345893970893972, "grad_norm": 1.6413851976394653, "learning_rate": 1.5048746027510085e-05, "loss": 0.6663, "num_input_tokens_seen": 4623872, "step": 51355 }, { "epoch": 13.347193347193347, "grad_norm": 6.528299808502197, "learning_rate": 1.5043545199719272e-05, "loss": 0.0919, "num_input_tokens_seen": 4624304, "step": 51360 }, { "epoch": 13.348492723492724, "grad_norm": 4.09229850769043, "learning_rate": 1.503834488399847e-05, "loss": 0.0559, "num_input_tokens_seen": 4624752, "step": 51365 }, { "epoch": 13.3497920997921, "grad_norm": 12.646029472351074, "learning_rate": 1.5033145080615129e-05, "loss": 0.2964, "num_input_tokens_seen": 4625200, "step": 51370 }, { "epoch": 13.351091476091476, "grad_norm": 0.18146266043186188, "learning_rate": 1.50279457898367e-05, "loss": 0.1023, "num_input_tokens_seen": 4625632, "step": 51375 }, { "epoch": 13.352390852390853, "grad_norm": 11.133997917175293, "learning_rate": 1.5022747011930564e-05, "loss": 0.2523, "num_input_tokens_seen": 4626112, "step": 51380 }, { "epoch": 13.353690228690228, "grad_norm": 4.095142364501953, "learning_rate": 1.5017548747164111e-05, "loss": 0.2276, "num_input_tokens_seen": 4626560, "step": 51385 }, { "epoch": 13.354989604989605, "grad_norm": 1.5457550287246704, "learning_rate": 1.5012350995804686e-05, "loss": 0.1064, "num_input_tokens_seen": 4627072, "step": 51390 }, { "epoch": 13.356288981288982, "grad_norm": 1.3221118450164795, "learning_rate": 1.500715375811963e-05, "loss": 0.1008, "num_input_tokens_seen": 4627504, "step": 51395 }, { "epoch": 13.357588357588357, "grad_norm": 11.363072395324707, "learning_rate": 1.5001957034376221e-05, "loss": 0.1843, "num_input_tokens_seen": 4627952, "step": 51400 }, { "epoch": 13.358887733887734, "grad_norm": 0.7031444907188416, "learning_rate": 1.4996760824841747e-05, "loss": 0.1857, "num_input_tokens_seen": 4628400, "step": 51405 }, { "epoch": 13.36018711018711, "grad_norm": 0.03303466737270355, "learning_rate": 1.4991565129783452e-05, "loss": 0.0289, "num_input_tokens_seen": 4628880, "step": 51410 }, { "epoch": 13.361486486486486, "grad_norm": 10.830013275146484, "learning_rate": 1.4986369949468543e-05, "loss": 0.196, "num_input_tokens_seen": 4629392, "step": 51415 }, { "epoch": 13.362785862785863, "grad_norm": 17.186161041259766, "learning_rate": 1.4981175284164226e-05, "loss": 0.534, "num_input_tokens_seen": 4629856, "step": 51420 }, { "epoch": 13.36408523908524, "grad_norm": 0.032711468636989594, "learning_rate": 1.4975981134137659e-05, "loss": 0.2251, "num_input_tokens_seen": 4630256, "step": 51425 }, { "epoch": 13.365384615384615, "grad_norm": 0.2531772553920746, "learning_rate": 1.4970787499655998e-05, "loss": 0.1082, "num_input_tokens_seen": 4630720, "step": 51430 }, { "epoch": 13.366683991683992, "grad_norm": 20.048423767089844, "learning_rate": 1.4965594380986334e-05, "loss": 0.1975, "num_input_tokens_seen": 4631216, "step": 51435 }, { "epoch": 13.367983367983369, "grad_norm": 8.957959175109863, "learning_rate": 1.4960401778395771e-05, "loss": 0.1285, "num_input_tokens_seen": 4631664, "step": 51440 }, { "epoch": 13.369282744282744, "grad_norm": 28.188396453857422, "learning_rate": 1.4955209692151358e-05, "loss": 0.375, "num_input_tokens_seen": 4632144, "step": 51445 }, { "epoch": 13.370582120582121, "grad_norm": 3.773064136505127, "learning_rate": 1.4950018122520148e-05, "loss": 0.6265, "num_input_tokens_seen": 4632624, "step": 51450 }, { "epoch": 13.371881496881496, "grad_norm": 0.13382072746753693, "learning_rate": 1.4944827069769123e-05, "loss": 0.3007, "num_input_tokens_seen": 4633056, "step": 51455 }, { "epoch": 13.373180873180873, "grad_norm": 2.0938127040863037, "learning_rate": 1.493963653416528e-05, "loss": 0.1152, "num_input_tokens_seen": 4633488, "step": 51460 }, { "epoch": 13.37448024948025, "grad_norm": 0.9447256922721863, "learning_rate": 1.4934446515975568e-05, "loss": 0.0278, "num_input_tokens_seen": 4633984, "step": 51465 }, { "epoch": 13.375779625779625, "grad_norm": 15.754940032958984, "learning_rate": 1.4929257015466923e-05, "loss": 0.36, "num_input_tokens_seen": 4634416, "step": 51470 }, { "epoch": 13.377079002079002, "grad_norm": 11.151870727539062, "learning_rate": 1.4924068032906235e-05, "loss": 0.3015, "num_input_tokens_seen": 4634896, "step": 51475 }, { "epoch": 13.378378378378379, "grad_norm": 0.3650597631931305, "learning_rate": 1.491887956856038e-05, "loss": 0.2634, "num_input_tokens_seen": 4635344, "step": 51480 }, { "epoch": 13.379677754677754, "grad_norm": 0.01158864889293909, "learning_rate": 1.4913691622696213e-05, "loss": 0.0023, "num_input_tokens_seen": 4635760, "step": 51485 }, { "epoch": 13.380977130977131, "grad_norm": 11.385269165039062, "learning_rate": 1.4908504195580542e-05, "loss": 0.15, "num_input_tokens_seen": 4636192, "step": 51490 }, { "epoch": 13.382276507276508, "grad_norm": 0.16642197966575623, "learning_rate": 1.4903317287480175e-05, "loss": 0.324, "num_input_tokens_seen": 4636672, "step": 51495 }, { "epoch": 13.383575883575883, "grad_norm": 10.807201385498047, "learning_rate": 1.4898130898661871e-05, "loss": 0.6629, "num_input_tokens_seen": 4637152, "step": 51500 }, { "epoch": 13.38487525987526, "grad_norm": 8.792073249816895, "learning_rate": 1.489294502939238e-05, "loss": 0.441, "num_input_tokens_seen": 4637584, "step": 51505 }, { "epoch": 13.386174636174637, "grad_norm": 0.021754290908575058, "learning_rate": 1.4887759679938403e-05, "loss": 0.233, "num_input_tokens_seen": 4638016, "step": 51510 }, { "epoch": 13.387474012474012, "grad_norm": 3.167782783508301, "learning_rate": 1.488257485056664e-05, "loss": 0.2857, "num_input_tokens_seen": 4638448, "step": 51515 }, { "epoch": 13.388773388773389, "grad_norm": 4.417818069458008, "learning_rate": 1.487739054154374e-05, "loss": 0.2346, "num_input_tokens_seen": 4638880, "step": 51520 }, { "epoch": 13.390072765072764, "grad_norm": 8.605734825134277, "learning_rate": 1.4872206753136353e-05, "loss": 0.0918, "num_input_tokens_seen": 4639360, "step": 51525 }, { "epoch": 13.391372141372141, "grad_norm": 1.738023281097412, "learning_rate": 1.4867023485611064e-05, "loss": 0.3609, "num_input_tokens_seen": 4639808, "step": 51530 }, { "epoch": 13.392671517671518, "grad_norm": 11.892717361450195, "learning_rate": 1.4861840739234478e-05, "loss": 0.5288, "num_input_tokens_seen": 4640240, "step": 51535 }, { "epoch": 13.393970893970893, "grad_norm": 2.1063907146453857, "learning_rate": 1.4856658514273142e-05, "loss": 0.3217, "num_input_tokens_seen": 4640688, "step": 51540 }, { "epoch": 13.39527027027027, "grad_norm": 7.244545936584473, "learning_rate": 1.4851476810993558e-05, "loss": 0.0512, "num_input_tokens_seen": 4641136, "step": 51545 }, { "epoch": 13.396569646569647, "grad_norm": 7.293501853942871, "learning_rate": 1.484629562966226e-05, "loss": 0.4004, "num_input_tokens_seen": 4641568, "step": 51550 }, { "epoch": 13.397869022869022, "grad_norm": 6.5488481521606445, "learning_rate": 1.48411149705457e-05, "loss": 0.5209, "num_input_tokens_seen": 4642048, "step": 51555 }, { "epoch": 13.3991683991684, "grad_norm": 3.549051523208618, "learning_rate": 1.4835934833910331e-05, "loss": 0.547, "num_input_tokens_seen": 4642480, "step": 51560 }, { "epoch": 13.400467775467776, "grad_norm": 2.7014544010162354, "learning_rate": 1.4830755220022572e-05, "loss": 0.1058, "num_input_tokens_seen": 4642976, "step": 51565 }, { "epoch": 13.401767151767151, "grad_norm": 5.483489513397217, "learning_rate": 1.4825576129148825e-05, "loss": 0.3081, "num_input_tokens_seen": 4643408, "step": 51570 }, { "epoch": 13.403066528066528, "grad_norm": 0.7458204627037048, "learning_rate": 1.4820397561555432e-05, "loss": 0.0954, "num_input_tokens_seen": 4643808, "step": 51575 }, { "epoch": 13.404365904365905, "grad_norm": 14.406339645385742, "learning_rate": 1.4815219517508756e-05, "loss": 0.1487, "num_input_tokens_seen": 4644256, "step": 51580 }, { "epoch": 13.40566528066528, "grad_norm": 3.547703504562378, "learning_rate": 1.4810041997275092e-05, "loss": 0.038, "num_input_tokens_seen": 4644688, "step": 51585 }, { "epoch": 13.406964656964657, "grad_norm": 3.565126895904541, "learning_rate": 1.4804865001120744e-05, "loss": 0.034, "num_input_tokens_seen": 4645136, "step": 51590 }, { "epoch": 13.408264033264032, "grad_norm": 12.017145156860352, "learning_rate": 1.4799688529311945e-05, "loss": 0.3423, "num_input_tokens_seen": 4645584, "step": 51595 }, { "epoch": 13.40956340956341, "grad_norm": 0.8668038845062256, "learning_rate": 1.4794512582114941e-05, "loss": 0.1971, "num_input_tokens_seen": 4646032, "step": 51600 }, { "epoch": 13.410862785862786, "grad_norm": 12.173751831054688, "learning_rate": 1.478933715979594e-05, "loss": 0.1782, "num_input_tokens_seen": 4646496, "step": 51605 }, { "epoch": 13.412162162162161, "grad_norm": 0.5971055626869202, "learning_rate": 1.4784162262621104e-05, "loss": 0.0851, "num_input_tokens_seen": 4646944, "step": 51610 }, { "epoch": 13.413461538461538, "grad_norm": 13.146400451660156, "learning_rate": 1.4778987890856594e-05, "loss": 0.5631, "num_input_tokens_seen": 4647376, "step": 51615 }, { "epoch": 13.414760914760915, "grad_norm": 3.992612600326538, "learning_rate": 1.4773814044768528e-05, "loss": 0.0933, "num_input_tokens_seen": 4647840, "step": 51620 }, { "epoch": 13.41606029106029, "grad_norm": 0.38692134618759155, "learning_rate": 1.4768640724623012e-05, "loss": 0.2693, "num_input_tokens_seen": 4648256, "step": 51625 }, { "epoch": 13.417359667359667, "grad_norm": 1.330942988395691, "learning_rate": 1.4763467930686097e-05, "loss": 0.106, "num_input_tokens_seen": 4648704, "step": 51630 }, { "epoch": 13.418659043659044, "grad_norm": 0.23136132955551147, "learning_rate": 1.475829566322384e-05, "loss": 0.0093, "num_input_tokens_seen": 4649120, "step": 51635 }, { "epoch": 13.41995841995842, "grad_norm": 0.028422243893146515, "learning_rate": 1.4753123922502244e-05, "loss": 0.0748, "num_input_tokens_seen": 4649584, "step": 51640 }, { "epoch": 13.421257796257796, "grad_norm": 0.4197741448879242, "learning_rate": 1.4747952708787316e-05, "loss": 0.2947, "num_input_tokens_seen": 4650032, "step": 51645 }, { "epoch": 13.422557172557173, "grad_norm": 0.04629802703857422, "learning_rate": 1.474278202234499e-05, "loss": 0.0064, "num_input_tokens_seen": 4650480, "step": 51650 }, { "epoch": 13.423856548856548, "grad_norm": 6.020907402038574, "learning_rate": 1.4737611863441217e-05, "loss": 0.2075, "num_input_tokens_seen": 4650928, "step": 51655 }, { "epoch": 13.425155925155925, "grad_norm": 1.130645990371704, "learning_rate": 1.4732442232341894e-05, "loss": 0.1969, "num_input_tokens_seen": 4651376, "step": 51660 }, { "epoch": 13.426455301455302, "grad_norm": 20.998111724853516, "learning_rate": 1.4727273129312918e-05, "loss": 0.2501, "num_input_tokens_seen": 4651840, "step": 51665 }, { "epoch": 13.427754677754677, "grad_norm": 12.040847778320312, "learning_rate": 1.472210455462012e-05, "loss": 0.0428, "num_input_tokens_seen": 4652304, "step": 51670 }, { "epoch": 13.429054054054054, "grad_norm": 11.145678520202637, "learning_rate": 1.4716936508529328e-05, "loss": 0.3002, "num_input_tokens_seen": 4652736, "step": 51675 }, { "epoch": 13.43035343035343, "grad_norm": 6.340982437133789, "learning_rate": 1.4711768991306358e-05, "loss": 0.1022, "num_input_tokens_seen": 4653216, "step": 51680 }, { "epoch": 13.431652806652806, "grad_norm": 9.673460006713867, "learning_rate": 1.4706602003216951e-05, "loss": 0.2541, "num_input_tokens_seen": 4653648, "step": 51685 }, { "epoch": 13.432952182952183, "grad_norm": 11.264019012451172, "learning_rate": 1.470143554452687e-05, "loss": 0.0995, "num_input_tokens_seen": 4654112, "step": 51690 }, { "epoch": 13.434251559251559, "grad_norm": 5.8560991287231445, "learning_rate": 1.4696269615501826e-05, "loss": 0.1515, "num_input_tokens_seen": 4654576, "step": 51695 }, { "epoch": 13.435550935550935, "grad_norm": 1.654495358467102, "learning_rate": 1.4691104216407518e-05, "loss": 0.3433, "num_input_tokens_seen": 4654992, "step": 51700 }, { "epoch": 13.436850311850312, "grad_norm": 11.511163711547852, "learning_rate": 1.4685939347509586e-05, "loss": 0.108, "num_input_tokens_seen": 4655424, "step": 51705 }, { "epoch": 13.438149688149688, "grad_norm": 12.599431037902832, "learning_rate": 1.4680775009073678e-05, "loss": 0.2646, "num_input_tokens_seen": 4655888, "step": 51710 }, { "epoch": 13.439449064449065, "grad_norm": 2.1337954998016357, "learning_rate": 1.4675611201365397e-05, "loss": 0.2885, "num_input_tokens_seen": 4656336, "step": 51715 }, { "epoch": 13.440748440748441, "grad_norm": 0.09879158437252045, "learning_rate": 1.4670447924650335e-05, "loss": 0.0837, "num_input_tokens_seen": 4656816, "step": 51720 }, { "epoch": 13.442047817047817, "grad_norm": 0.18158292770385742, "learning_rate": 1.4665285179194022e-05, "loss": 0.1141, "num_input_tokens_seen": 4657216, "step": 51725 }, { "epoch": 13.443347193347194, "grad_norm": 0.16400957107543945, "learning_rate": 1.4660122965262e-05, "loss": 0.3083, "num_input_tokens_seen": 4657648, "step": 51730 }, { "epoch": 13.44464656964657, "grad_norm": 2.7533040046691895, "learning_rate": 1.4654961283119768e-05, "loss": 0.3184, "num_input_tokens_seen": 4658112, "step": 51735 }, { "epoch": 13.445945945945946, "grad_norm": 0.5315435528755188, "learning_rate": 1.4649800133032775e-05, "loss": 0.4025, "num_input_tokens_seen": 4658576, "step": 51740 }, { "epoch": 13.447245322245323, "grad_norm": 10.782182693481445, "learning_rate": 1.4644639515266483e-05, "loss": 0.5485, "num_input_tokens_seen": 4658992, "step": 51745 }, { "epoch": 13.448544698544698, "grad_norm": 0.6511209607124329, "learning_rate": 1.4639479430086304e-05, "loss": 0.3563, "num_input_tokens_seen": 4659456, "step": 51750 }, { "epoch": 13.449844074844075, "grad_norm": 15.655969619750977, "learning_rate": 1.463431987775763e-05, "loss": 0.3413, "num_input_tokens_seen": 4659904, "step": 51755 }, { "epoch": 13.451143451143452, "grad_norm": 3.087568521499634, "learning_rate": 1.462916085854581e-05, "loss": 0.3059, "num_input_tokens_seen": 4660384, "step": 51760 }, { "epoch": 13.452442827442827, "grad_norm": 12.085271835327148, "learning_rate": 1.4624002372716184e-05, "loss": 0.1734, "num_input_tokens_seen": 4660816, "step": 51765 }, { "epoch": 13.453742203742204, "grad_norm": 0.17516560852527618, "learning_rate": 1.4618844420534055e-05, "loss": 0.0736, "num_input_tokens_seen": 4661264, "step": 51770 }, { "epoch": 13.45504158004158, "grad_norm": 0.06211988255381584, "learning_rate": 1.4613687002264713e-05, "loss": 0.1503, "num_input_tokens_seen": 4661728, "step": 51775 }, { "epoch": 13.456340956340956, "grad_norm": 1.4104524850845337, "learning_rate": 1.4608530118173388e-05, "loss": 0.3571, "num_input_tokens_seen": 4662192, "step": 51780 }, { "epoch": 13.457640332640333, "grad_norm": 0.7257766723632812, "learning_rate": 1.460337376852533e-05, "loss": 0.2015, "num_input_tokens_seen": 4662624, "step": 51785 }, { "epoch": 13.45893970893971, "grad_norm": 7.750036239624023, "learning_rate": 1.4598217953585711e-05, "loss": 0.0459, "num_input_tokens_seen": 4663072, "step": 51790 }, { "epoch": 13.460239085239085, "grad_norm": 0.5465359687805176, "learning_rate": 1.4593062673619711e-05, "loss": 0.248, "num_input_tokens_seen": 4663536, "step": 51795 }, { "epoch": 13.461538461538462, "grad_norm": 12.8635835647583, "learning_rate": 1.458790792889248e-05, "loss": 0.6093, "num_input_tokens_seen": 4663968, "step": 51800 }, { "epoch": 13.462837837837839, "grad_norm": 1.101487398147583, "learning_rate": 1.4582753719669106e-05, "loss": 0.1197, "num_input_tokens_seen": 4664416, "step": 51805 }, { "epoch": 13.464137214137214, "grad_norm": 0.17945905029773712, "learning_rate": 1.4577600046214701e-05, "loss": 0.0624, "num_input_tokens_seen": 4664864, "step": 51810 }, { "epoch": 13.46543659043659, "grad_norm": 6.066755771636963, "learning_rate": 1.45724469087943e-05, "loss": 0.2538, "num_input_tokens_seen": 4665312, "step": 51815 }, { "epoch": 13.466735966735968, "grad_norm": 2.599257230758667, "learning_rate": 1.4567294307672947e-05, "loss": 0.0291, "num_input_tokens_seen": 4665744, "step": 51820 }, { "epoch": 13.468035343035343, "grad_norm": 4.098026752471924, "learning_rate": 1.4562142243115644e-05, "loss": 0.3481, "num_input_tokens_seen": 4666208, "step": 51825 }, { "epoch": 13.46933471933472, "grad_norm": 0.03807071968913078, "learning_rate": 1.4556990715387375e-05, "loss": 0.3779, "num_input_tokens_seen": 4666720, "step": 51830 }, { "epoch": 13.470634095634095, "grad_norm": 0.0680432915687561, "learning_rate": 1.4551839724753074e-05, "loss": 0.0486, "num_input_tokens_seen": 4667136, "step": 51835 }, { "epoch": 13.471933471933472, "grad_norm": 0.5941276550292969, "learning_rate": 1.4546689271477674e-05, "loss": 0.0886, "num_input_tokens_seen": 4667584, "step": 51840 }, { "epoch": 13.473232848232849, "grad_norm": 17.845928192138672, "learning_rate": 1.454153935582605e-05, "loss": 0.3235, "num_input_tokens_seen": 4668048, "step": 51845 }, { "epoch": 13.474532224532224, "grad_norm": 13.951032638549805, "learning_rate": 1.4536389978063086e-05, "loss": 0.3998, "num_input_tokens_seen": 4668480, "step": 51850 }, { "epoch": 13.4758316008316, "grad_norm": 2.3743934631347656, "learning_rate": 1.4531241138453605e-05, "loss": 0.1365, "num_input_tokens_seen": 4668944, "step": 51855 }, { "epoch": 13.477130977130978, "grad_norm": 15.819555282592773, "learning_rate": 1.4526092837262423e-05, "loss": 0.1972, "num_input_tokens_seen": 4669424, "step": 51860 }, { "epoch": 13.478430353430353, "grad_norm": 9.107892990112305, "learning_rate": 1.4520945074754327e-05, "loss": 0.361, "num_input_tokens_seen": 4669856, "step": 51865 }, { "epoch": 13.47972972972973, "grad_norm": 1.5353152751922607, "learning_rate": 1.4515797851194064e-05, "loss": 0.1599, "num_input_tokens_seen": 4670288, "step": 51870 }, { "epoch": 13.481029106029107, "grad_norm": 13.932329177856445, "learning_rate": 1.4510651166846367e-05, "loss": 0.4796, "num_input_tokens_seen": 4670768, "step": 51875 }, { "epoch": 13.482328482328482, "grad_norm": 12.869174003601074, "learning_rate": 1.4505505021975923e-05, "loss": 0.226, "num_input_tokens_seen": 4671216, "step": 51880 }, { "epoch": 13.483627858627859, "grad_norm": 0.073799729347229, "learning_rate": 1.450035941684742e-05, "loss": 0.0281, "num_input_tokens_seen": 4671648, "step": 51885 }, { "epoch": 13.484927234927236, "grad_norm": 11.330926895141602, "learning_rate": 1.4495214351725483e-05, "loss": 0.4182, "num_input_tokens_seen": 4672096, "step": 51890 }, { "epoch": 13.486226611226611, "grad_norm": 22.525291442871094, "learning_rate": 1.4490069826874736e-05, "loss": 0.3074, "num_input_tokens_seen": 4672528, "step": 51895 }, { "epoch": 13.487525987525988, "grad_norm": 6.935915470123291, "learning_rate": 1.448492584255977e-05, "loss": 0.3015, "num_input_tokens_seen": 4672976, "step": 51900 }, { "epoch": 13.488825363825363, "grad_norm": 3.688265085220337, "learning_rate": 1.4479782399045152e-05, "loss": 0.1365, "num_input_tokens_seen": 4673424, "step": 51905 }, { "epoch": 13.49012474012474, "grad_norm": 3.1147422790527344, "learning_rate": 1.4474639496595397e-05, "loss": 0.3317, "num_input_tokens_seen": 4673872, "step": 51910 }, { "epoch": 13.491424116424117, "grad_norm": 0.0699084922671318, "learning_rate": 1.4469497135475025e-05, "loss": 0.1371, "num_input_tokens_seen": 4674320, "step": 51915 }, { "epoch": 13.492723492723492, "grad_norm": 1.9498943090438843, "learning_rate": 1.4464355315948497e-05, "loss": 0.2714, "num_input_tokens_seen": 4674752, "step": 51920 }, { "epoch": 13.494022869022869, "grad_norm": 26.915115356445312, "learning_rate": 1.4459214038280277e-05, "loss": 0.2953, "num_input_tokens_seen": 4675248, "step": 51925 }, { "epoch": 13.495322245322246, "grad_norm": 9.102317810058594, "learning_rate": 1.4454073302734772e-05, "loss": 0.1507, "num_input_tokens_seen": 4675728, "step": 51930 }, { "epoch": 13.496621621621621, "grad_norm": 1.0062721967697144, "learning_rate": 1.4448933109576378e-05, "loss": 0.1224, "num_input_tokens_seen": 4676176, "step": 51935 }, { "epoch": 13.497920997920998, "grad_norm": 9.689056396484375, "learning_rate": 1.4443793459069477e-05, "loss": 0.1911, "num_input_tokens_seen": 4676640, "step": 51940 }, { "epoch": 13.499220374220375, "grad_norm": 0.7125979065895081, "learning_rate": 1.4438654351478382e-05, "loss": 0.1017, "num_input_tokens_seen": 4677136, "step": 51945 }, { "epoch": 13.50051975051975, "grad_norm": 14.564637184143066, "learning_rate": 1.4433515787067425e-05, "loss": 0.4344, "num_input_tokens_seen": 4677584, "step": 51950 }, { "epoch": 13.501819126819127, "grad_norm": 1.4460382461547852, "learning_rate": 1.4428377766100861e-05, "loss": 0.115, "num_input_tokens_seen": 4678048, "step": 51955 }, { "epoch": 13.503118503118504, "grad_norm": 0.4609355628490448, "learning_rate": 1.442324028884297e-05, "loss": 0.1012, "num_input_tokens_seen": 4678496, "step": 51960 }, { "epoch": 13.504417879417879, "grad_norm": 1.0218390226364136, "learning_rate": 1.4418103355557955e-05, "loss": 0.1289, "num_input_tokens_seen": 4678928, "step": 51965 }, { "epoch": 13.505717255717256, "grad_norm": 9.881322860717773, "learning_rate": 1.4412966966510027e-05, "loss": 0.2685, "num_input_tokens_seen": 4679392, "step": 51970 }, { "epoch": 13.507016632016633, "grad_norm": 26.76300048828125, "learning_rate": 1.4407831121963347e-05, "loss": 0.2074, "num_input_tokens_seen": 4679856, "step": 51975 }, { "epoch": 13.508316008316008, "grad_norm": 7.661182880401611, "learning_rate": 1.4402695822182071e-05, "loss": 0.1541, "num_input_tokens_seen": 4680304, "step": 51980 }, { "epoch": 13.509615384615385, "grad_norm": 1.3745547533035278, "learning_rate": 1.4397561067430298e-05, "loss": 0.1021, "num_input_tokens_seen": 4680752, "step": 51985 }, { "epoch": 13.51091476091476, "grad_norm": 17.369976043701172, "learning_rate": 1.4392426857972124e-05, "loss": 0.249, "num_input_tokens_seen": 4681232, "step": 51990 }, { "epoch": 13.512214137214137, "grad_norm": 11.656582832336426, "learning_rate": 1.4387293194071599e-05, "loss": 0.2883, "num_input_tokens_seen": 4681728, "step": 51995 }, { "epoch": 13.513513513513514, "grad_norm": 11.119285583496094, "learning_rate": 1.4382160075992748e-05, "loss": 0.0983, "num_input_tokens_seen": 4682176, "step": 52000 }, { "epoch": 13.51481288981289, "grad_norm": 0.9252532720565796, "learning_rate": 1.4377027503999574e-05, "loss": 0.292, "num_input_tokens_seen": 4682592, "step": 52005 }, { "epoch": 13.516112266112266, "grad_norm": 0.8778140544891357, "learning_rate": 1.4371895478356057e-05, "loss": 0.0882, "num_input_tokens_seen": 4683072, "step": 52010 }, { "epoch": 13.517411642411643, "grad_norm": 0.22268755733966827, "learning_rate": 1.4366763999326149e-05, "loss": 0.041, "num_input_tokens_seen": 4683504, "step": 52015 }, { "epoch": 13.518711018711018, "grad_norm": 6.11570405960083, "learning_rate": 1.4361633067173743e-05, "loss": 0.2542, "num_input_tokens_seen": 4683904, "step": 52020 }, { "epoch": 13.520010395010395, "grad_norm": 16.350475311279297, "learning_rate": 1.4356502682162754e-05, "loss": 0.3802, "num_input_tokens_seen": 4684384, "step": 52025 }, { "epoch": 13.521309771309772, "grad_norm": 5.733465671539307, "learning_rate": 1.435137284455702e-05, "loss": 0.2326, "num_input_tokens_seen": 4684848, "step": 52030 }, { "epoch": 13.522609147609147, "grad_norm": 2.1379408836364746, "learning_rate": 1.4346243554620383e-05, "loss": 0.08, "num_input_tokens_seen": 4685296, "step": 52035 }, { "epoch": 13.523908523908524, "grad_norm": 0.8858630657196045, "learning_rate": 1.4341114812616649e-05, "loss": 0.028, "num_input_tokens_seen": 4685760, "step": 52040 }, { "epoch": 13.5252079002079, "grad_norm": 11.257421493530273, "learning_rate": 1.4335986618809604e-05, "loss": 0.2709, "num_input_tokens_seen": 4686208, "step": 52045 }, { "epoch": 13.526507276507276, "grad_norm": 0.03487391769886017, "learning_rate": 1.4330858973462974e-05, "loss": 0.0689, "num_input_tokens_seen": 4686656, "step": 52050 }, { "epoch": 13.527806652806653, "grad_norm": 0.8227458000183105, "learning_rate": 1.4325731876840498e-05, "loss": 0.1031, "num_input_tokens_seen": 4687104, "step": 52055 }, { "epoch": 13.529106029106028, "grad_norm": 0.617276668548584, "learning_rate": 1.432060532920586e-05, "loss": 0.0172, "num_input_tokens_seen": 4687600, "step": 52060 }, { "epoch": 13.530405405405405, "grad_norm": 11.863106727600098, "learning_rate": 1.4315479330822712e-05, "loss": 0.3378, "num_input_tokens_seen": 4688048, "step": 52065 }, { "epoch": 13.531704781704782, "grad_norm": 10.398253440856934, "learning_rate": 1.4310353881954702e-05, "loss": 0.4019, "num_input_tokens_seen": 4688544, "step": 52070 }, { "epoch": 13.533004158004157, "grad_norm": 0.6719375252723694, "learning_rate": 1.4305228982865432e-05, "loss": 0.1748, "num_input_tokens_seen": 4689024, "step": 52075 }, { "epoch": 13.534303534303534, "grad_norm": 0.9652160406112671, "learning_rate": 1.4300104633818493e-05, "loss": 0.104, "num_input_tokens_seen": 4689440, "step": 52080 }, { "epoch": 13.535602910602911, "grad_norm": 0.36803871393203735, "learning_rate": 1.4294980835077416e-05, "loss": 0.2011, "num_input_tokens_seen": 4689888, "step": 52085 }, { "epoch": 13.536902286902286, "grad_norm": 10.53017807006836, "learning_rate": 1.4289857586905739e-05, "loss": 0.1942, "num_input_tokens_seen": 4690352, "step": 52090 }, { "epoch": 13.538201663201663, "grad_norm": 5.118711948394775, "learning_rate": 1.4284734889566939e-05, "loss": 0.0293, "num_input_tokens_seen": 4690784, "step": 52095 }, { "epoch": 13.53950103950104, "grad_norm": 2.0861265659332275, "learning_rate": 1.4279612743324499e-05, "loss": 0.5492, "num_input_tokens_seen": 4691232, "step": 52100 }, { "epoch": 13.540800415800415, "grad_norm": 2.2208824157714844, "learning_rate": 1.4274491148441844e-05, "loss": 0.3078, "num_input_tokens_seen": 4691680, "step": 52105 }, { "epoch": 13.542099792099792, "grad_norm": 0.7453904747962952, "learning_rate": 1.4269370105182378e-05, "loss": 0.3269, "num_input_tokens_seen": 4692144, "step": 52110 }, { "epoch": 13.54339916839917, "grad_norm": 0.3913913369178772, "learning_rate": 1.4264249613809493e-05, "loss": 0.1768, "num_input_tokens_seen": 4692624, "step": 52115 }, { "epoch": 13.544698544698544, "grad_norm": 29.297500610351562, "learning_rate": 1.4259129674586546e-05, "loss": 0.3085, "num_input_tokens_seen": 4693120, "step": 52120 }, { "epoch": 13.545997920997921, "grad_norm": 1.2973730564117432, "learning_rate": 1.4254010287776854e-05, "loss": 0.0298, "num_input_tokens_seen": 4693568, "step": 52125 }, { "epoch": 13.547297297297296, "grad_norm": 21.980031967163086, "learning_rate": 1.4248891453643698e-05, "loss": 0.2415, "num_input_tokens_seen": 4694016, "step": 52130 }, { "epoch": 13.548596673596673, "grad_norm": 5.673442363739014, "learning_rate": 1.4243773172450364e-05, "loss": 0.1028, "num_input_tokens_seen": 4694480, "step": 52135 }, { "epoch": 13.54989604989605, "grad_norm": 1.3661772012710571, "learning_rate": 1.4238655444460073e-05, "loss": 0.1936, "num_input_tokens_seen": 4694912, "step": 52140 }, { "epoch": 13.551195426195425, "grad_norm": 18.963214874267578, "learning_rate": 1.4233538269936042e-05, "loss": 0.2248, "num_input_tokens_seen": 4695360, "step": 52145 }, { "epoch": 13.552494802494802, "grad_norm": 0.8256154656410217, "learning_rate": 1.4228421649141455e-05, "loss": 0.0475, "num_input_tokens_seen": 4695824, "step": 52150 }, { "epoch": 13.55379417879418, "grad_norm": 18.703866958618164, "learning_rate": 1.4223305582339475e-05, "loss": 0.398, "num_input_tokens_seen": 4696288, "step": 52155 }, { "epoch": 13.555093555093555, "grad_norm": 0.05907633900642395, "learning_rate": 1.4218190069793202e-05, "loss": 0.3504, "num_input_tokens_seen": 4696704, "step": 52160 }, { "epoch": 13.556392931392931, "grad_norm": 5.231831073760986, "learning_rate": 1.4213075111765756e-05, "loss": 0.4796, "num_input_tokens_seen": 4697120, "step": 52165 }, { "epoch": 13.557692307692308, "grad_norm": 10.571781158447266, "learning_rate": 1.420796070852018e-05, "loss": 0.3889, "num_input_tokens_seen": 4697536, "step": 52170 }, { "epoch": 13.558991683991684, "grad_norm": 3.8823153972625732, "learning_rate": 1.4202846860319538e-05, "loss": 0.105, "num_input_tokens_seen": 4697968, "step": 52175 }, { "epoch": 13.56029106029106, "grad_norm": 9.018603324890137, "learning_rate": 1.4197733567426816e-05, "loss": 0.3319, "num_input_tokens_seen": 4698400, "step": 52180 }, { "epoch": 13.561590436590437, "grad_norm": 27.434661865234375, "learning_rate": 1.4192620830105007e-05, "loss": 0.1936, "num_input_tokens_seen": 4698896, "step": 52185 }, { "epoch": 13.562889812889813, "grad_norm": 0.4706110656261444, "learning_rate": 1.4187508648617064e-05, "loss": 0.1301, "num_input_tokens_seen": 4699344, "step": 52190 }, { "epoch": 13.56418918918919, "grad_norm": 2.0785207748413086, "learning_rate": 1.4182397023225921e-05, "loss": 0.3496, "num_input_tokens_seen": 4699792, "step": 52195 }, { "epoch": 13.565488565488565, "grad_norm": 10.371018409729004, "learning_rate": 1.4177285954194463e-05, "loss": 0.3623, "num_input_tokens_seen": 4700224, "step": 52200 }, { "epoch": 13.566787941787942, "grad_norm": 0.18357640504837036, "learning_rate": 1.417217544178555e-05, "loss": 0.0282, "num_input_tokens_seen": 4700672, "step": 52205 }, { "epoch": 13.568087318087318, "grad_norm": 0.11361929029226303, "learning_rate": 1.4167065486262038e-05, "loss": 0.3879, "num_input_tokens_seen": 4701104, "step": 52210 }, { "epoch": 13.569386694386694, "grad_norm": 17.051286697387695, "learning_rate": 1.4161956087886718e-05, "loss": 0.3464, "num_input_tokens_seen": 4701552, "step": 52215 }, { "epoch": 13.57068607068607, "grad_norm": 0.00900342594832182, "learning_rate": 1.4156847246922383e-05, "loss": 0.3209, "num_input_tokens_seen": 4702000, "step": 52220 }, { "epoch": 13.571985446985448, "grad_norm": 20.2995662689209, "learning_rate": 1.415173896363178e-05, "loss": 0.5573, "num_input_tokens_seen": 4702464, "step": 52225 }, { "epoch": 13.573284823284823, "grad_norm": 7.900815010070801, "learning_rate": 1.4146631238277647e-05, "loss": 0.2046, "num_input_tokens_seen": 4702896, "step": 52230 }, { "epoch": 13.5745841995842, "grad_norm": 7.82557487487793, "learning_rate": 1.4141524071122659e-05, "loss": 0.2368, "num_input_tokens_seen": 4703344, "step": 52235 }, { "epoch": 13.575883575883577, "grad_norm": 1.4122170209884644, "learning_rate": 1.4136417462429502e-05, "loss": 0.1112, "num_input_tokens_seen": 4703792, "step": 52240 }, { "epoch": 13.577182952182952, "grad_norm": 9.185344696044922, "learning_rate": 1.4131311412460796e-05, "loss": 0.0794, "num_input_tokens_seen": 4704224, "step": 52245 }, { "epoch": 13.578482328482329, "grad_norm": 0.3775964379310608, "learning_rate": 1.4126205921479167e-05, "loss": 0.0691, "num_input_tokens_seen": 4704736, "step": 52250 }, { "epoch": 13.579781704781706, "grad_norm": 0.29870736598968506, "learning_rate": 1.4121100989747166e-05, "loss": 0.1111, "num_input_tokens_seen": 4705200, "step": 52255 }, { "epoch": 13.58108108108108, "grad_norm": 14.133787155151367, "learning_rate": 1.4115996617527382e-05, "loss": 0.2667, "num_input_tokens_seen": 4705632, "step": 52260 }, { "epoch": 13.582380457380458, "grad_norm": 0.04007752984762192, "learning_rate": 1.4110892805082323e-05, "loss": 0.2878, "num_input_tokens_seen": 4706112, "step": 52265 }, { "epoch": 13.583679833679835, "grad_norm": 11.789910316467285, "learning_rate": 1.410578955267447e-05, "loss": 0.3143, "num_input_tokens_seen": 4706560, "step": 52270 }, { "epoch": 13.58497920997921, "grad_norm": 14.296757698059082, "learning_rate": 1.4100686860566308e-05, "loss": 0.5778, "num_input_tokens_seen": 4707024, "step": 52275 }, { "epoch": 13.586278586278587, "grad_norm": 17.563505172729492, "learning_rate": 1.4095584729020255e-05, "loss": 0.5414, "num_input_tokens_seen": 4707440, "step": 52280 }, { "epoch": 13.587577962577962, "grad_norm": 11.494728088378906, "learning_rate": 1.4090483158298728e-05, "loss": 0.2072, "num_input_tokens_seen": 4707904, "step": 52285 }, { "epoch": 13.588877338877339, "grad_norm": 1.3184947967529297, "learning_rate": 1.4085382148664106e-05, "loss": 0.1624, "num_input_tokens_seen": 4708336, "step": 52290 }, { "epoch": 13.590176715176716, "grad_norm": 0.18374831974506378, "learning_rate": 1.4080281700378745e-05, "loss": 0.1714, "num_input_tokens_seen": 4708784, "step": 52295 }, { "epoch": 13.59147609147609, "grad_norm": 19.09047508239746, "learning_rate": 1.4075181813704952e-05, "loss": 0.4505, "num_input_tokens_seen": 4709216, "step": 52300 }, { "epoch": 13.592775467775468, "grad_norm": 0.618793785572052, "learning_rate": 1.4070082488905034e-05, "loss": 0.1073, "num_input_tokens_seen": 4709696, "step": 52305 }, { "epoch": 13.594074844074845, "grad_norm": 12.485742568969727, "learning_rate": 1.4064983726241235e-05, "loss": 0.4099, "num_input_tokens_seen": 4710224, "step": 52310 }, { "epoch": 13.59537422037422, "grad_norm": 11.083415031433105, "learning_rate": 1.4059885525975813e-05, "loss": 0.1759, "num_input_tokens_seen": 4710688, "step": 52315 }, { "epoch": 13.596673596673597, "grad_norm": 14.50151538848877, "learning_rate": 1.405478788837095e-05, "loss": 0.487, "num_input_tokens_seen": 4711136, "step": 52320 }, { "epoch": 13.597972972972974, "grad_norm": 3.901331901550293, "learning_rate": 1.4049690813688831e-05, "loss": 0.3036, "num_input_tokens_seen": 4711584, "step": 52325 }, { "epoch": 13.599272349272349, "grad_norm": 3.5254697799682617, "learning_rate": 1.4044594302191616e-05, "loss": 0.3657, "num_input_tokens_seen": 4712048, "step": 52330 }, { "epoch": 13.600571725571726, "grad_norm": 0.6145187616348267, "learning_rate": 1.4039498354141407e-05, "loss": 0.209, "num_input_tokens_seen": 4712528, "step": 52335 }, { "epoch": 13.601871101871101, "grad_norm": 7.016651153564453, "learning_rate": 1.4034402969800303e-05, "loss": 0.0766, "num_input_tokens_seen": 4712960, "step": 52340 }, { "epoch": 13.603170478170478, "grad_norm": 1.3318936824798584, "learning_rate": 1.4029308149430356e-05, "loss": 0.1336, "num_input_tokens_seen": 4713424, "step": 52345 }, { "epoch": 13.604469854469855, "grad_norm": 11.646378517150879, "learning_rate": 1.4024213893293612e-05, "loss": 0.1547, "num_input_tokens_seen": 4713888, "step": 52350 }, { "epoch": 13.60576923076923, "grad_norm": 0.11259698122739792, "learning_rate": 1.4019120201652054e-05, "loss": 0.089, "num_input_tokens_seen": 4714368, "step": 52355 }, { "epoch": 13.607068607068607, "grad_norm": 0.362079381942749, "learning_rate": 1.4014027074767663e-05, "loss": 0.1837, "num_input_tokens_seen": 4714864, "step": 52360 }, { "epoch": 13.608367983367984, "grad_norm": 13.564332962036133, "learning_rate": 1.4008934512902389e-05, "loss": 0.3233, "num_input_tokens_seen": 4715328, "step": 52365 }, { "epoch": 13.609667359667359, "grad_norm": 5.207244873046875, "learning_rate": 1.400384251631815e-05, "loss": 0.4945, "num_input_tokens_seen": 4715792, "step": 52370 }, { "epoch": 13.610966735966736, "grad_norm": 0.7649351358413696, "learning_rate": 1.399875108527682e-05, "loss": 0.3292, "num_input_tokens_seen": 4716224, "step": 52375 }, { "epoch": 13.612266112266113, "grad_norm": 0.14234884083271027, "learning_rate": 1.3993660220040273e-05, "loss": 0.3987, "num_input_tokens_seen": 4716720, "step": 52380 }, { "epoch": 13.613565488565488, "grad_norm": 7.064451694488525, "learning_rate": 1.3988569920870314e-05, "loss": 0.2599, "num_input_tokens_seen": 4717168, "step": 52385 }, { "epoch": 13.614864864864865, "grad_norm": 0.903941810131073, "learning_rate": 1.3983480188028764e-05, "loss": 0.2408, "num_input_tokens_seen": 4717600, "step": 52390 }, { "epoch": 13.616164241164242, "grad_norm": 1.6342605352401733, "learning_rate": 1.3978391021777377e-05, "loss": 0.4348, "num_input_tokens_seen": 4718048, "step": 52395 }, { "epoch": 13.617463617463617, "grad_norm": 13.575972557067871, "learning_rate": 1.3973302422377898e-05, "loss": 0.1477, "num_input_tokens_seen": 4718464, "step": 52400 }, { "epoch": 13.618762993762994, "grad_norm": 9.225168228149414, "learning_rate": 1.396821439009205e-05, "loss": 0.1354, "num_input_tokens_seen": 4718928, "step": 52405 }, { "epoch": 13.62006237006237, "grad_norm": 11.408629417419434, "learning_rate": 1.3963126925181497e-05, "loss": 0.2567, "num_input_tokens_seen": 4719440, "step": 52410 }, { "epoch": 13.621361746361746, "grad_norm": 18.650020599365234, "learning_rate": 1.3958040027907911e-05, "loss": 0.2374, "num_input_tokens_seen": 4719920, "step": 52415 }, { "epoch": 13.622661122661123, "grad_norm": 0.24485759437084198, "learning_rate": 1.3952953698532898e-05, "loss": 0.1415, "num_input_tokens_seen": 4720368, "step": 52420 }, { "epoch": 13.6239604989605, "grad_norm": 11.721344947814941, "learning_rate": 1.3947867937318068e-05, "loss": 0.2254, "num_input_tokens_seen": 4720784, "step": 52425 }, { "epoch": 13.625259875259875, "grad_norm": 4.481156349182129, "learning_rate": 1.3942782744524973e-05, "loss": 0.4285, "num_input_tokens_seen": 4721264, "step": 52430 }, { "epoch": 13.626559251559252, "grad_norm": 2.6301844120025635, "learning_rate": 1.3937698120415154e-05, "loss": 0.1315, "num_input_tokens_seen": 4721696, "step": 52435 }, { "epoch": 13.627858627858627, "grad_norm": 1.0730705261230469, "learning_rate": 1.3932614065250122e-05, "loss": 0.1476, "num_input_tokens_seen": 4722144, "step": 52440 }, { "epoch": 13.629158004158004, "grad_norm": 19.807775497436523, "learning_rate": 1.3927530579291359e-05, "loss": 0.4416, "num_input_tokens_seen": 4722592, "step": 52445 }, { "epoch": 13.630457380457381, "grad_norm": 11.688297271728516, "learning_rate": 1.3922447662800297e-05, "loss": 0.4025, "num_input_tokens_seen": 4723024, "step": 52450 }, { "epoch": 13.631756756756756, "grad_norm": 1.7374552488327026, "learning_rate": 1.391736531603838e-05, "loss": 0.1329, "num_input_tokens_seen": 4723504, "step": 52455 }, { "epoch": 13.633056133056133, "grad_norm": 4.238446235656738, "learning_rate": 1.3912283539266983e-05, "loss": 0.2923, "num_input_tokens_seen": 4723984, "step": 52460 }, { "epoch": 13.63435550935551, "grad_norm": 1.8551549911499023, "learning_rate": 1.3907202332747454e-05, "loss": 0.2046, "num_input_tokens_seen": 4724448, "step": 52465 }, { "epoch": 13.635654885654885, "grad_norm": 10.067001342773438, "learning_rate": 1.3902121696741141e-05, "loss": 0.341, "num_input_tokens_seen": 4724912, "step": 52470 }, { "epoch": 13.636954261954262, "grad_norm": 0.13569821417331696, "learning_rate": 1.3897041631509342e-05, "loss": 0.1678, "num_input_tokens_seen": 4725360, "step": 52475 }, { "epoch": 13.638253638253639, "grad_norm": 11.362197875976562, "learning_rate": 1.389196213731334e-05, "loss": 0.3291, "num_input_tokens_seen": 4725808, "step": 52480 }, { "epoch": 13.639553014553014, "grad_norm": 11.171104431152344, "learning_rate": 1.3886883214414359e-05, "loss": 0.3449, "num_input_tokens_seen": 4726272, "step": 52485 }, { "epoch": 13.640852390852391, "grad_norm": 1.6482131481170654, "learning_rate": 1.3881804863073631e-05, "loss": 0.1719, "num_input_tokens_seen": 4726736, "step": 52490 }, { "epoch": 13.642151767151766, "grad_norm": 0.3879511058330536, "learning_rate": 1.387672708355232e-05, "loss": 0.0353, "num_input_tokens_seen": 4727216, "step": 52495 }, { "epoch": 13.643451143451143, "grad_norm": 0.6336351633071899, "learning_rate": 1.3871649876111609e-05, "loss": 0.117, "num_input_tokens_seen": 4727680, "step": 52500 }, { "epoch": 13.64475051975052, "grad_norm": 1.2736868858337402, "learning_rate": 1.3866573241012581e-05, "loss": 0.1662, "num_input_tokens_seen": 4728128, "step": 52505 }, { "epoch": 13.646049896049895, "grad_norm": 8.023920059204102, "learning_rate": 1.386149717851638e-05, "loss": 0.2545, "num_input_tokens_seen": 4728576, "step": 52510 }, { "epoch": 13.647349272349272, "grad_norm": 2.5655555725097656, "learning_rate": 1.3856421688884047e-05, "loss": 0.0936, "num_input_tokens_seen": 4729040, "step": 52515 }, { "epoch": 13.64864864864865, "grad_norm": 12.899923324584961, "learning_rate": 1.3851346772376627e-05, "loss": 0.3405, "num_input_tokens_seen": 4729520, "step": 52520 }, { "epoch": 13.649948024948024, "grad_norm": 10.40674877166748, "learning_rate": 1.384627242925513e-05, "loss": 0.1305, "num_input_tokens_seen": 4729984, "step": 52525 }, { "epoch": 13.651247401247401, "grad_norm": 11.447039604187012, "learning_rate": 1.3841198659780514e-05, "loss": 0.558, "num_input_tokens_seen": 4730432, "step": 52530 }, { "epoch": 13.652546777546778, "grad_norm": 11.175622940063477, "learning_rate": 1.3836125464213745e-05, "loss": 0.1444, "num_input_tokens_seen": 4730848, "step": 52535 }, { "epoch": 13.653846153846153, "grad_norm": 0.6577621698379517, "learning_rate": 1.3831052842815742e-05, "loss": 0.2088, "num_input_tokens_seen": 4731264, "step": 52540 }, { "epoch": 13.65514553014553, "grad_norm": 10.233386039733887, "learning_rate": 1.3825980795847402e-05, "loss": 0.2269, "num_input_tokens_seen": 4731744, "step": 52545 }, { "epoch": 13.656444906444907, "grad_norm": 8.348469734191895, "learning_rate": 1.3820909323569564e-05, "loss": 0.2981, "num_input_tokens_seen": 4732192, "step": 52550 }, { "epoch": 13.657744282744282, "grad_norm": 0.11561869829893112, "learning_rate": 1.3815838426243083e-05, "loss": 0.0802, "num_input_tokens_seen": 4732640, "step": 52555 }, { "epoch": 13.65904365904366, "grad_norm": 6.54907751083374, "learning_rate": 1.3810768104128736e-05, "loss": 0.3483, "num_input_tokens_seen": 4733072, "step": 52560 }, { "epoch": 13.660343035343036, "grad_norm": 0.3293496072292328, "learning_rate": 1.3805698357487318e-05, "loss": 0.3613, "num_input_tokens_seen": 4733504, "step": 52565 }, { "epoch": 13.661642411642411, "grad_norm": 0.703010618686676, "learning_rate": 1.380062918657955e-05, "loss": 0.065, "num_input_tokens_seen": 4734000, "step": 52570 }, { "epoch": 13.662941787941788, "grad_norm": 2.4975547790527344, "learning_rate": 1.3795560591666152e-05, "loss": 0.0866, "num_input_tokens_seen": 4734464, "step": 52575 }, { "epoch": 13.664241164241163, "grad_norm": 2.663520574569702, "learning_rate": 1.3790492573007807e-05, "loss": 0.0697, "num_input_tokens_seen": 4734928, "step": 52580 }, { "epoch": 13.66554054054054, "grad_norm": 9.005982398986816, "learning_rate": 1.3785425130865181e-05, "loss": 0.0716, "num_input_tokens_seen": 4735360, "step": 52585 }, { "epoch": 13.666839916839917, "grad_norm": 13.394957542419434, "learning_rate": 1.3780358265498889e-05, "loss": 0.2885, "num_input_tokens_seen": 4735824, "step": 52590 }, { "epoch": 13.668139293139292, "grad_norm": 4.832545757293701, "learning_rate": 1.3775291977169507e-05, "loss": 0.2912, "num_input_tokens_seen": 4736272, "step": 52595 }, { "epoch": 13.66943866943867, "grad_norm": 4.24161958694458, "learning_rate": 1.3770226266137625e-05, "loss": 0.0633, "num_input_tokens_seen": 4736736, "step": 52600 }, { "epoch": 13.670738045738046, "grad_norm": 0.4066423177719116, "learning_rate": 1.3765161132663757e-05, "loss": 0.0358, "num_input_tokens_seen": 4737152, "step": 52605 }, { "epoch": 13.672037422037421, "grad_norm": 21.02167320251465, "learning_rate": 1.3760096577008413e-05, "loss": 0.3144, "num_input_tokens_seen": 4737632, "step": 52610 }, { "epoch": 13.673336798336798, "grad_norm": 3.501842737197876, "learning_rate": 1.3755032599432075e-05, "loss": 0.1472, "num_input_tokens_seen": 4738112, "step": 52615 }, { "epoch": 13.674636174636175, "grad_norm": 21.155122756958008, "learning_rate": 1.3749969200195196e-05, "loss": 0.3118, "num_input_tokens_seen": 4738560, "step": 52620 }, { "epoch": 13.67593555093555, "grad_norm": 1.0801136493682861, "learning_rate": 1.3744906379558165e-05, "loss": 0.1504, "num_input_tokens_seen": 4739040, "step": 52625 }, { "epoch": 13.677234927234927, "grad_norm": 11.637313842773438, "learning_rate": 1.3739844137781399e-05, "loss": 0.0678, "num_input_tokens_seen": 4739520, "step": 52630 }, { "epoch": 13.678534303534304, "grad_norm": 15.72216510772705, "learning_rate": 1.3734782475125224e-05, "loss": 0.2065, "num_input_tokens_seen": 4739952, "step": 52635 }, { "epoch": 13.67983367983368, "grad_norm": 0.0867815911769867, "learning_rate": 1.3729721391849992e-05, "loss": 0.1602, "num_input_tokens_seen": 4740416, "step": 52640 }, { "epoch": 13.681133056133056, "grad_norm": 3.776313304901123, "learning_rate": 1.3724660888215975e-05, "loss": 0.0759, "num_input_tokens_seen": 4740864, "step": 52645 }, { "epoch": 13.682432432432432, "grad_norm": 13.312806129455566, "learning_rate": 1.3719600964483455e-05, "loss": 0.1655, "num_input_tokens_seen": 4741328, "step": 52650 }, { "epoch": 13.683731808731808, "grad_norm": 10.672804832458496, "learning_rate": 1.3714541620912675e-05, "loss": 0.6833, "num_input_tokens_seen": 4741776, "step": 52655 }, { "epoch": 13.685031185031185, "grad_norm": 0.6709992289543152, "learning_rate": 1.3709482857763822e-05, "loss": 0.1342, "num_input_tokens_seen": 4742208, "step": 52660 }, { "epoch": 13.68633056133056, "grad_norm": 0.14191190898418427, "learning_rate": 1.3704424675297095e-05, "loss": 0.316, "num_input_tokens_seen": 4742640, "step": 52665 }, { "epoch": 13.687629937629938, "grad_norm": 2.307485342025757, "learning_rate": 1.3699367073772617e-05, "loss": 0.0692, "num_input_tokens_seen": 4743136, "step": 52670 }, { "epoch": 13.688929313929314, "grad_norm": 2.0278074741363525, "learning_rate": 1.3694310053450531e-05, "loss": 0.2243, "num_input_tokens_seen": 4743600, "step": 52675 }, { "epoch": 13.69022869022869, "grad_norm": 2.880845546722412, "learning_rate": 1.3689253614590902e-05, "loss": 0.127, "num_input_tokens_seen": 4744048, "step": 52680 }, { "epoch": 13.691528066528067, "grad_norm": 1.168555736541748, "learning_rate": 1.3684197757453796e-05, "loss": 0.0868, "num_input_tokens_seen": 4744496, "step": 52685 }, { "epoch": 13.692827442827443, "grad_norm": 0.0388190932571888, "learning_rate": 1.367914248229924e-05, "loss": 0.3478, "num_input_tokens_seen": 4744960, "step": 52690 }, { "epoch": 13.694126819126819, "grad_norm": 5.072676658630371, "learning_rate": 1.3674087789387247e-05, "loss": 0.0175, "num_input_tokens_seen": 4745440, "step": 52695 }, { "epoch": 13.695426195426196, "grad_norm": 7.467373371124268, "learning_rate": 1.3669033678977756e-05, "loss": 0.3063, "num_input_tokens_seen": 4745920, "step": 52700 }, { "epoch": 13.696725571725572, "grad_norm": 0.07602746039628983, "learning_rate": 1.3663980151330732e-05, "loss": 0.1307, "num_input_tokens_seen": 4746352, "step": 52705 }, { "epoch": 13.698024948024948, "grad_norm": 1.599044919013977, "learning_rate": 1.3658927206706063e-05, "loss": 0.2233, "num_input_tokens_seen": 4746816, "step": 52710 }, { "epoch": 13.699324324324325, "grad_norm": 18.06871223449707, "learning_rate": 1.365387484536364e-05, "loss": 0.4055, "num_input_tokens_seen": 4747232, "step": 52715 }, { "epoch": 13.700623700623701, "grad_norm": 28.009258270263672, "learning_rate": 1.3648823067563297e-05, "loss": 0.2956, "num_input_tokens_seen": 4747680, "step": 52720 }, { "epoch": 13.701923076923077, "grad_norm": 3.0399420261383057, "learning_rate": 1.3643771873564859e-05, "loss": 0.5475, "num_input_tokens_seen": 4748160, "step": 52725 }, { "epoch": 13.703222453222454, "grad_norm": 26.64459800720215, "learning_rate": 1.3638721263628123e-05, "loss": 0.5592, "num_input_tokens_seen": 4748624, "step": 52730 }, { "epoch": 13.704521829521829, "grad_norm": 9.676706314086914, "learning_rate": 1.3633671238012835e-05, "loss": 0.399, "num_input_tokens_seen": 4749072, "step": 52735 }, { "epoch": 13.705821205821206, "grad_norm": 16.009218215942383, "learning_rate": 1.362862179697873e-05, "loss": 0.4388, "num_input_tokens_seen": 4749552, "step": 52740 }, { "epoch": 13.707120582120583, "grad_norm": 16.547821044921875, "learning_rate": 1.3623572940785493e-05, "loss": 0.1655, "num_input_tokens_seen": 4749968, "step": 52745 }, { "epoch": 13.708419958419958, "grad_norm": 16.886371612548828, "learning_rate": 1.3618524669692811e-05, "loss": 0.1066, "num_input_tokens_seen": 4750464, "step": 52750 }, { "epoch": 13.709719334719335, "grad_norm": 11.125360488891602, "learning_rate": 1.3613476983960287e-05, "loss": 0.0838, "num_input_tokens_seen": 4750912, "step": 52755 }, { "epoch": 13.711018711018712, "grad_norm": 0.7221141457557678, "learning_rate": 1.3608429883847573e-05, "loss": 0.3492, "num_input_tokens_seen": 4751344, "step": 52760 }, { "epoch": 13.712318087318087, "grad_norm": 0.800791323184967, "learning_rate": 1.360338336961422e-05, "loss": 0.3782, "num_input_tokens_seen": 4751776, "step": 52765 }, { "epoch": 13.713617463617464, "grad_norm": 0.23532360792160034, "learning_rate": 1.3598337441519784e-05, "loss": 0.2872, "num_input_tokens_seen": 4752224, "step": 52770 }, { "epoch": 13.71491683991684, "grad_norm": 2.33847975730896, "learning_rate": 1.359329209982377e-05, "loss": 0.3021, "num_input_tokens_seen": 4752672, "step": 52775 }, { "epoch": 13.716216216216216, "grad_norm": 0.9643897414207458, "learning_rate": 1.3588247344785681e-05, "loss": 0.6732, "num_input_tokens_seen": 4753120, "step": 52780 }, { "epoch": 13.717515592515593, "grad_norm": 5.432588577270508, "learning_rate": 1.3583203176664961e-05, "loss": 0.0995, "num_input_tokens_seen": 4753568, "step": 52785 }, { "epoch": 13.71881496881497, "grad_norm": 1.174800992012024, "learning_rate": 1.3578159595721034e-05, "loss": 0.038, "num_input_tokens_seen": 4754016, "step": 52790 }, { "epoch": 13.720114345114345, "grad_norm": 17.099870681762695, "learning_rate": 1.3573116602213315e-05, "loss": 0.5797, "num_input_tokens_seen": 4754512, "step": 52795 }, { "epoch": 13.721413721413722, "grad_norm": 1.501495361328125, "learning_rate": 1.3568074196401149e-05, "loss": 0.2954, "num_input_tokens_seen": 4754944, "step": 52800 }, { "epoch": 13.722713097713097, "grad_norm": 4.925976276397705, "learning_rate": 1.356303237854389e-05, "loss": 0.1001, "num_input_tokens_seen": 4755392, "step": 52805 }, { "epoch": 13.724012474012474, "grad_norm": 0.29801005125045776, "learning_rate": 1.3557991148900825e-05, "loss": 0.1912, "num_input_tokens_seen": 4755840, "step": 52810 }, { "epoch": 13.72531185031185, "grad_norm": 12.733633995056152, "learning_rate": 1.355295050773125e-05, "loss": 0.5899, "num_input_tokens_seen": 4756256, "step": 52815 }, { "epoch": 13.726611226611226, "grad_norm": 0.664030134677887, "learning_rate": 1.3547910455294382e-05, "loss": 0.1593, "num_input_tokens_seen": 4756704, "step": 52820 }, { "epoch": 13.727910602910603, "grad_norm": 7.049683570861816, "learning_rate": 1.354287099184946e-05, "loss": 0.0418, "num_input_tokens_seen": 4757200, "step": 52825 }, { "epoch": 13.72920997920998, "grad_norm": 11.950615882873535, "learning_rate": 1.3537832117655655e-05, "loss": 0.0818, "num_input_tokens_seen": 4757664, "step": 52830 }, { "epoch": 13.730509355509355, "grad_norm": 6.875884532928467, "learning_rate": 1.3532793832972137e-05, "loss": 0.227, "num_input_tokens_seen": 4758128, "step": 52835 }, { "epoch": 13.731808731808732, "grad_norm": 2.4900684356689453, "learning_rate": 1.3527756138058013e-05, "loss": 0.2482, "num_input_tokens_seen": 4758560, "step": 52840 }, { "epoch": 13.733108108108109, "grad_norm": 0.34845224022865295, "learning_rate": 1.352271903317239e-05, "loss": 0.0531, "num_input_tokens_seen": 4759008, "step": 52845 }, { "epoch": 13.734407484407484, "grad_norm": 13.81051254272461, "learning_rate": 1.3517682518574326e-05, "loss": 0.4497, "num_input_tokens_seen": 4759440, "step": 52850 }, { "epoch": 13.73570686070686, "grad_norm": 8.410978317260742, "learning_rate": 1.351264659452284e-05, "loss": 0.7613, "num_input_tokens_seen": 4759872, "step": 52855 }, { "epoch": 13.737006237006238, "grad_norm": 0.0533515103161335, "learning_rate": 1.3507611261276948e-05, "loss": 0.0116, "num_input_tokens_seen": 4760304, "step": 52860 }, { "epoch": 13.738305613305613, "grad_norm": 0.19183999300003052, "learning_rate": 1.350257651909562e-05, "loss": 0.1403, "num_input_tokens_seen": 4760752, "step": 52865 }, { "epoch": 13.73960498960499, "grad_norm": 5.391443252563477, "learning_rate": 1.3497542368237808e-05, "loss": 0.1918, "num_input_tokens_seen": 4761184, "step": 52870 }, { "epoch": 13.740904365904367, "grad_norm": 15.402195930480957, "learning_rate": 1.3492508808962401e-05, "loss": 0.1914, "num_input_tokens_seen": 4761632, "step": 52875 }, { "epoch": 13.742203742203742, "grad_norm": 0.3192811906337738, "learning_rate": 1.3487475841528301e-05, "loss": 0.2162, "num_input_tokens_seen": 4762048, "step": 52880 }, { "epoch": 13.743503118503119, "grad_norm": 3.995706558227539, "learning_rate": 1.3482443466194345e-05, "loss": 0.1636, "num_input_tokens_seen": 4762480, "step": 52885 }, { "epoch": 13.744802494802494, "grad_norm": 13.829010009765625, "learning_rate": 1.3477411683219365e-05, "loss": 0.6725, "num_input_tokens_seen": 4762896, "step": 52890 }, { "epoch": 13.746101871101871, "grad_norm": 9.2711181640625, "learning_rate": 1.347238049286213e-05, "loss": 0.0698, "num_input_tokens_seen": 4763360, "step": 52895 }, { "epoch": 13.747401247401248, "grad_norm": 7.2813401222229, "learning_rate": 1.3467349895381415e-05, "loss": 0.0784, "num_input_tokens_seen": 4763808, "step": 52900 }, { "epoch": 13.748700623700623, "grad_norm": 0.07978258281946182, "learning_rate": 1.346231989103594e-05, "loss": 0.1639, "num_input_tokens_seen": 4764272, "step": 52905 }, { "epoch": 13.75, "grad_norm": 0.3487256169319153, "learning_rate": 1.3457290480084422e-05, "loss": 0.1933, "num_input_tokens_seen": 4764672, "step": 52910 }, { "epoch": 13.751299376299377, "grad_norm": 2.3329429626464844, "learning_rate": 1.3452261662785515e-05, "loss": 0.3303, "num_input_tokens_seen": 4765120, "step": 52915 }, { "epoch": 13.752598752598752, "grad_norm": 13.157050132751465, "learning_rate": 1.344723343939784e-05, "loss": 0.1245, "num_input_tokens_seen": 4765648, "step": 52920 }, { "epoch": 13.753898128898129, "grad_norm": 5.681169509887695, "learning_rate": 1.3442205810180031e-05, "loss": 0.4923, "num_input_tokens_seen": 4766064, "step": 52925 }, { "epoch": 13.755197505197506, "grad_norm": 0.47516268491744995, "learning_rate": 1.3437178775390644e-05, "loss": 0.2452, "num_input_tokens_seen": 4766512, "step": 52930 }, { "epoch": 13.756496881496881, "grad_norm": 5.104580402374268, "learning_rate": 1.3432152335288228e-05, "loss": 0.0419, "num_input_tokens_seen": 4766944, "step": 52935 }, { "epoch": 13.757796257796258, "grad_norm": 6.517757415771484, "learning_rate": 1.3427126490131304e-05, "loss": 0.4242, "num_input_tokens_seen": 4767376, "step": 52940 }, { "epoch": 13.759095634095633, "grad_norm": 9.277844429016113, "learning_rate": 1.3422101240178365e-05, "loss": 0.4456, "num_input_tokens_seen": 4767792, "step": 52945 }, { "epoch": 13.76039501039501, "grad_norm": 1.039742112159729, "learning_rate": 1.3417076585687838e-05, "loss": 0.3953, "num_input_tokens_seen": 4768240, "step": 52950 }, { "epoch": 13.761694386694387, "grad_norm": 9.000893592834473, "learning_rate": 1.3412052526918173e-05, "loss": 0.4187, "num_input_tokens_seen": 4768704, "step": 52955 }, { "epoch": 13.762993762993762, "grad_norm": 5.573294639587402, "learning_rate": 1.3407029064127734e-05, "loss": 0.4239, "num_input_tokens_seen": 4769152, "step": 52960 }, { "epoch": 13.76429313929314, "grad_norm": 0.14120151102542877, "learning_rate": 1.3402006197574915e-05, "loss": 0.2471, "num_input_tokens_seen": 4769616, "step": 52965 }, { "epoch": 13.765592515592516, "grad_norm": 2.2017698287963867, "learning_rate": 1.339698392751802e-05, "loss": 0.1871, "num_input_tokens_seen": 4770096, "step": 52970 }, { "epoch": 13.766891891891891, "grad_norm": 9.919014930725098, "learning_rate": 1.3391962254215356e-05, "loss": 0.1812, "num_input_tokens_seen": 4770576, "step": 52975 }, { "epoch": 13.768191268191268, "grad_norm": 7.767270565032959, "learning_rate": 1.3386941177925206e-05, "loss": 0.1864, "num_input_tokens_seen": 4771008, "step": 52980 }, { "epoch": 13.769490644490645, "grad_norm": 4.787631511688232, "learning_rate": 1.3381920698905787e-05, "loss": 0.2268, "num_input_tokens_seen": 4771408, "step": 52985 }, { "epoch": 13.77079002079002, "grad_norm": 1.1015464067459106, "learning_rate": 1.337690081741533e-05, "loss": 0.054, "num_input_tokens_seen": 4771872, "step": 52990 }, { "epoch": 13.772089397089397, "grad_norm": 2.470757246017456, "learning_rate": 1.3371881533711992e-05, "loss": 0.2657, "num_input_tokens_seen": 4772336, "step": 52995 }, { "epoch": 13.773388773388774, "grad_norm": 0.5118758678436279, "learning_rate": 1.3366862848053935e-05, "loss": 0.2351, "num_input_tokens_seen": 4772784, "step": 53000 }, { "epoch": 13.77468814968815, "grad_norm": 9.529434204101562, "learning_rate": 1.3361844760699251e-05, "loss": 0.3764, "num_input_tokens_seen": 4773232, "step": 53005 }, { "epoch": 13.775987525987526, "grad_norm": 3.4979240894317627, "learning_rate": 1.3356827271906061e-05, "loss": 0.2117, "num_input_tokens_seen": 4773680, "step": 53010 }, { "epoch": 13.777286902286903, "grad_norm": 1.0556690692901611, "learning_rate": 1.335181038193239e-05, "loss": 0.0882, "num_input_tokens_seen": 4774144, "step": 53015 }, { "epoch": 13.778586278586278, "grad_norm": 2.6431689262390137, "learning_rate": 1.3346794091036283e-05, "loss": 0.2034, "num_input_tokens_seen": 4774608, "step": 53020 }, { "epoch": 13.779885654885655, "grad_norm": 0.9806501269340515, "learning_rate": 1.3341778399475713e-05, "loss": 0.0998, "num_input_tokens_seen": 4775056, "step": 53025 }, { "epoch": 13.78118503118503, "grad_norm": 0.3829987943172455, "learning_rate": 1.3336763307508653e-05, "loss": 0.2692, "num_input_tokens_seen": 4775488, "step": 53030 }, { "epoch": 13.782484407484407, "grad_norm": 0.5980957746505737, "learning_rate": 1.333174881539303e-05, "loss": 0.1502, "num_input_tokens_seen": 4775904, "step": 53035 }, { "epoch": 13.783783783783784, "grad_norm": 0.25631242990493774, "learning_rate": 1.332673492338674e-05, "loss": 0.3718, "num_input_tokens_seen": 4776336, "step": 53040 }, { "epoch": 13.78508316008316, "grad_norm": 8.711069107055664, "learning_rate": 1.332172163174767e-05, "loss": 0.2189, "num_input_tokens_seen": 4776768, "step": 53045 }, { "epoch": 13.786382536382536, "grad_norm": 3.859797239303589, "learning_rate": 1.3316708940733635e-05, "loss": 0.1136, "num_input_tokens_seen": 4777200, "step": 53050 }, { "epoch": 13.787681912681913, "grad_norm": 17.418012619018555, "learning_rate": 1.3311696850602465e-05, "loss": 0.1923, "num_input_tokens_seen": 4777664, "step": 53055 }, { "epoch": 13.788981288981288, "grad_norm": 0.042353592813014984, "learning_rate": 1.3306685361611914e-05, "loss": 0.1385, "num_input_tokens_seen": 4778080, "step": 53060 }, { "epoch": 13.790280665280665, "grad_norm": 12.728282928466797, "learning_rate": 1.3301674474019748e-05, "loss": 0.3431, "num_input_tokens_seen": 4778528, "step": 53065 }, { "epoch": 13.791580041580042, "grad_norm": 0.05492416396737099, "learning_rate": 1.3296664188083663e-05, "loss": 0.318, "num_input_tokens_seen": 4778960, "step": 53070 }, { "epoch": 13.792879417879417, "grad_norm": 0.5426123738288879, "learning_rate": 1.3291654504061354e-05, "loss": 0.0813, "num_input_tokens_seen": 4779392, "step": 53075 }, { "epoch": 13.794178794178794, "grad_norm": 0.6597748398780823, "learning_rate": 1.328664542221047e-05, "loss": 0.2626, "num_input_tokens_seen": 4779824, "step": 53080 }, { "epoch": 13.795478170478171, "grad_norm": 0.23312285542488098, "learning_rate": 1.3281636942788647e-05, "loss": 0.2085, "num_input_tokens_seen": 4780288, "step": 53085 }, { "epoch": 13.796777546777546, "grad_norm": 4.438170433044434, "learning_rate": 1.3276629066053454e-05, "loss": 0.1205, "num_input_tokens_seen": 4780736, "step": 53090 }, { "epoch": 13.798076923076923, "grad_norm": 12.934732437133789, "learning_rate": 1.327162179226247e-05, "loss": 0.2069, "num_input_tokens_seen": 4781216, "step": 53095 }, { "epoch": 13.799376299376299, "grad_norm": 7.927661895751953, "learning_rate": 1.3266615121673203e-05, "loss": 0.5716, "num_input_tokens_seen": 4781680, "step": 53100 }, { "epoch": 13.800675675675675, "grad_norm": 8.205816268920898, "learning_rate": 1.3261609054543179e-05, "loss": 0.0288, "num_input_tokens_seen": 4782128, "step": 53105 }, { "epoch": 13.801975051975052, "grad_norm": 0.04504163935780525, "learning_rate": 1.3256603591129833e-05, "loss": 0.1943, "num_input_tokens_seen": 4782592, "step": 53110 }, { "epoch": 13.803274428274428, "grad_norm": 2.6133501529693604, "learning_rate": 1.3251598731690617e-05, "loss": 0.3645, "num_input_tokens_seen": 4783024, "step": 53115 }, { "epoch": 13.804573804573804, "grad_norm": 4.340958595275879, "learning_rate": 1.3246594476482946e-05, "loss": 0.097, "num_input_tokens_seen": 4783456, "step": 53120 }, { "epoch": 13.805873180873181, "grad_norm": 0.7687946557998657, "learning_rate": 1.3241590825764172e-05, "loss": 0.1336, "num_input_tokens_seen": 4783872, "step": 53125 }, { "epoch": 13.807172557172557, "grad_norm": 2.5974698066711426, "learning_rate": 1.323658777979166e-05, "loss": 0.3155, "num_input_tokens_seen": 4784320, "step": 53130 }, { "epoch": 13.808471933471933, "grad_norm": 1.9650324583053589, "learning_rate": 1.3231585338822697e-05, "loss": 0.1721, "num_input_tokens_seen": 4784752, "step": 53135 }, { "epoch": 13.80977130977131, "grad_norm": 1.7322444915771484, "learning_rate": 1.3226583503114587e-05, "loss": 0.1086, "num_input_tokens_seen": 4785168, "step": 53140 }, { "epoch": 13.811070686070686, "grad_norm": 0.6657150983810425, "learning_rate": 1.3221582272924557e-05, "loss": 0.0145, "num_input_tokens_seen": 4785600, "step": 53145 }, { "epoch": 13.812370062370062, "grad_norm": 0.33412620425224304, "learning_rate": 1.3216581648509837e-05, "loss": 0.2077, "num_input_tokens_seen": 4786016, "step": 53150 }, { "epoch": 13.81366943866944, "grad_norm": 0.2840758264064789, "learning_rate": 1.3211581630127615e-05, "loss": 0.2828, "num_input_tokens_seen": 4786480, "step": 53155 }, { "epoch": 13.814968814968815, "grad_norm": 0.477679580450058, "learning_rate": 1.3206582218035051e-05, "loss": 0.0074, "num_input_tokens_seen": 4786960, "step": 53160 }, { "epoch": 13.816268191268192, "grad_norm": 6.024200916290283, "learning_rate": 1.320158341248926e-05, "loss": 0.3723, "num_input_tokens_seen": 4787376, "step": 53165 }, { "epoch": 13.817567567567568, "grad_norm": 20.757293701171875, "learning_rate": 1.3196585213747342e-05, "loss": 0.2391, "num_input_tokens_seen": 4787856, "step": 53170 }, { "epoch": 13.818866943866944, "grad_norm": 0.5730659365653992, "learning_rate": 1.3191587622066359e-05, "loss": 0.09, "num_input_tokens_seen": 4788272, "step": 53175 }, { "epoch": 13.82016632016632, "grad_norm": 7.942440032958984, "learning_rate": 1.3186590637703328e-05, "loss": 0.0611, "num_input_tokens_seen": 4788720, "step": 53180 }, { "epoch": 13.821465696465696, "grad_norm": 0.22512440383434296, "learning_rate": 1.3181594260915262e-05, "loss": 0.3498, "num_input_tokens_seen": 4789152, "step": 53185 }, { "epoch": 13.822765072765073, "grad_norm": 0.29358088970184326, "learning_rate": 1.3176598491959124e-05, "loss": 0.0274, "num_input_tokens_seen": 4789568, "step": 53190 }, { "epoch": 13.82406444906445, "grad_norm": 0.10076837241649628, "learning_rate": 1.3171603331091865e-05, "loss": 0.1183, "num_input_tokens_seen": 4790000, "step": 53195 }, { "epoch": 13.825363825363825, "grad_norm": 0.17356735467910767, "learning_rate": 1.3166608778570371e-05, "loss": 0.0884, "num_input_tokens_seen": 4790480, "step": 53200 }, { "epoch": 13.826663201663202, "grad_norm": 5.721500396728516, "learning_rate": 1.3161614834651537e-05, "loss": 0.4252, "num_input_tokens_seen": 4790960, "step": 53205 }, { "epoch": 13.827962577962579, "grad_norm": 0.11696165800094604, "learning_rate": 1.3156621499592182e-05, "loss": 0.3525, "num_input_tokens_seen": 4791408, "step": 53210 }, { "epoch": 13.829261954261954, "grad_norm": 10.594573974609375, "learning_rate": 1.3151628773649141e-05, "loss": 0.1404, "num_input_tokens_seen": 4791856, "step": 53215 }, { "epoch": 13.83056133056133, "grad_norm": 9.951388359069824, "learning_rate": 1.3146636657079175e-05, "loss": 0.2195, "num_input_tokens_seen": 4792272, "step": 53220 }, { "epoch": 13.831860706860708, "grad_norm": 1.5779085159301758, "learning_rate": 1.3141645150139046e-05, "loss": 0.2205, "num_input_tokens_seen": 4792736, "step": 53225 }, { "epoch": 13.833160083160083, "grad_norm": 15.662858963012695, "learning_rate": 1.3136654253085467e-05, "loss": 0.4068, "num_input_tokens_seen": 4793200, "step": 53230 }, { "epoch": 13.83445945945946, "grad_norm": 0.6115345358848572, "learning_rate": 1.3131663966175137e-05, "loss": 0.3749, "num_input_tokens_seen": 4793664, "step": 53235 }, { "epoch": 13.835758835758837, "grad_norm": 5.5614190101623535, "learning_rate": 1.31266742896647e-05, "loss": 0.1778, "num_input_tokens_seen": 4794128, "step": 53240 }, { "epoch": 13.837058212058212, "grad_norm": 0.2563339173793793, "learning_rate": 1.312168522381077e-05, "loss": 0.2801, "num_input_tokens_seen": 4794608, "step": 53245 }, { "epoch": 13.838357588357589, "grad_norm": 11.642390251159668, "learning_rate": 1.3116696768869963e-05, "loss": 0.2448, "num_input_tokens_seen": 4795072, "step": 53250 }, { "epoch": 13.839656964656964, "grad_norm": 9.61794376373291, "learning_rate": 1.3111708925098806e-05, "loss": 0.4083, "num_input_tokens_seen": 4795520, "step": 53255 }, { "epoch": 13.84095634095634, "grad_norm": 2.4422621726989746, "learning_rate": 1.310672169275387e-05, "loss": 0.2641, "num_input_tokens_seen": 4796000, "step": 53260 }, { "epoch": 13.842255717255718, "grad_norm": 12.218127250671387, "learning_rate": 1.3101735072091622e-05, "loss": 0.4384, "num_input_tokens_seen": 4796432, "step": 53265 }, { "epoch": 13.843555093555093, "grad_norm": 9.869086265563965, "learning_rate": 1.3096749063368553e-05, "loss": 0.1143, "num_input_tokens_seen": 4796880, "step": 53270 }, { "epoch": 13.84485446985447, "grad_norm": 0.3528210520744324, "learning_rate": 1.309176366684107e-05, "loss": 0.1104, "num_input_tokens_seen": 4797360, "step": 53275 }, { "epoch": 13.846153846153847, "grad_norm": 2.735196352005005, "learning_rate": 1.3086778882765604e-05, "loss": 0.1528, "num_input_tokens_seen": 4797792, "step": 53280 }, { "epoch": 13.847453222453222, "grad_norm": 5.475113868713379, "learning_rate": 1.3081794711398504e-05, "loss": 0.2211, "num_input_tokens_seen": 4798256, "step": 53285 }, { "epoch": 13.848752598752599, "grad_norm": 7.042942047119141, "learning_rate": 1.307681115299612e-05, "loss": 0.23, "num_input_tokens_seen": 4798704, "step": 53290 }, { "epoch": 13.850051975051976, "grad_norm": 2.0671281814575195, "learning_rate": 1.3071828207814766e-05, "loss": 0.2919, "num_input_tokens_seen": 4799136, "step": 53295 }, { "epoch": 13.85135135135135, "grad_norm": 0.3213714361190796, "learning_rate": 1.3066845876110722e-05, "loss": 0.1776, "num_input_tokens_seen": 4799584, "step": 53300 }, { "epoch": 13.852650727650728, "grad_norm": 13.674277305603027, "learning_rate": 1.3061864158140232e-05, "loss": 0.2083, "num_input_tokens_seen": 4800000, "step": 53305 }, { "epoch": 13.853950103950105, "grad_norm": 0.094607874751091, "learning_rate": 1.3056883054159497e-05, "loss": 0.0964, "num_input_tokens_seen": 4800416, "step": 53310 }, { "epoch": 13.85524948024948, "grad_norm": 0.25441399216651917, "learning_rate": 1.3051902564424717e-05, "loss": 0.4256, "num_input_tokens_seen": 4800848, "step": 53315 }, { "epoch": 13.856548856548857, "grad_norm": 5.156585693359375, "learning_rate": 1.304692268919203e-05, "loss": 0.3455, "num_input_tokens_seen": 4801296, "step": 53320 }, { "epoch": 13.857848232848234, "grad_norm": 0.23073658347129822, "learning_rate": 1.304194342871756e-05, "loss": 0.3109, "num_input_tokens_seen": 4801744, "step": 53325 }, { "epoch": 13.859147609147609, "grad_norm": 0.15583539009094238, "learning_rate": 1.30369647832574e-05, "loss": 0.4947, "num_input_tokens_seen": 4802176, "step": 53330 }, { "epoch": 13.860446985446986, "grad_norm": 0.3052133619785309, "learning_rate": 1.303198675306761e-05, "loss": 0.1245, "num_input_tokens_seen": 4802640, "step": 53335 }, { "epoch": 13.861746361746361, "grad_norm": 0.03366424888372421, "learning_rate": 1.3027009338404198e-05, "loss": 0.525, "num_input_tokens_seen": 4803056, "step": 53340 }, { "epoch": 13.863045738045738, "grad_norm": 10.12376594543457, "learning_rate": 1.3022032539523176e-05, "loss": 0.2122, "num_input_tokens_seen": 4803488, "step": 53345 }, { "epoch": 13.864345114345115, "grad_norm": 8.734222412109375, "learning_rate": 1.3017056356680493e-05, "loss": 0.6402, "num_input_tokens_seen": 4803952, "step": 53350 }, { "epoch": 13.86564449064449, "grad_norm": 6.386616230010986, "learning_rate": 1.3012080790132092e-05, "loss": 0.3664, "num_input_tokens_seen": 4804384, "step": 53355 }, { "epoch": 13.866943866943867, "grad_norm": 9.866192817687988, "learning_rate": 1.3007105840133849e-05, "loss": 0.1251, "num_input_tokens_seen": 4804832, "step": 53360 }, { "epoch": 13.868243243243244, "grad_norm": 2.5010201930999756, "learning_rate": 1.3002131506941645e-05, "loss": 0.2818, "num_input_tokens_seen": 4805296, "step": 53365 }, { "epoch": 13.869542619542619, "grad_norm": 0.6231958270072937, "learning_rate": 1.2997157790811324e-05, "loss": 0.2371, "num_input_tokens_seen": 4805744, "step": 53370 }, { "epoch": 13.870841995841996, "grad_norm": 5.443539142608643, "learning_rate": 1.2992184691998666e-05, "loss": 0.1616, "num_input_tokens_seen": 4806208, "step": 53375 }, { "epoch": 13.872141372141373, "grad_norm": 8.844234466552734, "learning_rate": 1.298721221075947e-05, "loss": 0.3213, "num_input_tokens_seen": 4806656, "step": 53380 }, { "epoch": 13.873440748440748, "grad_norm": 0.9527819156646729, "learning_rate": 1.2982240347349445e-05, "loss": 0.0789, "num_input_tokens_seen": 4807120, "step": 53385 }, { "epoch": 13.874740124740125, "grad_norm": 7.379827499389648, "learning_rate": 1.2977269102024326e-05, "loss": 0.114, "num_input_tokens_seen": 4807600, "step": 53390 }, { "epoch": 13.8760395010395, "grad_norm": 0.15729202330112457, "learning_rate": 1.2972298475039766e-05, "loss": 0.2054, "num_input_tokens_seen": 4808080, "step": 53395 }, { "epoch": 13.877338877338877, "grad_norm": 0.11899679154157639, "learning_rate": 1.2967328466651418e-05, "loss": 0.0506, "num_input_tokens_seen": 4808512, "step": 53400 }, { "epoch": 13.878638253638254, "grad_norm": 6.551629543304443, "learning_rate": 1.29623590771149e-05, "loss": 0.2114, "num_input_tokens_seen": 4808960, "step": 53405 }, { "epoch": 13.87993762993763, "grad_norm": 0.7789214849472046, "learning_rate": 1.2957390306685797e-05, "loss": 0.2353, "num_input_tokens_seen": 4809360, "step": 53410 }, { "epoch": 13.881237006237006, "grad_norm": 0.2646240293979645, "learning_rate": 1.2952422155619637e-05, "loss": 0.1494, "num_input_tokens_seen": 4809824, "step": 53415 }, { "epoch": 13.882536382536383, "grad_norm": 0.1459793597459793, "learning_rate": 1.2947454624171962e-05, "loss": 0.0174, "num_input_tokens_seen": 4810304, "step": 53420 }, { "epoch": 13.883835758835758, "grad_norm": 10.908282279968262, "learning_rate": 1.2942487712598234e-05, "loss": 0.1689, "num_input_tokens_seen": 4810736, "step": 53425 }, { "epoch": 13.885135135135135, "grad_norm": 1.4134371280670166, "learning_rate": 1.2937521421153926e-05, "loss": 0.0168, "num_input_tokens_seen": 4811216, "step": 53430 }, { "epoch": 13.886434511434512, "grad_norm": 1.3231751918792725, "learning_rate": 1.2932555750094435e-05, "loss": 0.0376, "num_input_tokens_seen": 4811664, "step": 53435 }, { "epoch": 13.887733887733887, "grad_norm": 5.618932723999023, "learning_rate": 1.2927590699675168e-05, "loss": 0.2831, "num_input_tokens_seen": 4812144, "step": 53440 }, { "epoch": 13.889033264033264, "grad_norm": 3.5512733459472656, "learning_rate": 1.2922626270151485e-05, "loss": 0.1566, "num_input_tokens_seen": 4812576, "step": 53445 }, { "epoch": 13.890332640332641, "grad_norm": 10.543766021728516, "learning_rate": 1.2917662461778698e-05, "loss": 0.4319, "num_input_tokens_seen": 4813056, "step": 53450 }, { "epoch": 13.891632016632016, "grad_norm": 8.800945281982422, "learning_rate": 1.2912699274812118e-05, "loss": 0.2167, "num_input_tokens_seen": 4813520, "step": 53455 }, { "epoch": 13.892931392931393, "grad_norm": 6.025529384613037, "learning_rate": 1.2907736709506984e-05, "loss": 0.1225, "num_input_tokens_seen": 4813936, "step": 53460 }, { "epoch": 13.89423076923077, "grad_norm": 18.33930778503418, "learning_rate": 1.2902774766118548e-05, "loss": 0.424, "num_input_tokens_seen": 4814384, "step": 53465 }, { "epoch": 13.895530145530145, "grad_norm": 7.747756481170654, "learning_rate": 1.2897813444901985e-05, "loss": 0.2002, "num_input_tokens_seen": 4814848, "step": 53470 }, { "epoch": 13.896829521829522, "grad_norm": 0.1415410339832306, "learning_rate": 1.2892852746112477e-05, "loss": 0.2375, "num_input_tokens_seen": 4815344, "step": 53475 }, { "epoch": 13.898128898128899, "grad_norm": 3.76212739944458, "learning_rate": 1.2887892670005147e-05, "loss": 0.1237, "num_input_tokens_seen": 4815808, "step": 53480 }, { "epoch": 13.899428274428274, "grad_norm": 1.8170301914215088, "learning_rate": 1.2882933216835117e-05, "loss": 0.0654, "num_input_tokens_seen": 4816272, "step": 53485 }, { "epoch": 13.900727650727651, "grad_norm": 9.259392738342285, "learning_rate": 1.2877974386857428e-05, "loss": 0.2549, "num_input_tokens_seen": 4816704, "step": 53490 }, { "epoch": 13.902027027027026, "grad_norm": 16.812881469726562, "learning_rate": 1.287301618032714e-05, "loss": 0.2526, "num_input_tokens_seen": 4817168, "step": 53495 }, { "epoch": 13.903326403326403, "grad_norm": 0.5904555320739746, "learning_rate": 1.2868058597499255e-05, "loss": 0.3014, "num_input_tokens_seen": 4817616, "step": 53500 }, { "epoch": 13.90462577962578, "grad_norm": 16.411956787109375, "learning_rate": 1.2863101638628717e-05, "loss": 0.4029, "num_input_tokens_seen": 4818064, "step": 53505 }, { "epoch": 13.905925155925155, "grad_norm": 17.234210968017578, "learning_rate": 1.2858145303970512e-05, "loss": 0.1952, "num_input_tokens_seen": 4818496, "step": 53510 }, { "epoch": 13.907224532224532, "grad_norm": 1.2503485679626465, "learning_rate": 1.285318959377952e-05, "loss": 0.245, "num_input_tokens_seen": 4818944, "step": 53515 }, { "epoch": 13.90852390852391, "grad_norm": 0.6113728284835815, "learning_rate": 1.2848234508310631e-05, "loss": 0.2798, "num_input_tokens_seen": 4819424, "step": 53520 }, { "epoch": 13.909823284823284, "grad_norm": 4.942399501800537, "learning_rate": 1.2843280047818674e-05, "loss": 0.1865, "num_input_tokens_seen": 4819888, "step": 53525 }, { "epoch": 13.911122661122661, "grad_norm": 2.9651193618774414, "learning_rate": 1.2838326212558485e-05, "loss": 0.3713, "num_input_tokens_seen": 4820384, "step": 53530 }, { "epoch": 13.912422037422038, "grad_norm": 0.751743733882904, "learning_rate": 1.2833373002784822e-05, "loss": 0.1395, "num_input_tokens_seen": 4820848, "step": 53535 }, { "epoch": 13.913721413721413, "grad_norm": 1.8974041938781738, "learning_rate": 1.282842041875244e-05, "loss": 0.3247, "num_input_tokens_seen": 4821280, "step": 53540 }, { "epoch": 13.91502079002079, "grad_norm": 11.280304908752441, "learning_rate": 1.2823468460716063e-05, "loss": 0.1146, "num_input_tokens_seen": 4821728, "step": 53545 }, { "epoch": 13.916320166320165, "grad_norm": 0.0291227288544178, "learning_rate": 1.2818517128930374e-05, "loss": 0.3454, "num_input_tokens_seen": 4822208, "step": 53550 }, { "epoch": 13.917619542619542, "grad_norm": 0.7815704345703125, "learning_rate": 1.2813566423650014e-05, "loss": 0.1375, "num_input_tokens_seen": 4822656, "step": 53555 }, { "epoch": 13.91891891891892, "grad_norm": 0.9731854200363159, "learning_rate": 1.280861634512962e-05, "loss": 0.3578, "num_input_tokens_seen": 4823072, "step": 53560 }, { "epoch": 13.920218295218294, "grad_norm": 6.488871097564697, "learning_rate": 1.2803666893623767e-05, "loss": 0.425, "num_input_tokens_seen": 4823504, "step": 53565 }, { "epoch": 13.921517671517671, "grad_norm": 0.029798777773976326, "learning_rate": 1.2798718069387005e-05, "loss": 0.2723, "num_input_tokens_seen": 4823968, "step": 53570 }, { "epoch": 13.922817047817048, "grad_norm": 0.17447465658187866, "learning_rate": 1.279376987267386e-05, "loss": 0.3169, "num_input_tokens_seen": 4824416, "step": 53575 }, { "epoch": 13.924116424116423, "grad_norm": 13.688570976257324, "learning_rate": 1.2788822303738829e-05, "loss": 0.3768, "num_input_tokens_seen": 4824896, "step": 53580 }, { "epoch": 13.9254158004158, "grad_norm": 6.688417911529541, "learning_rate": 1.2783875362836373e-05, "loss": 0.2192, "num_input_tokens_seen": 4825360, "step": 53585 }, { "epoch": 13.926715176715177, "grad_norm": 10.2160005569458, "learning_rate": 1.2778929050220905e-05, "loss": 0.5854, "num_input_tokens_seen": 4825824, "step": 53590 }, { "epoch": 13.928014553014552, "grad_norm": 1.3154045343399048, "learning_rate": 1.2773983366146832e-05, "loss": 0.2095, "num_input_tokens_seen": 4826240, "step": 53595 }, { "epoch": 13.92931392931393, "grad_norm": 0.9864652156829834, "learning_rate": 1.2769038310868503e-05, "loss": 0.0802, "num_input_tokens_seen": 4826704, "step": 53600 }, { "epoch": 13.930613305613306, "grad_norm": 7.381170749664307, "learning_rate": 1.2764093884640261e-05, "loss": 0.2067, "num_input_tokens_seen": 4827136, "step": 53605 }, { "epoch": 13.931912681912682, "grad_norm": 1.918634295463562, "learning_rate": 1.2759150087716384e-05, "loss": 0.1587, "num_input_tokens_seen": 4827616, "step": 53610 }, { "epoch": 13.933212058212058, "grad_norm": 0.2319008857011795, "learning_rate": 1.2754206920351147e-05, "loss": 0.2918, "num_input_tokens_seen": 4828080, "step": 53615 }, { "epoch": 13.934511434511435, "grad_norm": 1.5677694082260132, "learning_rate": 1.2749264382798782e-05, "loss": 0.2461, "num_input_tokens_seen": 4828512, "step": 53620 }, { "epoch": 13.93581081081081, "grad_norm": 2.630768299102783, "learning_rate": 1.2744322475313497e-05, "loss": 0.2353, "num_input_tokens_seen": 4828960, "step": 53625 }, { "epoch": 13.937110187110187, "grad_norm": 1.4961894750595093, "learning_rate": 1.2739381198149453e-05, "loss": 0.3194, "num_input_tokens_seen": 4829472, "step": 53630 }, { "epoch": 13.938409563409563, "grad_norm": 13.196043968200684, "learning_rate": 1.2734440551560766e-05, "loss": 0.3524, "num_input_tokens_seen": 4829920, "step": 53635 }, { "epoch": 13.93970893970894, "grad_norm": 9.114178657531738, "learning_rate": 1.2729500535801569e-05, "loss": 0.221, "num_input_tokens_seen": 4830352, "step": 53640 }, { "epoch": 13.941008316008316, "grad_norm": 9.596476554870605, "learning_rate": 1.2724561151125902e-05, "loss": 0.1734, "num_input_tokens_seen": 4830768, "step": 53645 }, { "epoch": 13.942307692307692, "grad_norm": 8.374959945678711, "learning_rate": 1.271962239778782e-05, "loss": 0.3678, "num_input_tokens_seen": 4831216, "step": 53650 }, { "epoch": 13.943607068607069, "grad_norm": 0.4300598204135895, "learning_rate": 1.2714684276041322e-05, "loss": 0.1702, "num_input_tokens_seen": 4831648, "step": 53655 }, { "epoch": 13.944906444906445, "grad_norm": 0.25540199875831604, "learning_rate": 1.2709746786140398e-05, "loss": 0.2588, "num_input_tokens_seen": 4832080, "step": 53660 }, { "epoch": 13.94620582120582, "grad_norm": 0.24555878341197968, "learning_rate": 1.2704809928338956e-05, "loss": 0.1726, "num_input_tokens_seen": 4832576, "step": 53665 }, { "epoch": 13.947505197505198, "grad_norm": 1.0161482095718384, "learning_rate": 1.2699873702890936e-05, "loss": 0.225, "num_input_tokens_seen": 4833008, "step": 53670 }, { "epoch": 13.948804573804575, "grad_norm": 10.723180770874023, "learning_rate": 1.2694938110050181e-05, "loss": 0.2133, "num_input_tokens_seen": 4833440, "step": 53675 }, { "epoch": 13.95010395010395, "grad_norm": 2.296191692352295, "learning_rate": 1.2690003150070562e-05, "loss": 0.0997, "num_input_tokens_seen": 4833856, "step": 53680 }, { "epoch": 13.951403326403327, "grad_norm": 0.0964231789112091, "learning_rate": 1.2685068823205867e-05, "loss": 0.0485, "num_input_tokens_seen": 4834336, "step": 53685 }, { "epoch": 13.952702702702704, "grad_norm": 10.340241432189941, "learning_rate": 1.268013512970988e-05, "loss": 0.1882, "num_input_tokens_seen": 4834784, "step": 53690 }, { "epoch": 13.954002079002079, "grad_norm": 11.883986473083496, "learning_rate": 1.2675202069836361e-05, "loss": 0.2542, "num_input_tokens_seen": 4835216, "step": 53695 }, { "epoch": 13.955301455301456, "grad_norm": 2.308993101119995, "learning_rate": 1.2670269643838994e-05, "loss": 0.1034, "num_input_tokens_seen": 4835680, "step": 53700 }, { "epoch": 13.95660083160083, "grad_norm": 3.916356325149536, "learning_rate": 1.2665337851971487e-05, "loss": 0.1561, "num_input_tokens_seen": 4836112, "step": 53705 }, { "epoch": 13.957900207900208, "grad_norm": 11.02954387664795, "learning_rate": 1.2660406694487461e-05, "loss": 0.2781, "num_input_tokens_seen": 4836576, "step": 53710 }, { "epoch": 13.959199584199585, "grad_norm": 10.413544654846191, "learning_rate": 1.2655476171640551e-05, "loss": 0.3758, "num_input_tokens_seen": 4837008, "step": 53715 }, { "epoch": 13.96049896049896, "grad_norm": 0.19102388620376587, "learning_rate": 1.2650546283684322e-05, "loss": 0.2313, "num_input_tokens_seen": 4837472, "step": 53720 }, { "epoch": 13.961798336798337, "grad_norm": 0.03616499900817871, "learning_rate": 1.2645617030872328e-05, "loss": 0.3444, "num_input_tokens_seen": 4837888, "step": 53725 }, { "epoch": 13.963097713097714, "grad_norm": 1.8386187553405762, "learning_rate": 1.264068841345809e-05, "loss": 0.1373, "num_input_tokens_seen": 4838352, "step": 53730 }, { "epoch": 13.964397089397089, "grad_norm": 5.102797031402588, "learning_rate": 1.2635760431695098e-05, "loss": 0.2464, "num_input_tokens_seen": 4838784, "step": 53735 }, { "epoch": 13.965696465696466, "grad_norm": 0.7665557265281677, "learning_rate": 1.263083308583678e-05, "loss": 0.0433, "num_input_tokens_seen": 4839200, "step": 53740 }, { "epoch": 13.966995841995843, "grad_norm": 0.4889848530292511, "learning_rate": 1.2625906376136581e-05, "loss": 0.3958, "num_input_tokens_seen": 4839632, "step": 53745 }, { "epoch": 13.968295218295218, "grad_norm": 0.3020705580711365, "learning_rate": 1.2620980302847865e-05, "loss": 0.1842, "num_input_tokens_seen": 4840064, "step": 53750 }, { "epoch": 13.969594594594595, "grad_norm": 0.5222442150115967, "learning_rate": 1.2616054866223992e-05, "loss": 0.1005, "num_input_tokens_seen": 4840512, "step": 53755 }, { "epoch": 13.970893970893972, "grad_norm": 5.187674045562744, "learning_rate": 1.2611130066518293e-05, "loss": 0.1667, "num_input_tokens_seen": 4840976, "step": 53760 }, { "epoch": 13.972193347193347, "grad_norm": 7.677196979522705, "learning_rate": 1.2606205903984034e-05, "loss": 0.1512, "num_input_tokens_seen": 4841424, "step": 53765 }, { "epoch": 13.973492723492724, "grad_norm": 1.6007717847824097, "learning_rate": 1.260128237887449e-05, "loss": 0.0242, "num_input_tokens_seen": 4841856, "step": 53770 }, { "epoch": 13.9747920997921, "grad_norm": 9.288082122802734, "learning_rate": 1.2596359491442866e-05, "loss": 0.4485, "num_input_tokens_seen": 4842368, "step": 53775 }, { "epoch": 13.976091476091476, "grad_norm": 12.805609703063965, "learning_rate": 1.2591437241942367e-05, "loss": 0.2662, "num_input_tokens_seen": 4842784, "step": 53780 }, { "epoch": 13.977390852390853, "grad_norm": 5.158845901489258, "learning_rate": 1.2586515630626127e-05, "loss": 0.1712, "num_input_tokens_seen": 4843248, "step": 53785 }, { "epoch": 13.978690228690228, "grad_norm": 0.9079203009605408, "learning_rate": 1.2581594657747286e-05, "loss": 0.2249, "num_input_tokens_seen": 4843696, "step": 53790 }, { "epoch": 13.979989604989605, "grad_norm": 4.706315040588379, "learning_rate": 1.2576674323558929e-05, "loss": 0.2211, "num_input_tokens_seen": 4844144, "step": 53795 }, { "epoch": 13.981288981288982, "grad_norm": 14.335803031921387, "learning_rate": 1.2571754628314122e-05, "loss": 0.3639, "num_input_tokens_seen": 4844576, "step": 53800 }, { "epoch": 13.982588357588357, "grad_norm": 0.16965734958648682, "learning_rate": 1.2566835572265878e-05, "loss": 0.2579, "num_input_tokens_seen": 4845040, "step": 53805 }, { "epoch": 13.983887733887734, "grad_norm": 17.269460678100586, "learning_rate": 1.2561917155667199e-05, "loss": 0.6379, "num_input_tokens_seen": 4845472, "step": 53810 }, { "epoch": 13.98518711018711, "grad_norm": 9.784969329833984, "learning_rate": 1.255699937877103e-05, "loss": 0.5323, "num_input_tokens_seen": 4845888, "step": 53815 }, { "epoch": 13.986486486486486, "grad_norm": 0.42703312635421753, "learning_rate": 1.2552082241830316e-05, "loss": 0.0547, "num_input_tokens_seen": 4846320, "step": 53820 }, { "epoch": 13.987785862785863, "grad_norm": 0.9035311341285706, "learning_rate": 1.2547165745097928e-05, "loss": 0.3538, "num_input_tokens_seen": 4846816, "step": 53825 }, { "epoch": 13.98908523908524, "grad_norm": 3.669931173324585, "learning_rate": 1.2542249888826741e-05, "loss": 0.0431, "num_input_tokens_seen": 4847216, "step": 53830 }, { "epoch": 13.990384615384615, "grad_norm": 1.2863720655441284, "learning_rate": 1.2537334673269585e-05, "loss": 0.0813, "num_input_tokens_seen": 4847648, "step": 53835 }, { "epoch": 13.991683991683992, "grad_norm": 12.843267440795898, "learning_rate": 1.2532420098679237e-05, "loss": 0.2704, "num_input_tokens_seen": 4848096, "step": 53840 }, { "epoch": 13.992983367983367, "grad_norm": 2.1280136108398438, "learning_rate": 1.2527506165308481e-05, "loss": 0.4442, "num_input_tokens_seen": 4848528, "step": 53845 }, { "epoch": 13.994282744282744, "grad_norm": 1.632760763168335, "learning_rate": 1.2522592873410022e-05, "loss": 0.5379, "num_input_tokens_seen": 4848992, "step": 53850 }, { "epoch": 13.995582120582121, "grad_norm": 11.225719451904297, "learning_rate": 1.2517680223236578e-05, "loss": 0.5045, "num_input_tokens_seen": 4849392, "step": 53855 }, { "epoch": 13.996881496881496, "grad_norm": 7.937734127044678, "learning_rate": 1.2512768215040793e-05, "loss": 0.438, "num_input_tokens_seen": 4849840, "step": 53860 }, { "epoch": 13.998180873180873, "grad_norm": 0.3242778182029724, "learning_rate": 1.25078568490753e-05, "loss": 0.3572, "num_input_tokens_seen": 4850272, "step": 53865 }, { "epoch": 13.99948024948025, "grad_norm": 0.7632684707641602, "learning_rate": 1.2502946125592698e-05, "loss": 0.1854, "num_input_tokens_seen": 4850752, "step": 53870 }, { "epoch": 14.0, "eval_loss": 0.38002052903175354, "eval_runtime": 13.2065, "eval_samples_per_second": 64.817, "eval_steps_per_second": 32.408, "num_input_tokens_seen": 4850888, "step": 53872 }, { "epoch": 14.000779625779625, "grad_norm": 16.84703254699707, "learning_rate": 1.2498036044845563e-05, "loss": 0.3353, "num_input_tokens_seen": 4851176, "step": 53875 }, { "epoch": 14.002079002079002, "grad_norm": 0.3920392692089081, "learning_rate": 1.2493126607086403e-05, "loss": 0.0467, "num_input_tokens_seen": 4851576, "step": 53880 }, { "epoch": 14.003378378378379, "grad_norm": 0.3032093644142151, "learning_rate": 1.2488217812567732e-05, "loss": 0.1147, "num_input_tokens_seen": 4852024, "step": 53885 }, { "epoch": 14.004677754677754, "grad_norm": 0.1412956863641739, "learning_rate": 1.2483309661542001e-05, "loss": 0.3185, "num_input_tokens_seen": 4852520, "step": 53890 }, { "epoch": 14.005977130977131, "grad_norm": 3.5100901126861572, "learning_rate": 1.2478402154261653e-05, "loss": 0.128, "num_input_tokens_seen": 4852968, "step": 53895 }, { "epoch": 14.007276507276508, "grad_norm": 12.240508079528809, "learning_rate": 1.2473495290979073e-05, "loss": 0.1943, "num_input_tokens_seen": 4853384, "step": 53900 }, { "epoch": 14.008575883575883, "grad_norm": 5.187325477600098, "learning_rate": 1.2468589071946632e-05, "loss": 0.1906, "num_input_tokens_seen": 4853848, "step": 53905 }, { "epoch": 14.00987525987526, "grad_norm": 0.271566778421402, "learning_rate": 1.2463683497416672e-05, "loss": 0.1348, "num_input_tokens_seen": 4854296, "step": 53910 }, { "epoch": 14.011174636174637, "grad_norm": 17.25682258605957, "learning_rate": 1.245877856764147e-05, "loss": 0.2186, "num_input_tokens_seen": 4854712, "step": 53915 }, { "epoch": 14.012474012474012, "grad_norm": 7.995852947235107, "learning_rate": 1.2453874282873316e-05, "loss": 0.2042, "num_input_tokens_seen": 4855160, "step": 53920 }, { "epoch": 14.013773388773389, "grad_norm": 0.5127782225608826, "learning_rate": 1.2448970643364416e-05, "loss": 0.1079, "num_input_tokens_seen": 4855640, "step": 53925 }, { "epoch": 14.015072765072764, "grad_norm": 5.4026360511779785, "learning_rate": 1.2444067649366992e-05, "loss": 0.272, "num_input_tokens_seen": 4856088, "step": 53930 }, { "epoch": 14.016372141372141, "grad_norm": 0.4016958177089691, "learning_rate": 1.243916530113319e-05, "loss": 0.0849, "num_input_tokens_seen": 4856536, "step": 53935 }, { "epoch": 14.017671517671518, "grad_norm": 0.17622551321983337, "learning_rate": 1.2434263598915153e-05, "loss": 0.2413, "num_input_tokens_seen": 4856968, "step": 53940 }, { "epoch": 14.018970893970893, "grad_norm": 0.608180046081543, "learning_rate": 1.2429362542964976e-05, "loss": 0.1245, "num_input_tokens_seen": 4857400, "step": 53945 }, { "epoch": 14.02027027027027, "grad_norm": 0.32890546321868896, "learning_rate": 1.242446213353474e-05, "loss": 0.1912, "num_input_tokens_seen": 4857848, "step": 53950 }, { "epoch": 14.021569646569647, "grad_norm": 0.04947935789823532, "learning_rate": 1.2419562370876456e-05, "loss": 0.156, "num_input_tokens_seen": 4858312, "step": 53955 }, { "epoch": 14.022869022869022, "grad_norm": 1.434708595275879, "learning_rate": 1.241466325524214e-05, "loss": 0.2786, "num_input_tokens_seen": 4858760, "step": 53960 }, { "epoch": 14.0241683991684, "grad_norm": 1.6946367025375366, "learning_rate": 1.2409764786883754e-05, "loss": 0.0379, "num_input_tokens_seen": 4859208, "step": 53965 }, { "epoch": 14.025467775467776, "grad_norm": 0.09860727190971375, "learning_rate": 1.2404866966053218e-05, "loss": 0.3601, "num_input_tokens_seen": 4859640, "step": 53970 }, { "epoch": 14.026767151767151, "grad_norm": 13.133456230163574, "learning_rate": 1.2399969793002445e-05, "loss": 0.117, "num_input_tokens_seen": 4860136, "step": 53975 }, { "epoch": 14.028066528066528, "grad_norm": 0.252862811088562, "learning_rate": 1.2395073267983295e-05, "loss": 0.1646, "num_input_tokens_seen": 4860536, "step": 53980 }, { "epoch": 14.029365904365905, "grad_norm": 0.04104495793581009, "learning_rate": 1.2390177391247614e-05, "loss": 0.0751, "num_input_tokens_seen": 4861000, "step": 53985 }, { "epoch": 14.03066528066528, "grad_norm": 29.953697204589844, "learning_rate": 1.2385282163047182e-05, "loss": 0.3616, "num_input_tokens_seen": 4861464, "step": 53990 }, { "epoch": 14.031964656964657, "grad_norm": 0.05687200650572777, "learning_rate": 1.2380387583633787e-05, "loss": 0.0934, "num_input_tokens_seen": 4861912, "step": 53995 }, { "epoch": 14.033264033264032, "grad_norm": 0.389658659696579, "learning_rate": 1.237549365325914e-05, "loss": 0.0813, "num_input_tokens_seen": 4862376, "step": 54000 }, { "epoch": 14.03456340956341, "grad_norm": 0.091789111495018, "learning_rate": 1.2370600372174949e-05, "loss": 0.2254, "num_input_tokens_seen": 4862824, "step": 54005 }, { "epoch": 14.035862785862786, "grad_norm": 15.301214218139648, "learning_rate": 1.2365707740632882e-05, "loss": 0.1288, "num_input_tokens_seen": 4863288, "step": 54010 }, { "epoch": 14.037162162162161, "grad_norm": 1.335709810256958, "learning_rate": 1.2360815758884583e-05, "loss": 0.2121, "num_input_tokens_seen": 4863720, "step": 54015 }, { "epoch": 14.038461538461538, "grad_norm": 5.755972385406494, "learning_rate": 1.2355924427181628e-05, "loss": 0.3636, "num_input_tokens_seen": 4864184, "step": 54020 }, { "epoch": 14.039760914760915, "grad_norm": 1.7846146821975708, "learning_rate": 1.2351033745775606e-05, "loss": 0.1657, "num_input_tokens_seen": 4864680, "step": 54025 }, { "epoch": 14.04106029106029, "grad_norm": 15.03701114654541, "learning_rate": 1.2346143714918038e-05, "loss": 0.1428, "num_input_tokens_seen": 4865208, "step": 54030 }, { "epoch": 14.042359667359667, "grad_norm": 4.956134796142578, "learning_rate": 1.234125433486041e-05, "loss": 0.3363, "num_input_tokens_seen": 4865608, "step": 54035 }, { "epoch": 14.043659043659044, "grad_norm": 1.3525049686431885, "learning_rate": 1.2336365605854205e-05, "loss": 0.2118, "num_input_tokens_seen": 4866040, "step": 54040 }, { "epoch": 14.04495841995842, "grad_norm": 32.49235916137695, "learning_rate": 1.2331477528150851e-05, "loss": 0.2227, "num_input_tokens_seen": 4866456, "step": 54045 }, { "epoch": 14.046257796257796, "grad_norm": 13.714117050170898, "learning_rate": 1.2326590102001753e-05, "loss": 0.1541, "num_input_tokens_seen": 4866920, "step": 54050 }, { "epoch": 14.047557172557173, "grad_norm": 14.32756233215332, "learning_rate": 1.232170332765826e-05, "loss": 0.3082, "num_input_tokens_seen": 4867368, "step": 54055 }, { "epoch": 14.048856548856548, "grad_norm": 22.260700225830078, "learning_rate": 1.2316817205371722e-05, "loss": 0.0639, "num_input_tokens_seen": 4867816, "step": 54060 }, { "epoch": 14.050155925155925, "grad_norm": 0.21281127631664276, "learning_rate": 1.2311931735393417e-05, "loss": 0.0308, "num_input_tokens_seen": 4868264, "step": 54065 }, { "epoch": 14.051455301455302, "grad_norm": 14.484063148498535, "learning_rate": 1.2307046917974632e-05, "loss": 0.1428, "num_input_tokens_seen": 4868680, "step": 54070 }, { "epoch": 14.052754677754677, "grad_norm": 1.5885674953460693, "learning_rate": 1.2302162753366575e-05, "loss": 0.3188, "num_input_tokens_seen": 4869112, "step": 54075 }, { "epoch": 14.054054054054054, "grad_norm": 0.013826809823513031, "learning_rate": 1.2297279241820455e-05, "loss": 0.2976, "num_input_tokens_seen": 4869560, "step": 54080 }, { "epoch": 14.05535343035343, "grad_norm": 0.04746311530470848, "learning_rate": 1.2292396383587434e-05, "loss": 0.002, "num_input_tokens_seen": 4870008, "step": 54085 }, { "epoch": 14.056652806652806, "grad_norm": 0.5594767928123474, "learning_rate": 1.2287514178918651e-05, "loss": 0.0775, "num_input_tokens_seen": 4870424, "step": 54090 }, { "epoch": 14.057952182952183, "grad_norm": 1.4027200937271118, "learning_rate": 1.2282632628065197e-05, "loss": 0.0047, "num_input_tokens_seen": 4870872, "step": 54095 }, { "epoch": 14.059251559251559, "grad_norm": 0.7620280981063843, "learning_rate": 1.2277751731278123e-05, "loss": 0.4385, "num_input_tokens_seen": 4871352, "step": 54100 }, { "epoch": 14.060550935550935, "grad_norm": 0.9343066811561584, "learning_rate": 1.2272871488808473e-05, "loss": 0.3143, "num_input_tokens_seen": 4871800, "step": 54105 }, { "epoch": 14.061850311850312, "grad_norm": 11.395575523376465, "learning_rate": 1.226799190090723e-05, "loss": 0.3432, "num_input_tokens_seen": 4872264, "step": 54110 }, { "epoch": 14.063149688149688, "grad_norm": 7.287204742431641, "learning_rate": 1.2263112967825364e-05, "loss": 0.1115, "num_input_tokens_seen": 4872680, "step": 54115 }, { "epoch": 14.064449064449065, "grad_norm": 22.23944664001465, "learning_rate": 1.22582346898138e-05, "loss": 0.0722, "num_input_tokens_seen": 4873160, "step": 54120 }, { "epoch": 14.065748440748441, "grad_norm": 0.17138153314590454, "learning_rate": 1.2253357067123444e-05, "loss": 0.1309, "num_input_tokens_seen": 4873576, "step": 54125 }, { "epoch": 14.067047817047817, "grad_norm": 7.405505657196045, "learning_rate": 1.2248480100005138e-05, "loss": 0.3798, "num_input_tokens_seen": 4874040, "step": 54130 }, { "epoch": 14.068347193347194, "grad_norm": 19.838972091674805, "learning_rate": 1.2243603788709725e-05, "loss": 0.5191, "num_input_tokens_seen": 4874472, "step": 54135 }, { "epoch": 14.06964656964657, "grad_norm": 6.162128448486328, "learning_rate": 1.2238728133487986e-05, "loss": 0.2301, "num_input_tokens_seen": 4874920, "step": 54140 }, { "epoch": 14.070945945945946, "grad_norm": 17.89653968811035, "learning_rate": 1.2233853134590697e-05, "loss": 0.0979, "num_input_tokens_seen": 4875384, "step": 54145 }, { "epoch": 14.072245322245323, "grad_norm": 1.5046840906143188, "learning_rate": 1.2228978792268561e-05, "loss": 0.0917, "num_input_tokens_seen": 4875800, "step": 54150 }, { "epoch": 14.073544698544698, "grad_norm": 16.443296432495117, "learning_rate": 1.2224105106772282e-05, "loss": 0.373, "num_input_tokens_seen": 4876248, "step": 54155 }, { "epoch": 14.074844074844075, "grad_norm": 10.117279052734375, "learning_rate": 1.221923207835253e-05, "loss": 0.1949, "num_input_tokens_seen": 4876712, "step": 54160 }, { "epoch": 14.076143451143452, "grad_norm": 1.0617334842681885, "learning_rate": 1.221435970725991e-05, "loss": 0.1749, "num_input_tokens_seen": 4877160, "step": 54165 }, { "epoch": 14.077442827442827, "grad_norm": 0.6579989194869995, "learning_rate": 1.2209487993745025e-05, "loss": 0.0194, "num_input_tokens_seen": 4877560, "step": 54170 }, { "epoch": 14.078742203742204, "grad_norm": 8.317298889160156, "learning_rate": 1.2204616938058422e-05, "loss": 0.3709, "num_input_tokens_seen": 4877976, "step": 54175 }, { "epoch": 14.08004158004158, "grad_norm": 0.016742710024118423, "learning_rate": 1.2199746540450641e-05, "loss": 0.0126, "num_input_tokens_seen": 4878408, "step": 54180 }, { "epoch": 14.081340956340956, "grad_norm": 19.98126983642578, "learning_rate": 1.2194876801172148e-05, "loss": 0.4351, "num_input_tokens_seen": 4878824, "step": 54185 }, { "epoch": 14.082640332640333, "grad_norm": 11.788148880004883, "learning_rate": 1.2190007720473409e-05, "loss": 0.3283, "num_input_tokens_seen": 4879288, "step": 54190 }, { "epoch": 14.08393970893971, "grad_norm": 0.7803711891174316, "learning_rate": 1.218513929860485e-05, "loss": 0.1149, "num_input_tokens_seen": 4879736, "step": 54195 }, { "epoch": 14.085239085239085, "grad_norm": 0.631798267364502, "learning_rate": 1.2180271535816862e-05, "loss": 0.3026, "num_input_tokens_seen": 4880184, "step": 54200 }, { "epoch": 14.086538461538462, "grad_norm": 0.8825920820236206, "learning_rate": 1.2175404432359785e-05, "loss": 0.3486, "num_input_tokens_seen": 4880616, "step": 54205 }, { "epoch": 14.087837837837839, "grad_norm": 0.6088595390319824, "learning_rate": 1.2170537988483954e-05, "loss": 0.1371, "num_input_tokens_seen": 4881064, "step": 54210 }, { "epoch": 14.089137214137214, "grad_norm": 11.608604431152344, "learning_rate": 1.2165672204439638e-05, "loss": 0.3393, "num_input_tokens_seen": 4881512, "step": 54215 }, { "epoch": 14.09043659043659, "grad_norm": 4.652196407318115, "learning_rate": 1.2160807080477108e-05, "loss": 0.2001, "num_input_tokens_seen": 4881976, "step": 54220 }, { "epoch": 14.091735966735968, "grad_norm": 7.448561668395996, "learning_rate": 1.215594261684656e-05, "loss": 0.2905, "num_input_tokens_seen": 4882424, "step": 54225 }, { "epoch": 14.093035343035343, "grad_norm": 2.826775550842285, "learning_rate": 1.2151078813798194e-05, "loss": 0.0686, "num_input_tokens_seen": 4882856, "step": 54230 }, { "epoch": 14.09433471933472, "grad_norm": 5.0432329177856445, "learning_rate": 1.2146215671582161e-05, "loss": 0.2296, "num_input_tokens_seen": 4883304, "step": 54235 }, { "epoch": 14.095634095634095, "grad_norm": 15.233043670654297, "learning_rate": 1.2141353190448565e-05, "loss": 0.1054, "num_input_tokens_seen": 4883736, "step": 54240 }, { "epoch": 14.096933471933472, "grad_norm": 0.14742827415466309, "learning_rate": 1.2136491370647505e-05, "loss": 0.0091, "num_input_tokens_seen": 4884216, "step": 54245 }, { "epoch": 14.098232848232849, "grad_norm": 0.6462780237197876, "learning_rate": 1.2131630212429007e-05, "loss": 0.0374, "num_input_tokens_seen": 4884664, "step": 54250 }, { "epoch": 14.099532224532224, "grad_norm": 0.052723195403814316, "learning_rate": 1.2126769716043099e-05, "loss": 0.064, "num_input_tokens_seen": 4885080, "step": 54255 }, { "epoch": 14.1008316008316, "grad_norm": 10.920740127563477, "learning_rate": 1.2121909881739758e-05, "loss": 0.3594, "num_input_tokens_seen": 4885512, "step": 54260 }, { "epoch": 14.102130977130978, "grad_norm": 3.149793863296509, "learning_rate": 1.211705070976894e-05, "loss": 0.0451, "num_input_tokens_seen": 4885976, "step": 54265 }, { "epoch": 14.103430353430353, "grad_norm": 22.679040908813477, "learning_rate": 1.211219220038054e-05, "loss": 0.421, "num_input_tokens_seen": 4886472, "step": 54270 }, { "epoch": 14.10472972972973, "grad_norm": 11.691275596618652, "learning_rate": 1.210733435382445e-05, "loss": 0.0599, "num_input_tokens_seen": 4886904, "step": 54275 }, { "epoch": 14.106029106029107, "grad_norm": 0.2902768850326538, "learning_rate": 1.2102477170350501e-05, "loss": 0.3187, "num_input_tokens_seen": 4887368, "step": 54280 }, { "epoch": 14.107328482328482, "grad_norm": 3.501587390899658, "learning_rate": 1.2097620650208519e-05, "loss": 0.1591, "num_input_tokens_seen": 4887880, "step": 54285 }, { "epoch": 14.108627858627859, "grad_norm": 0.25041019916534424, "learning_rate": 1.2092764793648259e-05, "loss": 0.1652, "num_input_tokens_seen": 4888328, "step": 54290 }, { "epoch": 14.109927234927236, "grad_norm": 17.073421478271484, "learning_rate": 1.208790960091947e-05, "loss": 0.247, "num_input_tokens_seen": 4888792, "step": 54295 }, { "epoch": 14.111226611226611, "grad_norm": 0.5374945402145386, "learning_rate": 1.2083055072271876e-05, "loss": 0.1345, "num_input_tokens_seen": 4889256, "step": 54300 }, { "epoch": 14.112525987525988, "grad_norm": 16.157127380371094, "learning_rate": 1.2078201207955123e-05, "loss": 0.2319, "num_input_tokens_seen": 4889688, "step": 54305 }, { "epoch": 14.113825363825363, "grad_norm": 7.895064353942871, "learning_rate": 1.2073348008218874e-05, "loss": 0.0726, "num_input_tokens_seen": 4890168, "step": 54310 }, { "epoch": 14.11512474012474, "grad_norm": 7.247234344482422, "learning_rate": 1.2068495473312713e-05, "loss": 0.1601, "num_input_tokens_seen": 4890616, "step": 54315 }, { "epoch": 14.116424116424117, "grad_norm": 0.43227270245552063, "learning_rate": 1.206364360348623e-05, "loss": 0.1792, "num_input_tokens_seen": 4891096, "step": 54320 }, { "epoch": 14.117723492723492, "grad_norm": 0.911734402179718, "learning_rate": 1.205879239898894e-05, "loss": 0.0847, "num_input_tokens_seen": 4891544, "step": 54325 }, { "epoch": 14.119022869022869, "grad_norm": 1.3171813488006592, "learning_rate": 1.2053941860070359e-05, "loss": 0.1422, "num_input_tokens_seen": 4891976, "step": 54330 }, { "epoch": 14.120322245322246, "grad_norm": 0.21752218902111053, "learning_rate": 1.2049091986979951e-05, "loss": 0.2331, "num_input_tokens_seen": 4892408, "step": 54335 }, { "epoch": 14.121621621621621, "grad_norm": 16.11522674560547, "learning_rate": 1.2044242779967163e-05, "loss": 0.3091, "num_input_tokens_seen": 4892856, "step": 54340 }, { "epoch": 14.122920997920998, "grad_norm": 0.058774497359991074, "learning_rate": 1.203939423928137e-05, "loss": 0.154, "num_input_tokens_seen": 4893320, "step": 54345 }, { "epoch": 14.124220374220375, "grad_norm": 11.208722114562988, "learning_rate": 1.2034546365171959e-05, "loss": 0.0866, "num_input_tokens_seen": 4893784, "step": 54350 }, { "epoch": 14.12551975051975, "grad_norm": 1.9727084636688232, "learning_rate": 1.2029699157888253e-05, "loss": 0.2029, "num_input_tokens_seen": 4894232, "step": 54355 }, { "epoch": 14.126819126819127, "grad_norm": 1.1401371955871582, "learning_rate": 1.2024852617679538e-05, "loss": 0.2178, "num_input_tokens_seen": 4894712, "step": 54360 }, { "epoch": 14.128118503118504, "grad_norm": 16.590984344482422, "learning_rate": 1.2020006744795084e-05, "loss": 0.0716, "num_input_tokens_seen": 4895192, "step": 54365 }, { "epoch": 14.129417879417879, "grad_norm": 0.6829769015312195, "learning_rate": 1.2015161539484118e-05, "loss": 0.0689, "num_input_tokens_seen": 4895672, "step": 54370 }, { "epoch": 14.130717255717256, "grad_norm": 10.720647811889648, "learning_rate": 1.2010317001995846e-05, "loss": 0.1683, "num_input_tokens_seen": 4896152, "step": 54375 }, { "epoch": 14.132016632016631, "grad_norm": 0.37928053736686707, "learning_rate": 1.200547313257941e-05, "loss": 0.3482, "num_input_tokens_seen": 4896584, "step": 54380 }, { "epoch": 14.133316008316008, "grad_norm": 0.19245950877666473, "learning_rate": 1.2000629931483947e-05, "loss": 0.0137, "num_input_tokens_seen": 4896984, "step": 54385 }, { "epoch": 14.134615384615385, "grad_norm": 9.852219581604004, "learning_rate": 1.1995787398958536e-05, "loss": 0.1266, "num_input_tokens_seen": 4897400, "step": 54390 }, { "epoch": 14.13591476091476, "grad_norm": 0.06254692375659943, "learning_rate": 1.199094553525225e-05, "loss": 0.0311, "num_input_tokens_seen": 4897848, "step": 54395 }, { "epoch": 14.137214137214137, "grad_norm": 7.045491695404053, "learning_rate": 1.1986104340614086e-05, "loss": 0.0999, "num_input_tokens_seen": 4898280, "step": 54400 }, { "epoch": 14.138513513513514, "grad_norm": 2.0828566551208496, "learning_rate": 1.1981263815293044e-05, "loss": 0.3163, "num_input_tokens_seen": 4898728, "step": 54405 }, { "epoch": 14.13981288981289, "grad_norm": 0.9886488914489746, "learning_rate": 1.1976423959538083e-05, "loss": 0.0146, "num_input_tokens_seen": 4899192, "step": 54410 }, { "epoch": 14.141112266112266, "grad_norm": 3.7264463901519775, "learning_rate": 1.1971584773598122e-05, "loss": 0.3132, "num_input_tokens_seen": 4899624, "step": 54415 }, { "epoch": 14.142411642411643, "grad_norm": 0.22403401136398315, "learning_rate": 1.1966746257722042e-05, "loss": 0.1545, "num_input_tokens_seen": 4900072, "step": 54420 }, { "epoch": 14.143711018711018, "grad_norm": 3.826603651046753, "learning_rate": 1.1961908412158678e-05, "loss": 0.3527, "num_input_tokens_seen": 4900520, "step": 54425 }, { "epoch": 14.145010395010395, "grad_norm": 21.355621337890625, "learning_rate": 1.1957071237156867e-05, "loss": 0.1277, "num_input_tokens_seen": 4901000, "step": 54430 }, { "epoch": 14.146309771309772, "grad_norm": 0.8136791586875916, "learning_rate": 1.195223473296537e-05, "loss": 0.1707, "num_input_tokens_seen": 4901432, "step": 54435 }, { "epoch": 14.147609147609147, "grad_norm": 15.848416328430176, "learning_rate": 1.1947398899832942e-05, "loss": 0.2763, "num_input_tokens_seen": 4901880, "step": 54440 }, { "epoch": 14.148908523908524, "grad_norm": 0.2488149255514145, "learning_rate": 1.1942563738008298e-05, "loss": 0.0382, "num_input_tokens_seen": 4902312, "step": 54445 }, { "epoch": 14.1502079002079, "grad_norm": 18.85959815979004, "learning_rate": 1.1937729247740117e-05, "loss": 0.5015, "num_input_tokens_seen": 4902744, "step": 54450 }, { "epoch": 14.151507276507276, "grad_norm": 0.10004095733165741, "learning_rate": 1.193289542927703e-05, "loss": 0.192, "num_input_tokens_seen": 4903192, "step": 54455 }, { "epoch": 14.152806652806653, "grad_norm": 6.367443084716797, "learning_rate": 1.192806228286766e-05, "loss": 0.0833, "num_input_tokens_seen": 4903624, "step": 54460 }, { "epoch": 14.154106029106028, "grad_norm": 21.14436912536621, "learning_rate": 1.1923229808760564e-05, "loss": 0.1691, "num_input_tokens_seen": 4904104, "step": 54465 }, { "epoch": 14.155405405405405, "grad_norm": 4.184819221496582, "learning_rate": 1.1918398007204295e-05, "loss": 0.2275, "num_input_tokens_seen": 4904584, "step": 54470 }, { "epoch": 14.156704781704782, "grad_norm": 1.3449947834014893, "learning_rate": 1.1913566878447332e-05, "loss": 0.1759, "num_input_tokens_seen": 4905016, "step": 54475 }, { "epoch": 14.158004158004157, "grad_norm": 1.9313918352127075, "learning_rate": 1.1908736422738184e-05, "loss": 0.4742, "num_input_tokens_seen": 4905496, "step": 54480 }, { "epoch": 14.159303534303534, "grad_norm": 4.622630596160889, "learning_rate": 1.1903906640325262e-05, "loss": 0.1621, "num_input_tokens_seen": 4905912, "step": 54485 }, { "epoch": 14.160602910602911, "grad_norm": 1.4910166263580322, "learning_rate": 1.1899077531456965e-05, "loss": 0.1456, "num_input_tokens_seen": 4906344, "step": 54490 }, { "epoch": 14.161902286902286, "grad_norm": 0.2645978331565857, "learning_rate": 1.1894249096381668e-05, "loss": 0.173, "num_input_tokens_seen": 4906792, "step": 54495 }, { "epoch": 14.163201663201663, "grad_norm": 0.19690553843975067, "learning_rate": 1.1889421335347692e-05, "loss": 0.1825, "num_input_tokens_seen": 4907256, "step": 54500 }, { "epoch": 14.16450103950104, "grad_norm": 2.050800085067749, "learning_rate": 1.1884594248603336e-05, "loss": 0.2729, "num_input_tokens_seen": 4907736, "step": 54505 }, { "epoch": 14.165800415800415, "grad_norm": 9.694396018981934, "learning_rate": 1.1879767836396865e-05, "loss": 0.2532, "num_input_tokens_seen": 4908200, "step": 54510 }, { "epoch": 14.167099792099792, "grad_norm": 0.011702132411301136, "learning_rate": 1.1874942098976515e-05, "loss": 0.3775, "num_input_tokens_seen": 4908616, "step": 54515 }, { "epoch": 14.16839916839917, "grad_norm": 2.900662422180176, "learning_rate": 1.1870117036590458e-05, "loss": 0.0945, "num_input_tokens_seen": 4909064, "step": 54520 }, { "epoch": 14.169698544698544, "grad_norm": 17.351093292236328, "learning_rate": 1.1865292649486872e-05, "loss": 0.1807, "num_input_tokens_seen": 4909496, "step": 54525 }, { "epoch": 14.170997920997921, "grad_norm": 0.1418464630842209, "learning_rate": 1.1860468937913863e-05, "loss": 0.0055, "num_input_tokens_seen": 4909960, "step": 54530 }, { "epoch": 14.172297297297296, "grad_norm": 20.29608154296875, "learning_rate": 1.1855645902119534e-05, "loss": 0.1734, "num_input_tokens_seen": 4910408, "step": 54535 }, { "epoch": 14.173596673596673, "grad_norm": 0.11805178225040436, "learning_rate": 1.1850823542351921e-05, "loss": 0.1748, "num_input_tokens_seen": 4910840, "step": 54540 }, { "epoch": 14.17489604989605, "grad_norm": 16.402706146240234, "learning_rate": 1.1846001858859054e-05, "loss": 0.2456, "num_input_tokens_seen": 4911336, "step": 54545 }, { "epoch": 14.176195426195425, "grad_norm": 1.8960208892822266, "learning_rate": 1.1841180851888922e-05, "loss": 0.2503, "num_input_tokens_seen": 4911768, "step": 54550 }, { "epoch": 14.177494802494802, "grad_norm": 10.21685791015625, "learning_rate": 1.1836360521689457e-05, "loss": 0.0746, "num_input_tokens_seen": 4912200, "step": 54555 }, { "epoch": 14.17879417879418, "grad_norm": 6.840574741363525, "learning_rate": 1.1831540868508595e-05, "loss": 0.2983, "num_input_tokens_seen": 4912648, "step": 54560 }, { "epoch": 14.180093555093555, "grad_norm": 0.078790083527565, "learning_rate": 1.1826721892594193e-05, "loss": 0.0044, "num_input_tokens_seen": 4913112, "step": 54565 }, { "epoch": 14.181392931392931, "grad_norm": 2.5206007957458496, "learning_rate": 1.1821903594194115e-05, "loss": 0.7243, "num_input_tokens_seen": 4913528, "step": 54570 }, { "epoch": 14.182692307692308, "grad_norm": 20.698101043701172, "learning_rate": 1.1817085973556152e-05, "loss": 0.1812, "num_input_tokens_seen": 4913976, "step": 54575 }, { "epoch": 14.183991683991684, "grad_norm": 2.6952884197235107, "learning_rate": 1.181226903092809e-05, "loss": 0.2728, "num_input_tokens_seen": 4914408, "step": 54580 }, { "epoch": 14.18529106029106, "grad_norm": 7.304747581481934, "learning_rate": 1.1807452766557667e-05, "loss": 0.0398, "num_input_tokens_seen": 4914888, "step": 54585 }, { "epoch": 14.186590436590437, "grad_norm": 3.516153335571289, "learning_rate": 1.18026371806926e-05, "loss": 0.1489, "num_input_tokens_seen": 4915368, "step": 54590 }, { "epoch": 14.187889812889813, "grad_norm": 19.88918113708496, "learning_rate": 1.1797822273580537e-05, "loss": 0.2344, "num_input_tokens_seen": 4915848, "step": 54595 }, { "epoch": 14.18918918918919, "grad_norm": 0.43531039357185364, "learning_rate": 1.179300804546913e-05, "loss": 0.0106, "num_input_tokens_seen": 4916280, "step": 54600 }, { "epoch": 14.190488565488565, "grad_norm": 7.491873264312744, "learning_rate": 1.178819449660597e-05, "loss": 0.4316, "num_input_tokens_seen": 4916696, "step": 54605 }, { "epoch": 14.191787941787942, "grad_norm": 0.18441982567310333, "learning_rate": 1.1783381627238632e-05, "loss": 0.0835, "num_input_tokens_seen": 4917144, "step": 54610 }, { "epoch": 14.193087318087318, "grad_norm": 0.10104060173034668, "learning_rate": 1.1778569437614634e-05, "loss": 0.1009, "num_input_tokens_seen": 4917624, "step": 54615 }, { "epoch": 14.194386694386694, "grad_norm": 0.7011296153068542, "learning_rate": 1.1773757927981473e-05, "loss": 0.0134, "num_input_tokens_seen": 4918040, "step": 54620 }, { "epoch": 14.19568607068607, "grad_norm": 9.431678771972656, "learning_rate": 1.1768947098586628e-05, "loss": 0.4459, "num_input_tokens_seen": 4918488, "step": 54625 }, { "epoch": 14.196985446985448, "grad_norm": 0.3581819534301758, "learning_rate": 1.17641369496775e-05, "loss": 0.078, "num_input_tokens_seen": 4918984, "step": 54630 }, { "epoch": 14.198284823284823, "grad_norm": 1.1907353401184082, "learning_rate": 1.17593274815015e-05, "loss": 0.0706, "num_input_tokens_seen": 4919416, "step": 54635 }, { "epoch": 14.1995841995842, "grad_norm": 13.703784942626953, "learning_rate": 1.1754518694305966e-05, "loss": 0.2187, "num_input_tokens_seen": 4919832, "step": 54640 }, { "epoch": 14.200883575883577, "grad_norm": 19.832042694091797, "learning_rate": 1.1749710588338236e-05, "loss": 0.1561, "num_input_tokens_seen": 4920296, "step": 54645 }, { "epoch": 14.202182952182952, "grad_norm": 0.4237421154975891, "learning_rate": 1.1744903163845577e-05, "loss": 0.3276, "num_input_tokens_seen": 4920728, "step": 54650 }, { "epoch": 14.203482328482329, "grad_norm": 0.05198889225721359, "learning_rate": 1.1740096421075247e-05, "loss": 0.0545, "num_input_tokens_seen": 4921192, "step": 54655 }, { "epoch": 14.204781704781706, "grad_norm": 0.10678669810295105, "learning_rate": 1.1735290360274465e-05, "loss": 0.3165, "num_input_tokens_seen": 4921656, "step": 54660 }, { "epoch": 14.20608108108108, "grad_norm": 11.559745788574219, "learning_rate": 1.1730484981690418e-05, "loss": 0.0789, "num_input_tokens_seen": 4922152, "step": 54665 }, { "epoch": 14.207380457380458, "grad_norm": 0.27749836444854736, "learning_rate": 1.1725680285570233e-05, "loss": 0.3325, "num_input_tokens_seen": 4922632, "step": 54670 }, { "epoch": 14.208679833679835, "grad_norm": 10.459149360656738, "learning_rate": 1.1720876272161041e-05, "loss": 0.0926, "num_input_tokens_seen": 4923032, "step": 54675 }, { "epoch": 14.20997920997921, "grad_norm": 27.64877700805664, "learning_rate": 1.1716072941709905e-05, "loss": 0.2985, "num_input_tokens_seen": 4923496, "step": 54680 }, { "epoch": 14.211278586278587, "grad_norm": 4.0280537605285645, "learning_rate": 1.1711270294463856e-05, "loss": 0.2845, "num_input_tokens_seen": 4923960, "step": 54685 }, { "epoch": 14.212577962577962, "grad_norm": 25.331769943237305, "learning_rate": 1.1706468330669907e-05, "loss": 0.1599, "num_input_tokens_seen": 4924456, "step": 54690 }, { "epoch": 14.213877338877339, "grad_norm": 2.04325008392334, "learning_rate": 1.170166705057503e-05, "loss": 0.2869, "num_input_tokens_seen": 4924888, "step": 54695 }, { "epoch": 14.215176715176716, "grad_norm": 0.16376550495624542, "learning_rate": 1.1696866454426166e-05, "loss": 0.5704, "num_input_tokens_seen": 4925320, "step": 54700 }, { "epoch": 14.21647609147609, "grad_norm": 19.700754165649414, "learning_rate": 1.1692066542470201e-05, "loss": 0.1949, "num_input_tokens_seen": 4925816, "step": 54705 }, { "epoch": 14.217775467775468, "grad_norm": 0.17972111701965332, "learning_rate": 1.1687267314954011e-05, "loss": 0.0762, "num_input_tokens_seen": 4926248, "step": 54710 }, { "epoch": 14.219074844074845, "grad_norm": 0.34122610092163086, "learning_rate": 1.168246877212441e-05, "loss": 0.1632, "num_input_tokens_seen": 4926680, "step": 54715 }, { "epoch": 14.22037422037422, "grad_norm": 1.4320653676986694, "learning_rate": 1.1677670914228211e-05, "loss": 0.1252, "num_input_tokens_seen": 4927128, "step": 54720 }, { "epoch": 14.221673596673597, "grad_norm": 0.4629940688610077, "learning_rate": 1.1672873741512142e-05, "loss": 0.0562, "num_input_tokens_seen": 4927592, "step": 54725 }, { "epoch": 14.222972972972974, "grad_norm": 0.7310236096382141, "learning_rate": 1.1668077254222964e-05, "loss": 0.3424, "num_input_tokens_seen": 4928008, "step": 54730 }, { "epoch": 14.224272349272349, "grad_norm": 1.5394541025161743, "learning_rate": 1.166328145260734e-05, "loss": 0.1506, "num_input_tokens_seen": 4928456, "step": 54735 }, { "epoch": 14.225571725571726, "grad_norm": 0.11138750612735748, "learning_rate": 1.1658486336911936e-05, "loss": 0.2104, "num_input_tokens_seen": 4928904, "step": 54740 }, { "epoch": 14.226871101871103, "grad_norm": 0.8597388863563538, "learning_rate": 1.1653691907383362e-05, "loss": 0.2187, "num_input_tokens_seen": 4929352, "step": 54745 }, { "epoch": 14.228170478170478, "grad_norm": 4.219477653503418, "learning_rate": 1.1648898164268195e-05, "loss": 0.29, "num_input_tokens_seen": 4929832, "step": 54750 }, { "epoch": 14.229469854469855, "grad_norm": 7.033961772918701, "learning_rate": 1.1644105107812986e-05, "loss": 0.3011, "num_input_tokens_seen": 4930296, "step": 54755 }, { "epoch": 14.23076923076923, "grad_norm": 0.023063570261001587, "learning_rate": 1.1639312738264249e-05, "loss": 0.2057, "num_input_tokens_seen": 4930728, "step": 54760 }, { "epoch": 14.232068607068607, "grad_norm": 18.087711334228516, "learning_rate": 1.163452105586847e-05, "loss": 0.3294, "num_input_tokens_seen": 4931176, "step": 54765 }, { "epoch": 14.233367983367984, "grad_norm": 19.284841537475586, "learning_rate": 1.1629730060872069e-05, "loss": 0.3236, "num_input_tokens_seen": 4931592, "step": 54770 }, { "epoch": 14.234667359667359, "grad_norm": 24.622175216674805, "learning_rate": 1.1624939753521468e-05, "loss": 0.207, "num_input_tokens_seen": 4932088, "step": 54775 }, { "epoch": 14.235966735966736, "grad_norm": 0.14796678721904755, "learning_rate": 1.1620150134063026e-05, "loss": 0.1053, "num_input_tokens_seen": 4932552, "step": 54780 }, { "epoch": 14.237266112266113, "grad_norm": 1.4262726306915283, "learning_rate": 1.1615361202743088e-05, "loss": 0.0152, "num_input_tokens_seen": 4933000, "step": 54785 }, { "epoch": 14.238565488565488, "grad_norm": 1.2265206575393677, "learning_rate": 1.161057295980794e-05, "loss": 0.5301, "num_input_tokens_seen": 4933448, "step": 54790 }, { "epoch": 14.239864864864865, "grad_norm": 31.10529327392578, "learning_rate": 1.1605785405503854e-05, "loss": 0.3307, "num_input_tokens_seen": 4933912, "step": 54795 }, { "epoch": 14.241164241164242, "grad_norm": 2.82212495803833, "learning_rate": 1.160099854007706e-05, "loss": 0.1542, "num_input_tokens_seen": 4934376, "step": 54800 }, { "epoch": 14.242463617463617, "grad_norm": 0.12470610439777374, "learning_rate": 1.159621236377376e-05, "loss": 0.1316, "num_input_tokens_seen": 4934824, "step": 54805 }, { "epoch": 14.243762993762994, "grad_norm": 0.3818508982658386, "learning_rate": 1.1591426876840098e-05, "loss": 0.2234, "num_input_tokens_seen": 4935272, "step": 54810 }, { "epoch": 14.24506237006237, "grad_norm": 13.630096435546875, "learning_rate": 1.1586642079522189e-05, "loss": 0.1276, "num_input_tokens_seen": 4935752, "step": 54815 }, { "epoch": 14.246361746361746, "grad_norm": 2.3309326171875, "learning_rate": 1.1581857972066141e-05, "loss": 0.0753, "num_input_tokens_seen": 4936248, "step": 54820 }, { "epoch": 14.247661122661123, "grad_norm": 0.27945753931999207, "learning_rate": 1.1577074554717984e-05, "loss": 0.1669, "num_input_tokens_seen": 4936712, "step": 54825 }, { "epoch": 14.248960498960498, "grad_norm": 1.8595305681228638, "learning_rate": 1.1572291827723743e-05, "loss": 0.1252, "num_input_tokens_seen": 4937160, "step": 54830 }, { "epoch": 14.250259875259875, "grad_norm": 8.987370491027832, "learning_rate": 1.1567509791329401e-05, "loss": 0.0419, "num_input_tokens_seen": 4937608, "step": 54835 }, { "epoch": 14.251559251559252, "grad_norm": 0.8082084655761719, "learning_rate": 1.156272844578091e-05, "loss": 0.1525, "num_input_tokens_seen": 4938072, "step": 54840 }, { "epoch": 14.252858627858627, "grad_norm": 5.210679054260254, "learning_rate": 1.1557947791324161e-05, "loss": 0.0556, "num_input_tokens_seen": 4938520, "step": 54845 }, { "epoch": 14.254158004158004, "grad_norm": 13.74821949005127, "learning_rate": 1.1553167828205044e-05, "loss": 0.241, "num_input_tokens_seen": 4939000, "step": 54850 }, { "epoch": 14.255457380457381, "grad_norm": 14.452102661132812, "learning_rate": 1.1548388556669382e-05, "loss": 0.2495, "num_input_tokens_seen": 4939448, "step": 54855 }, { "epoch": 14.256756756756756, "grad_norm": 18.473491668701172, "learning_rate": 1.1543609976962994e-05, "loss": 0.1978, "num_input_tokens_seen": 4939896, "step": 54860 }, { "epoch": 14.258056133056133, "grad_norm": 2.362481117248535, "learning_rate": 1.1538832089331628e-05, "loss": 0.2739, "num_input_tokens_seen": 4940344, "step": 54865 }, { "epoch": 14.25935550935551, "grad_norm": 0.4969541132450104, "learning_rate": 1.1534054894021027e-05, "loss": 0.2601, "num_input_tokens_seen": 4940776, "step": 54870 }, { "epoch": 14.260654885654885, "grad_norm": 0.4234413504600525, "learning_rate": 1.1529278391276893e-05, "loss": 0.0485, "num_input_tokens_seen": 4941192, "step": 54875 }, { "epoch": 14.261954261954262, "grad_norm": 0.07024436444044113, "learning_rate": 1.152450258134487e-05, "loss": 0.1894, "num_input_tokens_seen": 4941624, "step": 54880 }, { "epoch": 14.263253638253639, "grad_norm": 0.23685947060585022, "learning_rate": 1.1519727464470601e-05, "loss": 0.0925, "num_input_tokens_seen": 4942072, "step": 54885 }, { "epoch": 14.264553014553014, "grad_norm": 4.586449146270752, "learning_rate": 1.1514953040899657e-05, "loss": 0.0347, "num_input_tokens_seen": 4942520, "step": 54890 }, { "epoch": 14.265852390852391, "grad_norm": 0.04534332826733589, "learning_rate": 1.1510179310877603e-05, "loss": 0.3623, "num_input_tokens_seen": 4942984, "step": 54895 }, { "epoch": 14.267151767151766, "grad_norm": 15.115876197814941, "learning_rate": 1.1505406274649947e-05, "loss": 0.2599, "num_input_tokens_seen": 4943432, "step": 54900 }, { "epoch": 14.268451143451143, "grad_norm": 0.9386492371559143, "learning_rate": 1.1500633932462175e-05, "loss": 0.0215, "num_input_tokens_seen": 4943880, "step": 54905 }, { "epoch": 14.26975051975052, "grad_norm": 17.752187728881836, "learning_rate": 1.1495862284559733e-05, "loss": 0.1273, "num_input_tokens_seen": 4944328, "step": 54910 }, { "epoch": 14.271049896049895, "grad_norm": 1.177998423576355, "learning_rate": 1.1491091331188046e-05, "loss": 0.0784, "num_input_tokens_seen": 4944776, "step": 54915 }, { "epoch": 14.272349272349272, "grad_norm": 16.556562423706055, "learning_rate": 1.1486321072592463e-05, "loss": 0.2035, "num_input_tokens_seen": 4945224, "step": 54920 }, { "epoch": 14.27364864864865, "grad_norm": 0.07857800275087357, "learning_rate": 1.1481551509018346e-05, "loss": 0.39, "num_input_tokens_seen": 4945672, "step": 54925 }, { "epoch": 14.274948024948024, "grad_norm": 1.0564029216766357, "learning_rate": 1.1476782640710975e-05, "loss": 0.3978, "num_input_tokens_seen": 4946088, "step": 54930 }, { "epoch": 14.276247401247401, "grad_norm": 1.8131816387176514, "learning_rate": 1.1472014467915643e-05, "loss": 0.4211, "num_input_tokens_seen": 4946552, "step": 54935 }, { "epoch": 14.277546777546778, "grad_norm": 10.347414016723633, "learning_rate": 1.1467246990877558e-05, "loss": 0.0947, "num_input_tokens_seen": 4946984, "step": 54940 }, { "epoch": 14.278846153846153, "grad_norm": 18.338584899902344, "learning_rate": 1.1462480209841928e-05, "loss": 0.1477, "num_input_tokens_seen": 4947496, "step": 54945 }, { "epoch": 14.28014553014553, "grad_norm": 4.439962387084961, "learning_rate": 1.145771412505392e-05, "loss": 0.4159, "num_input_tokens_seen": 4947944, "step": 54950 }, { "epoch": 14.281444906444907, "grad_norm": 5.021059513092041, "learning_rate": 1.1452948736758642e-05, "loss": 0.1696, "num_input_tokens_seen": 4948408, "step": 54955 }, { "epoch": 14.282744282744282, "grad_norm": 25.655969619750977, "learning_rate": 1.14481840452012e-05, "loss": 0.1276, "num_input_tokens_seen": 4948856, "step": 54960 }, { "epoch": 14.28404365904366, "grad_norm": 15.488547325134277, "learning_rate": 1.1443420050626625e-05, "loss": 0.3439, "num_input_tokens_seen": 4949288, "step": 54965 }, { "epoch": 14.285343035343036, "grad_norm": 26.125600814819336, "learning_rate": 1.1438656753279958e-05, "loss": 0.2791, "num_input_tokens_seen": 4949768, "step": 54970 }, { "epoch": 14.286642411642411, "grad_norm": 19.520597457885742, "learning_rate": 1.143389415340615e-05, "loss": 0.2949, "num_input_tokens_seen": 4950200, "step": 54975 }, { "epoch": 14.287941787941788, "grad_norm": 0.13069145381450653, "learning_rate": 1.142913225125018e-05, "loss": 0.0291, "num_input_tokens_seen": 4950664, "step": 54980 }, { "epoch": 14.289241164241163, "grad_norm": 6.583937168121338, "learning_rate": 1.1424371047056933e-05, "loss": 0.044, "num_input_tokens_seen": 4951096, "step": 54985 }, { "epoch": 14.29054054054054, "grad_norm": 0.015030386857688427, "learning_rate": 1.1419610541071301e-05, "loss": 0.0805, "num_input_tokens_seen": 4951528, "step": 54990 }, { "epoch": 14.291839916839917, "grad_norm": 6.478636264801025, "learning_rate": 1.1414850733538101e-05, "loss": 0.1336, "num_input_tokens_seen": 4951960, "step": 54995 }, { "epoch": 14.293139293139292, "grad_norm": 32.1744499206543, "learning_rate": 1.1410091624702155e-05, "loss": 0.2438, "num_input_tokens_seen": 4952424, "step": 55000 }, { "epoch": 14.29443866943867, "grad_norm": 2.7234973907470703, "learning_rate": 1.1405333214808209e-05, "loss": 0.2543, "num_input_tokens_seen": 4952840, "step": 55005 }, { "epoch": 14.295738045738046, "grad_norm": 6.883953094482422, "learning_rate": 1.1400575504101e-05, "loss": 0.0309, "num_input_tokens_seen": 4953288, "step": 55010 }, { "epoch": 14.297037422037421, "grad_norm": 1.0355323553085327, "learning_rate": 1.139581849282523e-05, "loss": 0.4379, "num_input_tokens_seen": 4953704, "step": 55015 }, { "epoch": 14.298336798336798, "grad_norm": 23.46756935119629, "learning_rate": 1.1391062181225545e-05, "loss": 0.2113, "num_input_tokens_seen": 4954200, "step": 55020 }, { "epoch": 14.299636174636175, "grad_norm": 0.09810932725667953, "learning_rate": 1.138630656954658e-05, "loss": 0.108, "num_input_tokens_seen": 4954664, "step": 55025 }, { "epoch": 14.30093555093555, "grad_norm": 0.48227694630622864, "learning_rate": 1.1381551658032899e-05, "loss": 0.1815, "num_input_tokens_seen": 4955080, "step": 55030 }, { "epoch": 14.302234927234927, "grad_norm": 0.25848686695098877, "learning_rate": 1.1376797446929077e-05, "loss": 0.0256, "num_input_tokens_seen": 4955528, "step": 55035 }, { "epoch": 14.303534303534304, "grad_norm": 0.4525302052497864, "learning_rate": 1.1372043936479606e-05, "loss": 0.1845, "num_input_tokens_seen": 4955960, "step": 55040 }, { "epoch": 14.30483367983368, "grad_norm": 0.23775024712085724, "learning_rate": 1.1367291126928976e-05, "loss": 0.1956, "num_input_tokens_seen": 4956424, "step": 55045 }, { "epoch": 14.306133056133056, "grad_norm": 1.657219409942627, "learning_rate": 1.136253901852162e-05, "loss": 0.0178, "num_input_tokens_seen": 4956840, "step": 55050 }, { "epoch": 14.307432432432432, "grad_norm": 0.12182256579399109, "learning_rate": 1.1357787611501961e-05, "loss": 0.1978, "num_input_tokens_seen": 4957272, "step": 55055 }, { "epoch": 14.308731808731808, "grad_norm": 5.5421342849731445, "learning_rate": 1.1353036906114347e-05, "loss": 0.3809, "num_input_tokens_seen": 4957720, "step": 55060 }, { "epoch": 14.310031185031185, "grad_norm": 13.265715599060059, "learning_rate": 1.1348286902603131e-05, "loss": 0.3269, "num_input_tokens_seen": 4958152, "step": 55065 }, { "epoch": 14.31133056133056, "grad_norm": 21.69022560119629, "learning_rate": 1.1343537601212597e-05, "loss": 0.1651, "num_input_tokens_seen": 4958584, "step": 55070 }, { "epoch": 14.312629937629938, "grad_norm": 2.121697425842285, "learning_rate": 1.1338789002187002e-05, "loss": 0.0814, "num_input_tokens_seen": 4959048, "step": 55075 }, { "epoch": 14.313929313929314, "grad_norm": 12.93210220336914, "learning_rate": 1.1334041105770579e-05, "loss": 0.3224, "num_input_tokens_seen": 4959480, "step": 55080 }, { "epoch": 14.31522869022869, "grad_norm": 8.1306791305542, "learning_rate": 1.1329293912207517e-05, "loss": 0.1216, "num_input_tokens_seen": 4959928, "step": 55085 }, { "epoch": 14.316528066528067, "grad_norm": 0.23805704712867737, "learning_rate": 1.1324547421741973e-05, "loss": 0.0708, "num_input_tokens_seen": 4960408, "step": 55090 }, { "epoch": 14.317827442827443, "grad_norm": 2.0542845726013184, "learning_rate": 1.1319801634618052e-05, "loss": 0.0768, "num_input_tokens_seen": 4960840, "step": 55095 }, { "epoch": 14.319126819126819, "grad_norm": 2.8287298679351807, "learning_rate": 1.131505655107985e-05, "loss": 0.1543, "num_input_tokens_seen": 4961256, "step": 55100 }, { "epoch": 14.320426195426196, "grad_norm": 2.4351117610931396, "learning_rate": 1.1310312171371393e-05, "loss": 0.1519, "num_input_tokens_seen": 4961688, "step": 55105 }, { "epoch": 14.321725571725572, "grad_norm": 0.5784457921981812, "learning_rate": 1.1305568495736705e-05, "loss": 0.127, "num_input_tokens_seen": 4962152, "step": 55110 }, { "epoch": 14.323024948024948, "grad_norm": 0.03802168741822243, "learning_rate": 1.1300825524419741e-05, "loss": 0.0613, "num_input_tokens_seen": 4962600, "step": 55115 }, { "epoch": 14.324324324324325, "grad_norm": 2.3929331302642822, "learning_rate": 1.129608325766445e-05, "loss": 0.0473, "num_input_tokens_seen": 4963016, "step": 55120 }, { "epoch": 14.325623700623701, "grad_norm": 8.5264310836792, "learning_rate": 1.1291341695714725e-05, "loss": 0.1422, "num_input_tokens_seen": 4963432, "step": 55125 }, { "epoch": 14.326923076923077, "grad_norm": 0.041836097836494446, "learning_rate": 1.128660083881444e-05, "loss": 0.0015, "num_input_tokens_seen": 4963864, "step": 55130 }, { "epoch": 14.328222453222454, "grad_norm": 0.12073870003223419, "learning_rate": 1.1281860687207414e-05, "loss": 0.0503, "num_input_tokens_seen": 4964328, "step": 55135 }, { "epoch": 14.329521829521829, "grad_norm": 20.474666595458984, "learning_rate": 1.1277121241137429e-05, "loss": 0.241, "num_input_tokens_seen": 4964776, "step": 55140 }, { "epoch": 14.330821205821206, "grad_norm": 0.6930012106895447, "learning_rate": 1.1272382500848256e-05, "loss": 0.1562, "num_input_tokens_seen": 4965224, "step": 55145 }, { "epoch": 14.332120582120583, "grad_norm": 0.06553448736667633, "learning_rate": 1.1267644466583593e-05, "loss": 0.032, "num_input_tokens_seen": 4965656, "step": 55150 }, { "epoch": 14.333419958419958, "grad_norm": 0.07025589793920517, "learning_rate": 1.1262907138587134e-05, "loss": 0.1622, "num_input_tokens_seen": 4966088, "step": 55155 }, { "epoch": 14.334719334719335, "grad_norm": 3.6947412490844727, "learning_rate": 1.1258170517102523e-05, "loss": 0.1592, "num_input_tokens_seen": 4966552, "step": 55160 }, { "epoch": 14.336018711018712, "grad_norm": 29.465791702270508, "learning_rate": 1.1253434602373379e-05, "loss": 0.4322, "num_input_tokens_seen": 4967032, "step": 55165 }, { "epoch": 14.337318087318087, "grad_norm": 15.578601837158203, "learning_rate": 1.1248699394643255e-05, "loss": 0.2444, "num_input_tokens_seen": 4967496, "step": 55170 }, { "epoch": 14.338617463617464, "grad_norm": 1.4837335348129272, "learning_rate": 1.1243964894155707e-05, "loss": 0.2453, "num_input_tokens_seen": 4967928, "step": 55175 }, { "epoch": 14.33991683991684, "grad_norm": 1.7381714582443237, "learning_rate": 1.1239231101154213e-05, "loss": 0.3627, "num_input_tokens_seen": 4968424, "step": 55180 }, { "epoch": 14.341216216216216, "grad_norm": 28.622652053833008, "learning_rate": 1.1234498015882261e-05, "loss": 0.3512, "num_input_tokens_seen": 4968872, "step": 55185 }, { "epoch": 14.342515592515593, "grad_norm": 0.26574334502220154, "learning_rate": 1.1229765638583253e-05, "loss": 0.0748, "num_input_tokens_seen": 4969304, "step": 55190 }, { "epoch": 14.34381496881497, "grad_norm": 2.1504926681518555, "learning_rate": 1.1225033969500595e-05, "loss": 0.1231, "num_input_tokens_seen": 4969720, "step": 55195 }, { "epoch": 14.345114345114345, "grad_norm": 0.27462300658226013, "learning_rate": 1.1220303008877647e-05, "loss": 0.1625, "num_input_tokens_seen": 4970184, "step": 55200 }, { "epoch": 14.346413721413722, "grad_norm": 5.479010105133057, "learning_rate": 1.121557275695771e-05, "loss": 0.039, "num_input_tokens_seen": 4970600, "step": 55205 }, { "epoch": 14.347713097713097, "grad_norm": 4.715871334075928, "learning_rate": 1.1210843213984082e-05, "loss": 0.0419, "num_input_tokens_seen": 4971048, "step": 55210 }, { "epoch": 14.349012474012474, "grad_norm": 0.840610682964325, "learning_rate": 1.1206114380199989e-05, "loss": 0.2134, "num_input_tokens_seen": 4971496, "step": 55215 }, { "epoch": 14.35031185031185, "grad_norm": 14.92360782623291, "learning_rate": 1.120138625584866e-05, "loss": 0.0575, "num_input_tokens_seen": 4971960, "step": 55220 }, { "epoch": 14.351611226611226, "grad_norm": 4.87674617767334, "learning_rate": 1.119665884117324e-05, "loss": 0.2924, "num_input_tokens_seen": 4972392, "step": 55225 }, { "epoch": 14.352910602910603, "grad_norm": 14.063222885131836, "learning_rate": 1.1191932136416899e-05, "loss": 0.4533, "num_input_tokens_seen": 4972824, "step": 55230 }, { "epoch": 14.35420997920998, "grad_norm": 1.2673120498657227, "learning_rate": 1.1187206141822709e-05, "loss": 0.4946, "num_input_tokens_seen": 4973272, "step": 55235 }, { "epoch": 14.355509355509355, "grad_norm": 19.557043075561523, "learning_rate": 1.1182480857633751e-05, "loss": 0.1285, "num_input_tokens_seen": 4973720, "step": 55240 }, { "epoch": 14.356808731808732, "grad_norm": 9.105474472045898, "learning_rate": 1.1177756284093032e-05, "loss": 0.2292, "num_input_tokens_seen": 4974168, "step": 55245 }, { "epoch": 14.358108108108109, "grad_norm": 0.3263797461986542, "learning_rate": 1.1173032421443563e-05, "loss": 0.3128, "num_input_tokens_seen": 4974584, "step": 55250 }, { "epoch": 14.359407484407484, "grad_norm": 14.751261711120605, "learning_rate": 1.1168309269928276e-05, "loss": 0.4611, "num_input_tokens_seen": 4975048, "step": 55255 }, { "epoch": 14.36070686070686, "grad_norm": 1.6141859292984009, "learning_rate": 1.1163586829790093e-05, "loss": 0.0431, "num_input_tokens_seen": 4975512, "step": 55260 }, { "epoch": 14.362006237006238, "grad_norm": 0.9244036674499512, "learning_rate": 1.1158865101271906e-05, "loss": 0.0238, "num_input_tokens_seen": 4975976, "step": 55265 }, { "epoch": 14.363305613305613, "grad_norm": 0.7587863802909851, "learning_rate": 1.115414408461654e-05, "loss": 0.1331, "num_input_tokens_seen": 4976424, "step": 55270 }, { "epoch": 14.36460498960499, "grad_norm": 6.788325786590576, "learning_rate": 1.1149423780066822e-05, "loss": 0.4267, "num_input_tokens_seen": 4976888, "step": 55275 }, { "epoch": 14.365904365904367, "grad_norm": 16.570201873779297, "learning_rate": 1.1144704187865499e-05, "loss": 0.0857, "num_input_tokens_seen": 4977320, "step": 55280 }, { "epoch": 14.367203742203742, "grad_norm": 0.032714616507291794, "learning_rate": 1.1139985308255324e-05, "loss": 0.0111, "num_input_tokens_seen": 4977800, "step": 55285 }, { "epoch": 14.368503118503119, "grad_norm": 18.654449462890625, "learning_rate": 1.1135267141478974e-05, "loss": 0.1091, "num_input_tokens_seen": 4978232, "step": 55290 }, { "epoch": 14.369802494802494, "grad_norm": 0.05503879860043526, "learning_rate": 1.1130549687779124e-05, "loss": 0.0786, "num_input_tokens_seen": 4978648, "step": 55295 }, { "epoch": 14.371101871101871, "grad_norm": 0.791581392288208, "learning_rate": 1.112583294739839e-05, "loss": 0.0096, "num_input_tokens_seen": 4979064, "step": 55300 }, { "epoch": 14.372401247401248, "grad_norm": 0.03210517764091492, "learning_rate": 1.112111692057937e-05, "loss": 0.5474, "num_input_tokens_seen": 4979512, "step": 55305 }, { "epoch": 14.373700623700623, "grad_norm": 20.437055587768555, "learning_rate": 1.1116401607564594e-05, "loss": 0.1518, "num_input_tokens_seen": 4979976, "step": 55310 }, { "epoch": 14.375, "grad_norm": 0.5085955858230591, "learning_rate": 1.1111687008596597e-05, "loss": 0.294, "num_input_tokens_seen": 4980424, "step": 55315 }, { "epoch": 14.376299376299377, "grad_norm": 10.230269432067871, "learning_rate": 1.1106973123917835e-05, "loss": 0.3171, "num_input_tokens_seen": 4980872, "step": 55320 }, { "epoch": 14.377598752598752, "grad_norm": 12.443428993225098, "learning_rate": 1.1102259953770766e-05, "loss": 0.2647, "num_input_tokens_seen": 4981320, "step": 55325 }, { "epoch": 14.378898128898129, "grad_norm": 0.015623491257429123, "learning_rate": 1.1097547498397776e-05, "loss": 0.1358, "num_input_tokens_seen": 4981768, "step": 55330 }, { "epoch": 14.380197505197506, "grad_norm": 0.4628157317638397, "learning_rate": 1.1092835758041237e-05, "loss": 0.0567, "num_input_tokens_seen": 4982216, "step": 55335 }, { "epoch": 14.381496881496881, "grad_norm": 17.257381439208984, "learning_rate": 1.108812473294349e-05, "loss": 0.3454, "num_input_tokens_seen": 4982664, "step": 55340 }, { "epoch": 14.382796257796258, "grad_norm": 0.015549598261713982, "learning_rate": 1.1083414423346807e-05, "loss": 0.0383, "num_input_tokens_seen": 4983128, "step": 55345 }, { "epoch": 14.384095634095633, "grad_norm": 13.984769821166992, "learning_rate": 1.1078704829493467e-05, "loss": 0.1641, "num_input_tokens_seen": 4983544, "step": 55350 }, { "epoch": 14.38539501039501, "grad_norm": 0.4273552894592285, "learning_rate": 1.1073995951625666e-05, "loss": 0.0304, "num_input_tokens_seen": 4984056, "step": 55355 }, { "epoch": 14.386694386694387, "grad_norm": 32.00065612792969, "learning_rate": 1.1069287789985606e-05, "loss": 0.2913, "num_input_tokens_seen": 4984504, "step": 55360 }, { "epoch": 14.387993762993762, "grad_norm": 9.656477928161621, "learning_rate": 1.106458034481541e-05, "loss": 0.1154, "num_input_tokens_seen": 4984920, "step": 55365 }, { "epoch": 14.38929313929314, "grad_norm": 0.14656999707221985, "learning_rate": 1.1059873616357202e-05, "loss": 0.0724, "num_input_tokens_seen": 4985384, "step": 55370 }, { "epoch": 14.390592515592516, "grad_norm": 12.152290344238281, "learning_rate": 1.1055167604853051e-05, "loss": 0.2418, "num_input_tokens_seen": 4985800, "step": 55375 }, { "epoch": 14.391891891891891, "grad_norm": 17.237333297729492, "learning_rate": 1.1050462310544999e-05, "loss": 0.1878, "num_input_tokens_seen": 4986248, "step": 55380 }, { "epoch": 14.393191268191268, "grad_norm": 0.08092469722032547, "learning_rate": 1.1045757733675027e-05, "loss": 0.2274, "num_input_tokens_seen": 4986696, "step": 55385 }, { "epoch": 14.394490644490645, "grad_norm": 0.005226528737694025, "learning_rate": 1.1041053874485115e-05, "loss": 0.4848, "num_input_tokens_seen": 4987160, "step": 55390 }, { "epoch": 14.39579002079002, "grad_norm": 24.316123962402344, "learning_rate": 1.1036350733217177e-05, "loss": 0.3148, "num_input_tokens_seen": 4987592, "step": 55395 }, { "epoch": 14.397089397089397, "grad_norm": 16.27909278869629, "learning_rate": 1.1031648310113091e-05, "loss": 0.1364, "num_input_tokens_seen": 4988040, "step": 55400 }, { "epoch": 14.398388773388774, "grad_norm": 9.362265586853027, "learning_rate": 1.1026946605414717e-05, "loss": 0.1558, "num_input_tokens_seen": 4988488, "step": 55405 }, { "epoch": 14.39968814968815, "grad_norm": 23.32602310180664, "learning_rate": 1.1022245619363866e-05, "loss": 0.1221, "num_input_tokens_seen": 4988936, "step": 55410 }, { "epoch": 14.400987525987526, "grad_norm": 7.837785720825195, "learning_rate": 1.1017545352202328e-05, "loss": 0.2673, "num_input_tokens_seen": 4989384, "step": 55415 }, { "epoch": 14.402286902286903, "grad_norm": 12.34321117401123, "learning_rate": 1.1012845804171815e-05, "loss": 0.6079, "num_input_tokens_seen": 4989832, "step": 55420 }, { "epoch": 14.403586278586278, "grad_norm": 2.6061060428619385, "learning_rate": 1.1008146975514059e-05, "loss": 0.0182, "num_input_tokens_seen": 4990296, "step": 55425 }, { "epoch": 14.404885654885655, "grad_norm": 4.250555038452148, "learning_rate": 1.1003448866470698e-05, "loss": 0.414, "num_input_tokens_seen": 4990728, "step": 55430 }, { "epoch": 14.40618503118503, "grad_norm": 0.14896175265312195, "learning_rate": 1.099875147728338e-05, "loss": 0.0835, "num_input_tokens_seen": 4991176, "step": 55435 }, { "epoch": 14.407484407484407, "grad_norm": 24.402183532714844, "learning_rate": 1.0994054808193686e-05, "loss": 0.0813, "num_input_tokens_seen": 4991592, "step": 55440 }, { "epoch": 14.408783783783784, "grad_norm": 18.48971176147461, "learning_rate": 1.0989358859443166e-05, "loss": 0.3687, "num_input_tokens_seen": 4992072, "step": 55445 }, { "epoch": 14.41008316008316, "grad_norm": 0.9013104438781738, "learning_rate": 1.0984663631273348e-05, "loss": 0.217, "num_input_tokens_seen": 4992488, "step": 55450 }, { "epoch": 14.411382536382536, "grad_norm": 6.9810285568237305, "learning_rate": 1.0979969123925718e-05, "loss": 0.0434, "num_input_tokens_seen": 4992968, "step": 55455 }, { "epoch": 14.412681912681913, "grad_norm": 19.842140197753906, "learning_rate": 1.0975275337641708e-05, "loss": 0.2729, "num_input_tokens_seen": 4993432, "step": 55460 }, { "epoch": 14.413981288981288, "grad_norm": 1.2682479619979858, "learning_rate": 1.0970582272662717e-05, "loss": 0.5102, "num_input_tokens_seen": 4994008, "step": 55465 }, { "epoch": 14.415280665280665, "grad_norm": 13.142613410949707, "learning_rate": 1.096588992923013e-05, "loss": 0.2532, "num_input_tokens_seen": 4994472, "step": 55470 }, { "epoch": 14.416580041580042, "grad_norm": 4.049235820770264, "learning_rate": 1.0961198307585252e-05, "loss": 0.4475, "num_input_tokens_seen": 4994920, "step": 55475 }, { "epoch": 14.417879417879417, "grad_norm": 0.27889421582221985, "learning_rate": 1.0956507407969416e-05, "loss": 0.211, "num_input_tokens_seen": 4995400, "step": 55480 }, { "epoch": 14.419178794178794, "grad_norm": 5.968803882598877, "learning_rate": 1.0951817230623852e-05, "loss": 0.0929, "num_input_tokens_seen": 4995864, "step": 55485 }, { "epoch": 14.420478170478171, "grad_norm": 0.2068275511264801, "learning_rate": 1.0947127775789795e-05, "loss": 0.1206, "num_input_tokens_seen": 4996312, "step": 55490 }, { "epoch": 14.421777546777546, "grad_norm": 2.370554208755493, "learning_rate": 1.0942439043708413e-05, "loss": 0.2718, "num_input_tokens_seen": 4996760, "step": 55495 }, { "epoch": 14.423076923076923, "grad_norm": 23.218324661254883, "learning_rate": 1.0937751034620869e-05, "loss": 0.1129, "num_input_tokens_seen": 4997224, "step": 55500 }, { "epoch": 14.424376299376299, "grad_norm": 10.139236450195312, "learning_rate": 1.0933063748768254e-05, "loss": 0.2952, "num_input_tokens_seen": 4997688, "step": 55505 }, { "epoch": 14.425675675675675, "grad_norm": 23.126285552978516, "learning_rate": 1.0928377186391647e-05, "loss": 0.4714, "num_input_tokens_seen": 4998120, "step": 55510 }, { "epoch": 14.426975051975052, "grad_norm": 0.059868671000003815, "learning_rate": 1.0923691347732087e-05, "loss": 0.0084, "num_input_tokens_seen": 4998584, "step": 55515 }, { "epoch": 14.428274428274428, "grad_norm": 0.18031898140907288, "learning_rate": 1.0919006233030576e-05, "loss": 0.2081, "num_input_tokens_seen": 4999048, "step": 55520 }, { "epoch": 14.429573804573804, "grad_norm": 0.1252857893705368, "learning_rate": 1.0914321842528057e-05, "loss": 0.0358, "num_input_tokens_seen": 4999496, "step": 55525 }, { "epoch": 14.430873180873181, "grad_norm": 6.771641731262207, "learning_rate": 1.0909638176465473e-05, "loss": 0.1483, "num_input_tokens_seen": 4999912, "step": 55530 }, { "epoch": 14.432172557172557, "grad_norm": 10.668607711791992, "learning_rate": 1.0904955235083696e-05, "loss": 0.3365, "num_input_tokens_seen": 5000328, "step": 55535 }, { "epoch": 14.433471933471933, "grad_norm": 0.42850908637046814, "learning_rate": 1.0900273018623569e-05, "loss": 0.1237, "num_input_tokens_seen": 5000760, "step": 55540 }, { "epoch": 14.43477130977131, "grad_norm": 14.08563232421875, "learning_rate": 1.0895591527325911e-05, "loss": 0.2263, "num_input_tokens_seen": 5001224, "step": 55545 }, { "epoch": 14.436070686070686, "grad_norm": 7.234191417694092, "learning_rate": 1.0890910761431491e-05, "loss": 0.0689, "num_input_tokens_seen": 5001688, "step": 55550 }, { "epoch": 14.437370062370062, "grad_norm": 0.16527295112609863, "learning_rate": 1.0886230721181063e-05, "loss": 0.2176, "num_input_tokens_seen": 5002104, "step": 55555 }, { "epoch": 14.43866943866944, "grad_norm": 39.781734466552734, "learning_rate": 1.0881551406815298e-05, "loss": 0.3273, "num_input_tokens_seen": 5002584, "step": 55560 }, { "epoch": 14.439968814968815, "grad_norm": 0.6306085586547852, "learning_rate": 1.0876872818574884e-05, "loss": 0.1421, "num_input_tokens_seen": 5003000, "step": 55565 }, { "epoch": 14.441268191268192, "grad_norm": 3.1078245639801025, "learning_rate": 1.0872194956700423e-05, "loss": 0.1529, "num_input_tokens_seen": 5003448, "step": 55570 }, { "epoch": 14.442567567567568, "grad_norm": 4.698678493499756, "learning_rate": 1.0867517821432524e-05, "loss": 0.0786, "num_input_tokens_seen": 5003880, "step": 55575 }, { "epoch": 14.443866943866944, "grad_norm": 2.5066516399383545, "learning_rate": 1.0862841413011713e-05, "loss": 0.1559, "num_input_tokens_seen": 5004328, "step": 55580 }, { "epoch": 14.44516632016632, "grad_norm": 15.76130485534668, "learning_rate": 1.0858165731678513e-05, "loss": 0.0967, "num_input_tokens_seen": 5004760, "step": 55585 }, { "epoch": 14.446465696465696, "grad_norm": 0.012831765227019787, "learning_rate": 1.08534907776734e-05, "loss": 0.1884, "num_input_tokens_seen": 5005240, "step": 55590 }, { "epoch": 14.447765072765073, "grad_norm": 35.42512130737305, "learning_rate": 1.084881655123682e-05, "loss": 0.1965, "num_input_tokens_seen": 5005672, "step": 55595 }, { "epoch": 14.44906444906445, "grad_norm": 19.857362747192383, "learning_rate": 1.0844143052609165e-05, "loss": 0.612, "num_input_tokens_seen": 5006104, "step": 55600 }, { "epoch": 14.450363825363825, "grad_norm": 0.10501223057508469, "learning_rate": 1.0839470282030787e-05, "loss": 0.2932, "num_input_tokens_seen": 5006520, "step": 55605 }, { "epoch": 14.451663201663202, "grad_norm": 6.020063877105713, "learning_rate": 1.0834798239742028e-05, "loss": 0.0482, "num_input_tokens_seen": 5006968, "step": 55610 }, { "epoch": 14.452962577962579, "grad_norm": 16.96111297607422, "learning_rate": 1.0830126925983158e-05, "loss": 0.1943, "num_input_tokens_seen": 5007416, "step": 55615 }, { "epoch": 14.454261954261954, "grad_norm": 16.31308364868164, "learning_rate": 1.082545634099444e-05, "loss": 0.1614, "num_input_tokens_seen": 5007848, "step": 55620 }, { "epoch": 14.45556133056133, "grad_norm": 0.9293377995491028, "learning_rate": 1.0820786485016082e-05, "loss": 0.1556, "num_input_tokens_seen": 5008312, "step": 55625 }, { "epoch": 14.456860706860708, "grad_norm": 18.033395767211914, "learning_rate": 1.0816117358288272e-05, "loss": 0.472, "num_input_tokens_seen": 5008792, "step": 55630 }, { "epoch": 14.458160083160083, "grad_norm": 7.228826522827148, "learning_rate": 1.0811448961051123e-05, "loss": 0.0923, "num_input_tokens_seen": 5009208, "step": 55635 }, { "epoch": 14.45945945945946, "grad_norm": 2.3408126831054688, "learning_rate": 1.0806781293544763e-05, "loss": 0.0187, "num_input_tokens_seen": 5009640, "step": 55640 }, { "epoch": 14.460758835758837, "grad_norm": 5.24503755569458, "learning_rate": 1.080211435600923e-05, "loss": 0.1256, "num_input_tokens_seen": 5010072, "step": 55645 }, { "epoch": 14.462058212058212, "grad_norm": 2.0779712200164795, "learning_rate": 1.0797448148684566e-05, "loss": 0.1538, "num_input_tokens_seen": 5010568, "step": 55650 }, { "epoch": 14.463357588357589, "grad_norm": 0.013149782083928585, "learning_rate": 1.0792782671810745e-05, "loss": 0.1975, "num_input_tokens_seen": 5011048, "step": 55655 }, { "epoch": 14.464656964656964, "grad_norm": 0.23238171637058258, "learning_rate": 1.0788117925627724e-05, "loss": 0.0198, "num_input_tokens_seen": 5011480, "step": 55660 }, { "epoch": 14.46595634095634, "grad_norm": 6.880216121673584, "learning_rate": 1.0783453910375424e-05, "loss": 0.1834, "num_input_tokens_seen": 5011928, "step": 55665 }, { "epoch": 14.467255717255718, "grad_norm": 11.685359001159668, "learning_rate": 1.07787906262937e-05, "loss": 0.3127, "num_input_tokens_seen": 5012360, "step": 55670 }, { "epoch": 14.468555093555093, "grad_norm": 0.533006489276886, "learning_rate": 1.077412807362241e-05, "loss": 0.2066, "num_input_tokens_seen": 5012792, "step": 55675 }, { "epoch": 14.46985446985447, "grad_norm": 16.56940460205078, "learning_rate": 1.0769466252601337e-05, "loss": 0.5146, "num_input_tokens_seen": 5013224, "step": 55680 }, { "epoch": 14.471153846153847, "grad_norm": 0.05780533701181412, "learning_rate": 1.076480516347026e-05, "loss": 0.6988, "num_input_tokens_seen": 5013704, "step": 55685 }, { "epoch": 14.472453222453222, "grad_norm": 10.71416187286377, "learning_rate": 1.076014480646888e-05, "loss": 0.1649, "num_input_tokens_seen": 5014136, "step": 55690 }, { "epoch": 14.473752598752599, "grad_norm": 0.30174410343170166, "learning_rate": 1.0755485181836897e-05, "loss": 0.0616, "num_input_tokens_seen": 5014568, "step": 55695 }, { "epoch": 14.475051975051976, "grad_norm": 7.347354888916016, "learning_rate": 1.0750826289813961e-05, "loss": 0.3179, "num_input_tokens_seen": 5015000, "step": 55700 }, { "epoch": 14.47635135135135, "grad_norm": 4.54085111618042, "learning_rate": 1.0746168130639689e-05, "loss": 0.2943, "num_input_tokens_seen": 5015464, "step": 55705 }, { "epoch": 14.477650727650728, "grad_norm": 1.7387168407440186, "learning_rate": 1.074151070455364e-05, "loss": 0.2997, "num_input_tokens_seen": 5015912, "step": 55710 }, { "epoch": 14.478950103950105, "grad_norm": 0.03135110065340996, "learning_rate": 1.0736854011795367e-05, "loss": 0.1018, "num_input_tokens_seen": 5016344, "step": 55715 }, { "epoch": 14.48024948024948, "grad_norm": 12.503750801086426, "learning_rate": 1.0732198052604347e-05, "loss": 0.237, "num_input_tokens_seen": 5016808, "step": 55720 }, { "epoch": 14.481548856548857, "grad_norm": 14.028278350830078, "learning_rate": 1.0727542827220056e-05, "loss": 0.1277, "num_input_tokens_seen": 5017256, "step": 55725 }, { "epoch": 14.482848232848234, "grad_norm": 1.1890645027160645, "learning_rate": 1.0722888335881917e-05, "loss": 0.5393, "num_input_tokens_seen": 5017704, "step": 55730 }, { "epoch": 14.484147609147609, "grad_norm": 17.60149383544922, "learning_rate": 1.0718234578829304e-05, "loss": 0.1758, "num_input_tokens_seen": 5018152, "step": 55735 }, { "epoch": 14.485446985446986, "grad_norm": 0.8629584312438965, "learning_rate": 1.0713581556301577e-05, "loss": 0.0448, "num_input_tokens_seen": 5018600, "step": 55740 }, { "epoch": 14.486746361746361, "grad_norm": 4.935962200164795, "learning_rate": 1.0708929268538034e-05, "loss": 0.1862, "num_input_tokens_seen": 5019080, "step": 55745 }, { "epoch": 14.488045738045738, "grad_norm": 1.8348019123077393, "learning_rate": 1.0704277715777957e-05, "loss": 0.2869, "num_input_tokens_seen": 5019560, "step": 55750 }, { "epoch": 14.489345114345115, "grad_norm": 10.12108325958252, "learning_rate": 1.0699626898260571e-05, "loss": 0.2389, "num_input_tokens_seen": 5020008, "step": 55755 }, { "epoch": 14.49064449064449, "grad_norm": 0.43729352951049805, "learning_rate": 1.0694976816225073e-05, "loss": 0.165, "num_input_tokens_seen": 5020440, "step": 55760 }, { "epoch": 14.491943866943867, "grad_norm": 3.4471418857574463, "learning_rate": 1.0690327469910622e-05, "loss": 0.1858, "num_input_tokens_seen": 5020920, "step": 55765 }, { "epoch": 14.493243243243244, "grad_norm": 8.670600891113281, "learning_rate": 1.0685678859556348e-05, "loss": 0.2386, "num_input_tokens_seen": 5021384, "step": 55770 }, { "epoch": 14.494542619542619, "grad_norm": 14.336621284484863, "learning_rate": 1.0681030985401319e-05, "loss": 0.1355, "num_input_tokens_seen": 5021800, "step": 55775 }, { "epoch": 14.495841995841996, "grad_norm": 0.2364189624786377, "learning_rate": 1.0676383847684595e-05, "loss": 0.0208, "num_input_tokens_seen": 5022280, "step": 55780 }, { "epoch": 14.497141372141373, "grad_norm": 0.17185820639133453, "learning_rate": 1.0671737446645163e-05, "loss": 0.031, "num_input_tokens_seen": 5022712, "step": 55785 }, { "epoch": 14.498440748440748, "grad_norm": 16.792449951171875, "learning_rate": 1.0667091782522015e-05, "loss": 0.1267, "num_input_tokens_seen": 5023128, "step": 55790 }, { "epoch": 14.499740124740125, "grad_norm": 0.16881516575813293, "learning_rate": 1.0662446855554059e-05, "loss": 0.1494, "num_input_tokens_seen": 5023592, "step": 55795 }, { "epoch": 14.5010395010395, "grad_norm": 14.153520584106445, "learning_rate": 1.0657802665980199e-05, "loss": 0.3456, "num_input_tokens_seen": 5024040, "step": 55800 }, { "epoch": 14.502338877338877, "grad_norm": 5.400756359100342, "learning_rate": 1.06531592140393e-05, "loss": 0.1242, "num_input_tokens_seen": 5024456, "step": 55805 }, { "epoch": 14.503638253638254, "grad_norm": 23.074583053588867, "learning_rate": 1.0648516499970159e-05, "loss": 0.4919, "num_input_tokens_seen": 5024952, "step": 55810 }, { "epoch": 14.50493762993763, "grad_norm": 2.7892136573791504, "learning_rate": 1.0643874524011573e-05, "loss": 0.0573, "num_input_tokens_seen": 5025448, "step": 55815 }, { "epoch": 14.506237006237006, "grad_norm": 14.491083145141602, "learning_rate": 1.063923328640227e-05, "loss": 0.2609, "num_input_tokens_seen": 5025912, "step": 55820 }, { "epoch": 14.507536382536383, "grad_norm": 3.5124666690826416, "learning_rate": 1.0634592787380965e-05, "loss": 0.029, "num_input_tokens_seen": 5026328, "step": 55825 }, { "epoch": 14.508835758835758, "grad_norm": 4.114378929138184, "learning_rate": 1.062995302718631e-05, "loss": 0.3407, "num_input_tokens_seen": 5026776, "step": 55830 }, { "epoch": 14.510135135135135, "grad_norm": 6.843684196472168, "learning_rate": 1.062531400605694e-05, "loss": 0.2351, "num_input_tokens_seen": 5027192, "step": 55835 }, { "epoch": 14.511434511434512, "grad_norm": 21.52175521850586, "learning_rate": 1.062067572423144e-05, "loss": 0.4485, "num_input_tokens_seen": 5027656, "step": 55840 }, { "epoch": 14.512733887733887, "grad_norm": 14.267661094665527, "learning_rate": 1.061603818194838e-05, "loss": 0.4607, "num_input_tokens_seen": 5028152, "step": 55845 }, { "epoch": 14.514033264033264, "grad_norm": 0.3736303150653839, "learning_rate": 1.0611401379446245e-05, "loss": 0.0057, "num_input_tokens_seen": 5028632, "step": 55850 }, { "epoch": 14.515332640332641, "grad_norm": 3.5017688274383545, "learning_rate": 1.0606765316963533e-05, "loss": 0.166, "num_input_tokens_seen": 5029032, "step": 55855 }, { "epoch": 14.516632016632016, "grad_norm": 0.022187281399965286, "learning_rate": 1.0602129994738672e-05, "loss": 0.2294, "num_input_tokens_seen": 5029512, "step": 55860 }, { "epoch": 14.517931392931393, "grad_norm": 0.38602110743522644, "learning_rate": 1.0597495413010053e-05, "loss": 0.104, "num_input_tokens_seen": 5029992, "step": 55865 }, { "epoch": 14.51923076923077, "grad_norm": 6.039670944213867, "learning_rate": 1.0592861572016042e-05, "loss": 0.5741, "num_input_tokens_seen": 5030472, "step": 55870 }, { "epoch": 14.520530145530145, "grad_norm": 0.15887552499771118, "learning_rate": 1.0588228471994966e-05, "loss": 0.2206, "num_input_tokens_seen": 5030888, "step": 55875 }, { "epoch": 14.521829521829522, "grad_norm": 21.142837524414062, "learning_rate": 1.0583596113185119e-05, "loss": 0.0834, "num_input_tokens_seen": 5031368, "step": 55880 }, { "epoch": 14.523128898128899, "grad_norm": 0.047587089240550995, "learning_rate": 1.0578964495824725e-05, "loss": 0.4676, "num_input_tokens_seen": 5031864, "step": 55885 }, { "epoch": 14.524428274428274, "grad_norm": 0.37582311034202576, "learning_rate": 1.0574333620152016e-05, "loss": 0.3727, "num_input_tokens_seen": 5032328, "step": 55890 }, { "epoch": 14.525727650727651, "grad_norm": 0.4410141408443451, "learning_rate": 1.0569703486405141e-05, "loss": 0.0385, "num_input_tokens_seen": 5032792, "step": 55895 }, { "epoch": 14.527027027027026, "grad_norm": 21.947269439697266, "learning_rate": 1.0565074094822252e-05, "loss": 0.3582, "num_input_tokens_seen": 5033240, "step": 55900 }, { "epoch": 14.528326403326403, "grad_norm": 7.6096110343933105, "learning_rate": 1.0560445445641423e-05, "loss": 0.2226, "num_input_tokens_seen": 5033656, "step": 55905 }, { "epoch": 14.52962577962578, "grad_norm": 3.7354319095611572, "learning_rate": 1.055581753910072e-05, "loss": 0.4209, "num_input_tokens_seen": 5034120, "step": 55910 }, { "epoch": 14.530925155925155, "grad_norm": 4.0666632652282715, "learning_rate": 1.0551190375438163e-05, "loss": 0.6899, "num_input_tokens_seen": 5034568, "step": 55915 }, { "epoch": 14.532224532224532, "grad_norm": 20.955154418945312, "learning_rate": 1.0546563954891736e-05, "loss": 0.344, "num_input_tokens_seen": 5035016, "step": 55920 }, { "epoch": 14.53352390852391, "grad_norm": 0.08869486302137375, "learning_rate": 1.0541938277699373e-05, "loss": 0.4807, "num_input_tokens_seen": 5035480, "step": 55925 }, { "epoch": 14.534823284823284, "grad_norm": 2.517174482345581, "learning_rate": 1.053731334409897e-05, "loss": 0.1715, "num_input_tokens_seen": 5035912, "step": 55930 }, { "epoch": 14.536122661122661, "grad_norm": 1.0658172369003296, "learning_rate": 1.0532689154328407e-05, "loss": 0.0143, "num_input_tokens_seen": 5036360, "step": 55935 }, { "epoch": 14.537422037422038, "grad_norm": 2.2208309173583984, "learning_rate": 1.0528065708625495e-05, "loss": 0.1324, "num_input_tokens_seen": 5036824, "step": 55940 }, { "epoch": 14.538721413721413, "grad_norm": 9.648567199707031, "learning_rate": 1.052344300722803e-05, "loss": 0.2786, "num_input_tokens_seen": 5037288, "step": 55945 }, { "epoch": 14.54002079002079, "grad_norm": 0.02175186760723591, "learning_rate": 1.0518821050373758e-05, "loss": 0.0081, "num_input_tokens_seen": 5037720, "step": 55950 }, { "epoch": 14.541320166320165, "grad_norm": 13.325606346130371, "learning_rate": 1.051419983830041e-05, "loss": 0.2565, "num_input_tokens_seen": 5038168, "step": 55955 }, { "epoch": 14.542619542619542, "grad_norm": 6.536252498626709, "learning_rate": 1.050957937124563e-05, "loss": 0.0573, "num_input_tokens_seen": 5038632, "step": 55960 }, { "epoch": 14.54391891891892, "grad_norm": 0.983636200428009, "learning_rate": 1.0504959649447077e-05, "loss": 0.1509, "num_input_tokens_seen": 5039064, "step": 55965 }, { "epoch": 14.545218295218294, "grad_norm": 0.12478818744421005, "learning_rate": 1.0500340673142331e-05, "loss": 0.7151, "num_input_tokens_seen": 5039528, "step": 55970 }, { "epoch": 14.546517671517671, "grad_norm": 0.1552596241235733, "learning_rate": 1.0495722442568954e-05, "loss": 0.1398, "num_input_tokens_seen": 5039992, "step": 55975 }, { "epoch": 14.547817047817048, "grad_norm": 18.02665138244629, "learning_rate": 1.0491104957964471e-05, "loss": 0.4608, "num_input_tokens_seen": 5040472, "step": 55980 }, { "epoch": 14.549116424116423, "grad_norm": 13.513374328613281, "learning_rate": 1.048648821956637e-05, "loss": 0.1565, "num_input_tokens_seen": 5040920, "step": 55985 }, { "epoch": 14.5504158004158, "grad_norm": 18.67021369934082, "learning_rate": 1.0481872227612085e-05, "loss": 0.1774, "num_input_tokens_seen": 5041384, "step": 55990 }, { "epoch": 14.551715176715177, "grad_norm": 0.710767924785614, "learning_rate": 1.0477256982339015e-05, "loss": 0.0062, "num_input_tokens_seen": 5041864, "step": 55995 }, { "epoch": 14.553014553014552, "grad_norm": 14.495352745056152, "learning_rate": 1.047264248398454e-05, "loss": 0.2707, "num_input_tokens_seen": 5042344, "step": 56000 }, { "epoch": 14.55431392931393, "grad_norm": 6.1088457107543945, "learning_rate": 1.0468028732785975e-05, "loss": 0.1264, "num_input_tokens_seen": 5042776, "step": 56005 }, { "epoch": 14.555613305613306, "grad_norm": 14.373013496398926, "learning_rate": 1.0463415728980617e-05, "loss": 0.3057, "num_input_tokens_seen": 5043256, "step": 56010 }, { "epoch": 14.556912681912682, "grad_norm": 19.372879028320312, "learning_rate": 1.0458803472805714e-05, "loss": 0.1803, "num_input_tokens_seen": 5043752, "step": 56015 }, { "epoch": 14.558212058212058, "grad_norm": 37.078468322753906, "learning_rate": 1.0454191964498491e-05, "loss": 0.4252, "num_input_tokens_seen": 5044232, "step": 56020 }, { "epoch": 14.559511434511435, "grad_norm": 19.307193756103516, "learning_rate": 1.0449581204296102e-05, "loss": 0.2892, "num_input_tokens_seen": 5044664, "step": 56025 }, { "epoch": 14.56081081081081, "grad_norm": 9.661954879760742, "learning_rate": 1.0444971192435704e-05, "loss": 0.314, "num_input_tokens_seen": 5045128, "step": 56030 }, { "epoch": 14.562110187110187, "grad_norm": 24.39744758605957, "learning_rate": 1.0440361929154375e-05, "loss": 0.1217, "num_input_tokens_seen": 5045640, "step": 56035 }, { "epoch": 14.563409563409563, "grad_norm": 23.358264923095703, "learning_rate": 1.043575341468919e-05, "loss": 0.3951, "num_input_tokens_seen": 5046136, "step": 56040 }, { "epoch": 14.56470893970894, "grad_norm": 1.2507833242416382, "learning_rate": 1.0431145649277152e-05, "loss": 0.1322, "num_input_tokens_seen": 5046584, "step": 56045 }, { "epoch": 14.566008316008316, "grad_norm": 0.15887580811977386, "learning_rate": 1.0426538633155253e-05, "loss": 0.2128, "num_input_tokens_seen": 5047064, "step": 56050 }, { "epoch": 14.567307692307692, "grad_norm": 23.573829650878906, "learning_rate": 1.0421932366560444e-05, "loss": 0.2824, "num_input_tokens_seen": 5047544, "step": 56055 }, { "epoch": 14.568607068607069, "grad_norm": 1.3326128721237183, "learning_rate": 1.0417326849729614e-05, "loss": 0.0328, "num_input_tokens_seen": 5048008, "step": 56060 }, { "epoch": 14.569906444906445, "grad_norm": 12.758810997009277, "learning_rate": 1.0412722082899644e-05, "loss": 0.1581, "num_input_tokens_seen": 5048488, "step": 56065 }, { "epoch": 14.57120582120582, "grad_norm": 0.43430858850479126, "learning_rate": 1.0408118066307346e-05, "loss": 0.0818, "num_input_tokens_seen": 5048952, "step": 56070 }, { "epoch": 14.572505197505198, "grad_norm": 3.776870012283325, "learning_rate": 1.0403514800189526e-05, "loss": 0.1948, "num_input_tokens_seen": 5049432, "step": 56075 }, { "epoch": 14.573804573804575, "grad_norm": 5.697384357452393, "learning_rate": 1.0398912284782918e-05, "loss": 0.0292, "num_input_tokens_seen": 5049864, "step": 56080 }, { "epoch": 14.57510395010395, "grad_norm": 2.5337328910827637, "learning_rate": 1.039431052032424e-05, "loss": 0.1627, "num_input_tokens_seen": 5050360, "step": 56085 }, { "epoch": 14.576403326403327, "grad_norm": 0.01959220878779888, "learning_rate": 1.0389709507050166e-05, "loss": 0.3826, "num_input_tokens_seen": 5050824, "step": 56090 }, { "epoch": 14.577702702702704, "grad_norm": 0.4722401201725006, "learning_rate": 1.0385109245197342e-05, "loss": 0.0568, "num_input_tokens_seen": 5051256, "step": 56095 }, { "epoch": 14.579002079002079, "grad_norm": 4.847707271575928, "learning_rate": 1.0380509735002341e-05, "loss": 0.4059, "num_input_tokens_seen": 5051688, "step": 56100 }, { "epoch": 14.580301455301456, "grad_norm": 13.177460670471191, "learning_rate": 1.0375910976701742e-05, "loss": 0.3131, "num_input_tokens_seen": 5052120, "step": 56105 }, { "epoch": 14.58160083160083, "grad_norm": 2.0057010650634766, "learning_rate": 1.0371312970532046e-05, "loss": 0.0385, "num_input_tokens_seen": 5052536, "step": 56110 }, { "epoch": 14.582900207900208, "grad_norm": 13.787325859069824, "learning_rate": 1.0366715716729747e-05, "loss": 0.1695, "num_input_tokens_seen": 5052968, "step": 56115 }, { "epoch": 14.584199584199585, "grad_norm": 12.472116470336914, "learning_rate": 1.0362119215531271e-05, "loss": 0.0811, "num_input_tokens_seen": 5053416, "step": 56120 }, { "epoch": 14.58549896049896, "grad_norm": 18.915035247802734, "learning_rate": 1.0357523467173031e-05, "loss": 0.2961, "num_input_tokens_seen": 5053896, "step": 56125 }, { "epoch": 14.586798336798337, "grad_norm": 7.159388542175293, "learning_rate": 1.0352928471891396e-05, "loss": 0.1364, "num_input_tokens_seen": 5054344, "step": 56130 }, { "epoch": 14.588097713097714, "grad_norm": 5.073250770568848, "learning_rate": 1.0348334229922677e-05, "loss": 0.2544, "num_input_tokens_seen": 5054824, "step": 56135 }, { "epoch": 14.589397089397089, "grad_norm": 15.511479377746582, "learning_rate": 1.0343740741503174e-05, "loss": 0.551, "num_input_tokens_seen": 5055304, "step": 56140 }, { "epoch": 14.590696465696466, "grad_norm": 5.258019924163818, "learning_rate": 1.033914800686912e-05, "loss": 0.2396, "num_input_tokens_seen": 5055784, "step": 56145 }, { "epoch": 14.591995841995843, "grad_norm": 4.753079891204834, "learning_rate": 1.033455602625674e-05, "loss": 0.0248, "num_input_tokens_seen": 5056216, "step": 56150 }, { "epoch": 14.593295218295218, "grad_norm": 0.8886027932167053, "learning_rate": 1.0329964799902187e-05, "loss": 0.2877, "num_input_tokens_seen": 5056680, "step": 56155 }, { "epoch": 14.594594594594595, "grad_norm": 1.382810115814209, "learning_rate": 1.0325374328041598e-05, "loss": 0.1791, "num_input_tokens_seen": 5057112, "step": 56160 }, { "epoch": 14.595893970893972, "grad_norm": 20.333282470703125, "learning_rate": 1.0320784610911069e-05, "loss": 0.3164, "num_input_tokens_seen": 5057544, "step": 56165 }, { "epoch": 14.597193347193347, "grad_norm": 0.4744909703731537, "learning_rate": 1.0316195648746662e-05, "loss": 0.0656, "num_input_tokens_seen": 5058024, "step": 56170 }, { "epoch": 14.598492723492724, "grad_norm": 0.05126458778977394, "learning_rate": 1.0311607441784374e-05, "loss": 0.196, "num_input_tokens_seen": 5058472, "step": 56175 }, { "epoch": 14.5997920997921, "grad_norm": 0.10842486470937729, "learning_rate": 1.0307019990260198e-05, "loss": 0.0341, "num_input_tokens_seen": 5058920, "step": 56180 }, { "epoch": 14.601091476091476, "grad_norm": 4.650196075439453, "learning_rate": 1.030243329441006e-05, "loss": 0.0457, "num_input_tokens_seen": 5059368, "step": 56185 }, { "epoch": 14.602390852390853, "grad_norm": 0.47121310234069824, "learning_rate": 1.0297847354469853e-05, "loss": 0.2158, "num_input_tokens_seen": 5059816, "step": 56190 }, { "epoch": 14.603690228690228, "grad_norm": 0.009846508502960205, "learning_rate": 1.0293262170675441e-05, "loss": 0.2533, "num_input_tokens_seen": 5060264, "step": 56195 }, { "epoch": 14.604989604989605, "grad_norm": 0.12517552077770233, "learning_rate": 1.028867774326265e-05, "loss": 0.6392, "num_input_tokens_seen": 5060680, "step": 56200 }, { "epoch": 14.606288981288982, "grad_norm": 10.212178230285645, "learning_rate": 1.0284094072467266e-05, "loss": 0.3625, "num_input_tokens_seen": 5061128, "step": 56205 }, { "epoch": 14.607588357588357, "grad_norm": 2.67195200920105, "learning_rate": 1.0279511158525015e-05, "loss": 0.4392, "num_input_tokens_seen": 5061560, "step": 56210 }, { "epoch": 14.608887733887734, "grad_norm": 0.18263301253318787, "learning_rate": 1.0274929001671618e-05, "loss": 0.1443, "num_input_tokens_seen": 5061976, "step": 56215 }, { "epoch": 14.61018711018711, "grad_norm": 9.710325241088867, "learning_rate": 1.0270347602142724e-05, "loss": 0.4014, "num_input_tokens_seen": 5062440, "step": 56220 }, { "epoch": 14.611486486486486, "grad_norm": 0.8512765169143677, "learning_rate": 1.0265766960173965e-05, "loss": 0.0718, "num_input_tokens_seen": 5062872, "step": 56225 }, { "epoch": 14.612785862785863, "grad_norm": 18.921289443969727, "learning_rate": 1.0261187076000928e-05, "loss": 0.1201, "num_input_tokens_seen": 5063352, "step": 56230 }, { "epoch": 14.61408523908524, "grad_norm": 0.22108244895935059, "learning_rate": 1.0256607949859171e-05, "loss": 0.2814, "num_input_tokens_seen": 5063816, "step": 56235 }, { "epoch": 14.615384615384615, "grad_norm": 11.632366180419922, "learning_rate": 1.0252029581984185e-05, "loss": 0.1734, "num_input_tokens_seen": 5064280, "step": 56240 }, { "epoch": 14.616683991683992, "grad_norm": 6.725160121917725, "learning_rate": 1.0247451972611457e-05, "loss": 0.1256, "num_input_tokens_seen": 5064760, "step": 56245 }, { "epoch": 14.617983367983367, "grad_norm": 1.0143142938613892, "learning_rate": 1.0242875121976412e-05, "loss": 0.1245, "num_input_tokens_seen": 5065176, "step": 56250 }, { "epoch": 14.619282744282744, "grad_norm": 1.0467039346694946, "learning_rate": 1.0238299030314427e-05, "loss": 0.2777, "num_input_tokens_seen": 5065624, "step": 56255 }, { "epoch": 14.620582120582121, "grad_norm": 0.04099159315228462, "learning_rate": 1.0233723697860867e-05, "loss": 0.5409, "num_input_tokens_seen": 5066088, "step": 56260 }, { "epoch": 14.621881496881496, "grad_norm": 24.069860458374023, "learning_rate": 1.0229149124851048e-05, "loss": 0.2883, "num_input_tokens_seen": 5066520, "step": 56265 }, { "epoch": 14.623180873180873, "grad_norm": 26.241607666015625, "learning_rate": 1.0224575311520248e-05, "loss": 0.2412, "num_input_tokens_seen": 5067000, "step": 56270 }, { "epoch": 14.62448024948025, "grad_norm": 1.2104085683822632, "learning_rate": 1.022000225810369e-05, "loss": 0.79, "num_input_tokens_seen": 5067448, "step": 56275 }, { "epoch": 14.625779625779625, "grad_norm": 4.898867130279541, "learning_rate": 1.0215429964836587e-05, "loss": 0.3056, "num_input_tokens_seen": 5067880, "step": 56280 }, { "epoch": 14.627079002079002, "grad_norm": 28.68398666381836, "learning_rate": 1.0210858431954076e-05, "loss": 0.2332, "num_input_tokens_seen": 5068360, "step": 56285 }, { "epoch": 14.628378378378379, "grad_norm": 5.052353382110596, "learning_rate": 1.0206287659691294e-05, "loss": 0.4399, "num_input_tokens_seen": 5068840, "step": 56290 }, { "epoch": 14.629677754677754, "grad_norm": 14.247724533081055, "learning_rate": 1.0201717648283308e-05, "loss": 0.3133, "num_input_tokens_seen": 5069256, "step": 56295 }, { "epoch": 14.630977130977131, "grad_norm": 9.050891876220703, "learning_rate": 1.019714839796516e-05, "loss": 0.0802, "num_input_tokens_seen": 5069704, "step": 56300 }, { "epoch": 14.632276507276508, "grad_norm": 0.47624409198760986, "learning_rate": 1.019257990897185e-05, "loss": 0.0229, "num_input_tokens_seen": 5070136, "step": 56305 }, { "epoch": 14.633575883575883, "grad_norm": 0.13481637835502625, "learning_rate": 1.0188012181538357e-05, "loss": 0.0801, "num_input_tokens_seen": 5070552, "step": 56310 }, { "epoch": 14.63487525987526, "grad_norm": 0.392116904258728, "learning_rate": 1.0183445215899584e-05, "loss": 0.0858, "num_input_tokens_seen": 5071000, "step": 56315 }, { "epoch": 14.636174636174637, "grad_norm": 5.2980241775512695, "learning_rate": 1.0178879012290415e-05, "loss": 0.2504, "num_input_tokens_seen": 5071480, "step": 56320 }, { "epoch": 14.637474012474012, "grad_norm": 16.890804290771484, "learning_rate": 1.017431357094571e-05, "loss": 0.4397, "num_input_tokens_seen": 5071976, "step": 56325 }, { "epoch": 14.638773388773389, "grad_norm": 0.5661875605583191, "learning_rate": 1.016974889210025e-05, "loss": 0.1834, "num_input_tokens_seen": 5072424, "step": 56330 }, { "epoch": 14.640072765072766, "grad_norm": 0.2140771597623825, "learning_rate": 1.0165184975988818e-05, "loss": 0.0802, "num_input_tokens_seen": 5072856, "step": 56335 }, { "epoch": 14.641372141372141, "grad_norm": 5.879693984985352, "learning_rate": 1.0160621822846133e-05, "loss": 0.1423, "num_input_tokens_seen": 5073304, "step": 56340 }, { "epoch": 14.642671517671518, "grad_norm": 3.8517532348632812, "learning_rate": 1.0156059432906898e-05, "loss": 0.1923, "num_input_tokens_seen": 5073736, "step": 56345 }, { "epoch": 14.643970893970893, "grad_norm": 0.33016976714134216, "learning_rate": 1.0151497806405741e-05, "loss": 0.0498, "num_input_tokens_seen": 5074184, "step": 56350 }, { "epoch": 14.64527027027027, "grad_norm": 1.2991379499435425, "learning_rate": 1.0146936943577284e-05, "loss": 0.1317, "num_input_tokens_seen": 5074632, "step": 56355 }, { "epoch": 14.646569646569647, "grad_norm": 31.08401870727539, "learning_rate": 1.0142376844656085e-05, "loss": 0.6031, "num_input_tokens_seen": 5075096, "step": 56360 }, { "epoch": 14.647869022869022, "grad_norm": 15.85882568359375, "learning_rate": 1.013781750987669e-05, "loss": 0.0935, "num_input_tokens_seen": 5075608, "step": 56365 }, { "epoch": 14.6491683991684, "grad_norm": 11.125116348266602, "learning_rate": 1.0133258939473573e-05, "loss": 0.0449, "num_input_tokens_seen": 5076056, "step": 56370 }, { "epoch": 14.650467775467776, "grad_norm": 20.096773147583008, "learning_rate": 1.012870113368119e-05, "loss": 0.1765, "num_input_tokens_seen": 5076504, "step": 56375 }, { "epoch": 14.651767151767151, "grad_norm": 0.08316165953874588, "learning_rate": 1.0124144092733967e-05, "loss": 0.0291, "num_input_tokens_seen": 5076920, "step": 56380 }, { "epoch": 14.653066528066528, "grad_norm": 0.7430049180984497, "learning_rate": 1.0119587816866258e-05, "loss": 0.4955, "num_input_tokens_seen": 5077352, "step": 56385 }, { "epoch": 14.654365904365905, "grad_norm": 13.61556339263916, "learning_rate": 1.0115032306312414e-05, "loss": 0.3668, "num_input_tokens_seen": 5077800, "step": 56390 }, { "epoch": 14.65566528066528, "grad_norm": 3.6438210010528564, "learning_rate": 1.0110477561306714e-05, "loss": 0.0197, "num_input_tokens_seen": 5078280, "step": 56395 }, { "epoch": 14.656964656964657, "grad_norm": 3.618605136871338, "learning_rate": 1.0105923582083424e-05, "loss": 0.4855, "num_input_tokens_seen": 5078792, "step": 56400 }, { "epoch": 14.658264033264032, "grad_norm": 14.725701332092285, "learning_rate": 1.010137036887675e-05, "loss": 0.1063, "num_input_tokens_seen": 5079272, "step": 56405 }, { "epoch": 14.65956340956341, "grad_norm": 15.90783977508545, "learning_rate": 1.0096817921920871e-05, "loss": 0.2078, "num_input_tokens_seen": 5079720, "step": 56410 }, { "epoch": 14.660862785862786, "grad_norm": 0.06039479747414589, "learning_rate": 1.0092266241449927e-05, "loss": 0.1141, "num_input_tokens_seen": 5080184, "step": 56415 }, { "epoch": 14.662162162162161, "grad_norm": 23.272491455078125, "learning_rate": 1.0087715327698022e-05, "loss": 0.2343, "num_input_tokens_seen": 5080664, "step": 56420 }, { "epoch": 14.663461538461538, "grad_norm": 0.1880236566066742, "learning_rate": 1.0083165180899196e-05, "loss": 0.0606, "num_input_tokens_seen": 5081128, "step": 56425 }, { "epoch": 14.664760914760915, "grad_norm": 0.32349899411201477, "learning_rate": 1.007861580128749e-05, "loss": 0.0972, "num_input_tokens_seen": 5081624, "step": 56430 }, { "epoch": 14.66606029106029, "grad_norm": 8.369048118591309, "learning_rate": 1.0074067189096855e-05, "loss": 0.3593, "num_input_tokens_seen": 5082040, "step": 56435 }, { "epoch": 14.667359667359667, "grad_norm": 5.513123035430908, "learning_rate": 1.0069519344561259e-05, "loss": 0.2447, "num_input_tokens_seen": 5082472, "step": 56440 }, { "epoch": 14.668659043659044, "grad_norm": 7.8287835121154785, "learning_rate": 1.0064972267914583e-05, "loss": 0.0606, "num_input_tokens_seen": 5082936, "step": 56445 }, { "epoch": 14.66995841995842, "grad_norm": 0.479192316532135, "learning_rate": 1.0060425959390688e-05, "loss": 0.0602, "num_input_tokens_seen": 5083384, "step": 56450 }, { "epoch": 14.671257796257796, "grad_norm": 0.8195523619651794, "learning_rate": 1.0055880419223413e-05, "loss": 0.0527, "num_input_tokens_seen": 5083832, "step": 56455 }, { "epoch": 14.672557172557173, "grad_norm": 0.3757670819759369, "learning_rate": 1.0051335647646515e-05, "loss": 0.0277, "num_input_tokens_seen": 5084264, "step": 56460 }, { "epoch": 14.673856548856548, "grad_norm": 2.149864673614502, "learning_rate": 1.0046791644893758e-05, "loss": 0.3496, "num_input_tokens_seen": 5084712, "step": 56465 }, { "epoch": 14.675155925155925, "grad_norm": 25.972171783447266, "learning_rate": 1.0042248411198824e-05, "loss": 0.1425, "num_input_tokens_seen": 5085144, "step": 56470 }, { "epoch": 14.676455301455302, "grad_norm": 15.736416816711426, "learning_rate": 1.0037705946795386e-05, "loss": 0.4932, "num_input_tokens_seen": 5085592, "step": 56475 }, { "epoch": 14.677754677754677, "grad_norm": 0.5181918740272522, "learning_rate": 1.0033164251917069e-05, "loss": 0.0346, "num_input_tokens_seen": 5086008, "step": 56480 }, { "epoch": 14.679054054054054, "grad_norm": 7.556282043457031, "learning_rate": 1.0028623326797465e-05, "loss": 0.3661, "num_input_tokens_seen": 5086472, "step": 56485 }, { "epoch": 14.68035343035343, "grad_norm": 0.10398872941732407, "learning_rate": 1.0024083171670095e-05, "loss": 0.3175, "num_input_tokens_seen": 5086952, "step": 56490 }, { "epoch": 14.681652806652806, "grad_norm": 0.24211983382701874, "learning_rate": 1.001954378676849e-05, "loss": 0.1338, "num_input_tokens_seen": 5087384, "step": 56495 }, { "epoch": 14.682952182952183, "grad_norm": 22.876190185546875, "learning_rate": 1.0015005172326092e-05, "loss": 0.1729, "num_input_tokens_seen": 5087832, "step": 56500 }, { "epoch": 14.684251559251559, "grad_norm": 20.22319221496582, "learning_rate": 1.0010467328576342e-05, "loss": 0.3185, "num_input_tokens_seen": 5088280, "step": 56505 }, { "epoch": 14.685550935550935, "grad_norm": 0.3115059733390808, "learning_rate": 1.0005930255752614e-05, "loss": 0.2067, "num_input_tokens_seen": 5088744, "step": 56510 }, { "epoch": 14.686850311850312, "grad_norm": 0.12184026092290878, "learning_rate": 1.0001393954088257e-05, "loss": 0.1591, "num_input_tokens_seen": 5089192, "step": 56515 }, { "epoch": 14.688149688149688, "grad_norm": 0.2958648204803467, "learning_rate": 9.99685842381659e-06, "loss": 0.0412, "num_input_tokens_seen": 5089624, "step": 56520 }, { "epoch": 14.689449064449065, "grad_norm": 6.127413749694824, "learning_rate": 9.99232366517086e-06, "loss": 0.1596, "num_input_tokens_seen": 5090072, "step": 56525 }, { "epoch": 14.690748440748441, "grad_norm": 6.458987712860107, "learning_rate": 9.987789678384313e-06, "loss": 0.1974, "num_input_tokens_seen": 5090536, "step": 56530 }, { "epoch": 14.692047817047817, "grad_norm": 13.147581100463867, "learning_rate": 9.98325646369012e-06, "loss": 0.1003, "num_input_tokens_seen": 5090984, "step": 56535 }, { "epoch": 14.693347193347194, "grad_norm": 0.020676208660006523, "learning_rate": 9.97872402132144e-06, "loss": 0.141, "num_input_tokens_seen": 5091400, "step": 56540 }, { "epoch": 14.69464656964657, "grad_norm": 0.04075825214385986, "learning_rate": 9.974192351511368e-06, "loss": 0.0333, "num_input_tokens_seen": 5091864, "step": 56545 }, { "epoch": 14.695945945945946, "grad_norm": 6.615090847015381, "learning_rate": 9.969661454492984e-06, "loss": 0.2326, "num_input_tokens_seen": 5092360, "step": 56550 }, { "epoch": 14.697245322245323, "grad_norm": 0.740119218826294, "learning_rate": 9.96513133049931e-06, "loss": 0.5713, "num_input_tokens_seen": 5092824, "step": 56555 }, { "epoch": 14.698544698544698, "grad_norm": 0.5610297918319702, "learning_rate": 9.960601979763347e-06, "loss": 0.3776, "num_input_tokens_seen": 5093288, "step": 56560 }, { "epoch": 14.699844074844075, "grad_norm": 3.143902540206909, "learning_rate": 9.956073402518026e-06, "loss": 0.157, "num_input_tokens_seen": 5093768, "step": 56565 }, { "epoch": 14.701143451143452, "grad_norm": 1.5679607391357422, "learning_rate": 9.951545598996273e-06, "loss": 0.047, "num_input_tokens_seen": 5094216, "step": 56570 }, { "epoch": 14.702442827442827, "grad_norm": 0.0953851267695427, "learning_rate": 9.947018569430947e-06, "loss": 0.4323, "num_input_tokens_seen": 5094680, "step": 56575 }, { "epoch": 14.703742203742204, "grad_norm": 13.866233825683594, "learning_rate": 9.942492314054872e-06, "loss": 0.5396, "num_input_tokens_seen": 5095128, "step": 56580 }, { "epoch": 14.70504158004158, "grad_norm": 1.3268071413040161, "learning_rate": 9.937966833100845e-06, "loss": 0.2913, "num_input_tokens_seen": 5095592, "step": 56585 }, { "epoch": 14.706340956340956, "grad_norm": 0.2063121199607849, "learning_rate": 9.933442126801615e-06, "loss": 0.1507, "num_input_tokens_seen": 5096072, "step": 56590 }, { "epoch": 14.707640332640333, "grad_norm": 0.03889560326933861, "learning_rate": 9.9289181953899e-06, "loss": 0.2953, "num_input_tokens_seen": 5096488, "step": 56595 }, { "epoch": 14.70893970893971, "grad_norm": 3.1768596172332764, "learning_rate": 9.924395039098355e-06, "loss": 0.5229, "num_input_tokens_seen": 5096952, "step": 56600 }, { "epoch": 14.710239085239085, "grad_norm": 0.057686105370521545, "learning_rate": 9.919872658159626e-06, "loss": 0.0461, "num_input_tokens_seen": 5097384, "step": 56605 }, { "epoch": 14.711538461538462, "grad_norm": 0.48536956310272217, "learning_rate": 9.915351052806288e-06, "loss": 0.0221, "num_input_tokens_seen": 5097848, "step": 56610 }, { "epoch": 14.712837837837839, "grad_norm": 8.252479553222656, "learning_rate": 9.910830223270906e-06, "loss": 0.175, "num_input_tokens_seen": 5098296, "step": 56615 }, { "epoch": 14.714137214137214, "grad_norm": 4.304741859436035, "learning_rate": 9.906310169785973e-06, "loss": 0.0898, "num_input_tokens_seen": 5098744, "step": 56620 }, { "epoch": 14.71543659043659, "grad_norm": 3.9049806594848633, "learning_rate": 9.901790892583974e-06, "loss": 0.1751, "num_input_tokens_seen": 5099176, "step": 56625 }, { "epoch": 14.716735966735968, "grad_norm": 1.7056100368499756, "learning_rate": 9.897272391897333e-06, "loss": 0.3654, "num_input_tokens_seen": 5099656, "step": 56630 }, { "epoch": 14.718035343035343, "grad_norm": 1.088831901550293, "learning_rate": 9.892754667958455e-06, "loss": 0.0992, "num_input_tokens_seen": 5100120, "step": 56635 }, { "epoch": 14.71933471933472, "grad_norm": 5.378208160400391, "learning_rate": 9.888237720999677e-06, "loss": 0.2204, "num_input_tokens_seen": 5100632, "step": 56640 }, { "epoch": 14.720634095634095, "grad_norm": 15.45062255859375, "learning_rate": 9.883721551253303e-06, "loss": 0.2338, "num_input_tokens_seen": 5101080, "step": 56645 }, { "epoch": 14.721933471933472, "grad_norm": 1.4111937284469604, "learning_rate": 9.879206158951623e-06, "loss": 0.2709, "num_input_tokens_seen": 5101528, "step": 56650 }, { "epoch": 14.723232848232849, "grad_norm": 0.40287110209465027, "learning_rate": 9.874691544326851e-06, "loss": 0.1897, "num_input_tokens_seen": 5101976, "step": 56655 }, { "epoch": 14.724532224532224, "grad_norm": 1.764137625694275, "learning_rate": 9.870177707611183e-06, "loss": 0.2669, "num_input_tokens_seen": 5102456, "step": 56660 }, { "epoch": 14.7258316008316, "grad_norm": 25.9010066986084, "learning_rate": 9.865664649036774e-06, "loss": 0.2851, "num_input_tokens_seen": 5102920, "step": 56665 }, { "epoch": 14.727130977130978, "grad_norm": 0.1489086151123047, "learning_rate": 9.861152368835742e-06, "loss": 0.5248, "num_input_tokens_seen": 5103384, "step": 56670 }, { "epoch": 14.728430353430353, "grad_norm": 12.73558521270752, "learning_rate": 9.856640867240139e-06, "loss": 0.2172, "num_input_tokens_seen": 5103832, "step": 56675 }, { "epoch": 14.72972972972973, "grad_norm": 20.84507942199707, "learning_rate": 9.852130144482017e-06, "loss": 0.4755, "num_input_tokens_seen": 5104248, "step": 56680 }, { "epoch": 14.731029106029107, "grad_norm": 0.06236911565065384, "learning_rate": 9.847620200793343e-06, "loss": 0.2965, "num_input_tokens_seen": 5104680, "step": 56685 }, { "epoch": 14.732328482328482, "grad_norm": 22.957401275634766, "learning_rate": 9.843111036406092e-06, "loss": 0.439, "num_input_tokens_seen": 5105160, "step": 56690 }, { "epoch": 14.733627858627859, "grad_norm": 14.196057319641113, "learning_rate": 9.838602651552146e-06, "loss": 0.363, "num_input_tokens_seen": 5105576, "step": 56695 }, { "epoch": 14.734927234927234, "grad_norm": 11.597579002380371, "learning_rate": 9.834095046463407e-06, "loss": 0.1218, "num_input_tokens_seen": 5105992, "step": 56700 }, { "epoch": 14.736226611226611, "grad_norm": 3.6412949562072754, "learning_rate": 9.829588221371694e-06, "loss": 0.1828, "num_input_tokens_seen": 5106456, "step": 56705 }, { "epoch": 14.737525987525988, "grad_norm": 3.6442813873291016, "learning_rate": 9.825082176508782e-06, "loss": 0.0612, "num_input_tokens_seen": 5106904, "step": 56710 }, { "epoch": 14.738825363825363, "grad_norm": 0.28352925181388855, "learning_rate": 9.820576912106446e-06, "loss": 0.0765, "num_input_tokens_seen": 5107336, "step": 56715 }, { "epoch": 14.74012474012474, "grad_norm": 0.1936127245426178, "learning_rate": 9.816072428396375e-06, "loss": 0.0217, "num_input_tokens_seen": 5107768, "step": 56720 }, { "epoch": 14.741424116424117, "grad_norm": 1.3074108362197876, "learning_rate": 9.811568725610246e-06, "loss": 0.44, "num_input_tokens_seen": 5108232, "step": 56725 }, { "epoch": 14.742723492723492, "grad_norm": 0.0075962599366903305, "learning_rate": 9.80706580397969e-06, "loss": 0.2378, "num_input_tokens_seen": 5108696, "step": 56730 }, { "epoch": 14.744022869022869, "grad_norm": 0.06807589530944824, "learning_rate": 9.802563663736305e-06, "loss": 0.1031, "num_input_tokens_seen": 5109176, "step": 56735 }, { "epoch": 14.745322245322246, "grad_norm": 12.782777786254883, "learning_rate": 9.798062305111625e-06, "loss": 0.2068, "num_input_tokens_seen": 5109640, "step": 56740 }, { "epoch": 14.746621621621621, "grad_norm": 6.294608116149902, "learning_rate": 9.793561728337176e-06, "loss": 0.2716, "num_input_tokens_seen": 5110072, "step": 56745 }, { "epoch": 14.747920997920998, "grad_norm": 1.4065494537353516, "learning_rate": 9.789061933644405e-06, "loss": 0.0837, "num_input_tokens_seen": 5110504, "step": 56750 }, { "epoch": 14.749220374220375, "grad_norm": 0.9668453931808472, "learning_rate": 9.784562921264767e-06, "loss": 0.206, "num_input_tokens_seen": 5110920, "step": 56755 }, { "epoch": 14.75051975051975, "grad_norm": 22.87360382080078, "learning_rate": 9.780064691429625e-06, "loss": 0.1526, "num_input_tokens_seen": 5111368, "step": 56760 }, { "epoch": 14.751819126819127, "grad_norm": 0.3239985704421997, "learning_rate": 9.77556724437034e-06, "loss": 0.0821, "num_input_tokens_seen": 5111800, "step": 56765 }, { "epoch": 14.753118503118504, "grad_norm": 19.637758255004883, "learning_rate": 9.771070580318225e-06, "loss": 0.4993, "num_input_tokens_seen": 5112216, "step": 56770 }, { "epoch": 14.754417879417879, "grad_norm": 0.2757495939731598, "learning_rate": 9.766574699504536e-06, "loss": 0.1857, "num_input_tokens_seen": 5112648, "step": 56775 }, { "epoch": 14.755717255717256, "grad_norm": 0.006793134845793247, "learning_rate": 9.762079602160515e-06, "loss": 0.1992, "num_input_tokens_seen": 5113128, "step": 56780 }, { "epoch": 14.757016632016633, "grad_norm": 27.110572814941406, "learning_rate": 9.757585288517328e-06, "loss": 0.2569, "num_input_tokens_seen": 5113624, "step": 56785 }, { "epoch": 14.758316008316008, "grad_norm": 0.010371401906013489, "learning_rate": 9.753091758806146e-06, "loss": 0.0655, "num_input_tokens_seen": 5114056, "step": 56790 }, { "epoch": 14.759615384615385, "grad_norm": 42.67673110961914, "learning_rate": 9.748599013258055e-06, "loss": 0.3811, "num_input_tokens_seen": 5114488, "step": 56795 }, { "epoch": 14.76091476091476, "grad_norm": 0.08899030089378357, "learning_rate": 9.744107052104128e-06, "loss": 0.3044, "num_input_tokens_seen": 5114904, "step": 56800 }, { "epoch": 14.762214137214137, "grad_norm": 3.7108166217803955, "learning_rate": 9.739615875575395e-06, "loss": 0.3865, "num_input_tokens_seen": 5115336, "step": 56805 }, { "epoch": 14.763513513513514, "grad_norm": 25.555164337158203, "learning_rate": 9.73512548390285e-06, "loss": 0.3577, "num_input_tokens_seen": 5115784, "step": 56810 }, { "epoch": 14.76481288981289, "grad_norm": 21.851110458374023, "learning_rate": 9.730635877317415e-06, "loss": 0.2436, "num_input_tokens_seen": 5116248, "step": 56815 }, { "epoch": 14.766112266112266, "grad_norm": 10.947218894958496, "learning_rate": 9.726147056050017e-06, "loss": 0.0493, "num_input_tokens_seen": 5116696, "step": 56820 }, { "epoch": 14.767411642411643, "grad_norm": 2.245922327041626, "learning_rate": 9.7216590203315e-06, "loss": 0.0068, "num_input_tokens_seen": 5117176, "step": 56825 }, { "epoch": 14.768711018711018, "grad_norm": 0.04798097163438797, "learning_rate": 9.717171770392711e-06, "loss": 0.2938, "num_input_tokens_seen": 5117624, "step": 56830 }, { "epoch": 14.770010395010395, "grad_norm": 0.04200595244765282, "learning_rate": 9.712685306464408e-06, "loss": 0.5362, "num_input_tokens_seen": 5118040, "step": 56835 }, { "epoch": 14.771309771309772, "grad_norm": 0.14736269414424896, "learning_rate": 9.708199628777351e-06, "loss": 0.4679, "num_input_tokens_seen": 5118472, "step": 56840 }, { "epoch": 14.772609147609147, "grad_norm": 2.7774081230163574, "learning_rate": 9.703714737562246e-06, "loss": 0.1284, "num_input_tokens_seen": 5118904, "step": 56845 }, { "epoch": 14.773908523908524, "grad_norm": 0.7873842716217041, "learning_rate": 9.699230633049742e-06, "loss": 0.1066, "num_input_tokens_seen": 5119336, "step": 56850 }, { "epoch": 14.7752079002079, "grad_norm": 1.2481271028518677, "learning_rate": 9.694747315470473e-06, "loss": 0.0086, "num_input_tokens_seen": 5119816, "step": 56855 }, { "epoch": 14.776507276507276, "grad_norm": 15.682963371276855, "learning_rate": 9.690264785055005e-06, "loss": 0.2941, "num_input_tokens_seen": 5120232, "step": 56860 }, { "epoch": 14.777806652806653, "grad_norm": 1.9836747646331787, "learning_rate": 9.6857830420339e-06, "loss": 0.1049, "num_input_tokens_seen": 5120680, "step": 56865 }, { "epoch": 14.779106029106028, "grad_norm": 20.2818603515625, "learning_rate": 9.681302086637634e-06, "loss": 0.2883, "num_input_tokens_seen": 5121112, "step": 56870 }, { "epoch": 14.780405405405405, "grad_norm": 4.008162975311279, "learning_rate": 9.676821919096682e-06, "loss": 0.1042, "num_input_tokens_seen": 5121544, "step": 56875 }, { "epoch": 14.781704781704782, "grad_norm": 6.064931869506836, "learning_rate": 9.67234253964146e-06, "loss": 0.2307, "num_input_tokens_seen": 5122008, "step": 56880 }, { "epoch": 14.783004158004157, "grad_norm": 9.548892974853516, "learning_rate": 9.667863948502356e-06, "loss": 0.2571, "num_input_tokens_seen": 5122408, "step": 56885 }, { "epoch": 14.784303534303534, "grad_norm": 7.9048638343811035, "learning_rate": 9.663386145909692e-06, "loss": 0.327, "num_input_tokens_seen": 5122856, "step": 56890 }, { "epoch": 14.785602910602911, "grad_norm": 0.046814192086458206, "learning_rate": 9.658909132093783e-06, "loss": 0.2943, "num_input_tokens_seen": 5123320, "step": 56895 }, { "epoch": 14.786902286902286, "grad_norm": 0.14867131412029266, "learning_rate": 9.654432907284874e-06, "loss": 0.0877, "num_input_tokens_seen": 5123768, "step": 56900 }, { "epoch": 14.788201663201663, "grad_norm": 6.566168785095215, "learning_rate": 9.649957471713175e-06, "loss": 0.2036, "num_input_tokens_seen": 5124216, "step": 56905 }, { "epoch": 14.78950103950104, "grad_norm": 0.02179950848221779, "learning_rate": 9.645482825608874e-06, "loss": 0.3053, "num_input_tokens_seen": 5124648, "step": 56910 }, { "epoch": 14.790800415800415, "grad_norm": 14.219778060913086, "learning_rate": 9.6410089692021e-06, "loss": 0.3598, "num_input_tokens_seen": 5125112, "step": 56915 }, { "epoch": 14.792099792099792, "grad_norm": 25.77894401550293, "learning_rate": 9.63653590272296e-06, "loss": 0.2448, "num_input_tokens_seen": 5125528, "step": 56920 }, { "epoch": 14.79339916839917, "grad_norm": 0.13641871511936188, "learning_rate": 9.63206362640149e-06, "loss": 0.2822, "num_input_tokens_seen": 5126008, "step": 56925 }, { "epoch": 14.794698544698544, "grad_norm": 17.847219467163086, "learning_rate": 9.627592140467726e-06, "loss": 0.6206, "num_input_tokens_seen": 5126424, "step": 56930 }, { "epoch": 14.795997920997921, "grad_norm": 7.478085041046143, "learning_rate": 9.623121445151615e-06, "loss": 0.2163, "num_input_tokens_seen": 5126904, "step": 56935 }, { "epoch": 14.797297297297296, "grad_norm": 3.0823662281036377, "learning_rate": 9.61865154068311e-06, "loss": 0.3183, "num_input_tokens_seen": 5127352, "step": 56940 }, { "epoch": 14.798596673596673, "grad_norm": 7.169048309326172, "learning_rate": 9.614182427292077e-06, "loss": 0.0772, "num_input_tokens_seen": 5127832, "step": 56945 }, { "epoch": 14.79989604989605, "grad_norm": 18.05116844177246, "learning_rate": 9.609714105208401e-06, "loss": 0.2595, "num_input_tokens_seen": 5128280, "step": 56950 }, { "epoch": 14.801195426195425, "grad_norm": 15.589167594909668, "learning_rate": 9.605246574661866e-06, "loss": 0.205, "num_input_tokens_seen": 5128696, "step": 56955 }, { "epoch": 14.802494802494802, "grad_norm": 0.007464530877768993, "learning_rate": 9.600779835882259e-06, "loss": 0.5041, "num_input_tokens_seen": 5129112, "step": 56960 }, { "epoch": 14.80379417879418, "grad_norm": 7.689942359924316, "learning_rate": 9.596313889099297e-06, "loss": 0.26, "num_input_tokens_seen": 5129576, "step": 56965 }, { "epoch": 14.805093555093555, "grad_norm": 18.97066307067871, "learning_rate": 9.591848734542665e-06, "loss": 0.3966, "num_input_tokens_seen": 5129976, "step": 56970 }, { "epoch": 14.806392931392931, "grad_norm": 16.88901710510254, "learning_rate": 9.587384372442019e-06, "loss": 0.257, "num_input_tokens_seen": 5130440, "step": 56975 }, { "epoch": 14.807692307692308, "grad_norm": 2.0266313552856445, "learning_rate": 9.582920803026959e-06, "loss": 0.2723, "num_input_tokens_seen": 5130904, "step": 56980 }, { "epoch": 14.808991683991684, "grad_norm": 3.0413434505462646, "learning_rate": 9.578458026527063e-06, "loss": 0.1515, "num_input_tokens_seen": 5131352, "step": 56985 }, { "epoch": 14.81029106029106, "grad_norm": 2.7327516078948975, "learning_rate": 9.573996043171837e-06, "loss": 0.2466, "num_input_tokens_seen": 5131784, "step": 56990 }, { "epoch": 14.811590436590437, "grad_norm": 3.1278584003448486, "learning_rate": 9.569534853190787e-06, "loss": 0.0974, "num_input_tokens_seen": 5132200, "step": 56995 }, { "epoch": 14.812889812889813, "grad_norm": 6.1430745124816895, "learning_rate": 9.565074456813334e-06, "loss": 0.1137, "num_input_tokens_seen": 5132680, "step": 57000 }, { "epoch": 14.81418918918919, "grad_norm": 19.05594825744629, "learning_rate": 9.560614854268899e-06, "loss": 0.1601, "num_input_tokens_seen": 5133144, "step": 57005 }, { "epoch": 14.815488565488565, "grad_norm": 0.5151695609092712, "learning_rate": 9.556156045786826e-06, "loss": 0.1585, "num_input_tokens_seen": 5133592, "step": 57010 }, { "epoch": 14.816787941787942, "grad_norm": 0.2543441653251648, "learning_rate": 9.551698031596445e-06, "loss": 0.0485, "num_input_tokens_seen": 5134024, "step": 57015 }, { "epoch": 14.818087318087318, "grad_norm": 1.3541315793991089, "learning_rate": 9.547240811927038e-06, "loss": 0.0636, "num_input_tokens_seen": 5134456, "step": 57020 }, { "epoch": 14.819386694386694, "grad_norm": 1.3408291339874268, "learning_rate": 9.54278438700785e-06, "loss": 0.0899, "num_input_tokens_seen": 5134904, "step": 57025 }, { "epoch": 14.82068607068607, "grad_norm": 0.14556673169136047, "learning_rate": 9.538328757068072e-06, "loss": 0.075, "num_input_tokens_seen": 5135352, "step": 57030 }, { "epoch": 14.821985446985448, "grad_norm": 0.3165734112262726, "learning_rate": 9.53387392233685e-06, "loss": 0.3818, "num_input_tokens_seen": 5135800, "step": 57035 }, { "epoch": 14.823284823284823, "grad_norm": 3.2954931259155273, "learning_rate": 9.529419883043319e-06, "loss": 0.1275, "num_input_tokens_seen": 5136264, "step": 57040 }, { "epoch": 14.8245841995842, "grad_norm": 1.599754810333252, "learning_rate": 9.52496663941654e-06, "loss": 0.2601, "num_input_tokens_seen": 5136696, "step": 57045 }, { "epoch": 14.825883575883577, "grad_norm": 6.283506393432617, "learning_rate": 9.520514191685556e-06, "loss": 0.5499, "num_input_tokens_seen": 5137128, "step": 57050 }, { "epoch": 14.827182952182952, "grad_norm": 1.635042428970337, "learning_rate": 9.516062540079357e-06, "loss": 0.1026, "num_input_tokens_seen": 5137560, "step": 57055 }, { "epoch": 14.828482328482329, "grad_norm": 1.016068935394287, "learning_rate": 9.511611684826904e-06, "loss": 0.3878, "num_input_tokens_seen": 5137976, "step": 57060 }, { "epoch": 14.829781704781706, "grad_norm": 0.21179808676242828, "learning_rate": 9.507161626157096e-06, "loss": 0.0371, "num_input_tokens_seen": 5138456, "step": 57065 }, { "epoch": 14.83108108108108, "grad_norm": 5.499870777130127, "learning_rate": 9.502712364298819e-06, "loss": 0.3668, "num_input_tokens_seen": 5138904, "step": 57070 }, { "epoch": 14.832380457380458, "grad_norm": 26.346534729003906, "learning_rate": 9.498263899480886e-06, "loss": 0.5623, "num_input_tokens_seen": 5139400, "step": 57075 }, { "epoch": 14.833679833679835, "grad_norm": 0.21143198013305664, "learning_rate": 9.493816231932101e-06, "loss": 0.1253, "num_input_tokens_seen": 5139864, "step": 57080 }, { "epoch": 14.83497920997921, "grad_norm": 20.97351837158203, "learning_rate": 9.489369361881196e-06, "loss": 0.365, "num_input_tokens_seen": 5140344, "step": 57085 }, { "epoch": 14.836278586278587, "grad_norm": 6.979052543640137, "learning_rate": 9.484923289556886e-06, "loss": 0.3826, "num_input_tokens_seen": 5140824, "step": 57090 }, { "epoch": 14.837577962577962, "grad_norm": 4.9932990074157715, "learning_rate": 9.480478015187846e-06, "loss": 0.2276, "num_input_tokens_seen": 5141336, "step": 57095 }, { "epoch": 14.838877338877339, "grad_norm": 21.6778564453125, "learning_rate": 9.476033539002683e-06, "loss": 0.2233, "num_input_tokens_seen": 5141752, "step": 57100 }, { "epoch": 14.840176715176716, "grad_norm": 0.35931533575057983, "learning_rate": 9.471589861229998e-06, "loss": 0.1674, "num_input_tokens_seen": 5142216, "step": 57105 }, { "epoch": 14.84147609147609, "grad_norm": 4.597370624542236, "learning_rate": 9.467146982098316e-06, "loss": 0.0891, "num_input_tokens_seen": 5142664, "step": 57110 }, { "epoch": 14.842775467775468, "grad_norm": 0.4413687288761139, "learning_rate": 9.462704901836156e-06, "loss": 0.0125, "num_input_tokens_seen": 5143128, "step": 57115 }, { "epoch": 14.844074844074845, "grad_norm": 2.504328966140747, "learning_rate": 9.458263620671965e-06, "loss": 0.086, "num_input_tokens_seen": 5143544, "step": 57120 }, { "epoch": 14.84537422037422, "grad_norm": 0.16145531833171844, "learning_rate": 9.453823138834162e-06, "loss": 0.0857, "num_input_tokens_seen": 5144008, "step": 57125 }, { "epoch": 14.846673596673597, "grad_norm": 10.475430488586426, "learning_rate": 9.449383456551133e-06, "loss": 0.306, "num_input_tokens_seen": 5144456, "step": 57130 }, { "epoch": 14.847972972972974, "grad_norm": 25.14293670654297, "learning_rate": 9.444944574051224e-06, "loss": 0.4236, "num_input_tokens_seen": 5144920, "step": 57135 }, { "epoch": 14.849272349272349, "grad_norm": 16.738689422607422, "learning_rate": 9.440506491562706e-06, "loss": 0.3757, "num_input_tokens_seen": 5145368, "step": 57140 }, { "epoch": 14.850571725571726, "grad_norm": 3.1449177265167236, "learning_rate": 9.436069209313858e-06, "loss": 0.099, "num_input_tokens_seen": 5145784, "step": 57145 }, { "epoch": 14.851871101871101, "grad_norm": 2.462392807006836, "learning_rate": 9.431632727532877e-06, "loss": 0.0274, "num_input_tokens_seen": 5146216, "step": 57150 }, { "epoch": 14.853170478170478, "grad_norm": 19.69864273071289, "learning_rate": 9.427197046447946e-06, "loss": 0.4167, "num_input_tokens_seen": 5146696, "step": 57155 }, { "epoch": 14.854469854469855, "grad_norm": 0.050268884748220444, "learning_rate": 9.422762166287189e-06, "loss": 0.0323, "num_input_tokens_seen": 5147096, "step": 57160 }, { "epoch": 14.85576923076923, "grad_norm": 2.3968346118927, "learning_rate": 9.418328087278694e-06, "loss": 0.1878, "num_input_tokens_seen": 5147528, "step": 57165 }, { "epoch": 14.857068607068607, "grad_norm": 8.58965015411377, "learning_rate": 9.413894809650529e-06, "loss": 0.1861, "num_input_tokens_seen": 5147944, "step": 57170 }, { "epoch": 14.858367983367984, "grad_norm": 26.26654052734375, "learning_rate": 9.409462333630675e-06, "loss": 0.4534, "num_input_tokens_seen": 5148408, "step": 57175 }, { "epoch": 14.859667359667359, "grad_norm": 0.33057740330696106, "learning_rate": 9.405030659447119e-06, "loss": 0.0924, "num_input_tokens_seen": 5148840, "step": 57180 }, { "epoch": 14.860966735966736, "grad_norm": 1.2630358934402466, "learning_rate": 9.400599787327773e-06, "loss": 0.2173, "num_input_tokens_seen": 5149272, "step": 57185 }, { "epoch": 14.862266112266113, "grad_norm": 2.114302158355713, "learning_rate": 9.396169717500534e-06, "loss": 0.5637, "num_input_tokens_seen": 5149752, "step": 57190 }, { "epoch": 14.863565488565488, "grad_norm": 8.32356071472168, "learning_rate": 9.39174045019322e-06, "loss": 0.3292, "num_input_tokens_seen": 5150200, "step": 57195 }, { "epoch": 14.864864864864865, "grad_norm": 1.9468834400177002, "learning_rate": 9.387311985633668e-06, "loss": 0.0735, "num_input_tokens_seen": 5150648, "step": 57200 }, { "epoch": 14.866164241164242, "grad_norm": 1.621924638748169, "learning_rate": 9.382884324049609e-06, "loss": 0.0904, "num_input_tokens_seen": 5151112, "step": 57205 }, { "epoch": 14.867463617463617, "grad_norm": 10.880793571472168, "learning_rate": 9.378457465668783e-06, "loss": 0.3344, "num_input_tokens_seen": 5151560, "step": 57210 }, { "epoch": 14.868762993762994, "grad_norm": 21.74994659423828, "learning_rate": 9.374031410718851e-06, "loss": 0.5246, "num_input_tokens_seen": 5152040, "step": 57215 }, { "epoch": 14.87006237006237, "grad_norm": 0.6023075580596924, "learning_rate": 9.369606159427461e-06, "loss": 0.092, "num_input_tokens_seen": 5152520, "step": 57220 }, { "epoch": 14.871361746361746, "grad_norm": 0.06927362084388733, "learning_rate": 9.365181712022195e-06, "loss": 0.0779, "num_input_tokens_seen": 5152968, "step": 57225 }, { "epoch": 14.872661122661123, "grad_norm": 12.733420372009277, "learning_rate": 9.360758068730614e-06, "loss": 0.343, "num_input_tokens_seen": 5153416, "step": 57230 }, { "epoch": 14.8739604989605, "grad_norm": 0.6257703304290771, "learning_rate": 9.356335229780238e-06, "loss": 0.0801, "num_input_tokens_seen": 5153864, "step": 57235 }, { "epoch": 14.875259875259875, "grad_norm": 0.17077727615833282, "learning_rate": 9.351913195398524e-06, "loss": 0.2187, "num_input_tokens_seen": 5154328, "step": 57240 }, { "epoch": 14.876559251559252, "grad_norm": 16.201635360717773, "learning_rate": 9.347491965812913e-06, "loss": 0.12, "num_input_tokens_seen": 5154776, "step": 57245 }, { "epoch": 14.877858627858627, "grad_norm": 9.640573501586914, "learning_rate": 9.343071541250781e-06, "loss": 0.1281, "num_input_tokens_seen": 5155256, "step": 57250 }, { "epoch": 14.879158004158004, "grad_norm": 1.7981394529342651, "learning_rate": 9.33865192193949e-06, "loss": 0.3211, "num_input_tokens_seen": 5155720, "step": 57255 }, { "epoch": 14.880457380457381, "grad_norm": 1.005592703819275, "learning_rate": 9.334233108106327e-06, "loss": 0.1087, "num_input_tokens_seen": 5156168, "step": 57260 }, { "epoch": 14.881756756756756, "grad_norm": 19.88365364074707, "learning_rate": 9.329815099978568e-06, "loss": 0.2749, "num_input_tokens_seen": 5156616, "step": 57265 }, { "epoch": 14.883056133056133, "grad_norm": 13.512011528015137, "learning_rate": 9.32539789778343e-06, "loss": 0.3566, "num_input_tokens_seen": 5157096, "step": 57270 }, { "epoch": 14.88435550935551, "grad_norm": 2.4195761680603027, "learning_rate": 9.320981501748107e-06, "loss": 0.2448, "num_input_tokens_seen": 5157560, "step": 57275 }, { "epoch": 14.885654885654885, "grad_norm": 0.05822933465242386, "learning_rate": 9.31656591209972e-06, "loss": 0.048, "num_input_tokens_seen": 5158024, "step": 57280 }, { "epoch": 14.886954261954262, "grad_norm": 0.09989950805902481, "learning_rate": 9.312151129065383e-06, "loss": 0.2819, "num_input_tokens_seen": 5158456, "step": 57285 }, { "epoch": 14.888253638253639, "grad_norm": 9.703391075134277, "learning_rate": 9.307737152872137e-06, "loss": 0.0667, "num_input_tokens_seen": 5158888, "step": 57290 }, { "epoch": 14.889553014553014, "grad_norm": 10.977214813232422, "learning_rate": 9.303323983747012e-06, "loss": 0.1203, "num_input_tokens_seen": 5159352, "step": 57295 }, { "epoch": 14.890852390852391, "grad_norm": 19.782331466674805, "learning_rate": 9.298911621916967e-06, "loss": 0.1441, "num_input_tokens_seen": 5159848, "step": 57300 }, { "epoch": 14.892151767151766, "grad_norm": 5.036123275756836, "learning_rate": 9.29450006760894e-06, "loss": 0.0863, "num_input_tokens_seen": 5160296, "step": 57305 }, { "epoch": 14.893451143451143, "grad_norm": 0.1031465083360672, "learning_rate": 9.290089321049833e-06, "loss": 0.19, "num_input_tokens_seen": 5160760, "step": 57310 }, { "epoch": 14.89475051975052, "grad_norm": 22.689830780029297, "learning_rate": 9.285679382466474e-06, "loss": 0.1426, "num_input_tokens_seen": 5161224, "step": 57315 }, { "epoch": 14.896049896049895, "grad_norm": 0.5002211332321167, "learning_rate": 9.281270252085692e-06, "loss": 0.2619, "num_input_tokens_seen": 5161672, "step": 57320 }, { "epoch": 14.897349272349272, "grad_norm": 0.37327131628990173, "learning_rate": 9.27686193013423e-06, "loss": 0.3689, "num_input_tokens_seen": 5162104, "step": 57325 }, { "epoch": 14.89864864864865, "grad_norm": 10.588480949401855, "learning_rate": 9.272454416838839e-06, "loss": 0.0451, "num_input_tokens_seen": 5162552, "step": 57330 }, { "epoch": 14.899948024948024, "grad_norm": 13.193671226501465, "learning_rate": 9.268047712426173e-06, "loss": 0.1291, "num_input_tokens_seen": 5163000, "step": 57335 }, { "epoch": 14.901247401247401, "grad_norm": 0.6072551608085632, "learning_rate": 9.263641817122887e-06, "loss": 0.2747, "num_input_tokens_seen": 5163432, "step": 57340 }, { "epoch": 14.902546777546778, "grad_norm": 17.510997772216797, "learning_rate": 9.259236731155582e-06, "loss": 0.3616, "num_input_tokens_seen": 5163896, "step": 57345 }, { "epoch": 14.903846153846153, "grad_norm": 6.876801490783691, "learning_rate": 9.254832454750823e-06, "loss": 0.1327, "num_input_tokens_seen": 5164312, "step": 57350 }, { "epoch": 14.90514553014553, "grad_norm": 1.3761143684387207, "learning_rate": 9.250428988135108e-06, "loss": 0.2214, "num_input_tokens_seen": 5164728, "step": 57355 }, { "epoch": 14.906444906444907, "grad_norm": 0.2886139750480652, "learning_rate": 9.246026331534927e-06, "loss": 0.1198, "num_input_tokens_seen": 5165144, "step": 57360 }, { "epoch": 14.907744282744282, "grad_norm": 0.07970839738845825, "learning_rate": 9.241624485176708e-06, "loss": 0.032, "num_input_tokens_seen": 5165576, "step": 57365 }, { "epoch": 14.90904365904366, "grad_norm": 0.15190786123275757, "learning_rate": 9.237223449286833e-06, "loss": 0.5921, "num_input_tokens_seen": 5165992, "step": 57370 }, { "epoch": 14.910343035343036, "grad_norm": 23.67940330505371, "learning_rate": 9.232823224091659e-06, "loss": 0.3006, "num_input_tokens_seen": 5166424, "step": 57375 }, { "epoch": 14.911642411642411, "grad_norm": 26.372251510620117, "learning_rate": 9.22842380981749e-06, "loss": 0.309, "num_input_tokens_seen": 5166872, "step": 57380 }, { "epoch": 14.912941787941788, "grad_norm": 15.001837730407715, "learning_rate": 9.224025206690608e-06, "loss": 0.313, "num_input_tokens_seen": 5167304, "step": 57385 }, { "epoch": 14.914241164241163, "grad_norm": 3.464773416519165, "learning_rate": 9.219627414937219e-06, "loss": 0.1987, "num_input_tokens_seen": 5167768, "step": 57390 }, { "epoch": 14.91554054054054, "grad_norm": 1.281553864479065, "learning_rate": 9.215230434783518e-06, "loss": 0.0622, "num_input_tokens_seen": 5168216, "step": 57395 }, { "epoch": 14.916839916839917, "grad_norm": 6.506917476654053, "learning_rate": 9.210834266455631e-06, "loss": 0.245, "num_input_tokens_seen": 5168648, "step": 57400 }, { "epoch": 14.918139293139292, "grad_norm": 2.276514768600464, "learning_rate": 9.206438910179676e-06, "loss": 0.1716, "num_input_tokens_seen": 5169080, "step": 57405 }, { "epoch": 14.91943866943867, "grad_norm": 27.014060974121094, "learning_rate": 9.202044366181692e-06, "loss": 0.2592, "num_input_tokens_seen": 5169512, "step": 57410 }, { "epoch": 14.920738045738046, "grad_norm": 6.778360366821289, "learning_rate": 9.197650634687701e-06, "loss": 0.0943, "num_input_tokens_seen": 5169960, "step": 57415 }, { "epoch": 14.922037422037421, "grad_norm": 0.3311109244823456, "learning_rate": 9.193257715923682e-06, "loss": 0.1865, "num_input_tokens_seen": 5170424, "step": 57420 }, { "epoch": 14.923336798336798, "grad_norm": 19.836124420166016, "learning_rate": 9.18886561011557e-06, "loss": 0.3776, "num_input_tokens_seen": 5170872, "step": 57425 }, { "epoch": 14.924636174636175, "grad_norm": 12.51794147491455, "learning_rate": 9.18447431748925e-06, "loss": 0.422, "num_input_tokens_seen": 5171304, "step": 57430 }, { "epoch": 14.92593555093555, "grad_norm": 31.56871795654297, "learning_rate": 9.180083838270561e-06, "loss": 0.2778, "num_input_tokens_seen": 5171736, "step": 57435 }, { "epoch": 14.927234927234927, "grad_norm": 7.635639667510986, "learning_rate": 9.175694172685328e-06, "loss": 0.3159, "num_input_tokens_seen": 5172200, "step": 57440 }, { "epoch": 14.928534303534304, "grad_norm": 25.281667709350586, "learning_rate": 9.17130532095929e-06, "loss": 0.1382, "num_input_tokens_seen": 5172664, "step": 57445 }, { "epoch": 14.92983367983368, "grad_norm": 1.598798155784607, "learning_rate": 9.1669172833182e-06, "loss": 0.1102, "num_input_tokens_seen": 5173112, "step": 57450 }, { "epoch": 14.931133056133056, "grad_norm": 0.14760959148406982, "learning_rate": 9.162530059987715e-06, "loss": 0.0976, "num_input_tokens_seen": 5173560, "step": 57455 }, { "epoch": 14.932432432432432, "grad_norm": 0.7743948698043823, "learning_rate": 9.158143651193492e-06, "loss": 0.0949, "num_input_tokens_seen": 5173976, "step": 57460 }, { "epoch": 14.933731808731808, "grad_norm": 7.846988677978516, "learning_rate": 9.153758057161116e-06, "loss": 0.0462, "num_input_tokens_seen": 5174456, "step": 57465 }, { "epoch": 14.935031185031185, "grad_norm": 0.7549946904182434, "learning_rate": 9.14937327811615e-06, "loss": 0.178, "num_input_tokens_seen": 5174856, "step": 57470 }, { "epoch": 14.93633056133056, "grad_norm": 3.641282796859741, "learning_rate": 9.144989314284097e-06, "loss": 0.341, "num_input_tokens_seen": 5175288, "step": 57475 }, { "epoch": 14.937629937629938, "grad_norm": 42.77212905883789, "learning_rate": 9.140606165890437e-06, "loss": 0.6557, "num_input_tokens_seen": 5175736, "step": 57480 }, { "epoch": 14.938929313929314, "grad_norm": 0.5757808089256287, "learning_rate": 9.136223833160596e-06, "loss": 0.2924, "num_input_tokens_seen": 5176168, "step": 57485 }, { "epoch": 14.94022869022869, "grad_norm": 2.3274612426757812, "learning_rate": 9.131842316319971e-06, "loss": 0.2292, "num_input_tokens_seen": 5176600, "step": 57490 }, { "epoch": 14.941528066528067, "grad_norm": 32.33821105957031, "learning_rate": 9.127461615593898e-06, "loss": 0.5003, "num_input_tokens_seen": 5177032, "step": 57495 }, { "epoch": 14.942827442827443, "grad_norm": 8.938603401184082, "learning_rate": 9.123081731207677e-06, "loss": 0.0269, "num_input_tokens_seen": 5177480, "step": 57500 }, { "epoch": 14.944126819126819, "grad_norm": 20.37810516357422, "learning_rate": 9.118702663386584e-06, "loss": 0.3543, "num_input_tokens_seen": 5177960, "step": 57505 }, { "epoch": 14.945426195426196, "grad_norm": 2.5768966674804688, "learning_rate": 9.114324412355821e-06, "loss": 0.124, "num_input_tokens_seen": 5178440, "step": 57510 }, { "epoch": 14.946725571725572, "grad_norm": 0.291107177734375, "learning_rate": 9.109946978340572e-06, "loss": 0.214, "num_input_tokens_seen": 5178920, "step": 57515 }, { "epoch": 14.948024948024948, "grad_norm": 0.0512557178735733, "learning_rate": 9.105570361565977e-06, "loss": 0.0828, "num_input_tokens_seen": 5179384, "step": 57520 }, { "epoch": 14.949324324324325, "grad_norm": 18.120031356811523, "learning_rate": 9.101194562257137e-06, "loss": 0.1667, "num_input_tokens_seen": 5179784, "step": 57525 }, { "epoch": 14.950623700623701, "grad_norm": 1.8225488662719727, "learning_rate": 9.096819580639082e-06, "loss": 0.2497, "num_input_tokens_seen": 5180216, "step": 57530 }, { "epoch": 14.951923076923077, "grad_norm": 0.28366756439208984, "learning_rate": 9.092445416936846e-06, "loss": 0.0036, "num_input_tokens_seen": 5180712, "step": 57535 }, { "epoch": 14.953222453222454, "grad_norm": 9.700287818908691, "learning_rate": 9.088072071375372e-06, "loss": 0.1558, "num_input_tokens_seen": 5181144, "step": 57540 }, { "epoch": 14.954521829521829, "grad_norm": 8.051910400390625, "learning_rate": 9.08369954417961e-06, "loss": 0.0943, "num_input_tokens_seen": 5181624, "step": 57545 }, { "epoch": 14.955821205821206, "grad_norm": 20.300460815429688, "learning_rate": 9.07932783557442e-06, "loss": 0.2462, "num_input_tokens_seen": 5182056, "step": 57550 }, { "epoch": 14.957120582120583, "grad_norm": 5.060522079467773, "learning_rate": 9.074956945784654e-06, "loss": 0.1258, "num_input_tokens_seen": 5182488, "step": 57555 }, { "epoch": 14.958419958419958, "grad_norm": 19.6124210357666, "learning_rate": 9.07058687503512e-06, "loss": 0.399, "num_input_tokens_seen": 5182936, "step": 57560 }, { "epoch": 14.959719334719335, "grad_norm": 0.1643369495868683, "learning_rate": 9.066217623550558e-06, "loss": 0.1074, "num_input_tokens_seen": 5183368, "step": 57565 }, { "epoch": 14.961018711018712, "grad_norm": 41.167110443115234, "learning_rate": 9.061849191555696e-06, "loss": 0.4162, "num_input_tokens_seen": 5183848, "step": 57570 }, { "epoch": 14.962318087318087, "grad_norm": 27.672964096069336, "learning_rate": 9.057481579275196e-06, "loss": 0.3631, "num_input_tokens_seen": 5184312, "step": 57575 }, { "epoch": 14.963617463617464, "grad_norm": 0.15928515791893005, "learning_rate": 9.0531147869337e-06, "loss": 0.3032, "num_input_tokens_seen": 5184728, "step": 57580 }, { "epoch": 14.96491683991684, "grad_norm": 0.09911072999238968, "learning_rate": 9.048748814755784e-06, "loss": 0.1397, "num_input_tokens_seen": 5185192, "step": 57585 }, { "epoch": 14.966216216216216, "grad_norm": 0.23650524020195007, "learning_rate": 9.044383662965997e-06, "loss": 0.1938, "num_input_tokens_seen": 5185640, "step": 57590 }, { "epoch": 14.967515592515593, "grad_norm": 3.954043388366699, "learning_rate": 9.040019331788848e-06, "loss": 0.1579, "num_input_tokens_seen": 5186120, "step": 57595 }, { "epoch": 14.96881496881497, "grad_norm": 0.02199517749249935, "learning_rate": 9.035655821448804e-06, "loss": 0.0682, "num_input_tokens_seen": 5186552, "step": 57600 }, { "epoch": 14.970114345114345, "grad_norm": 0.5529115200042725, "learning_rate": 9.031293132170271e-06, "loss": 0.0501, "num_input_tokens_seen": 5187016, "step": 57605 }, { "epoch": 14.971413721413722, "grad_norm": 0.5394166707992554, "learning_rate": 9.026931264177641e-06, "loss": 0.2572, "num_input_tokens_seen": 5187496, "step": 57610 }, { "epoch": 14.972713097713097, "grad_norm": 6.932975769042969, "learning_rate": 9.022570217695232e-06, "loss": 0.1268, "num_input_tokens_seen": 5187960, "step": 57615 }, { "epoch": 14.974012474012474, "grad_norm": 3.8650970458984375, "learning_rate": 9.018209992947355e-06, "loss": 0.2509, "num_input_tokens_seen": 5188456, "step": 57620 }, { "epoch": 14.97531185031185, "grad_norm": 20.80596923828125, "learning_rate": 9.013850590158241e-06, "loss": 0.1874, "num_input_tokens_seen": 5188888, "step": 57625 }, { "epoch": 14.976611226611226, "grad_norm": 16.769922256469727, "learning_rate": 9.00949200955211e-06, "loss": 0.3324, "num_input_tokens_seen": 5189336, "step": 57630 }, { "epoch": 14.977910602910603, "grad_norm": 0.5386015176773071, "learning_rate": 9.005134251353132e-06, "loss": 0.1809, "num_input_tokens_seen": 5189784, "step": 57635 }, { "epoch": 14.97920997920998, "grad_norm": 0.4487655758857727, "learning_rate": 9.000777315785417e-06, "loss": 0.0764, "num_input_tokens_seen": 5190232, "step": 57640 }, { "epoch": 14.980509355509355, "grad_norm": 8.594221115112305, "learning_rate": 8.996421203073062e-06, "loss": 0.2416, "num_input_tokens_seen": 5190680, "step": 57645 }, { "epoch": 14.981808731808732, "grad_norm": 4.687139987945557, "learning_rate": 8.992065913440092e-06, "loss": 0.3782, "num_input_tokens_seen": 5191160, "step": 57650 }, { "epoch": 14.983108108108109, "grad_norm": 1.1084578037261963, "learning_rate": 8.98771144711052e-06, "loss": 0.1997, "num_input_tokens_seen": 5191592, "step": 57655 }, { "epoch": 14.984407484407484, "grad_norm": 0.9912567138671875, "learning_rate": 8.983357804308282e-06, "loss": 0.2299, "num_input_tokens_seen": 5192024, "step": 57660 }, { "epoch": 14.98570686070686, "grad_norm": 30.612518310546875, "learning_rate": 8.979004985257294e-06, "loss": 0.297, "num_input_tokens_seen": 5192488, "step": 57665 }, { "epoch": 14.987006237006238, "grad_norm": 9.441980361938477, "learning_rate": 8.974652990181433e-06, "loss": 0.3161, "num_input_tokens_seen": 5192936, "step": 57670 }, { "epoch": 14.988305613305613, "grad_norm": 0.4271101653575897, "learning_rate": 8.970301819304533e-06, "loss": 0.2871, "num_input_tokens_seen": 5193400, "step": 57675 }, { "epoch": 14.98960498960499, "grad_norm": 0.01524240430444479, "learning_rate": 8.965951472850359e-06, "loss": 0.1583, "num_input_tokens_seen": 5193880, "step": 57680 }, { "epoch": 14.990904365904367, "grad_norm": 18.010751724243164, "learning_rate": 8.961601951042676e-06, "loss": 0.3764, "num_input_tokens_seen": 5194344, "step": 57685 }, { "epoch": 14.992203742203742, "grad_norm": 0.30705055594444275, "learning_rate": 8.957253254105166e-06, "loss": 0.085, "num_input_tokens_seen": 5194792, "step": 57690 }, { "epoch": 14.993503118503119, "grad_norm": 0.10589862614870071, "learning_rate": 8.95290538226148e-06, "loss": 0.0548, "num_input_tokens_seen": 5195240, "step": 57695 }, { "epoch": 14.994802494802494, "grad_norm": 0.7451165914535522, "learning_rate": 8.948558335735264e-06, "loss": 0.1236, "num_input_tokens_seen": 5195704, "step": 57700 }, { "epoch": 14.996101871101871, "grad_norm": 1.2010356187820435, "learning_rate": 8.944212114750058e-06, "loss": 0.5874, "num_input_tokens_seen": 5196152, "step": 57705 }, { "epoch": 14.997401247401248, "grad_norm": 0.18220627307891846, "learning_rate": 8.939866719529419e-06, "loss": 0.2108, "num_input_tokens_seen": 5196632, "step": 57710 }, { "epoch": 14.998700623700623, "grad_norm": 29.974655151367188, "learning_rate": 8.935522150296816e-06, "loss": 0.7135, "num_input_tokens_seen": 5197048, "step": 57715 }, { "epoch": 15.0, "grad_norm": 2.265982151031494, "learning_rate": 8.931178407275706e-06, "loss": 0.1511, "num_input_tokens_seen": 5197456, "step": 57720 }, { "epoch": 15.0, "eval_loss": 0.44215071201324463, "eval_runtime": 13.2129, "eval_samples_per_second": 64.785, "eval_steps_per_second": 32.393, "num_input_tokens_seen": 5197456, "step": 57720 }, { "epoch": 15.001299376299377, "grad_norm": 0.16588735580444336, "learning_rate": 8.926835490689481e-06, "loss": 0.0129, "num_input_tokens_seen": 5197920, "step": 57725 }, { "epoch": 15.002598752598752, "grad_norm": 0.6441186666488647, "learning_rate": 8.922493400761505e-06, "loss": 0.2508, "num_input_tokens_seen": 5198400, "step": 57730 }, { "epoch": 15.003898128898129, "grad_norm": 1.2934632301330566, "learning_rate": 8.9181521377151e-06, "loss": 0.13, "num_input_tokens_seen": 5198864, "step": 57735 }, { "epoch": 15.005197505197506, "grad_norm": 21.642818450927734, "learning_rate": 8.913811701773547e-06, "loss": 0.0789, "num_input_tokens_seen": 5199312, "step": 57740 }, { "epoch": 15.006496881496881, "grad_norm": 0.636342465877533, "learning_rate": 8.909472093160065e-06, "loss": 0.0676, "num_input_tokens_seen": 5199728, "step": 57745 }, { "epoch": 15.007796257796258, "grad_norm": 17.611312866210938, "learning_rate": 8.905133312097855e-06, "loss": 0.1472, "num_input_tokens_seen": 5200256, "step": 57750 }, { "epoch": 15.009095634095635, "grad_norm": 0.01541136484593153, "learning_rate": 8.900795358810062e-06, "loss": 0.0733, "num_input_tokens_seen": 5200720, "step": 57755 }, { "epoch": 15.01039501039501, "grad_norm": 1.2961068153381348, "learning_rate": 8.896458233519782e-06, "loss": 0.034, "num_input_tokens_seen": 5201168, "step": 57760 }, { "epoch": 15.011694386694387, "grad_norm": 16.48130226135254, "learning_rate": 8.892121936450085e-06, "loss": 0.1427, "num_input_tokens_seen": 5201584, "step": 57765 }, { "epoch": 15.012993762993762, "grad_norm": 9.193632125854492, "learning_rate": 8.88778646782399e-06, "loss": 0.037, "num_input_tokens_seen": 5202080, "step": 57770 }, { "epoch": 15.01429313929314, "grad_norm": 0.042922332882881165, "learning_rate": 8.883451827864481e-06, "loss": 0.1082, "num_input_tokens_seen": 5202496, "step": 57775 }, { "epoch": 15.015592515592516, "grad_norm": 21.192964553833008, "learning_rate": 8.87911801679448e-06, "loss": 0.3493, "num_input_tokens_seen": 5202960, "step": 57780 }, { "epoch": 15.016891891891891, "grad_norm": 2.1818952560424805, "learning_rate": 8.874785034836894e-06, "loss": 0.2364, "num_input_tokens_seen": 5203408, "step": 57785 }, { "epoch": 15.018191268191268, "grad_norm": 0.20907168090343475, "learning_rate": 8.870452882214555e-06, "loss": 0.0614, "num_input_tokens_seen": 5203920, "step": 57790 }, { "epoch": 15.019490644490645, "grad_norm": 16.64115333557129, "learning_rate": 8.866121559150286e-06, "loss": 0.0513, "num_input_tokens_seen": 5204384, "step": 57795 }, { "epoch": 15.02079002079002, "grad_norm": 0.0581446997821331, "learning_rate": 8.86179106586684e-06, "loss": 0.0064, "num_input_tokens_seen": 5204816, "step": 57800 }, { "epoch": 15.022089397089397, "grad_norm": 0.14339472353458405, "learning_rate": 8.85746140258694e-06, "loss": 0.062, "num_input_tokens_seen": 5205264, "step": 57805 }, { "epoch": 15.023388773388774, "grad_norm": 24.619766235351562, "learning_rate": 8.853132569533266e-06, "loss": 0.3645, "num_input_tokens_seen": 5205696, "step": 57810 }, { "epoch": 15.02468814968815, "grad_norm": 9.403768539428711, "learning_rate": 8.848804566928464e-06, "loss": 0.3993, "num_input_tokens_seen": 5206128, "step": 57815 }, { "epoch": 15.025987525987526, "grad_norm": 0.16271623969078064, "learning_rate": 8.844477394995118e-06, "loss": 0.1134, "num_input_tokens_seen": 5206624, "step": 57820 }, { "epoch": 15.027286902286903, "grad_norm": 5.868485450744629, "learning_rate": 8.840151053955773e-06, "loss": 0.0429, "num_input_tokens_seen": 5207088, "step": 57825 }, { "epoch": 15.028586278586278, "grad_norm": 0.3270731568336487, "learning_rate": 8.83582554403295e-06, "loss": 0.2407, "num_input_tokens_seen": 5207520, "step": 57830 }, { "epoch": 15.029885654885655, "grad_norm": 0.08044113963842392, "learning_rate": 8.831500865449097e-06, "loss": 0.2191, "num_input_tokens_seen": 5207936, "step": 57835 }, { "epoch": 15.03118503118503, "grad_norm": 0.06134435907006264, "learning_rate": 8.827177018426649e-06, "loss": 0.1575, "num_input_tokens_seen": 5208432, "step": 57840 }, { "epoch": 15.032484407484407, "grad_norm": 32.77627944946289, "learning_rate": 8.82285400318798e-06, "loss": 0.1273, "num_input_tokens_seen": 5208912, "step": 57845 }, { "epoch": 15.033783783783784, "grad_norm": 0.009811109863221645, "learning_rate": 8.818531819955442e-06, "loss": 0.2945, "num_input_tokens_seen": 5209360, "step": 57850 }, { "epoch": 15.03508316008316, "grad_norm": 19.20560073852539, "learning_rate": 8.814210468951306e-06, "loss": 0.0957, "num_input_tokens_seen": 5209808, "step": 57855 }, { "epoch": 15.036382536382536, "grad_norm": 15.929343223571777, "learning_rate": 8.809889950397843e-06, "loss": 0.1701, "num_input_tokens_seen": 5210208, "step": 57860 }, { "epoch": 15.037681912681913, "grad_norm": 0.20774275064468384, "learning_rate": 8.805570264517243e-06, "loss": 0.3419, "num_input_tokens_seen": 5210688, "step": 57865 }, { "epoch": 15.038981288981288, "grad_norm": 0.9647770524024963, "learning_rate": 8.801251411531692e-06, "loss": 0.0806, "num_input_tokens_seen": 5211136, "step": 57870 }, { "epoch": 15.040280665280665, "grad_norm": 0.07142774760723114, "learning_rate": 8.796933391663292e-06, "loss": 0.3244, "num_input_tokens_seen": 5211568, "step": 57875 }, { "epoch": 15.041580041580042, "grad_norm": 3.240356206893921, "learning_rate": 8.792616205134132e-06, "loss": 0.2663, "num_input_tokens_seen": 5212000, "step": 57880 }, { "epoch": 15.042879417879417, "grad_norm": 2.7820708751678467, "learning_rate": 8.788299852166257e-06, "loss": 0.3698, "num_input_tokens_seen": 5212448, "step": 57885 }, { "epoch": 15.044178794178794, "grad_norm": 11.97403335571289, "learning_rate": 8.783984332981649e-06, "loss": 0.5582, "num_input_tokens_seen": 5212896, "step": 57890 }, { "epoch": 15.045478170478171, "grad_norm": 0.16197548806667328, "learning_rate": 8.779669647802269e-06, "loss": 0.004, "num_input_tokens_seen": 5213328, "step": 57895 }, { "epoch": 15.046777546777546, "grad_norm": 0.1354312300682068, "learning_rate": 8.775355796850015e-06, "loss": 0.0268, "num_input_tokens_seen": 5213744, "step": 57900 }, { "epoch": 15.048076923076923, "grad_norm": 0.552845299243927, "learning_rate": 8.771042780346766e-06, "loss": 0.0696, "num_input_tokens_seen": 5214192, "step": 57905 }, { "epoch": 15.049376299376299, "grad_norm": 1.4591304063796997, "learning_rate": 8.766730598514328e-06, "loss": 0.0203, "num_input_tokens_seen": 5214640, "step": 57910 }, { "epoch": 15.050675675675675, "grad_norm": 0.0881907045841217, "learning_rate": 8.762419251574489e-06, "loss": 0.1381, "num_input_tokens_seen": 5215072, "step": 57915 }, { "epoch": 15.051975051975052, "grad_norm": 0.0557093471288681, "learning_rate": 8.758108739748986e-06, "loss": 0.2395, "num_input_tokens_seen": 5215536, "step": 57920 }, { "epoch": 15.053274428274428, "grad_norm": 25.769105911254883, "learning_rate": 8.753799063259522e-06, "loss": 0.3358, "num_input_tokens_seen": 5216000, "step": 57925 }, { "epoch": 15.054573804573804, "grad_norm": 0.44645121693611145, "learning_rate": 8.749490222327728e-06, "loss": 0.0516, "num_input_tokens_seen": 5216432, "step": 57930 }, { "epoch": 15.055873180873181, "grad_norm": 0.105668805539608, "learning_rate": 8.745182217175232e-06, "loss": 0.0823, "num_input_tokens_seen": 5216912, "step": 57935 }, { "epoch": 15.057172557172557, "grad_norm": 1.738508701324463, "learning_rate": 8.740875048023581e-06, "loss": 0.0226, "num_input_tokens_seen": 5217392, "step": 57940 }, { "epoch": 15.058471933471933, "grad_norm": 20.469524383544922, "learning_rate": 8.736568715094304e-06, "loss": 0.506, "num_input_tokens_seen": 5217904, "step": 57945 }, { "epoch": 15.05977130977131, "grad_norm": 0.059610530734062195, "learning_rate": 8.732263218608892e-06, "loss": 0.0559, "num_input_tokens_seen": 5218368, "step": 57950 }, { "epoch": 15.061070686070686, "grad_norm": 0.426898717880249, "learning_rate": 8.727958558788757e-06, "loss": 0.0365, "num_input_tokens_seen": 5218784, "step": 57955 }, { "epoch": 15.062370062370062, "grad_norm": 5.3287434577941895, "learning_rate": 8.723654735855316e-06, "loss": 0.0524, "num_input_tokens_seen": 5219184, "step": 57960 }, { "epoch": 15.06366943866944, "grad_norm": 4.090368747711182, "learning_rate": 8.719351750029896e-06, "loss": 0.0119, "num_input_tokens_seen": 5219600, "step": 57965 }, { "epoch": 15.064968814968815, "grad_norm": 13.364715576171875, "learning_rate": 8.715049601533825e-06, "loss": 0.3308, "num_input_tokens_seen": 5220080, "step": 57970 }, { "epoch": 15.066268191268192, "grad_norm": 15.935218811035156, "learning_rate": 8.71074829058835e-06, "loss": 0.0641, "num_input_tokens_seen": 5220480, "step": 57975 }, { "epoch": 15.067567567567568, "grad_norm": 1.6635318994522095, "learning_rate": 8.706447817414696e-06, "loss": 0.0258, "num_input_tokens_seen": 5220960, "step": 57980 }, { "epoch": 15.068866943866944, "grad_norm": 4.809566020965576, "learning_rate": 8.702148182234043e-06, "loss": 0.1456, "num_input_tokens_seen": 5221392, "step": 57985 }, { "epoch": 15.07016632016632, "grad_norm": 18.198793411254883, "learning_rate": 8.697849385267534e-06, "loss": 0.1872, "num_input_tokens_seen": 5221856, "step": 57990 }, { "epoch": 15.071465696465696, "grad_norm": 26.083532333374023, "learning_rate": 8.693551426736241e-06, "loss": 0.1447, "num_input_tokens_seen": 5222304, "step": 57995 }, { "epoch": 15.072765072765073, "grad_norm": 0.7885129451751709, "learning_rate": 8.68925430686123e-06, "loss": 0.12, "num_input_tokens_seen": 5222736, "step": 58000 }, { "epoch": 15.07406444906445, "grad_norm": 81.31997680664062, "learning_rate": 8.684958025863493e-06, "loss": 0.2272, "num_input_tokens_seen": 5223168, "step": 58005 }, { "epoch": 15.075363825363825, "grad_norm": 24.562740325927734, "learning_rate": 8.680662583964003e-06, "loss": 0.1566, "num_input_tokens_seen": 5223616, "step": 58010 }, { "epoch": 15.076663201663202, "grad_norm": 21.043073654174805, "learning_rate": 8.676367981383666e-06, "loss": 0.4895, "num_input_tokens_seen": 5224048, "step": 58015 }, { "epoch": 15.077962577962579, "grad_norm": 5.65708065032959, "learning_rate": 8.672074218343362e-06, "loss": 0.1516, "num_input_tokens_seen": 5224496, "step": 58020 }, { "epoch": 15.079261954261954, "grad_norm": 60.46070861816406, "learning_rate": 8.667781295063934e-06, "loss": 0.3732, "num_input_tokens_seen": 5224960, "step": 58025 }, { "epoch": 15.08056133056133, "grad_norm": 15.523286819458008, "learning_rate": 8.663489211766157e-06, "loss": 0.3605, "num_input_tokens_seen": 5225392, "step": 58030 }, { "epoch": 15.081860706860708, "grad_norm": 22.500261306762695, "learning_rate": 8.65919796867079e-06, "loss": 0.221, "num_input_tokens_seen": 5225792, "step": 58035 }, { "epoch": 15.083160083160083, "grad_norm": 0.25960826873779297, "learning_rate": 8.654907565998518e-06, "loss": 0.0031, "num_input_tokens_seen": 5226272, "step": 58040 }, { "epoch": 15.08445945945946, "grad_norm": 0.36420875787734985, "learning_rate": 8.65061800397002e-06, "loss": 0.0427, "num_input_tokens_seen": 5226752, "step": 58045 }, { "epoch": 15.085758835758837, "grad_norm": 0.5806337594985962, "learning_rate": 8.646329282805898e-06, "loss": 0.1977, "num_input_tokens_seen": 5227184, "step": 58050 }, { "epoch": 15.087058212058212, "grad_norm": 18.683258056640625, "learning_rate": 8.642041402726728e-06, "loss": 0.0448, "num_input_tokens_seen": 5227616, "step": 58055 }, { "epoch": 15.088357588357589, "grad_norm": 0.1951695829629898, "learning_rate": 8.637754363953044e-06, "loss": 0.0965, "num_input_tokens_seen": 5228080, "step": 58060 }, { "epoch": 15.089656964656964, "grad_norm": 1.2057292461395264, "learning_rate": 8.633468166705336e-06, "loss": 0.0271, "num_input_tokens_seen": 5228560, "step": 58065 }, { "epoch": 15.09095634095634, "grad_norm": 0.09039118140935898, "learning_rate": 8.629182811204034e-06, "loss": 0.0215, "num_input_tokens_seen": 5229056, "step": 58070 }, { "epoch": 15.092255717255718, "grad_norm": 3.903388261795044, "learning_rate": 8.624898297669557e-06, "loss": 0.096, "num_input_tokens_seen": 5229536, "step": 58075 }, { "epoch": 15.093555093555093, "grad_norm": 15.265917778015137, "learning_rate": 8.620614626322248e-06, "loss": 0.3286, "num_input_tokens_seen": 5229968, "step": 58080 }, { "epoch": 15.09485446985447, "grad_norm": 0.18660041689872742, "learning_rate": 8.616331797382416e-06, "loss": 0.0358, "num_input_tokens_seen": 5230432, "step": 58085 }, { "epoch": 15.096153846153847, "grad_norm": 0.6074392199516296, "learning_rate": 8.612049811070336e-06, "loss": 0.0156, "num_input_tokens_seen": 5230864, "step": 58090 }, { "epoch": 15.097453222453222, "grad_norm": 0.27501535415649414, "learning_rate": 8.607768667606236e-06, "loss": 0.2279, "num_input_tokens_seen": 5231280, "step": 58095 }, { "epoch": 15.098752598752599, "grad_norm": 9.362977981567383, "learning_rate": 8.603488367210308e-06, "loss": 0.2224, "num_input_tokens_seen": 5231712, "step": 58100 }, { "epoch": 15.100051975051976, "grad_norm": 0.598616361618042, "learning_rate": 8.599208910102677e-06, "loss": 0.0974, "num_input_tokens_seen": 5232160, "step": 58105 }, { "epoch": 15.10135135135135, "grad_norm": 1.5652239322662354, "learning_rate": 8.594930296503453e-06, "loss": 0.1672, "num_input_tokens_seen": 5232624, "step": 58110 }, { "epoch": 15.102650727650728, "grad_norm": 0.31226280331611633, "learning_rate": 8.590652526632675e-06, "loss": 0.0314, "num_input_tokens_seen": 5233056, "step": 58115 }, { "epoch": 15.103950103950105, "grad_norm": 0.04281095787882805, "learning_rate": 8.58637560071037e-06, "loss": 0.0633, "num_input_tokens_seen": 5233488, "step": 58120 }, { "epoch": 15.10524948024948, "grad_norm": 35.81391143798828, "learning_rate": 8.582099518956485e-06, "loss": 0.234, "num_input_tokens_seen": 5233920, "step": 58125 }, { "epoch": 15.106548856548857, "grad_norm": 1.0733028650283813, "learning_rate": 8.577824281590952e-06, "loss": 0.0973, "num_input_tokens_seen": 5234432, "step": 58130 }, { "epoch": 15.107848232848232, "grad_norm": 0.13190361857414246, "learning_rate": 8.573549888833651e-06, "loss": 0.0498, "num_input_tokens_seen": 5234912, "step": 58135 }, { "epoch": 15.109147609147609, "grad_norm": 0.1956435590982437, "learning_rate": 8.569276340904427e-06, "loss": 0.2158, "num_input_tokens_seen": 5235392, "step": 58140 }, { "epoch": 15.110446985446986, "grad_norm": 33.295413970947266, "learning_rate": 8.565003638023065e-06, "loss": 0.1449, "num_input_tokens_seen": 5235856, "step": 58145 }, { "epoch": 15.111746361746361, "grad_norm": 5.323548316955566, "learning_rate": 8.560731780409304e-06, "loss": 0.0838, "num_input_tokens_seen": 5236304, "step": 58150 }, { "epoch": 15.113045738045738, "grad_norm": 27.174976348876953, "learning_rate": 8.556460768282867e-06, "loss": 0.0919, "num_input_tokens_seen": 5236752, "step": 58155 }, { "epoch": 15.114345114345115, "grad_norm": 41.20539855957031, "learning_rate": 8.5521906018634e-06, "loss": 0.3617, "num_input_tokens_seen": 5237152, "step": 58160 }, { "epoch": 15.11564449064449, "grad_norm": 10.926445007324219, "learning_rate": 8.54792128137053e-06, "loss": 0.0921, "num_input_tokens_seen": 5237600, "step": 58165 }, { "epoch": 15.116943866943867, "grad_norm": 0.33262699842453003, "learning_rate": 8.543652807023833e-06, "loss": 0.0093, "num_input_tokens_seen": 5238048, "step": 58170 }, { "epoch": 15.118243243243244, "grad_norm": 0.4717158079147339, "learning_rate": 8.539385179042847e-06, "loss": 0.0398, "num_input_tokens_seen": 5238480, "step": 58175 }, { "epoch": 15.119542619542619, "grad_norm": 20.642427444458008, "learning_rate": 8.535118397647044e-06, "loss": 0.3144, "num_input_tokens_seen": 5238928, "step": 58180 }, { "epoch": 15.120841995841996, "grad_norm": 0.05050932243466377, "learning_rate": 8.530852463055889e-06, "loss": 0.0314, "num_input_tokens_seen": 5239344, "step": 58185 }, { "epoch": 15.122141372141373, "grad_norm": 0.019988980144262314, "learning_rate": 8.526587375488759e-06, "loss": 0.4338, "num_input_tokens_seen": 5239824, "step": 58190 }, { "epoch": 15.123440748440748, "grad_norm": 0.4052342474460602, "learning_rate": 8.522323135165028e-06, "loss": 0.0519, "num_input_tokens_seen": 5240320, "step": 58195 }, { "epoch": 15.124740124740125, "grad_norm": 0.676401674747467, "learning_rate": 8.518059742304005e-06, "loss": 0.3879, "num_input_tokens_seen": 5240800, "step": 58200 }, { "epoch": 15.126039501039502, "grad_norm": 0.021185478195548058, "learning_rate": 8.51379719712497e-06, "loss": 0.0725, "num_input_tokens_seen": 5241248, "step": 58205 }, { "epoch": 15.127338877338877, "grad_norm": 0.12073241919279099, "learning_rate": 8.50953549984714e-06, "loss": 0.2127, "num_input_tokens_seen": 5241696, "step": 58210 }, { "epoch": 15.128638253638254, "grad_norm": 0.10308785736560822, "learning_rate": 8.505274650689692e-06, "loss": 0.0616, "num_input_tokens_seen": 5242144, "step": 58215 }, { "epoch": 15.12993762993763, "grad_norm": 0.09455744922161102, "learning_rate": 8.501014649871785e-06, "loss": 0.2573, "num_input_tokens_seen": 5242608, "step": 58220 }, { "epoch": 15.131237006237006, "grad_norm": 8.968690872192383, "learning_rate": 8.496755497612492e-06, "loss": 0.1974, "num_input_tokens_seen": 5243040, "step": 58225 }, { "epoch": 15.132536382536383, "grad_norm": 12.547845840454102, "learning_rate": 8.492497194130877e-06, "loss": 0.0501, "num_input_tokens_seen": 5243504, "step": 58230 }, { "epoch": 15.133835758835758, "grad_norm": 22.316497802734375, "learning_rate": 8.488239739645947e-06, "loss": 0.4174, "num_input_tokens_seen": 5243952, "step": 58235 }, { "epoch": 15.135135135135135, "grad_norm": 0.41472378373146057, "learning_rate": 8.483983134376677e-06, "loss": 0.0613, "num_input_tokens_seen": 5244400, "step": 58240 }, { "epoch": 15.136434511434512, "grad_norm": 1.1077176332473755, "learning_rate": 8.479727378541972e-06, "loss": 0.0087, "num_input_tokens_seen": 5244864, "step": 58245 }, { "epoch": 15.137733887733887, "grad_norm": 1.1464555263519287, "learning_rate": 8.475472472360724e-06, "loss": 0.0412, "num_input_tokens_seen": 5245296, "step": 58250 }, { "epoch": 15.139033264033264, "grad_norm": 3.1096296310424805, "learning_rate": 8.471218416051752e-06, "loss": 0.0392, "num_input_tokens_seen": 5245744, "step": 58255 }, { "epoch": 15.140332640332641, "grad_norm": 34.38951873779297, "learning_rate": 8.46696520983386e-06, "loss": 0.2831, "num_input_tokens_seen": 5246208, "step": 58260 }, { "epoch": 15.141632016632016, "grad_norm": 48.37233352661133, "learning_rate": 8.462712853925784e-06, "loss": 0.3931, "num_input_tokens_seen": 5246640, "step": 58265 }, { "epoch": 15.142931392931393, "grad_norm": 21.636953353881836, "learning_rate": 8.458461348546228e-06, "loss": 0.094, "num_input_tokens_seen": 5247088, "step": 58270 }, { "epoch": 15.14423076923077, "grad_norm": 20.803058624267578, "learning_rate": 8.454210693913863e-06, "loss": 0.2881, "num_input_tokens_seen": 5247536, "step": 58275 }, { "epoch": 15.145530145530145, "grad_norm": 6.447859764099121, "learning_rate": 8.449960890247289e-06, "loss": 0.2718, "num_input_tokens_seen": 5247968, "step": 58280 }, { "epoch": 15.146829521829522, "grad_norm": 22.00739097595215, "learning_rate": 8.445711937765092e-06, "loss": 0.301, "num_input_tokens_seen": 5248400, "step": 58285 }, { "epoch": 15.148128898128897, "grad_norm": 0.3431607484817505, "learning_rate": 8.441463836685782e-06, "loss": 0.3355, "num_input_tokens_seen": 5248848, "step": 58290 }, { "epoch": 15.149428274428274, "grad_norm": 2.334041118621826, "learning_rate": 8.437216587227859e-06, "loss": 0.0611, "num_input_tokens_seen": 5249296, "step": 58295 }, { "epoch": 15.150727650727651, "grad_norm": 2.638648748397827, "learning_rate": 8.432970189609752e-06, "loss": 0.113, "num_input_tokens_seen": 5249696, "step": 58300 }, { "epoch": 15.152027027027026, "grad_norm": 2.7813093662261963, "learning_rate": 8.42872464404986e-06, "loss": 0.0122, "num_input_tokens_seen": 5250112, "step": 58305 }, { "epoch": 15.153326403326403, "grad_norm": 2.479341983795166, "learning_rate": 8.424479950766536e-06, "loss": 0.1212, "num_input_tokens_seen": 5250560, "step": 58310 }, { "epoch": 15.15462577962578, "grad_norm": 1.8771703243255615, "learning_rate": 8.420236109978102e-06, "loss": 0.1484, "num_input_tokens_seen": 5250992, "step": 58315 }, { "epoch": 15.155925155925155, "grad_norm": 28.11948585510254, "learning_rate": 8.4159931219028e-06, "loss": 0.3207, "num_input_tokens_seen": 5251472, "step": 58320 }, { "epoch": 15.157224532224532, "grad_norm": 5.9265241622924805, "learning_rate": 8.41175098675887e-06, "loss": 0.737, "num_input_tokens_seen": 5251888, "step": 58325 }, { "epoch": 15.15852390852391, "grad_norm": 0.4731350541114807, "learning_rate": 8.407509704764474e-06, "loss": 0.0489, "num_input_tokens_seen": 5252336, "step": 58330 }, { "epoch": 15.159823284823284, "grad_norm": 32.192142486572266, "learning_rate": 8.40326927613776e-06, "loss": 0.2089, "num_input_tokens_seen": 5252768, "step": 58335 }, { "epoch": 15.161122661122661, "grad_norm": 26.118066787719727, "learning_rate": 8.3990297010968e-06, "loss": 0.1145, "num_input_tokens_seen": 5253184, "step": 58340 }, { "epoch": 15.162422037422038, "grad_norm": 0.022914860397577286, "learning_rate": 8.394790979859649e-06, "loss": 0.0025, "num_input_tokens_seen": 5253632, "step": 58345 }, { "epoch": 15.163721413721413, "grad_norm": 0.016665322706103325, "learning_rate": 8.390553112644317e-06, "loss": 0.2862, "num_input_tokens_seen": 5254096, "step": 58350 }, { "epoch": 15.16502079002079, "grad_norm": 24.129228591918945, "learning_rate": 8.386316099668746e-06, "loss": 0.1001, "num_input_tokens_seen": 5254544, "step": 58355 }, { "epoch": 15.166320166320165, "grad_norm": 24.525737762451172, "learning_rate": 8.38207994115086e-06, "loss": 0.5264, "num_input_tokens_seen": 5255040, "step": 58360 }, { "epoch": 15.167619542619542, "grad_norm": 0.014180951751768589, "learning_rate": 8.377844637308521e-06, "loss": 0.0175, "num_input_tokens_seen": 5255472, "step": 58365 }, { "epoch": 15.16891891891892, "grad_norm": 0.3293640911579132, "learning_rate": 8.373610188359565e-06, "loss": 0.0143, "num_input_tokens_seen": 5255952, "step": 58370 }, { "epoch": 15.170218295218294, "grad_norm": 0.0984739363193512, "learning_rate": 8.369376594521764e-06, "loss": 0.066, "num_input_tokens_seen": 5256416, "step": 58375 }, { "epoch": 15.171517671517671, "grad_norm": 29.488473892211914, "learning_rate": 8.365143856012855e-06, "loss": 0.1134, "num_input_tokens_seen": 5256896, "step": 58380 }, { "epoch": 15.172817047817048, "grad_norm": 0.2152746617794037, "learning_rate": 8.360911973050537e-06, "loss": 0.0098, "num_input_tokens_seen": 5257376, "step": 58385 }, { "epoch": 15.174116424116423, "grad_norm": 1.020632266998291, "learning_rate": 8.356680945852466e-06, "loss": 0.2507, "num_input_tokens_seen": 5257824, "step": 58390 }, { "epoch": 15.1754158004158, "grad_norm": 0.04408806934952736, "learning_rate": 8.352450774636237e-06, "loss": 0.2459, "num_input_tokens_seen": 5258272, "step": 58395 }, { "epoch": 15.176715176715177, "grad_norm": 0.6987301707267761, "learning_rate": 8.348221459619418e-06, "loss": 0.3586, "num_input_tokens_seen": 5258704, "step": 58400 }, { "epoch": 15.178014553014552, "grad_norm": 0.32281455397605896, "learning_rate": 8.343993001019529e-06, "loss": 0.373, "num_input_tokens_seen": 5259136, "step": 58405 }, { "epoch": 15.17931392931393, "grad_norm": 2.2783596515655518, "learning_rate": 8.339765399054029e-06, "loss": 0.3202, "num_input_tokens_seen": 5259600, "step": 58410 }, { "epoch": 15.180613305613306, "grad_norm": 0.0405026338994503, "learning_rate": 8.335538653940356e-06, "loss": 0.0634, "num_input_tokens_seen": 5260048, "step": 58415 }, { "epoch": 15.181912681912682, "grad_norm": 20.19951820373535, "learning_rate": 8.331312765895899e-06, "loss": 0.4044, "num_input_tokens_seen": 5260496, "step": 58420 }, { "epoch": 15.183212058212058, "grad_norm": 1.9472051858901978, "learning_rate": 8.327087735138006e-06, "loss": 0.3778, "num_input_tokens_seen": 5260944, "step": 58425 }, { "epoch": 15.184511434511435, "grad_norm": 0.5722381472587585, "learning_rate": 8.322863561883956e-06, "loss": 0.295, "num_input_tokens_seen": 5261408, "step": 58430 }, { "epoch": 15.18581081081081, "grad_norm": 0.012001038528978825, "learning_rate": 8.318640246351023e-06, "loss": 0.0299, "num_input_tokens_seen": 5261840, "step": 58435 }, { "epoch": 15.187110187110187, "grad_norm": 22.481435775756836, "learning_rate": 8.314417788756395e-06, "loss": 0.1376, "num_input_tokens_seen": 5262272, "step": 58440 }, { "epoch": 15.188409563409563, "grad_norm": 0.5934271216392517, "learning_rate": 8.310196189317249e-06, "loss": 0.0272, "num_input_tokens_seen": 5262704, "step": 58445 }, { "epoch": 15.18970893970894, "grad_norm": 7.304489612579346, "learning_rate": 8.305975448250704e-06, "loss": 0.0251, "num_input_tokens_seen": 5263136, "step": 58450 }, { "epoch": 15.191008316008316, "grad_norm": 4.406242370605469, "learning_rate": 8.301755565773844e-06, "loss": 0.0154, "num_input_tokens_seen": 5263600, "step": 58455 }, { "epoch": 15.192307692307692, "grad_norm": 0.10329963266849518, "learning_rate": 8.29753654210369e-06, "loss": 0.1001, "num_input_tokens_seen": 5264096, "step": 58460 }, { "epoch": 15.193607068607069, "grad_norm": 37.1687126159668, "learning_rate": 8.293318377457241e-06, "loss": 0.4887, "num_input_tokens_seen": 5264512, "step": 58465 }, { "epoch": 15.194906444906445, "grad_norm": 22.089242935180664, "learning_rate": 8.289101072051434e-06, "loss": 0.1994, "num_input_tokens_seen": 5264976, "step": 58470 }, { "epoch": 15.19620582120582, "grad_norm": 4.5712480545043945, "learning_rate": 8.284884626103165e-06, "loss": 0.3758, "num_input_tokens_seen": 5265456, "step": 58475 }, { "epoch": 15.197505197505198, "grad_norm": 0.03161071985960007, "learning_rate": 8.280669039829295e-06, "loss": 0.4941, "num_input_tokens_seen": 5265904, "step": 58480 }, { "epoch": 15.198804573804575, "grad_norm": 48.653194427490234, "learning_rate": 8.276454313446633e-06, "loss": 0.6957, "num_input_tokens_seen": 5266336, "step": 58485 }, { "epoch": 15.20010395010395, "grad_norm": 15.321441650390625, "learning_rate": 8.27224044717196e-06, "loss": 0.0481, "num_input_tokens_seen": 5266752, "step": 58490 }, { "epoch": 15.201403326403327, "grad_norm": 0.8495779037475586, "learning_rate": 8.268027441221981e-06, "loss": 0.1386, "num_input_tokens_seen": 5267200, "step": 58495 }, { "epoch": 15.202702702702704, "grad_norm": 0.10274454951286316, "learning_rate": 8.26381529581339e-06, "loss": 0.1465, "num_input_tokens_seen": 5267648, "step": 58500 }, { "epoch": 15.204002079002079, "grad_norm": 10.983414649963379, "learning_rate": 8.259604011162806e-06, "loss": 0.14, "num_input_tokens_seen": 5268144, "step": 58505 }, { "epoch": 15.205301455301456, "grad_norm": 0.028874553740024567, "learning_rate": 8.255393587486834e-06, "loss": 0.1831, "num_input_tokens_seen": 5268560, "step": 58510 }, { "epoch": 15.20660083160083, "grad_norm": 22.52071189880371, "learning_rate": 8.251184025002006e-06, "loss": 0.2121, "num_input_tokens_seen": 5268976, "step": 58515 }, { "epoch": 15.207900207900208, "grad_norm": 0.0245064627379179, "learning_rate": 8.246975323924832e-06, "loss": 0.0217, "num_input_tokens_seen": 5269408, "step": 58520 }, { "epoch": 15.209199584199585, "grad_norm": 7.826742649078369, "learning_rate": 8.242767484471773e-06, "loss": 0.0197, "num_input_tokens_seen": 5269808, "step": 58525 }, { "epoch": 15.21049896049896, "grad_norm": 22.102733612060547, "learning_rate": 8.238560506859242e-06, "loss": 0.2097, "num_input_tokens_seen": 5270240, "step": 58530 }, { "epoch": 15.211798336798337, "grad_norm": 25.057090759277344, "learning_rate": 8.234354391303605e-06, "loss": 0.5185, "num_input_tokens_seen": 5270720, "step": 58535 }, { "epoch": 15.213097713097714, "grad_norm": 0.3412018418312073, "learning_rate": 8.230149138021178e-06, "loss": 0.0913, "num_input_tokens_seen": 5271184, "step": 58540 }, { "epoch": 15.214397089397089, "grad_norm": 0.043031539767980576, "learning_rate": 8.225944747228257e-06, "loss": 0.0804, "num_input_tokens_seen": 5271600, "step": 58545 }, { "epoch": 15.215696465696466, "grad_norm": 0.01764942891895771, "learning_rate": 8.22174121914106e-06, "loss": 0.299, "num_input_tokens_seen": 5272016, "step": 58550 }, { "epoch": 15.216995841995843, "grad_norm": 0.7437161803245544, "learning_rate": 8.21753855397579e-06, "loss": 0.0932, "num_input_tokens_seen": 5272464, "step": 58555 }, { "epoch": 15.218295218295218, "grad_norm": 47.76129150390625, "learning_rate": 8.213336751948595e-06, "loss": 0.1633, "num_input_tokens_seen": 5272928, "step": 58560 }, { "epoch": 15.219594594594595, "grad_norm": 0.1644972711801529, "learning_rate": 8.209135813275579e-06, "loss": 0.4308, "num_input_tokens_seen": 5273376, "step": 58565 }, { "epoch": 15.220893970893972, "grad_norm": 17.274934768676758, "learning_rate": 8.204935738172789e-06, "loss": 0.4548, "num_input_tokens_seen": 5273808, "step": 58570 }, { "epoch": 15.222193347193347, "grad_norm": 19.27803611755371, "learning_rate": 8.200736526856254e-06, "loss": 0.1463, "num_input_tokens_seen": 5274288, "step": 58575 }, { "epoch": 15.223492723492724, "grad_norm": 18.311250686645508, "learning_rate": 8.196538179541929e-06, "loss": 0.2632, "num_input_tokens_seen": 5274736, "step": 58580 }, { "epoch": 15.2247920997921, "grad_norm": 20.323482513427734, "learning_rate": 8.192340696445755e-06, "loss": 0.3313, "num_input_tokens_seen": 5275184, "step": 58585 }, { "epoch": 15.226091476091476, "grad_norm": 28.765474319458008, "learning_rate": 8.18814407778359e-06, "loss": 0.3263, "num_input_tokens_seen": 5275632, "step": 58590 }, { "epoch": 15.227390852390853, "grad_norm": 7.0215301513671875, "learning_rate": 8.18394832377129e-06, "loss": 0.3286, "num_input_tokens_seen": 5276112, "step": 58595 }, { "epoch": 15.228690228690228, "grad_norm": 0.2662881314754486, "learning_rate": 8.179753434624642e-06, "loss": 0.0121, "num_input_tokens_seen": 5276544, "step": 58600 }, { "epoch": 15.229989604989605, "grad_norm": 0.1591184139251709, "learning_rate": 8.175559410559388e-06, "loss": 0.0132, "num_input_tokens_seen": 5276992, "step": 58605 }, { "epoch": 15.231288981288982, "grad_norm": 0.009365863166749477, "learning_rate": 8.17136625179124e-06, "loss": 0.0635, "num_input_tokens_seen": 5277456, "step": 58610 }, { "epoch": 15.232588357588357, "grad_norm": 0.304379940032959, "learning_rate": 8.167173958535842e-06, "loss": 0.056, "num_input_tokens_seen": 5277888, "step": 58615 }, { "epoch": 15.233887733887734, "grad_norm": 0.34890782833099365, "learning_rate": 8.162982531008826e-06, "loss": 0.0241, "num_input_tokens_seen": 5278416, "step": 58620 }, { "epoch": 15.23518711018711, "grad_norm": 6.039422035217285, "learning_rate": 8.158791969425738e-06, "loss": 0.1649, "num_input_tokens_seen": 5278880, "step": 58625 }, { "epoch": 15.236486486486486, "grad_norm": 2.652611494064331, "learning_rate": 8.154602274002121e-06, "loss": 0.1238, "num_input_tokens_seen": 5279344, "step": 58630 }, { "epoch": 15.237785862785863, "grad_norm": 10.041239738464355, "learning_rate": 8.150413444953447e-06, "loss": 0.0263, "num_input_tokens_seen": 5279760, "step": 58635 }, { "epoch": 15.23908523908524, "grad_norm": 6.739016532897949, "learning_rate": 8.146225482495163e-06, "loss": 0.0704, "num_input_tokens_seen": 5280192, "step": 58640 }, { "epoch": 15.240384615384615, "grad_norm": 0.6926336288452148, "learning_rate": 8.14203838684264e-06, "loss": 0.3631, "num_input_tokens_seen": 5280640, "step": 58645 }, { "epoch": 15.241683991683992, "grad_norm": 35.60037612915039, "learning_rate": 8.137852158211246e-06, "loss": 0.3011, "num_input_tokens_seen": 5281072, "step": 58650 }, { "epoch": 15.242983367983369, "grad_norm": 0.2515198886394501, "learning_rate": 8.133666796816264e-06, "loss": 0.2786, "num_input_tokens_seen": 5281536, "step": 58655 }, { "epoch": 15.244282744282744, "grad_norm": 14.061664581298828, "learning_rate": 8.129482302872964e-06, "loss": 0.0543, "num_input_tokens_seen": 5282016, "step": 58660 }, { "epoch": 15.245582120582121, "grad_norm": 31.38208770751953, "learning_rate": 8.125298676596548e-06, "loss": 0.1226, "num_input_tokens_seen": 5282480, "step": 58665 }, { "epoch": 15.246881496881496, "grad_norm": 5.735791206359863, "learning_rate": 8.12111591820219e-06, "loss": 0.3489, "num_input_tokens_seen": 5282928, "step": 58670 }, { "epoch": 15.248180873180873, "grad_norm": 2.5342283248901367, "learning_rate": 8.116934027905021e-06, "loss": 0.1497, "num_input_tokens_seen": 5283408, "step": 58675 }, { "epoch": 15.24948024948025, "grad_norm": 21.206619262695312, "learning_rate": 8.112753005920104e-06, "loss": 0.2471, "num_input_tokens_seen": 5283840, "step": 58680 }, { "epoch": 15.250779625779625, "grad_norm": 5.190779209136963, "learning_rate": 8.108572852462487e-06, "loss": 0.2387, "num_input_tokens_seen": 5284272, "step": 58685 }, { "epoch": 15.252079002079002, "grad_norm": 2.605214834213257, "learning_rate": 8.104393567747146e-06, "loss": 0.1359, "num_input_tokens_seen": 5284720, "step": 58690 }, { "epoch": 15.253378378378379, "grad_norm": 0.4194662570953369, "learning_rate": 8.100215151989032e-06, "loss": 0.0053, "num_input_tokens_seen": 5285152, "step": 58695 }, { "epoch": 15.254677754677754, "grad_norm": 2.9151878356933594, "learning_rate": 8.096037605403045e-06, "loss": 0.0845, "num_input_tokens_seen": 5285584, "step": 58700 }, { "epoch": 15.255977130977131, "grad_norm": 0.0751163512468338, "learning_rate": 8.091860928204049e-06, "loss": 0.2386, "num_input_tokens_seen": 5286032, "step": 58705 }, { "epoch": 15.257276507276508, "grad_norm": 8.783041954040527, "learning_rate": 8.087685120606835e-06, "loss": 0.1192, "num_input_tokens_seen": 5286480, "step": 58710 }, { "epoch": 15.258575883575883, "grad_norm": 0.14089912176132202, "learning_rate": 8.083510182826192e-06, "loss": 0.2836, "num_input_tokens_seen": 5286960, "step": 58715 }, { "epoch": 15.25987525987526, "grad_norm": 27.002418518066406, "learning_rate": 8.079336115076818e-06, "loss": 0.3334, "num_input_tokens_seen": 5287408, "step": 58720 }, { "epoch": 15.261174636174637, "grad_norm": 8.90722942352295, "learning_rate": 8.075162917573412e-06, "loss": 0.1352, "num_input_tokens_seen": 5287856, "step": 58725 }, { "epoch": 15.262474012474012, "grad_norm": 0.10409384220838547, "learning_rate": 8.070990590530583e-06, "loss": 0.251, "num_input_tokens_seen": 5288304, "step": 58730 }, { "epoch": 15.263773388773389, "grad_norm": 1.7774302959442139, "learning_rate": 8.066819134162928e-06, "loss": 0.0206, "num_input_tokens_seen": 5288720, "step": 58735 }, { "epoch": 15.265072765072764, "grad_norm": 21.160602569580078, "learning_rate": 8.062648548685e-06, "loss": 0.3217, "num_input_tokens_seen": 5289184, "step": 58740 }, { "epoch": 15.266372141372141, "grad_norm": 45.16651916503906, "learning_rate": 8.058478834311275e-06, "loss": 0.6124, "num_input_tokens_seen": 5289648, "step": 58745 }, { "epoch": 15.267671517671518, "grad_norm": 12.087663650512695, "learning_rate": 8.054309991256225e-06, "loss": 0.1711, "num_input_tokens_seen": 5290064, "step": 58750 }, { "epoch": 15.268970893970893, "grad_norm": 0.6298661828041077, "learning_rate": 8.05014201973424e-06, "loss": 0.0953, "num_input_tokens_seen": 5290496, "step": 58755 }, { "epoch": 15.27027027027027, "grad_norm": 0.45109835267066956, "learning_rate": 8.045974919959703e-06, "loss": 0.0564, "num_input_tokens_seen": 5290960, "step": 58760 }, { "epoch": 15.271569646569647, "grad_norm": 17.929330825805664, "learning_rate": 8.04180869214691e-06, "loss": 0.222, "num_input_tokens_seen": 5291472, "step": 58765 }, { "epoch": 15.272869022869022, "grad_norm": 1.1501487493515015, "learning_rate": 8.037643336510145e-06, "loss": 0.0295, "num_input_tokens_seen": 5291904, "step": 58770 }, { "epoch": 15.2741683991684, "grad_norm": 0.21002565324306488, "learning_rate": 8.033478853263635e-06, "loss": 0.4509, "num_input_tokens_seen": 5292368, "step": 58775 }, { "epoch": 15.275467775467776, "grad_norm": 0.05360078439116478, "learning_rate": 8.029315242621572e-06, "loss": 0.0178, "num_input_tokens_seen": 5292848, "step": 58780 }, { "epoch": 15.276767151767151, "grad_norm": 1.1441885232925415, "learning_rate": 8.025152504798078e-06, "loss": 0.345, "num_input_tokens_seen": 5293312, "step": 58785 }, { "epoch": 15.278066528066528, "grad_norm": 0.40836796164512634, "learning_rate": 8.020990640007264e-06, "loss": 0.0728, "num_input_tokens_seen": 5293760, "step": 58790 }, { "epoch": 15.279365904365905, "grad_norm": 0.43949294090270996, "learning_rate": 8.016829648463173e-06, "loss": 0.0938, "num_input_tokens_seen": 5294208, "step": 58795 }, { "epoch": 15.28066528066528, "grad_norm": 0.3838717043399811, "learning_rate": 8.012669530379794e-06, "loss": 0.0891, "num_input_tokens_seen": 5294640, "step": 58800 }, { "epoch": 15.281964656964657, "grad_norm": 0.39766886830329895, "learning_rate": 8.008510285971097e-06, "loss": 0.096, "num_input_tokens_seen": 5295072, "step": 58805 }, { "epoch": 15.283264033264032, "grad_norm": 26.640104293823242, "learning_rate": 8.004351915450997e-06, "loss": 0.3655, "num_input_tokens_seen": 5295536, "step": 58810 }, { "epoch": 15.28456340956341, "grad_norm": 0.1271866261959076, "learning_rate": 8.00019441903337e-06, "loss": 0.285, "num_input_tokens_seen": 5296000, "step": 58815 }, { "epoch": 15.285862785862786, "grad_norm": 1.0416529178619385, "learning_rate": 7.996037796932026e-06, "loss": 0.0316, "num_input_tokens_seen": 5296448, "step": 58820 }, { "epoch": 15.287162162162161, "grad_norm": 4.679962635040283, "learning_rate": 7.99188204936076e-06, "loss": 0.0063, "num_input_tokens_seen": 5296960, "step": 58825 }, { "epoch": 15.288461538461538, "grad_norm": 0.02812417410314083, "learning_rate": 7.987727176533286e-06, "loss": 0.1938, "num_input_tokens_seen": 5297392, "step": 58830 }, { "epoch": 15.289760914760915, "grad_norm": 0.06016942486166954, "learning_rate": 7.983573178663315e-06, "loss": 0.0082, "num_input_tokens_seen": 5297872, "step": 58835 }, { "epoch": 15.29106029106029, "grad_norm": 0.017648350447416306, "learning_rate": 7.979420055964468e-06, "loss": 0.3721, "num_input_tokens_seen": 5298304, "step": 58840 }, { "epoch": 15.292359667359667, "grad_norm": 0.5020391941070557, "learning_rate": 7.975267808650355e-06, "loss": 0.2544, "num_input_tokens_seen": 5298768, "step": 58845 }, { "epoch": 15.293659043659044, "grad_norm": 12.936331748962402, "learning_rate": 7.971116436934534e-06, "loss": 0.3302, "num_input_tokens_seen": 5299232, "step": 58850 }, { "epoch": 15.29495841995842, "grad_norm": 17.867008209228516, "learning_rate": 7.966965941030519e-06, "loss": 0.0538, "num_input_tokens_seen": 5299712, "step": 58855 }, { "epoch": 15.296257796257796, "grad_norm": 0.08694573491811752, "learning_rate": 7.962816321151756e-06, "loss": 0.15, "num_input_tokens_seen": 5300160, "step": 58860 }, { "epoch": 15.297557172557173, "grad_norm": 1.054806113243103, "learning_rate": 7.958667577511683e-06, "loss": 0.0156, "num_input_tokens_seen": 5300608, "step": 58865 }, { "epoch": 15.298856548856548, "grad_norm": 5.304322242736816, "learning_rate": 7.954519710323663e-06, "loss": 0.1875, "num_input_tokens_seen": 5301040, "step": 58870 }, { "epoch": 15.300155925155925, "grad_norm": 4.9696855545043945, "learning_rate": 7.950372719801022e-06, "loss": 0.0221, "num_input_tokens_seen": 5301488, "step": 58875 }, { "epoch": 15.301455301455302, "grad_norm": 0.03719610348343849, "learning_rate": 7.946226606157045e-06, "loss": 0.2094, "num_input_tokens_seen": 5301936, "step": 58880 }, { "epoch": 15.302754677754677, "grad_norm": 0.5137653350830078, "learning_rate": 7.942081369604976e-06, "loss": 0.0641, "num_input_tokens_seen": 5302448, "step": 58885 }, { "epoch": 15.304054054054054, "grad_norm": 0.03179854899644852, "learning_rate": 7.937937010358012e-06, "loss": 0.1726, "num_input_tokens_seen": 5302896, "step": 58890 }, { "epoch": 15.30535343035343, "grad_norm": 33.549705505371094, "learning_rate": 7.933793528629287e-06, "loss": 0.3349, "num_input_tokens_seen": 5303344, "step": 58895 }, { "epoch": 15.306652806652806, "grad_norm": 27.036914825439453, "learning_rate": 7.929650924631924e-06, "loss": 0.2737, "num_input_tokens_seen": 5303776, "step": 58900 }, { "epoch": 15.307952182952183, "grad_norm": 4.602063179016113, "learning_rate": 7.92550919857896e-06, "loss": 0.1025, "num_input_tokens_seen": 5304208, "step": 58905 }, { "epoch": 15.309251559251559, "grad_norm": 15.68789291381836, "learning_rate": 7.921368350683428e-06, "loss": 0.1947, "num_input_tokens_seen": 5304624, "step": 58910 }, { "epoch": 15.310550935550935, "grad_norm": 1.269447922706604, "learning_rate": 7.917228381158268e-06, "loss": 0.2584, "num_input_tokens_seen": 5305104, "step": 58915 }, { "epoch": 15.311850311850312, "grad_norm": 25.1641845703125, "learning_rate": 7.913089290216436e-06, "loss": 0.2472, "num_input_tokens_seen": 5305552, "step": 58920 }, { "epoch": 15.313149688149688, "grad_norm": 8.115145683288574, "learning_rate": 7.908951078070787e-06, "loss": 0.0915, "num_input_tokens_seen": 5305984, "step": 58925 }, { "epoch": 15.314449064449065, "grad_norm": 0.04471220076084137, "learning_rate": 7.904813744934164e-06, "loss": 0.1444, "num_input_tokens_seen": 5306416, "step": 58930 }, { "epoch": 15.315748440748441, "grad_norm": 17.676877975463867, "learning_rate": 7.900677291019354e-06, "loss": 0.1777, "num_input_tokens_seen": 5306848, "step": 58935 }, { "epoch": 15.317047817047817, "grad_norm": 0.06238537281751633, "learning_rate": 7.896541716539085e-06, "loss": 0.3291, "num_input_tokens_seen": 5307280, "step": 58940 }, { "epoch": 15.318347193347194, "grad_norm": 2.8074193000793457, "learning_rate": 7.892407021706063e-06, "loss": 0.2962, "num_input_tokens_seen": 5307776, "step": 58945 }, { "epoch": 15.31964656964657, "grad_norm": 18.64365577697754, "learning_rate": 7.888273206732943e-06, "loss": 0.0668, "num_input_tokens_seen": 5308240, "step": 58950 }, { "epoch": 15.320945945945946, "grad_norm": 0.3151293694972992, "learning_rate": 7.884140271832333e-06, "loss": 0.0363, "num_input_tokens_seen": 5308688, "step": 58955 }, { "epoch": 15.322245322245323, "grad_norm": 59.6978759765625, "learning_rate": 7.88000821721678e-06, "loss": 0.3493, "num_input_tokens_seen": 5309200, "step": 58960 }, { "epoch": 15.323544698544698, "grad_norm": 0.09128808230161667, "learning_rate": 7.875877043098818e-06, "loss": 0.032, "num_input_tokens_seen": 5309632, "step": 58965 }, { "epoch": 15.324844074844075, "grad_norm": 0.13486728072166443, "learning_rate": 7.871746749690898e-06, "loss": 0.144, "num_input_tokens_seen": 5310080, "step": 58970 }, { "epoch": 15.326143451143452, "grad_norm": 1.0852911472320557, "learning_rate": 7.86761733720546e-06, "loss": 0.0804, "num_input_tokens_seen": 5310528, "step": 58975 }, { "epoch": 15.327442827442827, "grad_norm": 21.290607452392578, "learning_rate": 7.863488805854872e-06, "loss": 0.2184, "num_input_tokens_seen": 5310992, "step": 58980 }, { "epoch": 15.328742203742204, "grad_norm": 18.17439079284668, "learning_rate": 7.859361155851475e-06, "loss": 0.1599, "num_input_tokens_seen": 5311440, "step": 58985 }, { "epoch": 15.33004158004158, "grad_norm": 3.4024550914764404, "learning_rate": 7.855234387407554e-06, "loss": 0.2218, "num_input_tokens_seen": 5311888, "step": 58990 }, { "epoch": 15.331340956340956, "grad_norm": 8.44950008392334, "learning_rate": 7.851108500735365e-06, "loss": 0.3278, "num_input_tokens_seen": 5312320, "step": 58995 }, { "epoch": 15.332640332640333, "grad_norm": 1.028304100036621, "learning_rate": 7.8469834960471e-06, "loss": 0.0059, "num_input_tokens_seen": 5312800, "step": 59000 }, { "epoch": 15.33393970893971, "grad_norm": 0.8508251905441284, "learning_rate": 7.842859373554898e-06, "loss": 0.0574, "num_input_tokens_seen": 5313232, "step": 59005 }, { "epoch": 15.335239085239085, "grad_norm": 21.58206558227539, "learning_rate": 7.838736133470886e-06, "loss": 0.3267, "num_input_tokens_seen": 5313696, "step": 59010 }, { "epoch": 15.336538461538462, "grad_norm": 0.12403667718172073, "learning_rate": 7.83461377600711e-06, "loss": 0.079, "num_input_tokens_seen": 5314128, "step": 59015 }, { "epoch": 15.337837837837839, "grad_norm": 64.38304901123047, "learning_rate": 7.830492301375597e-06, "loss": 0.1634, "num_input_tokens_seen": 5314592, "step": 59020 }, { "epoch": 15.339137214137214, "grad_norm": 2.261713981628418, "learning_rate": 7.826371709788313e-06, "loss": 0.0572, "num_input_tokens_seen": 5315040, "step": 59025 }, { "epoch": 15.34043659043659, "grad_norm": 0.07298808544874191, "learning_rate": 7.822252001457195e-06, "loss": 0.5196, "num_input_tokens_seen": 5315472, "step": 59030 }, { "epoch": 15.341735966735968, "grad_norm": 20.813587188720703, "learning_rate": 7.818133176594109e-06, "loss": 0.3889, "num_input_tokens_seen": 5315936, "step": 59035 }, { "epoch": 15.343035343035343, "grad_norm": 45.047706604003906, "learning_rate": 7.814015235410905e-06, "loss": 0.7989, "num_input_tokens_seen": 5316448, "step": 59040 }, { "epoch": 15.34433471933472, "grad_norm": 4.791958332061768, "learning_rate": 7.809898178119354e-06, "loss": 0.3395, "num_input_tokens_seen": 5316896, "step": 59045 }, { "epoch": 15.345634095634095, "grad_norm": 0.006842674687504768, "learning_rate": 7.805782004931219e-06, "loss": 0.3282, "num_input_tokens_seen": 5317392, "step": 59050 }, { "epoch": 15.346933471933472, "grad_norm": 1.435109257698059, "learning_rate": 7.801666716058185e-06, "loss": 0.1832, "num_input_tokens_seen": 5317872, "step": 59055 }, { "epoch": 15.348232848232849, "grad_norm": 10.630643844604492, "learning_rate": 7.797552311711906e-06, "loss": 0.104, "num_input_tokens_seen": 5318368, "step": 59060 }, { "epoch": 15.349532224532224, "grad_norm": 0.9073244333267212, "learning_rate": 7.793438792104005e-06, "loss": 0.024, "num_input_tokens_seen": 5318816, "step": 59065 }, { "epoch": 15.3508316008316, "grad_norm": 1.9700554609298706, "learning_rate": 7.789326157446025e-06, "loss": 0.1074, "num_input_tokens_seen": 5319232, "step": 59070 }, { "epoch": 15.352130977130978, "grad_norm": 0.1445194035768509, "learning_rate": 7.785214407949498e-06, "loss": 0.0279, "num_input_tokens_seen": 5319648, "step": 59075 }, { "epoch": 15.353430353430353, "grad_norm": 0.01757773570716381, "learning_rate": 7.781103543825881e-06, "loss": 0.0101, "num_input_tokens_seen": 5320064, "step": 59080 }, { "epoch": 15.35472972972973, "grad_norm": 4.168174743652344, "learning_rate": 7.776993565286617e-06, "loss": 0.1295, "num_input_tokens_seen": 5320512, "step": 59085 }, { "epoch": 15.356029106029107, "grad_norm": 2.234128952026367, "learning_rate": 7.772884472543065e-06, "loss": 0.4173, "num_input_tokens_seen": 5320976, "step": 59090 }, { "epoch": 15.357328482328482, "grad_norm": 3.932333469390869, "learning_rate": 7.768776265806574e-06, "loss": 0.103, "num_input_tokens_seen": 5321424, "step": 59095 }, { "epoch": 15.358627858627859, "grad_norm": 11.633301734924316, "learning_rate": 7.764668945288428e-06, "loss": 0.3236, "num_input_tokens_seen": 5321856, "step": 59100 }, { "epoch": 15.359927234927236, "grad_norm": 20.30442237854004, "learning_rate": 7.760562511199882e-06, "loss": 0.194, "num_input_tokens_seen": 5322288, "step": 59105 }, { "epoch": 15.361226611226611, "grad_norm": 0.015149744227528572, "learning_rate": 7.756456963752113e-06, "loss": 0.1027, "num_input_tokens_seen": 5322736, "step": 59110 }, { "epoch": 15.362525987525988, "grad_norm": 0.06588192284107208, "learning_rate": 7.752352303156293e-06, "loss": 0.1777, "num_input_tokens_seen": 5323184, "step": 59115 }, { "epoch": 15.363825363825363, "grad_norm": 0.26605603098869324, "learning_rate": 7.748248529623514e-06, "loss": 0.3531, "num_input_tokens_seen": 5323600, "step": 59120 }, { "epoch": 15.36512474012474, "grad_norm": 0.8515812754631042, "learning_rate": 7.74414564336485e-06, "loss": 0.0998, "num_input_tokens_seen": 5324080, "step": 59125 }, { "epoch": 15.366424116424117, "grad_norm": 4.1783013343811035, "learning_rate": 7.740043644591302e-06, "loss": 0.4085, "num_input_tokens_seen": 5324544, "step": 59130 }, { "epoch": 15.367723492723492, "grad_norm": 0.0433834083378315, "learning_rate": 7.735942533513846e-06, "loss": 0.0961, "num_input_tokens_seen": 5325024, "step": 59135 }, { "epoch": 15.369022869022869, "grad_norm": 0.5964780449867249, "learning_rate": 7.731842310343415e-06, "loss": 0.0336, "num_input_tokens_seen": 5325472, "step": 59140 }, { "epoch": 15.370322245322246, "grad_norm": 19.20216178894043, "learning_rate": 7.727742975290871e-06, "loss": 0.1819, "num_input_tokens_seen": 5325936, "step": 59145 }, { "epoch": 15.371621621621621, "grad_norm": 1.4054738283157349, "learning_rate": 7.723644528567064e-06, "loss": 0.0037, "num_input_tokens_seen": 5326384, "step": 59150 }, { "epoch": 15.372920997920998, "grad_norm": 0.7232645750045776, "learning_rate": 7.719546970382766e-06, "loss": 0.0894, "num_input_tokens_seen": 5326800, "step": 59155 }, { "epoch": 15.374220374220375, "grad_norm": 3.817955493927002, "learning_rate": 7.715450300948732e-06, "loss": 0.5277, "num_input_tokens_seen": 5327232, "step": 59160 }, { "epoch": 15.37551975051975, "grad_norm": 29.022705078125, "learning_rate": 7.711354520475633e-06, "loss": 0.2478, "num_input_tokens_seen": 5327664, "step": 59165 }, { "epoch": 15.376819126819127, "grad_norm": 11.127249717712402, "learning_rate": 7.707259629174152e-06, "loss": 0.1397, "num_input_tokens_seen": 5328128, "step": 59170 }, { "epoch": 15.378118503118504, "grad_norm": 0.1576339453458786, "learning_rate": 7.703165627254869e-06, "loss": 0.1292, "num_input_tokens_seen": 5328576, "step": 59175 }, { "epoch": 15.379417879417879, "grad_norm": 0.18635240197181702, "learning_rate": 7.69907251492836e-06, "loss": 0.0065, "num_input_tokens_seen": 5329056, "step": 59180 }, { "epoch": 15.380717255717256, "grad_norm": 0.26262226700782776, "learning_rate": 7.694980292405122e-06, "loss": 0.5131, "num_input_tokens_seen": 5329504, "step": 59185 }, { "epoch": 15.382016632016631, "grad_norm": 11.87257194519043, "learning_rate": 7.690888959895634e-06, "loss": 0.3909, "num_input_tokens_seen": 5329952, "step": 59190 }, { "epoch": 15.383316008316008, "grad_norm": 0.017170166596770287, "learning_rate": 7.686798517610304e-06, "loss": 0.0557, "num_input_tokens_seen": 5330400, "step": 59195 }, { "epoch": 15.384615384615385, "grad_norm": 0.09667645394802094, "learning_rate": 7.682708965759516e-06, "loss": 0.2147, "num_input_tokens_seen": 5330832, "step": 59200 }, { "epoch": 15.38591476091476, "grad_norm": 0.27566248178482056, "learning_rate": 7.678620304553604e-06, "loss": 0.0562, "num_input_tokens_seen": 5331264, "step": 59205 }, { "epoch": 15.387214137214137, "grad_norm": 10.979432106018066, "learning_rate": 7.674532534202841e-06, "loss": 0.4563, "num_input_tokens_seen": 5331712, "step": 59210 }, { "epoch": 15.388513513513514, "grad_norm": 1.3846019506454468, "learning_rate": 7.670445654917477e-06, "loss": 0.1182, "num_input_tokens_seen": 5332160, "step": 59215 }, { "epoch": 15.38981288981289, "grad_norm": 29.12273406982422, "learning_rate": 7.66635966690769e-06, "loss": 0.394, "num_input_tokens_seen": 5332608, "step": 59220 }, { "epoch": 15.391112266112266, "grad_norm": 10.122309684753418, "learning_rate": 7.66227457038364e-06, "loss": 0.255, "num_input_tokens_seen": 5333056, "step": 59225 }, { "epoch": 15.392411642411643, "grad_norm": 0.4292343556880951, "learning_rate": 7.658190365555415e-06, "loss": 0.0213, "num_input_tokens_seen": 5333488, "step": 59230 }, { "epoch": 15.393711018711018, "grad_norm": 16.626672744750977, "learning_rate": 7.654107052633075e-06, "loss": 0.225, "num_input_tokens_seen": 5333920, "step": 59235 }, { "epoch": 15.395010395010395, "grad_norm": 0.03861122205853462, "learning_rate": 7.65002463182663e-06, "loss": 0.1993, "num_input_tokens_seen": 5334352, "step": 59240 }, { "epoch": 15.396309771309772, "grad_norm": 0.39447739720344543, "learning_rate": 7.645943103346048e-06, "loss": 0.0045, "num_input_tokens_seen": 5334784, "step": 59245 }, { "epoch": 15.397609147609147, "grad_norm": 17.87528419494629, "learning_rate": 7.641862467401234e-06, "loss": 0.0634, "num_input_tokens_seen": 5335248, "step": 59250 }, { "epoch": 15.398908523908524, "grad_norm": 2.7120134830474854, "learning_rate": 7.637782724202073e-06, "loss": 0.4031, "num_input_tokens_seen": 5335680, "step": 59255 }, { "epoch": 15.4002079002079, "grad_norm": 11.012898445129395, "learning_rate": 7.63370387395838e-06, "loss": 0.3599, "num_input_tokens_seen": 5336128, "step": 59260 }, { "epoch": 15.401507276507276, "grad_norm": 15.696005821228027, "learning_rate": 7.629625916879932e-06, "loss": 0.2858, "num_input_tokens_seen": 5336576, "step": 59265 }, { "epoch": 15.402806652806653, "grad_norm": 20.196760177612305, "learning_rate": 7.625548853176464e-06, "loss": 0.1563, "num_input_tokens_seen": 5337040, "step": 59270 }, { "epoch": 15.404106029106028, "grad_norm": 0.9429658651351929, "learning_rate": 7.621472683057668e-06, "loss": 0.0348, "num_input_tokens_seen": 5337488, "step": 59275 }, { "epoch": 15.405405405405405, "grad_norm": 26.61466407775879, "learning_rate": 7.61739740673319e-06, "loss": 0.24, "num_input_tokens_seen": 5337968, "step": 59280 }, { "epoch": 15.406704781704782, "grad_norm": 0.009420297108590603, "learning_rate": 7.613323024412608e-06, "loss": 0.1575, "num_input_tokens_seen": 5338448, "step": 59285 }, { "epoch": 15.408004158004157, "grad_norm": 0.08972857147455215, "learning_rate": 7.6092495363054935e-06, "loss": 0.4993, "num_input_tokens_seen": 5338912, "step": 59290 }, { "epoch": 15.409303534303534, "grad_norm": 22.46604347229004, "learning_rate": 7.605176942621328e-06, "loss": 0.2333, "num_input_tokens_seen": 5339376, "step": 59295 }, { "epoch": 15.410602910602911, "grad_norm": 21.83478546142578, "learning_rate": 7.6011052435695894e-06, "loss": 0.3877, "num_input_tokens_seen": 5339840, "step": 59300 }, { "epoch": 15.411902286902286, "grad_norm": 10.928727149963379, "learning_rate": 7.597034439359671e-06, "loss": 0.047, "num_input_tokens_seen": 5340304, "step": 59305 }, { "epoch": 15.413201663201663, "grad_norm": 0.023533547297120094, "learning_rate": 7.592964530200944e-06, "loss": 0.7074, "num_input_tokens_seen": 5340752, "step": 59310 }, { "epoch": 15.41450103950104, "grad_norm": 0.33605122566223145, "learning_rate": 7.58889551630273e-06, "loss": 0.0574, "num_input_tokens_seen": 5341232, "step": 59315 }, { "epoch": 15.415800415800415, "grad_norm": 2.9349474906921387, "learning_rate": 7.584827397874312e-06, "loss": 0.0324, "num_input_tokens_seen": 5341696, "step": 59320 }, { "epoch": 15.417099792099792, "grad_norm": 0.2705041170120239, "learning_rate": 7.580760175124907e-06, "loss": 0.4524, "num_input_tokens_seen": 5342160, "step": 59325 }, { "epoch": 15.41839916839917, "grad_norm": 21.8817195892334, "learning_rate": 7.576693848263686e-06, "loss": 0.4458, "num_input_tokens_seen": 5342624, "step": 59330 }, { "epoch": 15.419698544698544, "grad_norm": 2.343527317047119, "learning_rate": 7.5726284174998055e-06, "loss": 0.2364, "num_input_tokens_seen": 5343072, "step": 59335 }, { "epoch": 15.420997920997921, "grad_norm": 1.5472325086593628, "learning_rate": 7.568563883042334e-06, "loss": 0.0093, "num_input_tokens_seen": 5343488, "step": 59340 }, { "epoch": 15.422297297297296, "grad_norm": 18.929908752441406, "learning_rate": 7.564500245100325e-06, "loss": 0.1557, "num_input_tokens_seen": 5343904, "step": 59345 }, { "epoch": 15.423596673596673, "grad_norm": 0.06504055112600327, "learning_rate": 7.5604375038827725e-06, "loss": 0.0768, "num_input_tokens_seen": 5344352, "step": 59350 }, { "epoch": 15.42489604989605, "grad_norm": 9.311258316040039, "learning_rate": 7.556375659598638e-06, "loss": 0.061, "num_input_tokens_seen": 5344864, "step": 59355 }, { "epoch": 15.426195426195425, "grad_norm": 17.88375473022461, "learning_rate": 7.55231471245681e-06, "loss": 0.2177, "num_input_tokens_seen": 5345312, "step": 59360 }, { "epoch": 15.427494802494802, "grad_norm": 9.87178897857666, "learning_rate": 7.548254662666163e-06, "loss": 0.0942, "num_input_tokens_seen": 5345760, "step": 59365 }, { "epoch": 15.42879417879418, "grad_norm": 0.030823737382888794, "learning_rate": 7.54419551043549e-06, "loss": 0.004, "num_input_tokens_seen": 5346208, "step": 59370 }, { "epoch": 15.430093555093555, "grad_norm": 28.362613677978516, "learning_rate": 7.540137255973576e-06, "loss": 0.4131, "num_input_tokens_seen": 5346656, "step": 59375 }, { "epoch": 15.431392931392931, "grad_norm": 0.12232035398483276, "learning_rate": 7.536079899489121e-06, "loss": 0.1917, "num_input_tokens_seen": 5347120, "step": 59380 }, { "epoch": 15.432692307692308, "grad_norm": 11.37891960144043, "learning_rate": 7.5320234411908135e-06, "loss": 0.0656, "num_input_tokens_seen": 5347600, "step": 59385 }, { "epoch": 15.433991683991684, "grad_norm": 1.4883993864059448, "learning_rate": 7.527967881287284e-06, "loss": 0.2473, "num_input_tokens_seen": 5348016, "step": 59390 }, { "epoch": 15.43529106029106, "grad_norm": 38.822017669677734, "learning_rate": 7.5239132199871e-06, "loss": 0.1941, "num_input_tokens_seen": 5348448, "step": 59395 }, { "epoch": 15.436590436590437, "grad_norm": 0.07403257489204407, "learning_rate": 7.519859457498812e-06, "loss": 0.0012, "num_input_tokens_seen": 5348864, "step": 59400 }, { "epoch": 15.437889812889813, "grad_norm": 5.875147819519043, "learning_rate": 7.515806594030891e-06, "loss": 0.0916, "num_input_tokens_seen": 5349312, "step": 59405 }, { "epoch": 15.43918918918919, "grad_norm": 19.378461837768555, "learning_rate": 7.511754629791792e-06, "loss": 0.2505, "num_input_tokens_seen": 5349760, "step": 59410 }, { "epoch": 15.440488565488565, "grad_norm": 0.014797030948102474, "learning_rate": 7.507703564989907e-06, "loss": 0.2208, "num_input_tokens_seen": 5350208, "step": 59415 }, { "epoch": 15.441787941787942, "grad_norm": 0.343822717666626, "learning_rate": 7.503653399833596e-06, "loss": 0.024, "num_input_tokens_seen": 5350640, "step": 59420 }, { "epoch": 15.443087318087318, "grad_norm": 0.5915128588676453, "learning_rate": 7.499604134531149e-06, "loss": 0.1648, "num_input_tokens_seen": 5351056, "step": 59425 }, { "epoch": 15.444386694386694, "grad_norm": 29.604053497314453, "learning_rate": 7.495555769290835e-06, "loss": 0.2868, "num_input_tokens_seen": 5351504, "step": 59430 }, { "epoch": 15.44568607068607, "grad_norm": 0.05638326331973076, "learning_rate": 7.491508304320854e-06, "loss": 0.0444, "num_input_tokens_seen": 5351952, "step": 59435 }, { "epoch": 15.446985446985448, "grad_norm": 0.7319159507751465, "learning_rate": 7.487461739829383e-06, "loss": 0.0406, "num_input_tokens_seen": 5352400, "step": 59440 }, { "epoch": 15.448284823284823, "grad_norm": 32.898094177246094, "learning_rate": 7.483416076024527e-06, "loss": 0.2685, "num_input_tokens_seen": 5352848, "step": 59445 }, { "epoch": 15.4495841995842, "grad_norm": 0.9912049174308777, "learning_rate": 7.479371313114364e-06, "loss": 0.0675, "num_input_tokens_seen": 5353296, "step": 59450 }, { "epoch": 15.450883575883577, "grad_norm": 0.18224793672561646, "learning_rate": 7.4753274513069325e-06, "loss": 0.1838, "num_input_tokens_seen": 5353744, "step": 59455 }, { "epoch": 15.452182952182952, "grad_norm": 4.063302993774414, "learning_rate": 7.471284490810193e-06, "loss": 0.2388, "num_input_tokens_seen": 5354208, "step": 59460 }, { "epoch": 15.453482328482329, "grad_norm": 12.396018028259277, "learning_rate": 7.467242431832094e-06, "loss": 0.1004, "num_input_tokens_seen": 5354672, "step": 59465 }, { "epoch": 15.454781704781706, "grad_norm": 0.37911033630371094, "learning_rate": 7.463201274580508e-06, "loss": 0.4127, "num_input_tokens_seen": 5355184, "step": 59470 }, { "epoch": 15.45608108108108, "grad_norm": 45.635589599609375, "learning_rate": 7.4591610192632915e-06, "loss": 0.1839, "num_input_tokens_seen": 5355664, "step": 59475 }, { "epoch": 15.457380457380458, "grad_norm": 15.201897621154785, "learning_rate": 7.455121666088225e-06, "loss": 0.1557, "num_input_tokens_seen": 5356112, "step": 59480 }, { "epoch": 15.458679833679835, "grad_norm": 0.06222978234291077, "learning_rate": 7.451083215263057e-06, "loss": 0.0168, "num_input_tokens_seen": 5356544, "step": 59485 }, { "epoch": 15.45997920997921, "grad_norm": 5.256259918212891, "learning_rate": 7.447045666995498e-06, "loss": 0.0143, "num_input_tokens_seen": 5357024, "step": 59490 }, { "epoch": 15.461278586278587, "grad_norm": 14.522085189819336, "learning_rate": 7.443009021493205e-06, "loss": 0.2459, "num_input_tokens_seen": 5357456, "step": 59495 }, { "epoch": 15.462577962577962, "grad_norm": 0.05187559127807617, "learning_rate": 7.438973278963774e-06, "loss": 0.2085, "num_input_tokens_seen": 5357904, "step": 59500 }, { "epoch": 15.463877338877339, "grad_norm": 1.6563447713851929, "learning_rate": 7.434938439614781e-06, "loss": 0.0078, "num_input_tokens_seen": 5358352, "step": 59505 }, { "epoch": 15.465176715176716, "grad_norm": 2.670598030090332, "learning_rate": 7.430904503653724e-06, "loss": 0.0109, "num_input_tokens_seen": 5358816, "step": 59510 }, { "epoch": 15.46647609147609, "grad_norm": 1.8987988233566284, "learning_rate": 7.426871471288091e-06, "loss": 0.2046, "num_input_tokens_seen": 5359264, "step": 59515 }, { "epoch": 15.467775467775468, "grad_norm": 0.3820095658302307, "learning_rate": 7.422839342725291e-06, "loss": 0.0594, "num_input_tokens_seen": 5359696, "step": 59520 }, { "epoch": 15.469074844074845, "grad_norm": 0.21271514892578125, "learning_rate": 7.418808118172702e-06, "loss": 0.0844, "num_input_tokens_seen": 5360176, "step": 59525 }, { "epoch": 15.47037422037422, "grad_norm": 5.324052333831787, "learning_rate": 7.414777797837666e-06, "loss": 0.1604, "num_input_tokens_seen": 5360640, "step": 59530 }, { "epoch": 15.471673596673597, "grad_norm": 29.801523208618164, "learning_rate": 7.410748381927449e-06, "loss": 0.207, "num_input_tokens_seen": 5361072, "step": 59535 }, { "epoch": 15.472972972972974, "grad_norm": 1.1108019351959229, "learning_rate": 7.406719870649303e-06, "loss": 0.1078, "num_input_tokens_seen": 5361504, "step": 59540 }, { "epoch": 15.474272349272349, "grad_norm": 7.240163803100586, "learning_rate": 7.402692264210403e-06, "loss": 0.1522, "num_input_tokens_seen": 5361936, "step": 59545 }, { "epoch": 15.475571725571726, "grad_norm": 5.789734363555908, "learning_rate": 7.3986655628179105e-06, "loss": 0.1473, "num_input_tokens_seen": 5362368, "step": 59550 }, { "epoch": 15.476871101871103, "grad_norm": 1.4308644533157349, "learning_rate": 7.3946397666789045e-06, "loss": 0.0386, "num_input_tokens_seen": 5362832, "step": 59555 }, { "epoch": 15.478170478170478, "grad_norm": 0.7150132060050964, "learning_rate": 7.390614876000443e-06, "loss": 0.4307, "num_input_tokens_seen": 5363280, "step": 59560 }, { "epoch": 15.479469854469855, "grad_norm": 4.150107383728027, "learning_rate": 7.386590890989534e-06, "loss": 0.096, "num_input_tokens_seen": 5363712, "step": 59565 }, { "epoch": 15.48076923076923, "grad_norm": 1.6456602811813354, "learning_rate": 7.382567811853136e-06, "loss": 0.0164, "num_input_tokens_seen": 5364176, "step": 59570 }, { "epoch": 15.482068607068607, "grad_norm": 0.14956164360046387, "learning_rate": 7.3785456387981485e-06, "loss": 0.0579, "num_input_tokens_seen": 5364624, "step": 59575 }, { "epoch": 15.483367983367984, "grad_norm": 0.008133256807923317, "learning_rate": 7.374524372031452e-06, "loss": 0.0849, "num_input_tokens_seen": 5365056, "step": 59580 }, { "epoch": 15.484667359667359, "grad_norm": 10.003290176391602, "learning_rate": 7.370504011759855e-06, "loss": 0.0161, "num_input_tokens_seen": 5365488, "step": 59585 }, { "epoch": 15.485966735966736, "grad_norm": 31.63343048095703, "learning_rate": 7.3664845581901206e-06, "loss": 0.286, "num_input_tokens_seen": 5365936, "step": 59590 }, { "epoch": 15.487266112266113, "grad_norm": 1.3818066120147705, "learning_rate": 7.362466011528979e-06, "loss": 0.126, "num_input_tokens_seen": 5366416, "step": 59595 }, { "epoch": 15.488565488565488, "grad_norm": 10.975273132324219, "learning_rate": 7.358448371983112e-06, "loss": 0.1699, "num_input_tokens_seen": 5366896, "step": 59600 }, { "epoch": 15.489864864864865, "grad_norm": 19.26451301574707, "learning_rate": 7.3544316397591545e-06, "loss": 0.3404, "num_input_tokens_seen": 5367328, "step": 59605 }, { "epoch": 15.491164241164242, "grad_norm": 18.61802101135254, "learning_rate": 7.350415815063677e-06, "loss": 0.1177, "num_input_tokens_seen": 5367744, "step": 59610 }, { "epoch": 15.492463617463617, "grad_norm": 0.5918464660644531, "learning_rate": 7.3464008981032365e-06, "loss": 0.1, "num_input_tokens_seen": 5368192, "step": 59615 }, { "epoch": 15.493762993762994, "grad_norm": 0.9666264057159424, "learning_rate": 7.342386889084301e-06, "loss": 0.0827, "num_input_tokens_seen": 5368640, "step": 59620 }, { "epoch": 15.49506237006237, "grad_norm": 10.224565505981445, "learning_rate": 7.338373788213337e-06, "loss": 0.2329, "num_input_tokens_seen": 5369104, "step": 59625 }, { "epoch": 15.496361746361746, "grad_norm": 11.720184326171875, "learning_rate": 7.334361595696723e-06, "loss": 0.1217, "num_input_tokens_seen": 5369552, "step": 59630 }, { "epoch": 15.497661122661123, "grad_norm": 9.772153854370117, "learning_rate": 7.330350311740816e-06, "loss": 0.199, "num_input_tokens_seen": 5370000, "step": 59635 }, { "epoch": 15.4989604989605, "grad_norm": 0.016387445852160454, "learning_rate": 7.326339936551926e-06, "loss": 0.0693, "num_input_tokens_seen": 5370448, "step": 59640 }, { "epoch": 15.500259875259875, "grad_norm": 39.81742477416992, "learning_rate": 7.3223304703363135e-06, "loss": 0.2959, "num_input_tokens_seen": 5370896, "step": 59645 }, { "epoch": 15.501559251559252, "grad_norm": 0.41570210456848145, "learning_rate": 7.318321913300183e-06, "loss": 0.0497, "num_input_tokens_seen": 5371344, "step": 59650 }, { "epoch": 15.502858627858627, "grad_norm": 5.85787296295166, "learning_rate": 7.314314265649691e-06, "loss": 0.0514, "num_input_tokens_seen": 5371808, "step": 59655 }, { "epoch": 15.504158004158004, "grad_norm": 27.47819709777832, "learning_rate": 7.310307527590962e-06, "loss": 0.3617, "num_input_tokens_seen": 5372256, "step": 59660 }, { "epoch": 15.505457380457381, "grad_norm": 0.011949754320085049, "learning_rate": 7.306301699330065e-06, "loss": 0.3767, "num_input_tokens_seen": 5372704, "step": 59665 }, { "epoch": 15.506756756756756, "grad_norm": 2.185915470123291, "learning_rate": 7.302296781073034e-06, "loss": 0.1564, "num_input_tokens_seen": 5373168, "step": 59670 }, { "epoch": 15.508056133056133, "grad_norm": 8.832480430603027, "learning_rate": 7.298292773025828e-06, "loss": 0.2044, "num_input_tokens_seen": 5373648, "step": 59675 }, { "epoch": 15.50935550935551, "grad_norm": 2.186607837677002, "learning_rate": 7.294289675394394e-06, "loss": 0.0101, "num_input_tokens_seen": 5374080, "step": 59680 }, { "epoch": 15.510654885654885, "grad_norm": 0.5295268893241882, "learning_rate": 7.290287488384598e-06, "loss": 0.1135, "num_input_tokens_seen": 5374512, "step": 59685 }, { "epoch": 15.511954261954262, "grad_norm": 25.733535766601562, "learning_rate": 7.286286212202295e-06, "loss": 0.1658, "num_input_tokens_seen": 5374992, "step": 59690 }, { "epoch": 15.513253638253639, "grad_norm": 23.841588973999023, "learning_rate": 7.282285847053255e-06, "loss": 0.0969, "num_input_tokens_seen": 5375472, "step": 59695 }, { "epoch": 15.514553014553014, "grad_norm": 1.4379256963729858, "learning_rate": 7.27828639314323e-06, "loss": 0.3203, "num_input_tokens_seen": 5375920, "step": 59700 }, { "epoch": 15.515852390852391, "grad_norm": 0.9586043953895569, "learning_rate": 7.274287850677919e-06, "loss": 0.4822, "num_input_tokens_seen": 5376432, "step": 59705 }, { "epoch": 15.517151767151766, "grad_norm": 1.5254864692687988, "learning_rate": 7.2702902198629714e-06, "loss": 0.1963, "num_input_tokens_seen": 5376880, "step": 59710 }, { "epoch": 15.518451143451143, "grad_norm": 1.822965383529663, "learning_rate": 7.2662935009039865e-06, "loss": 0.0537, "num_input_tokens_seen": 5377328, "step": 59715 }, { "epoch": 15.51975051975052, "grad_norm": 34.73478698730469, "learning_rate": 7.262297694006512e-06, "loss": 0.1525, "num_input_tokens_seen": 5377744, "step": 59720 }, { "epoch": 15.521049896049895, "grad_norm": 9.940091133117676, "learning_rate": 7.258302799376071e-06, "loss": 0.0679, "num_input_tokens_seen": 5378224, "step": 59725 }, { "epoch": 15.522349272349272, "grad_norm": 0.923119843006134, "learning_rate": 7.254308817218109e-06, "loss": 0.2097, "num_input_tokens_seen": 5378672, "step": 59730 }, { "epoch": 15.52364864864865, "grad_norm": 1.3047277927398682, "learning_rate": 7.2503157477380455e-06, "loss": 0.3075, "num_input_tokens_seen": 5379136, "step": 59735 }, { "epoch": 15.524948024948024, "grad_norm": 7.431557655334473, "learning_rate": 7.246323591141249e-06, "loss": 0.0965, "num_input_tokens_seen": 5379568, "step": 59740 }, { "epoch": 15.526247401247401, "grad_norm": 0.006327122915536165, "learning_rate": 7.242332347633052e-06, "loss": 0.0199, "num_input_tokens_seen": 5379984, "step": 59745 }, { "epoch": 15.527546777546778, "grad_norm": 5.034720420837402, "learning_rate": 7.238342017418706e-06, "loss": 0.2721, "num_input_tokens_seen": 5380448, "step": 59750 }, { "epoch": 15.528846153846153, "grad_norm": 0.40498214960098267, "learning_rate": 7.234352600703459e-06, "loss": 0.0163, "num_input_tokens_seen": 5380912, "step": 59755 }, { "epoch": 15.53014553014553, "grad_norm": 0.07970456779003143, "learning_rate": 7.230364097692469e-06, "loss": 0.3408, "num_input_tokens_seen": 5381408, "step": 59760 }, { "epoch": 15.531444906444907, "grad_norm": 0.0008607710478827357, "learning_rate": 7.22637650859089e-06, "loss": 0.6972, "num_input_tokens_seen": 5381840, "step": 59765 }, { "epoch": 15.532744282744282, "grad_norm": 28.429460525512695, "learning_rate": 7.2223898336037875e-06, "loss": 0.1528, "num_input_tokens_seen": 5382304, "step": 59770 }, { "epoch": 15.53404365904366, "grad_norm": 0.1366247683763504, "learning_rate": 7.218404072936211e-06, "loss": 0.2966, "num_input_tokens_seen": 5382736, "step": 59775 }, { "epoch": 15.535343035343036, "grad_norm": 0.10095799714326859, "learning_rate": 7.214419226793159e-06, "loss": 0.018, "num_input_tokens_seen": 5383216, "step": 59780 }, { "epoch": 15.536642411642411, "grad_norm": 0.1662546843290329, "learning_rate": 7.210435295379558e-06, "loss": 0.0046, "num_input_tokens_seen": 5383648, "step": 59785 }, { "epoch": 15.537941787941788, "grad_norm": 37.56640625, "learning_rate": 7.206452278900322e-06, "loss": 0.3823, "num_input_tokens_seen": 5384096, "step": 59790 }, { "epoch": 15.539241164241163, "grad_norm": 6.991394996643066, "learning_rate": 7.202470177560288e-06, "loss": 0.0656, "num_input_tokens_seen": 5384592, "step": 59795 }, { "epoch": 15.54054054054054, "grad_norm": 25.501340866088867, "learning_rate": 7.198488991564273e-06, "loss": 0.4488, "num_input_tokens_seen": 5385040, "step": 59800 }, { "epoch": 15.541839916839917, "grad_norm": 16.14759635925293, "learning_rate": 7.1945087211170185e-06, "loss": 0.1433, "num_input_tokens_seen": 5385488, "step": 59805 }, { "epoch": 15.543139293139292, "grad_norm": 0.03019050881266594, "learning_rate": 7.19052936642324e-06, "loss": 0.0155, "num_input_tokens_seen": 5385920, "step": 59810 }, { "epoch": 15.54443866943867, "grad_norm": 3.4236812591552734, "learning_rate": 7.186550927687602e-06, "loss": 0.0481, "num_input_tokens_seen": 5386400, "step": 59815 }, { "epoch": 15.545738045738046, "grad_norm": 15.625367164611816, "learning_rate": 7.1825734051147235e-06, "loss": 0.1404, "num_input_tokens_seen": 5386864, "step": 59820 }, { "epoch": 15.547037422037421, "grad_norm": 31.1439151763916, "learning_rate": 7.178596798909159e-06, "loss": 0.5213, "num_input_tokens_seen": 5387328, "step": 59825 }, { "epoch": 15.548336798336798, "grad_norm": 17.545257568359375, "learning_rate": 7.174621109275445e-06, "loss": 0.216, "num_input_tokens_seen": 5387808, "step": 59830 }, { "epoch": 15.549636174636175, "grad_norm": 2.5066325664520264, "learning_rate": 7.170646336418038e-06, "loss": 0.0222, "num_input_tokens_seen": 5388256, "step": 59835 }, { "epoch": 15.55093555093555, "grad_norm": 14.905279159545898, "learning_rate": 7.166672480541384e-06, "loss": 0.1181, "num_input_tokens_seen": 5388688, "step": 59840 }, { "epoch": 15.552234927234927, "grad_norm": 13.064933776855469, "learning_rate": 7.162699541849841e-06, "loss": 0.0404, "num_input_tokens_seen": 5389152, "step": 59845 }, { "epoch": 15.553534303534304, "grad_norm": 0.06942426413297653, "learning_rate": 7.158727520547753e-06, "loss": 0.0426, "num_input_tokens_seen": 5389600, "step": 59850 }, { "epoch": 15.55483367983368, "grad_norm": 37.24554443359375, "learning_rate": 7.154756416839409e-06, "loss": 0.1908, "num_input_tokens_seen": 5390016, "step": 59855 }, { "epoch": 15.556133056133056, "grad_norm": 0.032375942915678024, "learning_rate": 7.1507862309290355e-06, "loss": 0.3752, "num_input_tokens_seen": 5390448, "step": 59860 }, { "epoch": 15.557432432432432, "grad_norm": 1.1575037240982056, "learning_rate": 7.146816963020836e-06, "loss": 0.267, "num_input_tokens_seen": 5390896, "step": 59865 }, { "epoch": 15.558731808731808, "grad_norm": 16.2596492767334, "learning_rate": 7.142848613318936e-06, "loss": 0.0762, "num_input_tokens_seen": 5391360, "step": 59870 }, { "epoch": 15.560031185031185, "grad_norm": 16.18004608154297, "learning_rate": 7.138881182027452e-06, "loss": 0.1159, "num_input_tokens_seen": 5391808, "step": 59875 }, { "epoch": 15.56133056133056, "grad_norm": 0.7410533428192139, "learning_rate": 7.134914669350415e-06, "loss": 0.0036, "num_input_tokens_seen": 5392240, "step": 59880 }, { "epoch": 15.562629937629938, "grad_norm": 17.982770919799805, "learning_rate": 7.13094907549183e-06, "loss": 0.4278, "num_input_tokens_seen": 5392688, "step": 59885 }, { "epoch": 15.563929313929314, "grad_norm": 17.173809051513672, "learning_rate": 7.126984400655659e-06, "loss": 0.3426, "num_input_tokens_seen": 5393136, "step": 59890 }, { "epoch": 15.56522869022869, "grad_norm": 11.047160148620605, "learning_rate": 7.123020645045814e-06, "loss": 0.0922, "num_input_tokens_seen": 5393568, "step": 59895 }, { "epoch": 15.566528066528067, "grad_norm": 37.00102615356445, "learning_rate": 7.1190578088661365e-06, "loss": 0.4157, "num_input_tokens_seen": 5394048, "step": 59900 }, { "epoch": 15.567827442827443, "grad_norm": 0.06871809810400009, "learning_rate": 7.115095892320456e-06, "loss": 0.0946, "num_input_tokens_seen": 5394512, "step": 59905 }, { "epoch": 15.569126819126819, "grad_norm": 0.03053581342101097, "learning_rate": 7.111134895612523e-06, "loss": 0.147, "num_input_tokens_seen": 5394944, "step": 59910 }, { "epoch": 15.570426195426196, "grad_norm": 0.005275206174701452, "learning_rate": 7.107174818946063e-06, "loss": 0.0666, "num_input_tokens_seen": 5395360, "step": 59915 }, { "epoch": 15.571725571725572, "grad_norm": 0.36901962757110596, "learning_rate": 7.103215662524753e-06, "loss": 0.2015, "num_input_tokens_seen": 5395824, "step": 59920 }, { "epoch": 15.573024948024948, "grad_norm": 19.721521377563477, "learning_rate": 7.099257426552203e-06, "loss": 0.4476, "num_input_tokens_seen": 5396256, "step": 59925 }, { "epoch": 15.574324324324325, "grad_norm": 0.7482343316078186, "learning_rate": 7.095300111232e-06, "loss": 0.048, "num_input_tokens_seen": 5396704, "step": 59930 }, { "epoch": 15.575623700623701, "grad_norm": 0.2034272700548172, "learning_rate": 7.0913437167676625e-06, "loss": 0.1969, "num_input_tokens_seen": 5397136, "step": 59935 }, { "epoch": 15.576923076923077, "grad_norm": 0.11606465280056, "learning_rate": 7.087388243362686e-06, "loss": 0.1786, "num_input_tokens_seen": 5397600, "step": 59940 }, { "epoch": 15.578222453222454, "grad_norm": 0.05213472619652748, "learning_rate": 7.083433691220484e-06, "loss": 0.2152, "num_input_tokens_seen": 5398016, "step": 59945 }, { "epoch": 15.579521829521829, "grad_norm": 8.946825981140137, "learning_rate": 7.079480060544458e-06, "loss": 0.0485, "num_input_tokens_seen": 5398480, "step": 59950 }, { "epoch": 15.580821205821206, "grad_norm": 11.590313911437988, "learning_rate": 7.075527351537939e-06, "loss": 0.1834, "num_input_tokens_seen": 5398944, "step": 59955 }, { "epoch": 15.582120582120583, "grad_norm": 0.21767868101596832, "learning_rate": 7.071575564404231e-06, "loss": 0.199, "num_input_tokens_seen": 5399408, "step": 59960 }, { "epoch": 15.583419958419958, "grad_norm": 0.1526707261800766, "learning_rate": 7.067624699346564e-06, "loss": 0.0605, "num_input_tokens_seen": 5399840, "step": 59965 }, { "epoch": 15.584719334719335, "grad_norm": 10.6679048538208, "learning_rate": 7.0636747565681475e-06, "loss": 0.1678, "num_input_tokens_seen": 5400240, "step": 59970 }, { "epoch": 15.586018711018712, "grad_norm": 1.2145105600357056, "learning_rate": 7.059725736272127e-06, "loss": 0.159, "num_input_tokens_seen": 5400656, "step": 59975 }, { "epoch": 15.587318087318087, "grad_norm": 0.47837620973587036, "learning_rate": 7.05577763866159e-06, "loss": 0.287, "num_input_tokens_seen": 5401152, "step": 59980 }, { "epoch": 15.588617463617464, "grad_norm": 0.7973915338516235, "learning_rate": 7.051830463939604e-06, "loss": 0.0907, "num_input_tokens_seen": 5401584, "step": 59985 }, { "epoch": 15.58991683991684, "grad_norm": 8.999947547912598, "learning_rate": 7.047884212309172e-06, "loss": 0.1495, "num_input_tokens_seen": 5402016, "step": 59990 }, { "epoch": 15.591216216216216, "grad_norm": 0.21916666626930237, "learning_rate": 7.043938883973266e-06, "loss": 0.2163, "num_input_tokens_seen": 5402464, "step": 59995 }, { "epoch": 15.592515592515593, "grad_norm": 0.006227502133697271, "learning_rate": 7.039994479134782e-06, "loss": 0.1329, "num_input_tokens_seen": 5402928, "step": 60000 }, { "epoch": 15.59381496881497, "grad_norm": 36.41744613647461, "learning_rate": 7.036050997996593e-06, "loss": 0.3039, "num_input_tokens_seen": 5403360, "step": 60005 }, { "epoch": 15.595114345114345, "grad_norm": 0.3564525544643402, "learning_rate": 7.032108440761509e-06, "loss": 0.0021, "num_input_tokens_seen": 5403856, "step": 60010 }, { "epoch": 15.596413721413722, "grad_norm": 0.6819756031036377, "learning_rate": 7.028166807632311e-06, "loss": 0.0633, "num_input_tokens_seen": 5404288, "step": 60015 }, { "epoch": 15.597713097713097, "grad_norm": 15.007390022277832, "learning_rate": 7.024226098811707e-06, "loss": 0.3378, "num_input_tokens_seen": 5404720, "step": 60020 }, { "epoch": 15.599012474012474, "grad_norm": 17.29479217529297, "learning_rate": 7.020286314502377e-06, "loss": 0.0867, "num_input_tokens_seen": 5405184, "step": 60025 }, { "epoch": 15.60031185031185, "grad_norm": 24.19512939453125, "learning_rate": 7.016347454906949e-06, "loss": 0.0592, "num_input_tokens_seen": 5405616, "step": 60030 }, { "epoch": 15.601611226611226, "grad_norm": 0.04840376600623131, "learning_rate": 7.012409520228011e-06, "loss": 0.2469, "num_input_tokens_seen": 5406048, "step": 60035 }, { "epoch": 15.602910602910603, "grad_norm": 0.09731530398130417, "learning_rate": 7.008472510668085e-06, "loss": 0.0048, "num_input_tokens_seen": 5406480, "step": 60040 }, { "epoch": 15.60420997920998, "grad_norm": 37.60676956176758, "learning_rate": 7.00453642642965e-06, "loss": 0.3294, "num_input_tokens_seen": 5406928, "step": 60045 }, { "epoch": 15.605509355509355, "grad_norm": 40.93989562988281, "learning_rate": 7.000601267715157e-06, "loss": 0.4861, "num_input_tokens_seen": 5407408, "step": 60050 }, { "epoch": 15.606808731808732, "grad_norm": 39.072330474853516, "learning_rate": 6.996667034726978e-06, "loss": 0.155, "num_input_tokens_seen": 5407872, "step": 60055 }, { "epoch": 15.608108108108109, "grad_norm": 0.46418917179107666, "learning_rate": 6.992733727667466e-06, "loss": 0.2482, "num_input_tokens_seen": 5408336, "step": 60060 }, { "epoch": 15.609407484407484, "grad_norm": 1.8441029787063599, "learning_rate": 6.98880134673891e-06, "loss": 0.2877, "num_input_tokens_seen": 5408768, "step": 60065 }, { "epoch": 15.61070686070686, "grad_norm": 1.3370434045791626, "learning_rate": 6.9848698921435664e-06, "loss": 0.2292, "num_input_tokens_seen": 5409200, "step": 60070 }, { "epoch": 15.612006237006238, "grad_norm": 62.32059860229492, "learning_rate": 6.980939364083616e-06, "loss": 0.3558, "num_input_tokens_seen": 5409680, "step": 60075 }, { "epoch": 15.613305613305613, "grad_norm": 20.6030216217041, "learning_rate": 6.97700976276123e-06, "loss": 0.0934, "num_input_tokens_seen": 5410160, "step": 60080 }, { "epoch": 15.61460498960499, "grad_norm": 0.05144442990422249, "learning_rate": 6.973081088378492e-06, "loss": 0.0021, "num_input_tokens_seen": 5410608, "step": 60085 }, { "epoch": 15.615904365904367, "grad_norm": 35.53403091430664, "learning_rate": 6.969153341137472e-06, "loss": 0.2715, "num_input_tokens_seen": 5411072, "step": 60090 }, { "epoch": 15.617203742203742, "grad_norm": 0.41942793130874634, "learning_rate": 6.965226521240165e-06, "loss": 0.032, "num_input_tokens_seen": 5411472, "step": 60095 }, { "epoch": 15.618503118503119, "grad_norm": 26.609590530395508, "learning_rate": 6.96130062888854e-06, "loss": 0.1175, "num_input_tokens_seen": 5411968, "step": 60100 }, { "epoch": 15.619802494802494, "grad_norm": 35.71000671386719, "learning_rate": 6.957375664284513e-06, "loss": 0.0603, "num_input_tokens_seen": 5412400, "step": 60105 }, { "epoch": 15.621101871101871, "grad_norm": 36.857505798339844, "learning_rate": 6.953451627629936e-06, "loss": 0.2344, "num_input_tokens_seen": 5412880, "step": 60110 }, { "epoch": 15.622401247401248, "grad_norm": 17.009265899658203, "learning_rate": 6.9495285191266415e-06, "loss": 0.0514, "num_input_tokens_seen": 5413312, "step": 60115 }, { "epoch": 15.623700623700623, "grad_norm": 2.042975664138794, "learning_rate": 6.945606338976382e-06, "loss": 0.0855, "num_input_tokens_seen": 5413776, "step": 60120 }, { "epoch": 15.625, "grad_norm": 18.249649047851562, "learning_rate": 6.941685087380898e-06, "loss": 0.5685, "num_input_tokens_seen": 5414256, "step": 60125 }, { "epoch": 15.626299376299377, "grad_norm": 0.35141995549201965, "learning_rate": 6.937764764541832e-06, "loss": 0.004, "num_input_tokens_seen": 5414688, "step": 60130 }, { "epoch": 15.627598752598752, "grad_norm": 0.1901138722896576, "learning_rate": 6.933845370660849e-06, "loss": 0.0154, "num_input_tokens_seen": 5415120, "step": 60135 }, { "epoch": 15.628898128898129, "grad_norm": 0.17167606949806213, "learning_rate": 6.929926905939502e-06, "loss": 0.1943, "num_input_tokens_seen": 5415552, "step": 60140 }, { "epoch": 15.630197505197506, "grad_norm": 0.8390833735466003, "learning_rate": 6.926009370579334e-06, "loss": 0.132, "num_input_tokens_seen": 5416032, "step": 60145 }, { "epoch": 15.631496881496881, "grad_norm": 19.129100799560547, "learning_rate": 6.922092764781815e-06, "loss": 0.6707, "num_input_tokens_seen": 5416480, "step": 60150 }, { "epoch": 15.632796257796258, "grad_norm": 0.11600121855735779, "learning_rate": 6.9181770887483926e-06, "loss": 0.0909, "num_input_tokens_seen": 5416912, "step": 60155 }, { "epoch": 15.634095634095633, "grad_norm": 0.2829222083091736, "learning_rate": 6.914262342680442e-06, "loss": 0.0779, "num_input_tokens_seen": 5417328, "step": 60160 }, { "epoch": 15.63539501039501, "grad_norm": 0.38271257281303406, "learning_rate": 6.9103485267793065e-06, "loss": 0.6061, "num_input_tokens_seen": 5417792, "step": 60165 }, { "epoch": 15.636694386694387, "grad_norm": 0.051074378192424774, "learning_rate": 6.90643564124629e-06, "loss": 0.2182, "num_input_tokens_seen": 5418240, "step": 60170 }, { "epoch": 15.637993762993762, "grad_norm": 8.698369979858398, "learning_rate": 6.9025236862826135e-06, "loss": 0.3687, "num_input_tokens_seen": 5418672, "step": 60175 }, { "epoch": 15.63929313929314, "grad_norm": 13.465505599975586, "learning_rate": 6.898612662089493e-06, "loss": 0.1839, "num_input_tokens_seen": 5419120, "step": 60180 }, { "epoch": 15.640592515592516, "grad_norm": 0.12085792422294617, "learning_rate": 6.894702568868058e-06, "loss": 0.1921, "num_input_tokens_seen": 5419584, "step": 60185 }, { "epoch": 15.641891891891891, "grad_norm": 0.5107007026672363, "learning_rate": 6.890793406819427e-06, "loss": 0.1223, "num_input_tokens_seen": 5420032, "step": 60190 }, { "epoch": 15.643191268191268, "grad_norm": 25.913066864013672, "learning_rate": 6.886885176144633e-06, "loss": 0.139, "num_input_tokens_seen": 5420496, "step": 60195 }, { "epoch": 15.644490644490645, "grad_norm": 33.56989288330078, "learning_rate": 6.882977877044691e-06, "loss": 0.3027, "num_input_tokens_seen": 5420928, "step": 60200 }, { "epoch": 15.64579002079002, "grad_norm": 0.06323326379060745, "learning_rate": 6.8790715097205555e-06, "loss": 0.0359, "num_input_tokens_seen": 5421360, "step": 60205 }, { "epoch": 15.647089397089397, "grad_norm": 28.04034996032715, "learning_rate": 6.875166074373144e-06, "loss": 0.3591, "num_input_tokens_seen": 5421808, "step": 60210 }, { "epoch": 15.648388773388774, "grad_norm": 0.48082444071769714, "learning_rate": 6.871261571203297e-06, "loss": 0.0494, "num_input_tokens_seen": 5422240, "step": 60215 }, { "epoch": 15.64968814968815, "grad_norm": 0.921101987361908, "learning_rate": 6.867358000411847e-06, "loss": 0.0861, "num_input_tokens_seen": 5422688, "step": 60220 }, { "epoch": 15.650987525987526, "grad_norm": 0.06241244450211525, "learning_rate": 6.8634553621995416e-06, "loss": 0.2202, "num_input_tokens_seen": 5423136, "step": 60225 }, { "epoch": 15.652286902286903, "grad_norm": 9.448624610900879, "learning_rate": 6.859553656767112e-06, "loss": 0.0943, "num_input_tokens_seen": 5423568, "step": 60230 }, { "epoch": 15.653586278586278, "grad_norm": 1.6076481342315674, "learning_rate": 6.8556528843152115e-06, "loss": 0.1333, "num_input_tokens_seen": 5424000, "step": 60235 }, { "epoch": 15.654885654885655, "grad_norm": 10.17245101928711, "learning_rate": 6.8517530450444676e-06, "loss": 0.4845, "num_input_tokens_seen": 5424432, "step": 60240 }, { "epoch": 15.65618503118503, "grad_norm": 0.014165477827191353, "learning_rate": 6.84785413915546e-06, "loss": 0.0077, "num_input_tokens_seen": 5424848, "step": 60245 }, { "epoch": 15.657484407484407, "grad_norm": 1.9206254482269287, "learning_rate": 6.8439561668487e-06, "loss": 0.379, "num_input_tokens_seen": 5425296, "step": 60250 }, { "epoch": 15.658783783783784, "grad_norm": 0.17637690901756287, "learning_rate": 6.84005912832468e-06, "loss": 0.096, "num_input_tokens_seen": 5425744, "step": 60255 }, { "epoch": 15.66008316008316, "grad_norm": 14.548257827758789, "learning_rate": 6.83616302378381e-06, "loss": 0.0389, "num_input_tokens_seen": 5426192, "step": 60260 }, { "epoch": 15.661382536382536, "grad_norm": 36.62097930908203, "learning_rate": 6.832267853426485e-06, "loss": 0.111, "num_input_tokens_seen": 5426656, "step": 60265 }, { "epoch": 15.662681912681913, "grad_norm": 24.926631927490234, "learning_rate": 6.828373617453026e-06, "loss": 0.1927, "num_input_tokens_seen": 5427120, "step": 60270 }, { "epoch": 15.663981288981288, "grad_norm": 0.005810985341668129, "learning_rate": 6.824480316063725e-06, "loss": 0.0821, "num_input_tokens_seen": 5427552, "step": 60275 }, { "epoch": 15.665280665280665, "grad_norm": 1.4965031147003174, "learning_rate": 6.820587949458815e-06, "loss": 0.1016, "num_input_tokens_seen": 5427968, "step": 60280 }, { "epoch": 15.666580041580042, "grad_norm": 7.134058952331543, "learning_rate": 6.816696517838492e-06, "loss": 0.1689, "num_input_tokens_seen": 5428400, "step": 60285 }, { "epoch": 15.667879417879417, "grad_norm": 9.236149787902832, "learning_rate": 6.812806021402882e-06, "loss": 0.0384, "num_input_tokens_seen": 5428880, "step": 60290 }, { "epoch": 15.669178794178794, "grad_norm": 24.67332649230957, "learning_rate": 6.808916460352091e-06, "loss": 0.3975, "num_input_tokens_seen": 5429360, "step": 60295 }, { "epoch": 15.670478170478171, "grad_norm": 0.420247882604599, "learning_rate": 6.80502783488616e-06, "loss": 0.077, "num_input_tokens_seen": 5429824, "step": 60300 }, { "epoch": 15.671777546777546, "grad_norm": 0.07725456357002258, "learning_rate": 6.80114014520507e-06, "loss": 0.1098, "num_input_tokens_seen": 5430256, "step": 60305 }, { "epoch": 15.673076923076923, "grad_norm": 0.6093904376029968, "learning_rate": 6.7972533915087815e-06, "loss": 0.4109, "num_input_tokens_seen": 5430720, "step": 60310 }, { "epoch": 15.674376299376299, "grad_norm": 0.2587928771972656, "learning_rate": 6.793367573997192e-06, "loss": 0.086, "num_input_tokens_seen": 5431184, "step": 60315 }, { "epoch": 15.675675675675675, "grad_norm": 0.2458004653453827, "learning_rate": 6.789482692870158e-06, "loss": 0.0257, "num_input_tokens_seen": 5431664, "step": 60320 }, { "epoch": 15.676975051975052, "grad_norm": 3.9887895584106445, "learning_rate": 6.785598748327474e-06, "loss": 0.2225, "num_input_tokens_seen": 5432096, "step": 60325 }, { "epoch": 15.678274428274428, "grad_norm": 0.8183041214942932, "learning_rate": 6.781715740568903e-06, "loss": 0.2055, "num_input_tokens_seen": 5432512, "step": 60330 }, { "epoch": 15.679573804573804, "grad_norm": 1.6696603298187256, "learning_rate": 6.77783366979414e-06, "loss": 0.0125, "num_input_tokens_seen": 5432992, "step": 60335 }, { "epoch": 15.680873180873181, "grad_norm": 0.06354686617851257, "learning_rate": 6.77395253620286e-06, "loss": 0.1522, "num_input_tokens_seen": 5433408, "step": 60340 }, { "epoch": 15.682172557172557, "grad_norm": 0.2926183044910431, "learning_rate": 6.770072339994657e-06, "loss": 0.2124, "num_input_tokens_seen": 5433904, "step": 60345 }, { "epoch": 15.683471933471933, "grad_norm": 44.60129928588867, "learning_rate": 6.766193081369099e-06, "loss": 0.3169, "num_input_tokens_seen": 5434336, "step": 60350 }, { "epoch": 15.68477130977131, "grad_norm": 0.10352116823196411, "learning_rate": 6.762314760525704e-06, "loss": 0.0211, "num_input_tokens_seen": 5434768, "step": 60355 }, { "epoch": 15.686070686070686, "grad_norm": 0.0046845367178320885, "learning_rate": 6.75843737766394e-06, "loss": 0.024, "num_input_tokens_seen": 5435280, "step": 60360 }, { "epoch": 15.687370062370062, "grad_norm": 39.303871154785156, "learning_rate": 6.754560932983223e-06, "loss": 0.6248, "num_input_tokens_seen": 5435776, "step": 60365 }, { "epoch": 15.68866943866944, "grad_norm": 1.808047890663147, "learning_rate": 6.750685426682909e-06, "loss": 0.4857, "num_input_tokens_seen": 5436224, "step": 60370 }, { "epoch": 15.689968814968815, "grad_norm": 33.148826599121094, "learning_rate": 6.746810858962338e-06, "loss": 0.2416, "num_input_tokens_seen": 5436672, "step": 60375 }, { "epoch": 15.691268191268192, "grad_norm": 1.1620516777038574, "learning_rate": 6.742937230020757e-06, "loss": 0.2761, "num_input_tokens_seen": 5437152, "step": 60380 }, { "epoch": 15.692567567567568, "grad_norm": 2.22493577003479, "learning_rate": 6.739064540057424e-06, "loss": 0.1713, "num_input_tokens_seen": 5437616, "step": 60385 }, { "epoch": 15.693866943866944, "grad_norm": 0.9415680766105652, "learning_rate": 6.735192789271491e-06, "loss": 0.0082, "num_input_tokens_seen": 5438048, "step": 60390 }, { "epoch": 15.69516632016632, "grad_norm": 11.162361145019531, "learning_rate": 6.7313219778621015e-06, "loss": 0.2082, "num_input_tokens_seen": 5438464, "step": 60395 }, { "epoch": 15.696465696465696, "grad_norm": 10.851336479187012, "learning_rate": 6.727452106028317e-06, "loss": 0.0297, "num_input_tokens_seen": 5438928, "step": 60400 }, { "epoch": 15.697765072765073, "grad_norm": 24.724472045898438, "learning_rate": 6.723583173969189e-06, "loss": 0.2224, "num_input_tokens_seen": 5439376, "step": 60405 }, { "epoch": 15.69906444906445, "grad_norm": 14.183197021484375, "learning_rate": 6.719715181883682e-06, "loss": 0.254, "num_input_tokens_seen": 5439808, "step": 60410 }, { "epoch": 15.700363825363825, "grad_norm": 0.13193944096565247, "learning_rate": 6.715848129970737e-06, "loss": 0.1477, "num_input_tokens_seen": 5440304, "step": 60415 }, { "epoch": 15.701663201663202, "grad_norm": 0.013652000576257706, "learning_rate": 6.711982018429242e-06, "loss": 0.1855, "num_input_tokens_seen": 5440752, "step": 60420 }, { "epoch": 15.702962577962579, "grad_norm": 25.976980209350586, "learning_rate": 6.708116847458043e-06, "loss": 0.1574, "num_input_tokens_seen": 5441168, "step": 60425 }, { "epoch": 15.704261954261954, "grad_norm": 3.3757338523864746, "learning_rate": 6.704252617255918e-06, "loss": 0.2309, "num_input_tokens_seen": 5441648, "step": 60430 }, { "epoch": 15.70556133056133, "grad_norm": 31.11640167236328, "learning_rate": 6.700389328021608e-06, "loss": 0.3464, "num_input_tokens_seen": 5442112, "step": 60435 }, { "epoch": 15.706860706860708, "grad_norm": 0.07784628868103027, "learning_rate": 6.696526979953812e-06, "loss": 0.0569, "num_input_tokens_seen": 5442560, "step": 60440 }, { "epoch": 15.708160083160083, "grad_norm": 4.773194789886475, "learning_rate": 6.692665573251167e-06, "loss": 0.4986, "num_input_tokens_seen": 5442992, "step": 60445 }, { "epoch": 15.70945945945946, "grad_norm": 1.464890956878662, "learning_rate": 6.688805108112273e-06, "loss": 0.0888, "num_input_tokens_seen": 5443456, "step": 60450 }, { "epoch": 15.710758835758837, "grad_norm": 3.2970240116119385, "learning_rate": 6.684945584735675e-06, "loss": 0.1443, "num_input_tokens_seen": 5443904, "step": 60455 }, { "epoch": 15.712058212058212, "grad_norm": 2.428816080093384, "learning_rate": 6.68108700331988e-06, "loss": 0.0441, "num_input_tokens_seen": 5444352, "step": 60460 }, { "epoch": 15.713357588357589, "grad_norm": 0.1235482394695282, "learning_rate": 6.677229364063328e-06, "loss": 0.0081, "num_input_tokens_seen": 5444800, "step": 60465 }, { "epoch": 15.714656964656964, "grad_norm": 0.4100012481212616, "learning_rate": 6.673372667164435e-06, "loss": 0.3573, "num_input_tokens_seen": 5445280, "step": 60470 }, { "epoch": 15.71595634095634, "grad_norm": 0.08314037322998047, "learning_rate": 6.669516912821535e-06, "loss": 0.0977, "num_input_tokens_seen": 5445712, "step": 60475 }, { "epoch": 15.717255717255718, "grad_norm": 3.0769593715667725, "learning_rate": 6.665662101232953e-06, "loss": 0.5308, "num_input_tokens_seen": 5446176, "step": 60480 }, { "epoch": 15.718555093555093, "grad_norm": 0.010354246944189072, "learning_rate": 6.6618082325969275e-06, "loss": 0.1599, "num_input_tokens_seen": 5446640, "step": 60485 }, { "epoch": 15.71985446985447, "grad_norm": 1.1589787006378174, "learning_rate": 6.657955307111674e-06, "loss": 0.2964, "num_input_tokens_seen": 5447056, "step": 60490 }, { "epoch": 15.721153846153847, "grad_norm": 2.189568519592285, "learning_rate": 6.654103324975355e-06, "loss": 0.4231, "num_input_tokens_seen": 5447472, "step": 60495 }, { "epoch": 15.722453222453222, "grad_norm": 1.8910545110702515, "learning_rate": 6.650252286386088e-06, "loss": 0.2765, "num_input_tokens_seen": 5447920, "step": 60500 }, { "epoch": 15.723752598752599, "grad_norm": 26.658061981201172, "learning_rate": 6.646402191541929e-06, "loss": 0.0672, "num_input_tokens_seen": 5448384, "step": 60505 }, { "epoch": 15.725051975051976, "grad_norm": 5.577621936798096, "learning_rate": 6.642553040640881e-06, "loss": 0.0167, "num_input_tokens_seen": 5448800, "step": 60510 }, { "epoch": 15.72635135135135, "grad_norm": 0.5859291553497314, "learning_rate": 6.638704833880929e-06, "loss": 0.2582, "num_input_tokens_seen": 5449232, "step": 60515 }, { "epoch": 15.727650727650728, "grad_norm": 0.522725522518158, "learning_rate": 6.634857571459971e-06, "loss": 0.2891, "num_input_tokens_seen": 5449696, "step": 60520 }, { "epoch": 15.728950103950105, "grad_norm": 0.07109030336141586, "learning_rate": 6.6310112535758864e-06, "loss": 0.0235, "num_input_tokens_seen": 5450192, "step": 60525 }, { "epoch": 15.73024948024948, "grad_norm": 0.036144427955150604, "learning_rate": 6.627165880426492e-06, "loss": 0.1812, "num_input_tokens_seen": 5450656, "step": 60530 }, { "epoch": 15.731548856548857, "grad_norm": 0.20440897345542908, "learning_rate": 6.623321452209569e-06, "loss": 0.0368, "num_input_tokens_seen": 5451072, "step": 60535 }, { "epoch": 15.732848232848234, "grad_norm": 26.954816818237305, "learning_rate": 6.619477969122822e-06, "loss": 0.1653, "num_input_tokens_seen": 5451552, "step": 60540 }, { "epoch": 15.734147609147609, "grad_norm": 8.203350067138672, "learning_rate": 6.615635431363942e-06, "loss": 0.115, "num_input_tokens_seen": 5451984, "step": 60545 }, { "epoch": 15.735446985446986, "grad_norm": 0.012880897149443626, "learning_rate": 6.611793839130542e-06, "loss": 0.2887, "num_input_tokens_seen": 5452432, "step": 60550 }, { "epoch": 15.736746361746361, "grad_norm": 0.09688162803649902, "learning_rate": 6.607953192620209e-06, "loss": 0.0323, "num_input_tokens_seen": 5452864, "step": 60555 }, { "epoch": 15.738045738045738, "grad_norm": 24.791776657104492, "learning_rate": 6.604113492030459e-06, "loss": 0.3132, "num_input_tokens_seen": 5453360, "step": 60560 }, { "epoch": 15.739345114345115, "grad_norm": 0.04379134625196457, "learning_rate": 6.600274737558779e-06, "loss": 0.3767, "num_input_tokens_seen": 5453808, "step": 60565 }, { "epoch": 15.74064449064449, "grad_norm": 10.837414741516113, "learning_rate": 6.5964369294026054e-06, "loss": 0.2029, "num_input_tokens_seen": 5454272, "step": 60570 }, { "epoch": 15.741943866943867, "grad_norm": 0.5058153867721558, "learning_rate": 6.5926000677593055e-06, "loss": 0.2818, "num_input_tokens_seen": 5454720, "step": 60575 }, { "epoch": 15.743243243243244, "grad_norm": 9.593855857849121, "learning_rate": 6.58876415282623e-06, "loss": 0.1771, "num_input_tokens_seen": 5455168, "step": 60580 }, { "epoch": 15.744542619542619, "grad_norm": 10.359880447387695, "learning_rate": 6.5849291848006454e-06, "loss": 0.1381, "num_input_tokens_seen": 5455568, "step": 60585 }, { "epoch": 15.745841995841996, "grad_norm": 35.825252532958984, "learning_rate": 6.581095163879808e-06, "loss": 0.2072, "num_input_tokens_seen": 5456016, "step": 60590 }, { "epoch": 15.747141372141373, "grad_norm": 0.555543839931488, "learning_rate": 6.577262090260885e-06, "loss": 0.0209, "num_input_tokens_seen": 5456432, "step": 60595 }, { "epoch": 15.748440748440748, "grad_norm": 0.9372357726097107, "learning_rate": 6.573429964141025e-06, "loss": 0.4554, "num_input_tokens_seen": 5456864, "step": 60600 }, { "epoch": 15.749740124740125, "grad_norm": 0.05519627034664154, "learning_rate": 6.569598785717318e-06, "loss": 0.0561, "num_input_tokens_seen": 5457296, "step": 60605 }, { "epoch": 15.7510395010395, "grad_norm": 0.007529068738222122, "learning_rate": 6.56576855518681e-06, "loss": 0.1262, "num_input_tokens_seen": 5457760, "step": 60610 }, { "epoch": 15.752338877338877, "grad_norm": 0.019422395154833794, "learning_rate": 6.561939272746484e-06, "loss": 0.1916, "num_input_tokens_seen": 5458224, "step": 60615 }, { "epoch": 15.753638253638254, "grad_norm": 9.823936462402344, "learning_rate": 6.558110938593293e-06, "loss": 0.2302, "num_input_tokens_seen": 5458656, "step": 60620 }, { "epoch": 15.75493762993763, "grad_norm": 1.1148000955581665, "learning_rate": 6.554283552924118e-06, "loss": 0.6017, "num_input_tokens_seen": 5459104, "step": 60625 }, { "epoch": 15.756237006237006, "grad_norm": 0.03264666721224785, "learning_rate": 6.550457115935815e-06, "loss": 0.1722, "num_input_tokens_seen": 5459616, "step": 60630 }, { "epoch": 15.757536382536383, "grad_norm": 0.12957864999771118, "learning_rate": 6.5466316278251894e-06, "loss": 0.105, "num_input_tokens_seen": 5460048, "step": 60635 }, { "epoch": 15.758835758835758, "grad_norm": 0.02210676670074463, "learning_rate": 6.5428070887889686e-06, "loss": 0.0573, "num_input_tokens_seen": 5460496, "step": 60640 }, { "epoch": 15.760135135135135, "grad_norm": 1.835422396659851, "learning_rate": 6.538983499023876e-06, "loss": 0.197, "num_input_tokens_seen": 5460928, "step": 60645 }, { "epoch": 15.761434511434512, "grad_norm": 0.02442365512251854, "learning_rate": 6.5351608587265415e-06, "loss": 0.0019, "num_input_tokens_seen": 5461376, "step": 60650 }, { "epoch": 15.762733887733887, "grad_norm": 0.001504388521425426, "learning_rate": 6.531339168093581e-06, "loss": 0.0518, "num_input_tokens_seen": 5461792, "step": 60655 }, { "epoch": 15.764033264033264, "grad_norm": 0.1628742665052414, "learning_rate": 6.527518427321539e-06, "loss": 0.0234, "num_input_tokens_seen": 5462208, "step": 60660 }, { "epoch": 15.765332640332641, "grad_norm": 0.1365910917520523, "learning_rate": 6.523698636606923e-06, "loss": 0.1827, "num_input_tokens_seen": 5462672, "step": 60665 }, { "epoch": 15.766632016632016, "grad_norm": 10.630881309509277, "learning_rate": 6.519879796146189e-06, "loss": 0.0514, "num_input_tokens_seen": 5463136, "step": 60670 }, { "epoch": 15.767931392931393, "grad_norm": 0.13509118556976318, "learning_rate": 6.5160619061357504e-06, "loss": 0.0595, "num_input_tokens_seen": 5463568, "step": 60675 }, { "epoch": 15.76923076923077, "grad_norm": 21.068626403808594, "learning_rate": 6.512244966771952e-06, "loss": 0.1215, "num_input_tokens_seen": 5464032, "step": 60680 }, { "epoch": 15.770530145530145, "grad_norm": 19.232746124267578, "learning_rate": 6.508428978251116e-06, "loss": 0.4947, "num_input_tokens_seen": 5464496, "step": 60685 }, { "epoch": 15.771829521829522, "grad_norm": 19.61745834350586, "learning_rate": 6.504613940769486e-06, "loss": 0.3005, "num_input_tokens_seen": 5464976, "step": 60690 }, { "epoch": 15.773128898128899, "grad_norm": 3.2490134239196777, "learning_rate": 6.500799854523293e-06, "loss": 0.1659, "num_input_tokens_seen": 5465456, "step": 60695 }, { "epoch": 15.774428274428274, "grad_norm": 0.06531275063753128, "learning_rate": 6.496986719708681e-06, "loss": 0.0112, "num_input_tokens_seen": 5465872, "step": 60700 }, { "epoch": 15.775727650727651, "grad_norm": 0.019784094765782356, "learning_rate": 6.493174536521768e-06, "loss": 0.6038, "num_input_tokens_seen": 5466320, "step": 60705 }, { "epoch": 15.777027027027026, "grad_norm": 7.849476337432861, "learning_rate": 6.48936330515863e-06, "loss": 0.1257, "num_input_tokens_seen": 5466736, "step": 60710 }, { "epoch": 15.778326403326403, "grad_norm": 0.5541486144065857, "learning_rate": 6.485553025815266e-06, "loss": 0.0064, "num_input_tokens_seen": 5467216, "step": 60715 }, { "epoch": 15.77962577962578, "grad_norm": 24.701204299926758, "learning_rate": 6.481743698687659e-06, "loss": 0.2134, "num_input_tokens_seen": 5467648, "step": 60720 }, { "epoch": 15.780925155925155, "grad_norm": 3.1164703369140625, "learning_rate": 6.477935323971707e-06, "loss": 0.4616, "num_input_tokens_seen": 5468080, "step": 60725 }, { "epoch": 15.782224532224532, "grad_norm": 0.996415913105011, "learning_rate": 6.4741279018632975e-06, "loss": 0.3035, "num_input_tokens_seen": 5468544, "step": 60730 }, { "epoch": 15.78352390852391, "grad_norm": 32.513851165771484, "learning_rate": 6.47032143255823e-06, "loss": 0.195, "num_input_tokens_seen": 5468976, "step": 60735 }, { "epoch": 15.784823284823284, "grad_norm": 0.04915459081530571, "learning_rate": 6.466515916252288e-06, "loss": 0.0472, "num_input_tokens_seen": 5469408, "step": 60740 }, { "epoch": 15.786122661122661, "grad_norm": 0.04175514727830887, "learning_rate": 6.46271135314119e-06, "loss": 0.2767, "num_input_tokens_seen": 5469856, "step": 60745 }, { "epoch": 15.787422037422038, "grad_norm": 23.78070831298828, "learning_rate": 6.4589077434206145e-06, "loss": 0.0909, "num_input_tokens_seen": 5470288, "step": 60750 }, { "epoch": 15.788721413721413, "grad_norm": 16.138843536376953, "learning_rate": 6.455105087286173e-06, "loss": 0.3099, "num_input_tokens_seen": 5470720, "step": 60755 }, { "epoch": 15.79002079002079, "grad_norm": 0.01677953638136387, "learning_rate": 6.451303384933455e-06, "loss": 0.0991, "num_input_tokens_seen": 5471184, "step": 60760 }, { "epoch": 15.791320166320165, "grad_norm": 0.3557049036026001, "learning_rate": 6.447502636557972e-06, "loss": 0.0472, "num_input_tokens_seen": 5471584, "step": 60765 }, { "epoch": 15.792619542619542, "grad_norm": 14.377896308898926, "learning_rate": 6.443702842355201e-06, "loss": 0.3414, "num_input_tokens_seen": 5472016, "step": 60770 }, { "epoch": 15.79391891891892, "grad_norm": 0.018237777054309845, "learning_rate": 6.439904002520572e-06, "loss": 0.0364, "num_input_tokens_seen": 5472528, "step": 60775 }, { "epoch": 15.795218295218294, "grad_norm": 15.77314567565918, "learning_rate": 6.436106117249463e-06, "loss": 0.0847, "num_input_tokens_seen": 5472992, "step": 60780 }, { "epoch": 15.796517671517671, "grad_norm": 32.48936080932617, "learning_rate": 6.4323091867372095e-06, "loss": 0.3298, "num_input_tokens_seen": 5473472, "step": 60785 }, { "epoch": 15.797817047817048, "grad_norm": 0.9219591617584229, "learning_rate": 6.42851321117908e-06, "loss": 0.0603, "num_input_tokens_seen": 5473952, "step": 60790 }, { "epoch": 15.799116424116423, "grad_norm": 0.23102958500385284, "learning_rate": 6.424718190770315e-06, "loss": 0.2234, "num_input_tokens_seen": 5474368, "step": 60795 }, { "epoch": 15.8004158004158, "grad_norm": 0.25855088233947754, "learning_rate": 6.4209241257060875e-06, "loss": 0.123, "num_input_tokens_seen": 5474768, "step": 60800 }, { "epoch": 15.801715176715177, "grad_norm": 0.050550173968076706, "learning_rate": 6.417131016181538e-06, "loss": 0.3482, "num_input_tokens_seen": 5475216, "step": 60805 }, { "epoch": 15.803014553014552, "grad_norm": 25.001922607421875, "learning_rate": 6.413338862391741e-06, "loss": 0.2091, "num_input_tokens_seen": 5475616, "step": 60810 }, { "epoch": 15.80431392931393, "grad_norm": 0.5249037146568298, "learning_rate": 6.4095476645317346e-06, "loss": 0.3753, "num_input_tokens_seen": 5476048, "step": 60815 }, { "epoch": 15.805613305613306, "grad_norm": 3.7582545280456543, "learning_rate": 6.405757422796502e-06, "loss": 0.1982, "num_input_tokens_seen": 5476496, "step": 60820 }, { "epoch": 15.806912681912682, "grad_norm": 3.3911499977111816, "learning_rate": 6.401968137380993e-06, "loss": 0.1753, "num_input_tokens_seen": 5476960, "step": 60825 }, { "epoch": 15.808212058212058, "grad_norm": 0.00827656127512455, "learning_rate": 6.398179808480078e-06, "loss": 0.2261, "num_input_tokens_seen": 5477392, "step": 60830 }, { "epoch": 15.809511434511435, "grad_norm": 25.887014389038086, "learning_rate": 6.394392436288593e-06, "loss": 0.3363, "num_input_tokens_seen": 5477856, "step": 60835 }, { "epoch": 15.81081081081081, "grad_norm": 43.35775375366211, "learning_rate": 6.390606021001342e-06, "loss": 0.378, "num_input_tokens_seen": 5478320, "step": 60840 }, { "epoch": 15.812110187110187, "grad_norm": 1.097551941871643, "learning_rate": 6.386820562813043e-06, "loss": 0.3512, "num_input_tokens_seen": 5478736, "step": 60845 }, { "epoch": 15.813409563409563, "grad_norm": 0.18229587376117706, "learning_rate": 6.383036061918399e-06, "loss": 0.1263, "num_input_tokens_seen": 5479168, "step": 60850 }, { "epoch": 15.81470893970894, "grad_norm": 4.276384353637695, "learning_rate": 6.379252518512047e-06, "loss": 0.2623, "num_input_tokens_seen": 5479632, "step": 60855 }, { "epoch": 15.816008316008316, "grad_norm": 7.0579423904418945, "learning_rate": 6.375469932788586e-06, "loss": 0.2011, "num_input_tokens_seen": 5480080, "step": 60860 }, { "epoch": 15.817307692307692, "grad_norm": 16.993383407592773, "learning_rate": 6.371688304942544e-06, "loss": 0.2483, "num_input_tokens_seen": 5480576, "step": 60865 }, { "epoch": 15.818607068607069, "grad_norm": 2.020273447036743, "learning_rate": 6.367907635168433e-06, "loss": 0.2404, "num_input_tokens_seen": 5481040, "step": 60870 }, { "epoch": 15.819906444906445, "grad_norm": 4.280598163604736, "learning_rate": 6.364127923660673e-06, "loss": 0.0525, "num_input_tokens_seen": 5481488, "step": 60875 }, { "epoch": 15.82120582120582, "grad_norm": 27.7834529876709, "learning_rate": 6.360349170613672e-06, "loss": 0.1589, "num_input_tokens_seen": 5481952, "step": 60880 }, { "epoch": 15.822505197505198, "grad_norm": 16.083942413330078, "learning_rate": 6.356571376221771e-06, "loss": 0.0781, "num_input_tokens_seen": 5482432, "step": 60885 }, { "epoch": 15.823804573804575, "grad_norm": 0.01623385399580002, "learning_rate": 6.352794540679274e-06, "loss": 0.1667, "num_input_tokens_seen": 5482880, "step": 60890 }, { "epoch": 15.82510395010395, "grad_norm": 32.80573654174805, "learning_rate": 6.349018664180425e-06, "loss": 0.2294, "num_input_tokens_seen": 5483344, "step": 60895 }, { "epoch": 15.826403326403327, "grad_norm": 0.14350156486034393, "learning_rate": 6.345243746919405e-06, "loss": 0.0202, "num_input_tokens_seen": 5483840, "step": 60900 }, { "epoch": 15.827702702702704, "grad_norm": 1.016421914100647, "learning_rate": 6.341469789090385e-06, "loss": 0.0044, "num_input_tokens_seen": 5484336, "step": 60905 }, { "epoch": 15.829002079002079, "grad_norm": 3.3368818759918213, "learning_rate": 6.3376967908874406e-06, "loss": 0.5615, "num_input_tokens_seen": 5484784, "step": 60910 }, { "epoch": 15.830301455301456, "grad_norm": 44.77924728393555, "learning_rate": 6.3339247525046305e-06, "loss": 0.1194, "num_input_tokens_seen": 5485248, "step": 60915 }, { "epoch": 15.83160083160083, "grad_norm": 0.016621826216578484, "learning_rate": 6.3301536741359575e-06, "loss": 0.6774, "num_input_tokens_seen": 5485728, "step": 60920 }, { "epoch": 15.832900207900208, "grad_norm": 1.9381859302520752, "learning_rate": 6.326383555975376e-06, "loss": 0.0709, "num_input_tokens_seen": 5486192, "step": 60925 }, { "epoch": 15.834199584199585, "grad_norm": 16.441770553588867, "learning_rate": 6.322614398216775e-06, "loss": 0.2718, "num_input_tokens_seen": 5486640, "step": 60930 }, { "epoch": 15.83549896049896, "grad_norm": 1.0762685537338257, "learning_rate": 6.318846201054018e-06, "loss": 0.2159, "num_input_tokens_seen": 5487104, "step": 60935 }, { "epoch": 15.836798336798337, "grad_norm": 16.6635799407959, "learning_rate": 6.315078964680893e-06, "loss": 0.3473, "num_input_tokens_seen": 5487568, "step": 60940 }, { "epoch": 15.838097713097714, "grad_norm": 14.360658645629883, "learning_rate": 6.311312689291166e-06, "loss": 0.1447, "num_input_tokens_seen": 5488016, "step": 60945 }, { "epoch": 15.839397089397089, "grad_norm": 26.914337158203125, "learning_rate": 6.3075473750785265e-06, "loss": 0.4063, "num_input_tokens_seen": 5488464, "step": 60950 }, { "epoch": 15.840696465696466, "grad_norm": 8.99326229095459, "learning_rate": 6.303783022236637e-06, "loss": 0.1675, "num_input_tokens_seen": 5488896, "step": 60955 }, { "epoch": 15.841995841995843, "grad_norm": 0.020906995981931686, "learning_rate": 6.300019630959109e-06, "loss": 0.3327, "num_input_tokens_seen": 5489328, "step": 60960 }, { "epoch": 15.843295218295218, "grad_norm": 0.3957289159297943, "learning_rate": 6.296257201439479e-06, "loss": 0.4821, "num_input_tokens_seen": 5489760, "step": 60965 }, { "epoch": 15.844594594594595, "grad_norm": 27.1728458404541, "learning_rate": 6.29249573387127e-06, "loss": 0.2778, "num_input_tokens_seen": 5490224, "step": 60970 }, { "epoch": 15.845893970893972, "grad_norm": 0.03244146704673767, "learning_rate": 6.288735228447923e-06, "loss": 0.179, "num_input_tokens_seen": 5490640, "step": 60975 }, { "epoch": 15.847193347193347, "grad_norm": 1.70183527469635, "learning_rate": 6.284975685362859e-06, "loss": 0.0079, "num_input_tokens_seen": 5491088, "step": 60980 }, { "epoch": 15.848492723492724, "grad_norm": 0.008271822705864906, "learning_rate": 6.28121710480942e-06, "loss": 0.0471, "num_input_tokens_seen": 5491584, "step": 60985 }, { "epoch": 15.8497920997921, "grad_norm": 0.46354910731315613, "learning_rate": 6.277459486980922e-06, "loss": 0.2728, "num_input_tokens_seen": 5492064, "step": 60990 }, { "epoch": 15.851091476091476, "grad_norm": 32.68458938598633, "learning_rate": 6.273702832070621e-06, "loss": 0.1715, "num_input_tokens_seen": 5492512, "step": 60995 }, { "epoch": 15.852390852390853, "grad_norm": 47.76652908325195, "learning_rate": 6.269947140271732e-06, "loss": 0.2081, "num_input_tokens_seen": 5492944, "step": 61000 }, { "epoch": 15.853690228690228, "grad_norm": 25.81533432006836, "learning_rate": 6.2661924117774025e-06, "loss": 0.1405, "num_input_tokens_seen": 5493408, "step": 61005 }, { "epoch": 15.854989604989605, "grad_norm": 29.339889526367188, "learning_rate": 6.262438646780752e-06, "loss": 0.1611, "num_input_tokens_seen": 5493872, "step": 61010 }, { "epoch": 15.856288981288982, "grad_norm": 0.03613622859120369, "learning_rate": 6.258685845474829e-06, "loss": 0.1351, "num_input_tokens_seen": 5494272, "step": 61015 }, { "epoch": 15.857588357588357, "grad_norm": 4.734645843505859, "learning_rate": 6.254934008052657e-06, "loss": 0.1455, "num_input_tokens_seen": 5494704, "step": 61020 }, { "epoch": 15.858887733887734, "grad_norm": 0.5259156823158264, "learning_rate": 6.251183134707184e-06, "loss": 0.0082, "num_input_tokens_seen": 5495152, "step": 61025 }, { "epoch": 15.86018711018711, "grad_norm": 0.495864599943161, "learning_rate": 6.247433225631324e-06, "loss": 0.1566, "num_input_tokens_seen": 5495600, "step": 61030 }, { "epoch": 15.861486486486486, "grad_norm": 0.20503324270248413, "learning_rate": 6.243684281017953e-06, "loss": 0.0877, "num_input_tokens_seen": 5496032, "step": 61035 }, { "epoch": 15.862785862785863, "grad_norm": 0.1095590814948082, "learning_rate": 6.2399363010598594e-06, "loss": 0.1372, "num_input_tokens_seen": 5496448, "step": 61040 }, { "epoch": 15.86408523908524, "grad_norm": 1.7602839469909668, "learning_rate": 6.236189285949825e-06, "loss": 0.003, "num_input_tokens_seen": 5496944, "step": 61045 }, { "epoch": 15.865384615384615, "grad_norm": 0.3399859368801117, "learning_rate": 6.2324432358805476e-06, "loss": 0.0497, "num_input_tokens_seen": 5497424, "step": 61050 }, { "epoch": 15.866683991683992, "grad_norm": 0.6785440444946289, "learning_rate": 6.228698151044704e-06, "loss": 0.0086, "num_input_tokens_seen": 5497872, "step": 61055 }, { "epoch": 15.867983367983367, "grad_norm": 49.3286247253418, "learning_rate": 6.224954031634891e-06, "loss": 0.3511, "num_input_tokens_seen": 5498320, "step": 61060 }, { "epoch": 15.869282744282744, "grad_norm": 0.1203090026974678, "learning_rate": 6.221210877843683e-06, "loss": 0.0081, "num_input_tokens_seen": 5498752, "step": 61065 }, { "epoch": 15.870582120582121, "grad_norm": 2.53914475440979, "learning_rate": 6.217468689863595e-06, "loss": 0.4165, "num_input_tokens_seen": 5499200, "step": 61070 }, { "epoch": 15.871881496881496, "grad_norm": 0.31293588876724243, "learning_rate": 6.213727467887093e-06, "loss": 0.1193, "num_input_tokens_seen": 5499616, "step": 61075 }, { "epoch": 15.873180873180873, "grad_norm": 0.09609495103359222, "learning_rate": 6.209987212106583e-06, "loss": 0.3027, "num_input_tokens_seen": 5500048, "step": 61080 }, { "epoch": 15.87448024948025, "grad_norm": 0.3948988914489746, "learning_rate": 6.206247922714439e-06, "loss": 0.2803, "num_input_tokens_seen": 5500512, "step": 61085 }, { "epoch": 15.875779625779625, "grad_norm": 34.29566955566406, "learning_rate": 6.202509599902973e-06, "loss": 0.1877, "num_input_tokens_seen": 5500960, "step": 61090 }, { "epoch": 15.877079002079002, "grad_norm": 34.45696258544922, "learning_rate": 6.1987722438644395e-06, "loss": 0.163, "num_input_tokens_seen": 5501408, "step": 61095 }, { "epoch": 15.878378378378379, "grad_norm": 26.722900390625, "learning_rate": 6.195035854791068e-06, "loss": 0.4732, "num_input_tokens_seen": 5501872, "step": 61100 }, { "epoch": 15.879677754677754, "grad_norm": 0.3470434248447418, "learning_rate": 6.191300432875017e-06, "loss": 0.0437, "num_input_tokens_seen": 5502320, "step": 61105 }, { "epoch": 15.880977130977131, "grad_norm": 0.08582452684640884, "learning_rate": 6.187565978308416e-06, "loss": 0.0796, "num_input_tokens_seen": 5502768, "step": 61110 }, { "epoch": 15.882276507276508, "grad_norm": 27.46121597290039, "learning_rate": 6.1838324912833116e-06, "loss": 0.0548, "num_input_tokens_seen": 5503216, "step": 61115 }, { "epoch": 15.883575883575883, "grad_norm": 16.207975387573242, "learning_rate": 6.18009997199174e-06, "loss": 0.2152, "num_input_tokens_seen": 5503664, "step": 61120 }, { "epoch": 15.88487525987526, "grad_norm": 4.237418174743652, "learning_rate": 6.1763684206256525e-06, "loss": 0.3157, "num_input_tokens_seen": 5504096, "step": 61125 }, { "epoch": 15.886174636174637, "grad_norm": 26.584522247314453, "learning_rate": 6.172637837376974e-06, "loss": 0.2509, "num_input_tokens_seen": 5504544, "step": 61130 }, { "epoch": 15.887474012474012, "grad_norm": 0.9342781901359558, "learning_rate": 6.1689082224375695e-06, "loss": 0.1476, "num_input_tokens_seen": 5504992, "step": 61135 }, { "epoch": 15.888773388773389, "grad_norm": 35.072505950927734, "learning_rate": 6.165179575999267e-06, "loss": 0.4312, "num_input_tokens_seen": 5505440, "step": 61140 }, { "epoch": 15.890072765072766, "grad_norm": 19.63032341003418, "learning_rate": 6.161451898253814e-06, "loss": 0.1767, "num_input_tokens_seen": 5505904, "step": 61145 }, { "epoch": 15.891372141372141, "grad_norm": 23.862459182739258, "learning_rate": 6.1577251893929514e-06, "loss": 0.0598, "num_input_tokens_seen": 5506336, "step": 61150 }, { "epoch": 15.892671517671518, "grad_norm": 38.20109176635742, "learning_rate": 6.1539994496083356e-06, "loss": 0.2181, "num_input_tokens_seen": 5506784, "step": 61155 }, { "epoch": 15.893970893970893, "grad_norm": 21.581907272338867, "learning_rate": 6.150274679091577e-06, "loss": 0.1275, "num_input_tokens_seen": 5507264, "step": 61160 }, { "epoch": 15.89527027027027, "grad_norm": 3.4278998374938965, "learning_rate": 6.1465508780342545e-06, "loss": 0.1502, "num_input_tokens_seen": 5507680, "step": 61165 }, { "epoch": 15.896569646569647, "grad_norm": 0.1665598601102829, "learning_rate": 6.142828046627883e-06, "loss": 0.2048, "num_input_tokens_seen": 5508096, "step": 61170 }, { "epoch": 15.897869022869022, "grad_norm": 22.945938110351562, "learning_rate": 6.139106185063942e-06, "loss": 0.0553, "num_input_tokens_seen": 5508544, "step": 61175 }, { "epoch": 15.8991683991684, "grad_norm": 0.0527908019721508, "learning_rate": 6.135385293533832e-06, "loss": 0.072, "num_input_tokens_seen": 5508992, "step": 61180 }, { "epoch": 15.900467775467776, "grad_norm": 1.5532087087631226, "learning_rate": 6.13166537222894e-06, "loss": 0.1321, "num_input_tokens_seen": 5509424, "step": 61185 }, { "epoch": 15.901767151767151, "grad_norm": 29.36910057067871, "learning_rate": 6.127946421340569e-06, "loss": 0.2455, "num_input_tokens_seen": 5509872, "step": 61190 }, { "epoch": 15.903066528066528, "grad_norm": 0.5058948993682861, "learning_rate": 6.124228441060004e-06, "loss": 0.1078, "num_input_tokens_seen": 5510288, "step": 61195 }, { "epoch": 15.904365904365905, "grad_norm": 6.028045654296875, "learning_rate": 6.120511431578449e-06, "loss": 0.0241, "num_input_tokens_seen": 5510752, "step": 61200 }, { "epoch": 15.90566528066528, "grad_norm": 0.06324297934770584, "learning_rate": 6.116795393087079e-06, "loss": 0.255, "num_input_tokens_seen": 5511168, "step": 61205 }, { "epoch": 15.906964656964657, "grad_norm": 0.002677696291357279, "learning_rate": 6.113080325777018e-06, "loss": 0.0007, "num_input_tokens_seen": 5511632, "step": 61210 }, { "epoch": 15.908264033264032, "grad_norm": 10.434296607971191, "learning_rate": 6.1093662298393365e-06, "loss": 0.0271, "num_input_tokens_seen": 5512064, "step": 61215 }, { "epoch": 15.90956340956341, "grad_norm": 5.877240180969238, "learning_rate": 6.105653105465051e-06, "loss": 0.1721, "num_input_tokens_seen": 5512496, "step": 61220 }, { "epoch": 15.910862785862786, "grad_norm": 0.22931751608848572, "learning_rate": 6.101940952845122e-06, "loss": 0.0028, "num_input_tokens_seen": 5512912, "step": 61225 }, { "epoch": 15.912162162162161, "grad_norm": 2.2295234203338623, "learning_rate": 6.098229772170486e-06, "loss": 0.0403, "num_input_tokens_seen": 5513360, "step": 61230 }, { "epoch": 15.913461538461538, "grad_norm": 24.479040145874023, "learning_rate": 6.094519563631995e-06, "loss": 0.5086, "num_input_tokens_seen": 5513824, "step": 61235 }, { "epoch": 15.914760914760915, "grad_norm": 33.75544357299805, "learning_rate": 6.090810327420479e-06, "loss": 0.2181, "num_input_tokens_seen": 5514272, "step": 61240 }, { "epoch": 15.91606029106029, "grad_norm": 0.291789710521698, "learning_rate": 6.087102063726704e-06, "loss": 0.0035, "num_input_tokens_seen": 5514720, "step": 61245 }, { "epoch": 15.917359667359667, "grad_norm": 34.94759750366211, "learning_rate": 6.0833947727414e-06, "loss": 0.32, "num_input_tokens_seen": 5515184, "step": 61250 }, { "epoch": 15.918659043659044, "grad_norm": 0.4337502717971802, "learning_rate": 6.079688454655219e-06, "loss": 0.0012, "num_input_tokens_seen": 5515616, "step": 61255 }, { "epoch": 15.91995841995842, "grad_norm": 0.6911080479621887, "learning_rate": 6.075983109658798e-06, "loss": 0.0156, "num_input_tokens_seen": 5516064, "step": 61260 }, { "epoch": 15.921257796257796, "grad_norm": 15.323532104492188, "learning_rate": 6.072278737942691e-06, "loss": 0.2773, "num_input_tokens_seen": 5516512, "step": 61265 }, { "epoch": 15.922557172557173, "grad_norm": 0.02680499665439129, "learning_rate": 6.06857533969743e-06, "loss": 0.1713, "num_input_tokens_seen": 5516976, "step": 61270 }, { "epoch": 15.923856548856548, "grad_norm": 21.034387588500977, "learning_rate": 6.0648729151134705e-06, "loss": 0.48, "num_input_tokens_seen": 5517392, "step": 61275 }, { "epoch": 15.925155925155925, "grad_norm": 0.3361453413963318, "learning_rate": 6.0611714643812406e-06, "loss": 0.116, "num_input_tokens_seen": 5517824, "step": 61280 }, { "epoch": 15.926455301455302, "grad_norm": 4.270551681518555, "learning_rate": 6.057470987691116e-06, "loss": 0.0353, "num_input_tokens_seen": 5518288, "step": 61285 }, { "epoch": 15.927754677754677, "grad_norm": 0.11448266357183456, "learning_rate": 6.0537714852334e-06, "loss": 0.0556, "num_input_tokens_seen": 5518736, "step": 61290 }, { "epoch": 15.929054054054054, "grad_norm": 0.019175788387656212, "learning_rate": 6.05007295719838e-06, "loss": 0.5915, "num_input_tokens_seen": 5519184, "step": 61295 }, { "epoch": 15.93035343035343, "grad_norm": 22.36864471435547, "learning_rate": 6.046375403776256e-06, "loss": 0.1303, "num_input_tokens_seen": 5519680, "step": 61300 }, { "epoch": 15.931652806652806, "grad_norm": 0.12118156999349594, "learning_rate": 6.0426788251572105e-06, "loss": 0.2512, "num_input_tokens_seen": 5520112, "step": 61305 }, { "epoch": 15.932952182952183, "grad_norm": 0.040105175226926804, "learning_rate": 6.038983221531352e-06, "loss": 0.0418, "num_input_tokens_seen": 5520528, "step": 61310 }, { "epoch": 15.934251559251559, "grad_norm": 3.0393624305725098, "learning_rate": 6.0352885930887556e-06, "loss": 0.3272, "num_input_tokens_seen": 5520944, "step": 61315 }, { "epoch": 15.935550935550935, "grad_norm": 29.464576721191406, "learning_rate": 6.031594940019436e-06, "loss": 0.1446, "num_input_tokens_seen": 5521392, "step": 61320 }, { "epoch": 15.936850311850312, "grad_norm": 19.55331802368164, "learning_rate": 6.027902262513372e-06, "loss": 0.3093, "num_input_tokens_seen": 5521840, "step": 61325 }, { "epoch": 15.938149688149688, "grad_norm": 10.052051544189453, "learning_rate": 6.024210560760463e-06, "loss": 0.1018, "num_input_tokens_seen": 5522304, "step": 61330 }, { "epoch": 15.939449064449065, "grad_norm": 0.025017032399773598, "learning_rate": 6.0205198349505945e-06, "loss": 0.2205, "num_input_tokens_seen": 5522736, "step": 61335 }, { "epoch": 15.940748440748441, "grad_norm": 8.489240646362305, "learning_rate": 6.01683008527357e-06, "loss": 0.1567, "num_input_tokens_seen": 5523184, "step": 61340 }, { "epoch": 15.942047817047817, "grad_norm": 4.651142597198486, "learning_rate": 6.0131413119191685e-06, "loss": 0.4664, "num_input_tokens_seen": 5523648, "step": 61345 }, { "epoch": 15.943347193347194, "grad_norm": 0.03300207853317261, "learning_rate": 6.009453515077096e-06, "loss": 0.1767, "num_input_tokens_seen": 5524048, "step": 61350 }, { "epoch": 15.94464656964657, "grad_norm": 2.519416570663452, "learning_rate": 6.005766694937026e-06, "loss": 0.2421, "num_input_tokens_seen": 5524480, "step": 61355 }, { "epoch": 15.945945945945946, "grad_norm": 0.02966753952205181, "learning_rate": 6.00208085168858e-06, "loss": 0.1481, "num_input_tokens_seen": 5524912, "step": 61360 }, { "epoch": 15.947245322245323, "grad_norm": 0.23201526701450348, "learning_rate": 5.99839598552131e-06, "loss": 0.2174, "num_input_tokens_seen": 5525376, "step": 61365 }, { "epoch": 15.948544698544698, "grad_norm": 27.82019805908203, "learning_rate": 5.994712096624752e-06, "loss": 0.4563, "num_input_tokens_seen": 5525808, "step": 61370 }, { "epoch": 15.949844074844075, "grad_norm": 0.1438213586807251, "learning_rate": 5.9910291851883515e-06, "loss": 0.1669, "num_input_tokens_seen": 5526240, "step": 61375 }, { "epoch": 15.951143451143452, "grad_norm": 0.010060598142445087, "learning_rate": 5.9873472514015316e-06, "loss": 0.0315, "num_input_tokens_seen": 5526736, "step": 61380 }, { "epoch": 15.952442827442827, "grad_norm": 12.713530540466309, "learning_rate": 5.9836662954536634e-06, "loss": 0.3249, "num_input_tokens_seen": 5527216, "step": 61385 }, { "epoch": 15.953742203742204, "grad_norm": 1.65645432472229, "learning_rate": 5.979986317534064e-06, "loss": 0.5066, "num_input_tokens_seen": 5527664, "step": 61390 }, { "epoch": 15.95504158004158, "grad_norm": 32.298255920410156, "learning_rate": 5.976307317831984e-06, "loss": 0.3528, "num_input_tokens_seen": 5528144, "step": 61395 }, { "epoch": 15.956340956340956, "grad_norm": 0.5584366917610168, "learning_rate": 5.972629296536655e-06, "loss": 0.1066, "num_input_tokens_seen": 5528624, "step": 61400 }, { "epoch": 15.957640332640333, "grad_norm": 2.7447168827056885, "learning_rate": 5.968952253837224e-06, "loss": 0.0228, "num_input_tokens_seen": 5529088, "step": 61405 }, { "epoch": 15.95893970893971, "grad_norm": 28.53691864013672, "learning_rate": 5.96527618992282e-06, "loss": 0.0443, "num_input_tokens_seen": 5529536, "step": 61410 }, { "epoch": 15.960239085239085, "grad_norm": 0.06837685406208038, "learning_rate": 5.961601104982495e-06, "loss": 0.003, "num_input_tokens_seen": 5530000, "step": 61415 }, { "epoch": 15.961538461538462, "grad_norm": 0.5815280079841614, "learning_rate": 5.957926999205265e-06, "loss": 0.0028, "num_input_tokens_seen": 5530448, "step": 61420 }, { "epoch": 15.962837837837839, "grad_norm": 0.04541000351309776, "learning_rate": 5.954253872780102e-06, "loss": 0.4672, "num_input_tokens_seen": 5530912, "step": 61425 }, { "epoch": 15.964137214137214, "grad_norm": 2.2575430870056152, "learning_rate": 5.950581725895904e-06, "loss": 0.081, "num_input_tokens_seen": 5531392, "step": 61430 }, { "epoch": 15.96543659043659, "grad_norm": 24.290578842163086, "learning_rate": 5.946910558741548e-06, "loss": 0.1934, "num_input_tokens_seen": 5531904, "step": 61435 }, { "epoch": 15.966735966735968, "grad_norm": 0.013157200068235397, "learning_rate": 5.943240371505829e-06, "loss": 0.4011, "num_input_tokens_seen": 5532336, "step": 61440 }, { "epoch": 15.968035343035343, "grad_norm": 0.005159643478691578, "learning_rate": 5.939571164377525e-06, "loss": 0.0333, "num_input_tokens_seen": 5532832, "step": 61445 }, { "epoch": 15.96933471933472, "grad_norm": 0.2604715824127197, "learning_rate": 5.935902937545332e-06, "loss": 0.1428, "num_input_tokens_seen": 5533264, "step": 61450 }, { "epoch": 15.970634095634095, "grad_norm": 0.4383471608161926, "learning_rate": 5.932235691197918e-06, "loss": 0.0814, "num_input_tokens_seen": 5533712, "step": 61455 }, { "epoch": 15.971933471933472, "grad_norm": 31.09187126159668, "learning_rate": 5.928569425523891e-06, "loss": 0.2925, "num_input_tokens_seen": 5534144, "step": 61460 }, { "epoch": 15.973232848232849, "grad_norm": 23.948423385620117, "learning_rate": 5.924904140711818e-06, "loss": 0.0375, "num_input_tokens_seen": 5534576, "step": 61465 }, { "epoch": 15.974532224532224, "grad_norm": 0.7877588272094727, "learning_rate": 5.921239836950196e-06, "loss": 0.2036, "num_input_tokens_seen": 5535024, "step": 61470 }, { "epoch": 15.9758316008316, "grad_norm": 26.153024673461914, "learning_rate": 5.9175765144274976e-06, "loss": 0.0862, "num_input_tokens_seen": 5535456, "step": 61475 }, { "epoch": 15.977130977130978, "grad_norm": 0.23801115155220032, "learning_rate": 5.9139141733321215e-06, "loss": 0.4148, "num_input_tokens_seen": 5535952, "step": 61480 }, { "epoch": 15.978430353430353, "grad_norm": 3.0067334175109863, "learning_rate": 5.910252813852421e-06, "loss": 0.6204, "num_input_tokens_seen": 5536448, "step": 61485 }, { "epoch": 15.97972972972973, "grad_norm": 0.448496550321579, "learning_rate": 5.9065924361767084e-06, "loss": 0.0959, "num_input_tokens_seen": 5536880, "step": 61490 }, { "epoch": 15.981029106029107, "grad_norm": 40.323184967041016, "learning_rate": 5.902933040493241e-06, "loss": 0.4261, "num_input_tokens_seen": 5537328, "step": 61495 }, { "epoch": 15.982328482328482, "grad_norm": 21.29443359375, "learning_rate": 5.899274626990234e-06, "loss": 0.6419, "num_input_tokens_seen": 5537744, "step": 61500 }, { "epoch": 15.983627858627859, "grad_norm": 0.2105310559272766, "learning_rate": 5.8956171958558266e-06, "loss": 0.0026, "num_input_tokens_seen": 5538208, "step": 61505 }, { "epoch": 15.984927234927234, "grad_norm": 11.004325866699219, "learning_rate": 5.891960747278136e-06, "loss": 0.1278, "num_input_tokens_seen": 5538672, "step": 61510 }, { "epoch": 15.986226611226611, "grad_norm": 26.884111404418945, "learning_rate": 5.888305281445208e-06, "loss": 0.5197, "num_input_tokens_seen": 5539088, "step": 61515 }, { "epoch": 15.987525987525988, "grad_norm": 3.5515105724334717, "learning_rate": 5.8846507985450574e-06, "loss": 0.2758, "num_input_tokens_seen": 5539504, "step": 61520 }, { "epoch": 15.988825363825363, "grad_norm": 0.7237357497215271, "learning_rate": 5.880997298765628e-06, "loss": 0.1798, "num_input_tokens_seen": 5539984, "step": 61525 }, { "epoch": 15.99012474012474, "grad_norm": 0.06823675334453583, "learning_rate": 5.877344782294822e-06, "loss": 0.2156, "num_input_tokens_seen": 5540448, "step": 61530 }, { "epoch": 15.991424116424117, "grad_norm": 0.6948089599609375, "learning_rate": 5.8736932493205e-06, "loss": 0.0133, "num_input_tokens_seen": 5540864, "step": 61535 }, { "epoch": 15.992723492723492, "grad_norm": 0.013535944744944572, "learning_rate": 5.870042700030464e-06, "loss": 0.1135, "num_input_tokens_seen": 5541296, "step": 61540 }, { "epoch": 15.994022869022869, "grad_norm": 1.795771598815918, "learning_rate": 5.866393134612463e-06, "loss": 0.0742, "num_input_tokens_seen": 5541760, "step": 61545 }, { "epoch": 15.995322245322246, "grad_norm": 0.3616703152656555, "learning_rate": 5.862744553254188e-06, "loss": 0.0814, "num_input_tokens_seen": 5542256, "step": 61550 }, { "epoch": 15.996621621621621, "grad_norm": 31.239696502685547, "learning_rate": 5.859096956143306e-06, "loss": 0.1648, "num_input_tokens_seen": 5542688, "step": 61555 }, { "epoch": 15.997920997920998, "grad_norm": 0.8797478079795837, "learning_rate": 5.855450343467397e-06, "loss": 0.207, "num_input_tokens_seen": 5543120, "step": 61560 }, { "epoch": 15.999220374220375, "grad_norm": 0.7011630535125732, "learning_rate": 5.8518047154140245e-06, "loss": 0.0483, "num_input_tokens_seen": 5543648, "step": 61565 }, { "epoch": 16.0, "eval_loss": 0.5153894424438477, "eval_runtime": 13.1904, "eval_samples_per_second": 64.896, "eval_steps_per_second": 32.448, "num_input_tokens_seen": 5543848, "step": 61568 }, { "epoch": 16.00051975051975, "grad_norm": 11.10312271118164, "learning_rate": 5.848160072170681e-06, "loss": 0.0529, "num_input_tokens_seen": 5544056, "step": 61570 }, { "epoch": 16.001819126819125, "grad_norm": 3.256416082382202, "learning_rate": 5.844516413924822e-06, "loss": 0.0845, "num_input_tokens_seen": 5544504, "step": 61575 }, { "epoch": 16.003118503118504, "grad_norm": 12.533747673034668, "learning_rate": 5.840873740863828e-06, "loss": 0.0725, "num_input_tokens_seen": 5544936, "step": 61580 }, { "epoch": 16.00441787941788, "grad_norm": 0.0660879909992218, "learning_rate": 5.8372320531750655e-06, "loss": 0.1973, "num_input_tokens_seen": 5545368, "step": 61585 }, { "epoch": 16.005717255717254, "grad_norm": 0.30764177441596985, "learning_rate": 5.833591351045811e-06, "loss": 0.0468, "num_input_tokens_seen": 5545848, "step": 61590 }, { "epoch": 16.007016632016633, "grad_norm": 20.263254165649414, "learning_rate": 5.829951634663325e-06, "loss": 0.0644, "num_input_tokens_seen": 5546264, "step": 61595 }, { "epoch": 16.008316008316008, "grad_norm": 23.67718505859375, "learning_rate": 5.826312904214781e-06, "loss": 0.3802, "num_input_tokens_seen": 5546728, "step": 61600 }, { "epoch": 16.009615384615383, "grad_norm": 0.5437511801719666, "learning_rate": 5.82267515988735e-06, "loss": 0.0114, "num_input_tokens_seen": 5547256, "step": 61605 }, { "epoch": 16.010914760914762, "grad_norm": 21.813600540161133, "learning_rate": 5.8190384018681075e-06, "loss": 0.1976, "num_input_tokens_seen": 5547720, "step": 61610 }, { "epoch": 16.012214137214137, "grad_norm": 0.8136678338050842, "learning_rate": 5.815402630344094e-06, "loss": 0.3996, "num_input_tokens_seen": 5548200, "step": 61615 }, { "epoch": 16.013513513513512, "grad_norm": 14.432491302490234, "learning_rate": 5.811767845502311e-06, "loss": 0.0358, "num_input_tokens_seen": 5548680, "step": 61620 }, { "epoch": 16.01481288981289, "grad_norm": 3.5717356204986572, "learning_rate": 5.808134047529687e-06, "loss": 0.0799, "num_input_tokens_seen": 5549128, "step": 61625 }, { "epoch": 16.016112266112266, "grad_norm": 7.617029666900635, "learning_rate": 5.804501236613116e-06, "loss": 0.0412, "num_input_tokens_seen": 5549608, "step": 61630 }, { "epoch": 16.01741164241164, "grad_norm": 0.060597974807024, "learning_rate": 5.8008694129394385e-06, "loss": 0.0843, "num_input_tokens_seen": 5550072, "step": 61635 }, { "epoch": 16.01871101871102, "grad_norm": 18.960248947143555, "learning_rate": 5.797238576695452e-06, "loss": 0.2191, "num_input_tokens_seen": 5550536, "step": 61640 }, { "epoch": 16.020010395010395, "grad_norm": 23.844863891601562, "learning_rate": 5.7936087280678755e-06, "loss": 0.065, "num_input_tokens_seen": 5550952, "step": 61645 }, { "epoch": 16.02130977130977, "grad_norm": 0.7924285531044006, "learning_rate": 5.789979867243414e-06, "loss": 0.0426, "num_input_tokens_seen": 5551384, "step": 61650 }, { "epoch": 16.02260914760915, "grad_norm": 3.8409361839294434, "learning_rate": 5.786351994408684e-06, "loss": 0.0086, "num_input_tokens_seen": 5551896, "step": 61655 }, { "epoch": 16.023908523908524, "grad_norm": 0.6118131279945374, "learning_rate": 5.782725109750289e-06, "loss": 0.0033, "num_input_tokens_seen": 5552344, "step": 61660 }, { "epoch": 16.0252079002079, "grad_norm": 0.511377215385437, "learning_rate": 5.77909921345475e-06, "loss": 0.0346, "num_input_tokens_seen": 5552792, "step": 61665 }, { "epoch": 16.026507276507278, "grad_norm": 0.30028247833251953, "learning_rate": 5.775474305708553e-06, "loss": 0.0014, "num_input_tokens_seen": 5553272, "step": 61670 }, { "epoch": 16.027806652806653, "grad_norm": 6.434573650360107, "learning_rate": 5.771850386698138e-06, "loss": 0.1733, "num_input_tokens_seen": 5553752, "step": 61675 }, { "epoch": 16.02910602910603, "grad_norm": 0.6991629004478455, "learning_rate": 5.768227456609879e-06, "loss": 0.0158, "num_input_tokens_seen": 5554184, "step": 61680 }, { "epoch": 16.030405405405407, "grad_norm": 31.492156982421875, "learning_rate": 5.764605515630112e-06, "loss": 0.2692, "num_input_tokens_seen": 5554616, "step": 61685 }, { "epoch": 16.031704781704782, "grad_norm": 0.08502174913883209, "learning_rate": 5.760984563945107e-06, "loss": 0.0946, "num_input_tokens_seen": 5555048, "step": 61690 }, { "epoch": 16.033004158004157, "grad_norm": 12.544625282287598, "learning_rate": 5.757364601741108e-06, "loss": 0.0399, "num_input_tokens_seen": 5555496, "step": 61695 }, { "epoch": 16.034303534303536, "grad_norm": 8.071928977966309, "learning_rate": 5.753745629204277e-06, "loss": 0.1247, "num_input_tokens_seen": 5555912, "step": 61700 }, { "epoch": 16.03560291060291, "grad_norm": 3.8204457759857178, "learning_rate": 5.750127646520747e-06, "loss": 0.3687, "num_input_tokens_seen": 5556360, "step": 61705 }, { "epoch": 16.036902286902286, "grad_norm": 0.08638967573642731, "learning_rate": 5.7465106538766e-06, "loss": 0.1106, "num_input_tokens_seen": 5556824, "step": 61710 }, { "epoch": 16.03820166320166, "grad_norm": 22.083656311035156, "learning_rate": 5.742894651457864e-06, "loss": 0.1496, "num_input_tokens_seen": 5557256, "step": 61715 }, { "epoch": 16.03950103950104, "grad_norm": 0.038922347128391266, "learning_rate": 5.739279639450501e-06, "loss": 0.0013, "num_input_tokens_seen": 5557704, "step": 61720 }, { "epoch": 16.040800415800415, "grad_norm": 2.595140218734741, "learning_rate": 5.735665618040445e-06, "loss": 0.0648, "num_input_tokens_seen": 5558136, "step": 61725 }, { "epoch": 16.04209979209979, "grad_norm": 0.302635133266449, "learning_rate": 5.732052587413561e-06, "loss": 0.0396, "num_input_tokens_seen": 5558568, "step": 61730 }, { "epoch": 16.04339916839917, "grad_norm": 23.111125946044922, "learning_rate": 5.728440547755679e-06, "loss": 0.0882, "num_input_tokens_seen": 5559000, "step": 61735 }, { "epoch": 16.044698544698544, "grad_norm": 0.05340702459216118, "learning_rate": 5.72482949925256e-06, "loss": 0.0542, "num_input_tokens_seen": 5559512, "step": 61740 }, { "epoch": 16.04599792099792, "grad_norm": 5.4280548095703125, "learning_rate": 5.721219442089926e-06, "loss": 0.2196, "num_input_tokens_seen": 5559976, "step": 61745 }, { "epoch": 16.0472972972973, "grad_norm": 21.147838592529297, "learning_rate": 5.717610376453455e-06, "loss": 0.2915, "num_input_tokens_seen": 5560376, "step": 61750 }, { "epoch": 16.048596673596673, "grad_norm": 26.102325439453125, "learning_rate": 5.714002302528751e-06, "loss": 0.1446, "num_input_tokens_seen": 5560840, "step": 61755 }, { "epoch": 16.04989604989605, "grad_norm": 0.6012158393859863, "learning_rate": 5.7103952205013965e-06, "loss": 0.1512, "num_input_tokens_seen": 5561272, "step": 61760 }, { "epoch": 16.051195426195427, "grad_norm": 31.855432510375977, "learning_rate": 5.706789130556889e-06, "loss": 0.3052, "num_input_tokens_seen": 5561688, "step": 61765 }, { "epoch": 16.052494802494802, "grad_norm": 0.17990192770957947, "learning_rate": 5.70318403288071e-06, "loss": 0.2178, "num_input_tokens_seen": 5562152, "step": 61770 }, { "epoch": 16.053794178794178, "grad_norm": 55.40945816040039, "learning_rate": 5.699579927658258e-06, "loss": 0.4226, "num_input_tokens_seen": 5562600, "step": 61775 }, { "epoch": 16.055093555093556, "grad_norm": 0.03345254436135292, "learning_rate": 5.695976815074905e-06, "loss": 0.3369, "num_input_tokens_seen": 5563048, "step": 61780 }, { "epoch": 16.05639293139293, "grad_norm": 0.3335062563419342, "learning_rate": 5.69237469531596e-06, "loss": 0.0256, "num_input_tokens_seen": 5563496, "step": 61785 }, { "epoch": 16.057692307692307, "grad_norm": 0.03862197324633598, "learning_rate": 5.688773568566691e-06, "loss": 0.0064, "num_input_tokens_seen": 5563944, "step": 61790 }, { "epoch": 16.058991683991685, "grad_norm": 0.5715624094009399, "learning_rate": 5.685173435012292e-06, "loss": 0.2657, "num_input_tokens_seen": 5564424, "step": 61795 }, { "epoch": 16.06029106029106, "grad_norm": 0.12530921399593353, "learning_rate": 5.6815742948379384e-06, "loss": 0.0026, "num_input_tokens_seen": 5564872, "step": 61800 }, { "epoch": 16.061590436590436, "grad_norm": 0.019535748288035393, "learning_rate": 5.677976148228728e-06, "loss": 0.0614, "num_input_tokens_seen": 5565320, "step": 61805 }, { "epoch": 16.062889812889814, "grad_norm": 35.986663818359375, "learning_rate": 5.674378995369712e-06, "loss": 0.4103, "num_input_tokens_seen": 5565768, "step": 61810 }, { "epoch": 16.06418918918919, "grad_norm": 23.592796325683594, "learning_rate": 5.670782836445901e-06, "loss": 0.2209, "num_input_tokens_seen": 5566232, "step": 61815 }, { "epoch": 16.065488565488565, "grad_norm": 0.01663343608379364, "learning_rate": 5.667187671642246e-06, "loss": 0.0788, "num_input_tokens_seen": 5566664, "step": 61820 }, { "epoch": 16.066787941787943, "grad_norm": 0.01971890963613987, "learning_rate": 5.663593501143663e-06, "loss": 0.1011, "num_input_tokens_seen": 5567080, "step": 61825 }, { "epoch": 16.06808731808732, "grad_norm": 1.0469359159469604, "learning_rate": 5.660000325134987e-06, "loss": 0.0026, "num_input_tokens_seen": 5567560, "step": 61830 }, { "epoch": 16.069386694386694, "grad_norm": 35.50575637817383, "learning_rate": 5.656408143801028e-06, "loss": 0.1381, "num_input_tokens_seen": 5568024, "step": 61835 }, { "epoch": 16.070686070686072, "grad_norm": 50.958988189697266, "learning_rate": 5.6528169573265286e-06, "loss": 0.2278, "num_input_tokens_seen": 5568472, "step": 61840 }, { "epoch": 16.071985446985448, "grad_norm": 0.027247264981269836, "learning_rate": 5.649226765896199e-06, "loss": 0.2744, "num_input_tokens_seen": 5568920, "step": 61845 }, { "epoch": 16.073284823284823, "grad_norm": 7.5139312744140625, "learning_rate": 5.645637569694662e-06, "loss": 0.0367, "num_input_tokens_seen": 5569400, "step": 61850 }, { "epoch": 16.074584199584198, "grad_norm": 4.54836893081665, "learning_rate": 5.642049368906544e-06, "loss": 0.3891, "num_input_tokens_seen": 5569864, "step": 61855 }, { "epoch": 16.075883575883577, "grad_norm": 0.0031448735389858484, "learning_rate": 5.638462163716366e-06, "loss": 0.0055, "num_input_tokens_seen": 5570296, "step": 61860 }, { "epoch": 16.07718295218295, "grad_norm": 3.183291435241699, "learning_rate": 5.634875954308638e-06, "loss": 0.0483, "num_input_tokens_seen": 5570728, "step": 61865 }, { "epoch": 16.078482328482327, "grad_norm": 0.024656992405653, "learning_rate": 5.631290740867795e-06, "loss": 0.2142, "num_input_tokens_seen": 5571192, "step": 61870 }, { "epoch": 16.079781704781706, "grad_norm": 0.023415368050336838, "learning_rate": 5.627706523578219e-06, "loss": 0.0361, "num_input_tokens_seen": 5571640, "step": 61875 }, { "epoch": 16.08108108108108, "grad_norm": 9.129988670349121, "learning_rate": 5.624123302624259e-06, "loss": 0.0209, "num_input_tokens_seen": 5572104, "step": 61880 }, { "epoch": 16.082380457380456, "grad_norm": 0.018242811784148216, "learning_rate": 5.620541078190203e-06, "loss": 0.049, "num_input_tokens_seen": 5572536, "step": 61885 }, { "epoch": 16.083679833679835, "grad_norm": 4.907149791717529, "learning_rate": 5.616959850460296e-06, "loss": 0.0437, "num_input_tokens_seen": 5572984, "step": 61890 }, { "epoch": 16.08497920997921, "grad_norm": 0.2409883290529251, "learning_rate": 5.613379619618705e-06, "loss": 0.2189, "num_input_tokens_seen": 5573400, "step": 61895 }, { "epoch": 16.086278586278585, "grad_norm": 0.029393993318080902, "learning_rate": 5.609800385849587e-06, "loss": 0.2685, "num_input_tokens_seen": 5573832, "step": 61900 }, { "epoch": 16.087577962577964, "grad_norm": 0.572056233882904, "learning_rate": 5.6062221493370035e-06, "loss": 0.0158, "num_input_tokens_seen": 5574296, "step": 61905 }, { "epoch": 16.08887733887734, "grad_norm": 0.042111027985811234, "learning_rate": 5.602644910265006e-06, "loss": 0.1095, "num_input_tokens_seen": 5574728, "step": 61910 }, { "epoch": 16.090176715176714, "grad_norm": 5.2855119705200195, "learning_rate": 5.5990686688175585e-06, "loss": 0.022, "num_input_tokens_seen": 5575160, "step": 61915 }, { "epoch": 16.091476091476093, "grad_norm": 0.06012026220560074, "learning_rate": 5.5954934251786e-06, "loss": 0.0108, "num_input_tokens_seen": 5575592, "step": 61920 }, { "epoch": 16.092775467775468, "grad_norm": 0.0800006166100502, "learning_rate": 5.591919179532007e-06, "loss": 0.2747, "num_input_tokens_seen": 5576008, "step": 61925 }, { "epoch": 16.094074844074843, "grad_norm": 28.07623291015625, "learning_rate": 5.588345932061612e-06, "loss": 0.067, "num_input_tokens_seen": 5576488, "step": 61930 }, { "epoch": 16.09537422037422, "grad_norm": 0.05279017612338066, "learning_rate": 5.584773682951186e-06, "loss": 0.0114, "num_input_tokens_seen": 5576936, "step": 61935 }, { "epoch": 16.096673596673597, "grad_norm": 26.32378578186035, "learning_rate": 5.581202432384444e-06, "loss": 0.1113, "num_input_tokens_seen": 5577400, "step": 61940 }, { "epoch": 16.097972972972972, "grad_norm": 0.012717816047370434, "learning_rate": 5.577632180545075e-06, "loss": 0.395, "num_input_tokens_seen": 5577848, "step": 61945 }, { "epoch": 16.09927234927235, "grad_norm": 0.046243466436862946, "learning_rate": 5.574062927616685e-06, "loss": 0.1271, "num_input_tokens_seen": 5578312, "step": 61950 }, { "epoch": 16.100571725571726, "grad_norm": 59.289249420166016, "learning_rate": 5.570494673782853e-06, "loss": 0.333, "num_input_tokens_seen": 5578776, "step": 61955 }, { "epoch": 16.1018711018711, "grad_norm": 17.883724212646484, "learning_rate": 5.566927419227094e-06, "loss": 0.0423, "num_input_tokens_seen": 5579224, "step": 61960 }, { "epoch": 16.10317047817048, "grad_norm": 0.020857980474829674, "learning_rate": 5.563361164132888e-06, "loss": 0.054, "num_input_tokens_seen": 5579688, "step": 61965 }, { "epoch": 16.104469854469855, "grad_norm": 29.222904205322266, "learning_rate": 5.559795908683632e-06, "loss": 0.1279, "num_input_tokens_seen": 5580152, "step": 61970 }, { "epoch": 16.10576923076923, "grad_norm": 49.91115951538086, "learning_rate": 5.556231653062705e-06, "loss": 0.2499, "num_input_tokens_seen": 5580600, "step": 61975 }, { "epoch": 16.10706860706861, "grad_norm": 29.18434715270996, "learning_rate": 5.552668397453409e-06, "loss": 0.0712, "num_input_tokens_seen": 5581048, "step": 61980 }, { "epoch": 16.108367983367984, "grad_norm": 1.112810492515564, "learning_rate": 5.549106142039018e-06, "loss": 0.3326, "num_input_tokens_seen": 5581480, "step": 61985 }, { "epoch": 16.10966735966736, "grad_norm": 0.006396469660103321, "learning_rate": 5.545544887002726e-06, "loss": 0.0065, "num_input_tokens_seen": 5581928, "step": 61990 }, { "epoch": 16.110966735966738, "grad_norm": 0.16334719955921173, "learning_rate": 5.541984632527702e-06, "loss": 0.0026, "num_input_tokens_seen": 5582344, "step": 61995 }, { "epoch": 16.112266112266113, "grad_norm": 0.09024950861930847, "learning_rate": 5.53842537879706e-06, "loss": 0.0029, "num_input_tokens_seen": 5582776, "step": 62000 }, { "epoch": 16.113565488565488, "grad_norm": 14.679418563842773, "learning_rate": 5.534867125993839e-06, "loss": 0.0264, "num_input_tokens_seen": 5583208, "step": 62005 }, { "epoch": 16.114864864864863, "grad_norm": 0.11539506167173386, "learning_rate": 5.531309874301061e-06, "loss": 0.4728, "num_input_tokens_seen": 5583688, "step": 62010 }, { "epoch": 16.116164241164242, "grad_norm": 0.0437091663479805, "learning_rate": 5.527753623901663e-06, "loss": 0.0027, "num_input_tokens_seen": 5584088, "step": 62015 }, { "epoch": 16.117463617463617, "grad_norm": 0.07197492569684982, "learning_rate": 5.524198374978559e-06, "loss": 0.5481, "num_input_tokens_seen": 5584520, "step": 62020 }, { "epoch": 16.118762993762992, "grad_norm": 25.861406326293945, "learning_rate": 5.520644127714589e-06, "loss": 0.1163, "num_input_tokens_seen": 5584968, "step": 62025 }, { "epoch": 16.12006237006237, "grad_norm": 18.274253845214844, "learning_rate": 5.517090882292552e-06, "loss": 0.0265, "num_input_tokens_seen": 5585416, "step": 62030 }, { "epoch": 16.121361746361746, "grad_norm": 2.038156032562256, "learning_rate": 5.5135386388952024e-06, "loss": 0.0034, "num_input_tokens_seen": 5585880, "step": 62035 }, { "epoch": 16.12266112266112, "grad_norm": 2.530580759048462, "learning_rate": 5.509987397705238e-06, "loss": 0.0381, "num_input_tokens_seen": 5586312, "step": 62040 }, { "epoch": 16.1239604989605, "grad_norm": 19.573198318481445, "learning_rate": 5.506437158905287e-06, "loss": 0.1397, "num_input_tokens_seen": 5586712, "step": 62045 }, { "epoch": 16.125259875259875, "grad_norm": 3.574406862258911, "learning_rate": 5.50288792267796e-06, "loss": 0.0687, "num_input_tokens_seen": 5587160, "step": 62050 }, { "epoch": 16.12655925155925, "grad_norm": 3.0026049613952637, "learning_rate": 5.499339689205779e-06, "loss": 0.0609, "num_input_tokens_seen": 5587640, "step": 62055 }, { "epoch": 16.12785862785863, "grad_norm": 11.69472885131836, "learning_rate": 5.49579245867125e-06, "loss": 0.0166, "num_input_tokens_seen": 5588104, "step": 62060 }, { "epoch": 16.129158004158004, "grad_norm": 50.84427261352539, "learning_rate": 5.492246231256798e-06, "loss": 0.3359, "num_input_tokens_seen": 5588552, "step": 62065 }, { "epoch": 16.13045738045738, "grad_norm": 0.0958409234881401, "learning_rate": 5.488701007144812e-06, "loss": 0.3418, "num_input_tokens_seen": 5589000, "step": 62070 }, { "epoch": 16.131756756756758, "grad_norm": 0.2927394211292267, "learning_rate": 5.485156786517634e-06, "loss": 0.1266, "num_input_tokens_seen": 5589432, "step": 62075 }, { "epoch": 16.133056133056133, "grad_norm": 0.04873281717300415, "learning_rate": 5.481613569557536e-06, "loss": 0.0336, "num_input_tokens_seen": 5589880, "step": 62080 }, { "epoch": 16.134355509355508, "grad_norm": 0.9087634086608887, "learning_rate": 5.47807135644676e-06, "loss": 0.0014, "num_input_tokens_seen": 5590344, "step": 62085 }, { "epoch": 16.135654885654887, "grad_norm": 48.064395904541016, "learning_rate": 5.474530147367471e-06, "loss": 0.2497, "num_input_tokens_seen": 5590808, "step": 62090 }, { "epoch": 16.136954261954262, "grad_norm": 0.007179105654358864, "learning_rate": 5.4709899425018144e-06, "loss": 0.3827, "num_input_tokens_seen": 5591240, "step": 62095 }, { "epoch": 16.138253638253637, "grad_norm": 0.03871528059244156, "learning_rate": 5.467450742031841e-06, "loss": 0.2393, "num_input_tokens_seen": 5591672, "step": 62100 }, { "epoch": 16.139553014553016, "grad_norm": 3.512495994567871, "learning_rate": 5.4639125461396045e-06, "loss": 0.1931, "num_input_tokens_seen": 5592152, "step": 62105 }, { "epoch": 16.14085239085239, "grad_norm": 49.79820251464844, "learning_rate": 5.460375355007058e-06, "loss": 0.4683, "num_input_tokens_seen": 5592600, "step": 62110 }, { "epoch": 16.142151767151766, "grad_norm": 0.6250742673873901, "learning_rate": 5.4568391688161355e-06, "loss": 0.0614, "num_input_tokens_seen": 5593080, "step": 62115 }, { "epoch": 16.143451143451145, "grad_norm": 0.19807642698287964, "learning_rate": 5.453303987748695e-06, "loss": 0.0033, "num_input_tokens_seen": 5593512, "step": 62120 }, { "epoch": 16.14475051975052, "grad_norm": 0.056030914187431335, "learning_rate": 5.449769811986563e-06, "loss": 0.0792, "num_input_tokens_seen": 5593944, "step": 62125 }, { "epoch": 16.146049896049895, "grad_norm": 51.57534408569336, "learning_rate": 5.4462366417114965e-06, "loss": 0.2419, "num_input_tokens_seen": 5594424, "step": 62130 }, { "epoch": 16.147349272349274, "grad_norm": 2.7758054733276367, "learning_rate": 5.442704477105215e-06, "loss": 0.2128, "num_input_tokens_seen": 5594856, "step": 62135 }, { "epoch": 16.14864864864865, "grad_norm": 47.77667236328125, "learning_rate": 5.439173318349389e-06, "loss": 0.3004, "num_input_tokens_seen": 5595272, "step": 62140 }, { "epoch": 16.149948024948024, "grad_norm": 25.496349334716797, "learning_rate": 5.435643165625614e-06, "loss": 0.0326, "num_input_tokens_seen": 5595736, "step": 62145 }, { "epoch": 16.151247401247403, "grad_norm": 0.04953543841838837, "learning_rate": 5.432114019115464e-06, "loss": 0.004, "num_input_tokens_seen": 5596184, "step": 62150 }, { "epoch": 16.152546777546778, "grad_norm": 0.02629607357084751, "learning_rate": 5.42858587900043e-06, "loss": 0.2203, "num_input_tokens_seen": 5596648, "step": 62155 }, { "epoch": 16.153846153846153, "grad_norm": 50.6107177734375, "learning_rate": 5.425058745461986e-06, "loss": 0.1263, "num_input_tokens_seen": 5597112, "step": 62160 }, { "epoch": 16.15514553014553, "grad_norm": 0.04274819791316986, "learning_rate": 5.4215326186815185e-06, "loss": 0.2394, "num_input_tokens_seen": 5597544, "step": 62165 }, { "epoch": 16.156444906444907, "grad_norm": 0.011181375943124294, "learning_rate": 5.418007498840388e-06, "loss": 0.2502, "num_input_tokens_seen": 5597992, "step": 62170 }, { "epoch": 16.157744282744282, "grad_norm": 0.8873308897018433, "learning_rate": 5.4144833861198925e-06, "loss": 0.276, "num_input_tokens_seen": 5598456, "step": 62175 }, { "epoch": 16.159043659043657, "grad_norm": 8.31442642211914, "learning_rate": 5.410960280701291e-06, "loss": 0.4008, "num_input_tokens_seen": 5598904, "step": 62180 }, { "epoch": 16.160343035343036, "grad_norm": 0.01356274913996458, "learning_rate": 5.407438182765764e-06, "loss": 0.0006, "num_input_tokens_seen": 5599352, "step": 62185 }, { "epoch": 16.16164241164241, "grad_norm": 0.10113801807165146, "learning_rate": 5.403917092494473e-06, "loss": 0.0296, "num_input_tokens_seen": 5599800, "step": 62190 }, { "epoch": 16.162941787941786, "grad_norm": 2.1442387104034424, "learning_rate": 5.400397010068492e-06, "loss": 0.0169, "num_input_tokens_seen": 5600264, "step": 62195 }, { "epoch": 16.164241164241165, "grad_norm": 2.408531904220581, "learning_rate": 5.396877935668882e-06, "loss": 0.0861, "num_input_tokens_seen": 5600712, "step": 62200 }, { "epoch": 16.16554054054054, "grad_norm": 0.004136254079639912, "learning_rate": 5.3933598694766135e-06, "loss": 0.0046, "num_input_tokens_seen": 5601144, "step": 62205 }, { "epoch": 16.166839916839916, "grad_norm": 65.10449981689453, "learning_rate": 5.389842811672635e-06, "loss": 0.3716, "num_input_tokens_seen": 5601576, "step": 62210 }, { "epoch": 16.168139293139294, "grad_norm": 0.5987812876701355, "learning_rate": 5.386326762437835e-06, "loss": 0.231, "num_input_tokens_seen": 5602040, "step": 62215 }, { "epoch": 16.16943866943867, "grad_norm": 40.85253143310547, "learning_rate": 5.3828117219530374e-06, "loss": 0.1764, "num_input_tokens_seen": 5602520, "step": 62220 }, { "epoch": 16.170738045738045, "grad_norm": 0.5245772004127502, "learning_rate": 5.379297690399035e-06, "loss": 0.3579, "num_input_tokens_seen": 5602952, "step": 62225 }, { "epoch": 16.172037422037423, "grad_norm": 1.8599987030029297, "learning_rate": 5.375784667956546e-06, "loss": 0.2631, "num_input_tokens_seen": 5603432, "step": 62230 }, { "epoch": 16.1733367983368, "grad_norm": 0.008209111168980598, "learning_rate": 5.37227265480626e-06, "loss": 0.0255, "num_input_tokens_seen": 5603896, "step": 62235 }, { "epoch": 16.174636174636174, "grad_norm": 0.28697240352630615, "learning_rate": 5.368761651128792e-06, "loss": 0.1957, "num_input_tokens_seen": 5604344, "step": 62240 }, { "epoch": 16.175935550935552, "grad_norm": 0.646445631980896, "learning_rate": 5.3652516571047225e-06, "loss": 0.2416, "num_input_tokens_seen": 5604792, "step": 62245 }, { "epoch": 16.177234927234927, "grad_norm": 1.5898804664611816, "learning_rate": 5.361742672914572e-06, "loss": 0.003, "num_input_tokens_seen": 5605272, "step": 62250 }, { "epoch": 16.178534303534303, "grad_norm": 0.5817267894744873, "learning_rate": 5.35823469873882e-06, "loss": 0.0378, "num_input_tokens_seen": 5605752, "step": 62255 }, { "epoch": 16.17983367983368, "grad_norm": 22.996902465820312, "learning_rate": 5.35472773475787e-06, "loss": 0.1449, "num_input_tokens_seen": 5606200, "step": 62260 }, { "epoch": 16.181133056133056, "grad_norm": 0.009482871741056442, "learning_rate": 5.351221781152102e-06, "loss": 0.0058, "num_input_tokens_seen": 5606664, "step": 62265 }, { "epoch": 16.18243243243243, "grad_norm": 44.80192184448242, "learning_rate": 5.347716838101827e-06, "loss": 0.1172, "num_input_tokens_seen": 5607144, "step": 62270 }, { "epoch": 16.18373180873181, "grad_norm": 0.18773996829986572, "learning_rate": 5.344212905787296e-06, "loss": 0.4716, "num_input_tokens_seen": 5607624, "step": 62275 }, { "epoch": 16.185031185031185, "grad_norm": 0.4869508743286133, "learning_rate": 5.340709984388728e-06, "loss": 0.1473, "num_input_tokens_seen": 5608056, "step": 62280 }, { "epoch": 16.18633056133056, "grad_norm": 32.23733901977539, "learning_rate": 5.337208074086284e-06, "loss": 0.4677, "num_input_tokens_seen": 5608488, "step": 62285 }, { "epoch": 16.18762993762994, "grad_norm": 7.571673393249512, "learning_rate": 5.333707175060074e-06, "loss": 0.2707, "num_input_tokens_seen": 5608936, "step": 62290 }, { "epoch": 16.188929313929314, "grad_norm": 15.605782508850098, "learning_rate": 5.330207287490141e-06, "loss": 0.1694, "num_input_tokens_seen": 5609384, "step": 62295 }, { "epoch": 16.19022869022869, "grad_norm": 0.9176924824714661, "learning_rate": 5.3267084115565e-06, "loss": 0.1264, "num_input_tokens_seen": 5609816, "step": 62300 }, { "epoch": 16.191528066528065, "grad_norm": 0.00692480755969882, "learning_rate": 5.3232105474390895e-06, "loss": 0.1514, "num_input_tokens_seen": 5610248, "step": 62305 }, { "epoch": 16.192827442827443, "grad_norm": 0.8516720533370972, "learning_rate": 5.3197136953178215e-06, "loss": 0.0093, "num_input_tokens_seen": 5610696, "step": 62310 }, { "epoch": 16.19412681912682, "grad_norm": 3.942544937133789, "learning_rate": 5.316217855372527e-06, "loss": 0.1019, "num_input_tokens_seen": 5611176, "step": 62315 }, { "epoch": 16.195426195426194, "grad_norm": 0.024039292708039284, "learning_rate": 5.312723027783006e-06, "loss": 0.1287, "num_input_tokens_seen": 5611608, "step": 62320 }, { "epoch": 16.196725571725572, "grad_norm": 0.006879974622279406, "learning_rate": 5.309229212729009e-06, "loss": 0.0069, "num_input_tokens_seen": 5612072, "step": 62325 }, { "epoch": 16.198024948024948, "grad_norm": 1.1930714845657349, "learning_rate": 5.305736410390222e-06, "loss": 0.2817, "num_input_tokens_seen": 5612552, "step": 62330 }, { "epoch": 16.199324324324323, "grad_norm": 12.643402099609375, "learning_rate": 5.302244620946284e-06, "loss": 0.3366, "num_input_tokens_seen": 5613032, "step": 62335 }, { "epoch": 16.2006237006237, "grad_norm": 0.7209774851799011, "learning_rate": 5.29875384457677e-06, "loss": 0.0158, "num_input_tokens_seen": 5613464, "step": 62340 }, { "epoch": 16.201923076923077, "grad_norm": 0.03194255754351616, "learning_rate": 5.295264081461232e-06, "loss": 0.1085, "num_input_tokens_seen": 5613896, "step": 62345 }, { "epoch": 16.203222453222452, "grad_norm": 17.57684898376465, "learning_rate": 5.291775331779125e-06, "loss": 0.5694, "num_input_tokens_seen": 5614328, "step": 62350 }, { "epoch": 16.20452182952183, "grad_norm": 0.0983862578868866, "learning_rate": 5.288287595709915e-06, "loss": 0.0388, "num_input_tokens_seen": 5614792, "step": 62355 }, { "epoch": 16.205821205821206, "grad_norm": 53.71980667114258, "learning_rate": 5.284800873432949e-06, "loss": 0.572, "num_input_tokens_seen": 5615256, "step": 62360 }, { "epoch": 16.20712058212058, "grad_norm": 0.009431831538677216, "learning_rate": 5.281315165127573e-06, "loss": 0.0035, "num_input_tokens_seen": 5615736, "step": 62365 }, { "epoch": 16.20841995841996, "grad_norm": 0.4314210116863251, "learning_rate": 5.277830470973047e-06, "loss": 0.0651, "num_input_tokens_seen": 5616184, "step": 62370 }, { "epoch": 16.209719334719335, "grad_norm": 28.204153060913086, "learning_rate": 5.274346791148601e-06, "loss": 0.1839, "num_input_tokens_seen": 5616600, "step": 62375 }, { "epoch": 16.21101871101871, "grad_norm": 1.1991888284683228, "learning_rate": 5.270864125833394e-06, "loss": 0.0152, "num_input_tokens_seen": 5617048, "step": 62380 }, { "epoch": 16.21231808731809, "grad_norm": 14.622662544250488, "learning_rate": 5.267382475206548e-06, "loss": 0.2868, "num_input_tokens_seen": 5617512, "step": 62385 }, { "epoch": 16.213617463617464, "grad_norm": 0.2727442979812622, "learning_rate": 5.263901839447128e-06, "loss": 0.1518, "num_input_tokens_seen": 5618024, "step": 62390 }, { "epoch": 16.21491683991684, "grad_norm": 13.050612449645996, "learning_rate": 5.260422218734154e-06, "loss": 0.0247, "num_input_tokens_seen": 5618456, "step": 62395 }, { "epoch": 16.216216216216218, "grad_norm": 0.15258508920669556, "learning_rate": 5.256943613246579e-06, "loss": 0.1508, "num_input_tokens_seen": 5618904, "step": 62400 }, { "epoch": 16.217515592515593, "grad_norm": 3.4745452404022217, "learning_rate": 5.2534660231633036e-06, "loss": 0.0227, "num_input_tokens_seen": 5619368, "step": 62405 }, { "epoch": 16.218814968814968, "grad_norm": 0.02534792199730873, "learning_rate": 5.249989448663195e-06, "loss": 0.0037, "num_input_tokens_seen": 5619864, "step": 62410 }, { "epoch": 16.220114345114347, "grad_norm": 1.2955313920974731, "learning_rate": 5.24651388992505e-06, "loss": 0.1025, "num_input_tokens_seen": 5620296, "step": 62415 }, { "epoch": 16.22141372141372, "grad_norm": 0.3694905936717987, "learning_rate": 5.243039347127621e-06, "loss": 0.0464, "num_input_tokens_seen": 5620744, "step": 62420 }, { "epoch": 16.222713097713097, "grad_norm": 22.132606506347656, "learning_rate": 5.2395658204496075e-06, "loss": 0.022, "num_input_tokens_seen": 5621224, "step": 62425 }, { "epoch": 16.224012474012476, "grad_norm": 46.983097076416016, "learning_rate": 5.236093310069667e-06, "loss": 0.2409, "num_input_tokens_seen": 5621688, "step": 62430 }, { "epoch": 16.22531185031185, "grad_norm": 0.48579344153404236, "learning_rate": 5.232621816166375e-06, "loss": 0.0497, "num_input_tokens_seen": 5622152, "step": 62435 }, { "epoch": 16.226611226611226, "grad_norm": 2.2955493927001953, "learning_rate": 5.22915133891829e-06, "loss": 0.1787, "num_input_tokens_seen": 5622632, "step": 62440 }, { "epoch": 16.227910602910605, "grad_norm": 0.11424868553876877, "learning_rate": 5.225681878503891e-06, "loss": 0.0833, "num_input_tokens_seen": 5623048, "step": 62445 }, { "epoch": 16.22920997920998, "grad_norm": 42.372554779052734, "learning_rate": 5.222213435101625e-06, "loss": 0.2311, "num_input_tokens_seen": 5623512, "step": 62450 }, { "epoch": 16.230509355509355, "grad_norm": 1.337876558303833, "learning_rate": 5.218746008889863e-06, "loss": 0.3602, "num_input_tokens_seen": 5623976, "step": 62455 }, { "epoch": 16.23180873180873, "grad_norm": 0.056743621826171875, "learning_rate": 5.2152796000469514e-06, "loss": 0.0008, "num_input_tokens_seen": 5624392, "step": 62460 }, { "epoch": 16.23310810810811, "grad_norm": 0.24425528943538666, "learning_rate": 5.2118142087511705e-06, "loss": 0.1238, "num_input_tokens_seen": 5624856, "step": 62465 }, { "epoch": 16.234407484407484, "grad_norm": 0.7819458246231079, "learning_rate": 5.20834983518074e-06, "loss": 0.0064, "num_input_tokens_seen": 5625336, "step": 62470 }, { "epoch": 16.23570686070686, "grad_norm": 9.44438648223877, "learning_rate": 5.2048864795138454e-06, "loss": 0.1582, "num_input_tokens_seen": 5625768, "step": 62475 }, { "epoch": 16.237006237006238, "grad_norm": 27.041667938232422, "learning_rate": 5.2014241419286015e-06, "loss": 0.4492, "num_input_tokens_seen": 5626184, "step": 62480 }, { "epoch": 16.238305613305613, "grad_norm": 0.03139127790927887, "learning_rate": 5.197962822603092e-06, "loss": 0.2503, "num_input_tokens_seen": 5626616, "step": 62485 }, { "epoch": 16.239604989604988, "grad_norm": 0.39601561427116394, "learning_rate": 5.194502521715316e-06, "loss": 0.0588, "num_input_tokens_seen": 5627128, "step": 62490 }, { "epoch": 16.240904365904367, "grad_norm": 16.48758316040039, "learning_rate": 5.191043239443258e-06, "loss": 0.3872, "num_input_tokens_seen": 5627624, "step": 62495 }, { "epoch": 16.242203742203742, "grad_norm": 11.755330085754395, "learning_rate": 5.187584975964823e-06, "loss": 0.1534, "num_input_tokens_seen": 5628088, "step": 62500 }, { "epoch": 16.243503118503117, "grad_norm": 0.07843486219644547, "learning_rate": 5.184127731457883e-06, "loss": 0.0217, "num_input_tokens_seen": 5628520, "step": 62505 }, { "epoch": 16.244802494802496, "grad_norm": 0.028304284438490868, "learning_rate": 5.1806715061002345e-06, "loss": 0.038, "num_input_tokens_seen": 5628952, "step": 62510 }, { "epoch": 16.24610187110187, "grad_norm": 0.151132270693779, "learning_rate": 5.177216300069645e-06, "loss": 0.1199, "num_input_tokens_seen": 5629368, "step": 62515 }, { "epoch": 16.247401247401246, "grad_norm": 1.5799167156219482, "learning_rate": 5.173762113543809e-06, "loss": 0.0927, "num_input_tokens_seen": 5629832, "step": 62520 }, { "epoch": 16.248700623700625, "grad_norm": 0.16507896780967712, "learning_rate": 5.1703089467003916e-06, "loss": 0.0896, "num_input_tokens_seen": 5630280, "step": 62525 }, { "epoch": 16.25, "grad_norm": 0.07840108126401901, "learning_rate": 5.16685679971698e-06, "loss": 0.1097, "num_input_tokens_seen": 5630744, "step": 62530 }, { "epoch": 16.251299376299375, "grad_norm": 0.3937022387981415, "learning_rate": 5.163405672771124e-06, "loss": 0.1063, "num_input_tokens_seen": 5631208, "step": 62535 }, { "epoch": 16.252598752598754, "grad_norm": 1.6377612352371216, "learning_rate": 5.159955566040325e-06, "loss": 0.3471, "num_input_tokens_seen": 5631656, "step": 62540 }, { "epoch": 16.25389812889813, "grad_norm": 0.47918879985809326, "learning_rate": 5.156506479702019e-06, "loss": 0.0205, "num_input_tokens_seen": 5632104, "step": 62545 }, { "epoch": 16.255197505197504, "grad_norm": 6.840686798095703, "learning_rate": 5.153058413933601e-06, "loss": 0.1964, "num_input_tokens_seen": 5632568, "step": 62550 }, { "epoch": 16.256496881496883, "grad_norm": 5.6834845542907715, "learning_rate": 5.149611368912402e-06, "loss": 0.0453, "num_input_tokens_seen": 5632984, "step": 62555 }, { "epoch": 16.257796257796258, "grad_norm": 2.683832883834839, "learning_rate": 5.146165344815715e-06, "loss": 0.7743, "num_input_tokens_seen": 5633400, "step": 62560 }, { "epoch": 16.259095634095633, "grad_norm": 0.004057873506098986, "learning_rate": 5.142720341820759e-06, "loss": 0.1814, "num_input_tokens_seen": 5633896, "step": 62565 }, { "epoch": 16.260395010395012, "grad_norm": 49.116302490234375, "learning_rate": 5.139276360104725e-06, "loss": 0.1644, "num_input_tokens_seen": 5634328, "step": 62570 }, { "epoch": 16.261694386694387, "grad_norm": 0.2875165641307831, "learning_rate": 5.135833399844736e-06, "loss": 0.1145, "num_input_tokens_seen": 5634792, "step": 62575 }, { "epoch": 16.262993762993762, "grad_norm": 0.4346505403518677, "learning_rate": 5.1323914612178726e-06, "loss": 0.1745, "num_input_tokens_seen": 5635240, "step": 62580 }, { "epoch": 16.26429313929314, "grad_norm": 0.008697155863046646, "learning_rate": 5.1289505444011486e-06, "loss": 0.1018, "num_input_tokens_seen": 5635704, "step": 62585 }, { "epoch": 16.265592515592516, "grad_norm": 0.005336332134902477, "learning_rate": 5.125510649571543e-06, "loss": 0.0137, "num_input_tokens_seen": 5636136, "step": 62590 }, { "epoch": 16.26689189189189, "grad_norm": 37.573299407958984, "learning_rate": 5.122071776905971e-06, "loss": 0.0805, "num_input_tokens_seen": 5636600, "step": 62595 }, { "epoch": 16.26819126819127, "grad_norm": 2.3208024501800537, "learning_rate": 5.118633926581276e-06, "loss": 0.0758, "num_input_tokens_seen": 5637048, "step": 62600 }, { "epoch": 16.269490644490645, "grad_norm": 2.4338905811309814, "learning_rate": 5.115197098774302e-06, "loss": 0.004, "num_input_tokens_seen": 5637480, "step": 62605 }, { "epoch": 16.27079002079002, "grad_norm": 9.422821998596191, "learning_rate": 5.111761293661788e-06, "loss": 0.1915, "num_input_tokens_seen": 5637960, "step": 62610 }, { "epoch": 16.272089397089395, "grad_norm": 41.65452575683594, "learning_rate": 5.108326511420453e-06, "loss": 0.2673, "num_input_tokens_seen": 5638424, "step": 62615 }, { "epoch": 16.273388773388774, "grad_norm": 38.10240936279297, "learning_rate": 5.104892752226939e-06, "loss": 0.1631, "num_input_tokens_seen": 5638872, "step": 62620 }, { "epoch": 16.27468814968815, "grad_norm": 0.6828449368476868, "learning_rate": 5.101460016257859e-06, "loss": 0.3169, "num_input_tokens_seen": 5639352, "step": 62625 }, { "epoch": 16.275987525987524, "grad_norm": 4.208892822265625, "learning_rate": 5.09802830368975e-06, "loss": 0.011, "num_input_tokens_seen": 5639816, "step": 62630 }, { "epoch": 16.277286902286903, "grad_norm": 0.0017274422571063042, "learning_rate": 5.094597614699115e-06, "loss": 0.3361, "num_input_tokens_seen": 5640280, "step": 62635 }, { "epoch": 16.27858627858628, "grad_norm": 0.16962848603725433, "learning_rate": 5.091167949462397e-06, "loss": 0.2119, "num_input_tokens_seen": 5640744, "step": 62640 }, { "epoch": 16.279885654885653, "grad_norm": 23.264324188232422, "learning_rate": 5.0877393081559924e-06, "loss": 0.1133, "num_input_tokens_seen": 5641192, "step": 62645 }, { "epoch": 16.281185031185032, "grad_norm": 0.8308889865875244, "learning_rate": 5.084311690956229e-06, "loss": 0.3325, "num_input_tokens_seen": 5641624, "step": 62650 }, { "epoch": 16.282484407484407, "grad_norm": 4.671375274658203, "learning_rate": 5.080885098039404e-06, "loss": 0.4661, "num_input_tokens_seen": 5642120, "step": 62655 }, { "epoch": 16.283783783783782, "grad_norm": 0.2949128746986389, "learning_rate": 5.077459529581741e-06, "loss": 0.0423, "num_input_tokens_seen": 5642584, "step": 62660 }, { "epoch": 16.28508316008316, "grad_norm": 55.2840461730957, "learning_rate": 5.074034985759421e-06, "loss": 0.1889, "num_input_tokens_seen": 5643080, "step": 62665 }, { "epoch": 16.286382536382536, "grad_norm": 0.07869498431682587, "learning_rate": 5.0706114667485705e-06, "loss": 0.0206, "num_input_tokens_seen": 5643512, "step": 62670 }, { "epoch": 16.28768191268191, "grad_norm": 0.5514888167381287, "learning_rate": 5.06718897272527e-06, "loss": 0.1885, "num_input_tokens_seen": 5643928, "step": 62675 }, { "epoch": 16.28898128898129, "grad_norm": 0.25438469648361206, "learning_rate": 5.063767503865543e-06, "loss": 0.042, "num_input_tokens_seen": 5644392, "step": 62680 }, { "epoch": 16.290280665280665, "grad_norm": 0.208369642496109, "learning_rate": 5.060347060345352e-06, "loss": 0.0125, "num_input_tokens_seen": 5644856, "step": 62685 }, { "epoch": 16.29158004158004, "grad_norm": 0.004503167700022459, "learning_rate": 5.056927642340622e-06, "loss": 0.1287, "num_input_tokens_seen": 5645304, "step": 62690 }, { "epoch": 16.29287941787942, "grad_norm": 55.981021881103516, "learning_rate": 5.053509250027205e-06, "loss": 0.1318, "num_input_tokens_seen": 5645752, "step": 62695 }, { "epoch": 16.294178794178794, "grad_norm": 1.0830718278884888, "learning_rate": 5.050091883580926e-06, "loss": 0.1484, "num_input_tokens_seen": 5646216, "step": 62700 }, { "epoch": 16.29547817047817, "grad_norm": 0.61662757396698, "learning_rate": 5.0466755431775316e-06, "loss": 0.3182, "num_input_tokens_seen": 5646664, "step": 62705 }, { "epoch": 16.296777546777548, "grad_norm": 1.9676952362060547, "learning_rate": 5.04326022899273e-06, "loss": 0.0547, "num_input_tokens_seen": 5647160, "step": 62710 }, { "epoch": 16.298076923076923, "grad_norm": 51.04533386230469, "learning_rate": 5.039845941202178e-06, "loss": 0.7436, "num_input_tokens_seen": 5647592, "step": 62715 }, { "epoch": 16.2993762993763, "grad_norm": 22.609800338745117, "learning_rate": 5.036432679981482e-06, "loss": 0.0959, "num_input_tokens_seen": 5648072, "step": 62720 }, { "epoch": 16.300675675675677, "grad_norm": 0.03312987834215164, "learning_rate": 5.033020445506179e-06, "loss": 0.0057, "num_input_tokens_seen": 5648584, "step": 62725 }, { "epoch": 16.301975051975052, "grad_norm": 0.7744158506393433, "learning_rate": 5.02960923795176e-06, "loss": 0.1378, "num_input_tokens_seen": 5649032, "step": 62730 }, { "epoch": 16.303274428274428, "grad_norm": 60.597530364990234, "learning_rate": 5.026199057493678e-06, "loss": 0.2636, "num_input_tokens_seen": 5649448, "step": 62735 }, { "epoch": 16.304573804573806, "grad_norm": 0.020491817966103554, "learning_rate": 5.022789904307312e-06, "loss": 0.3114, "num_input_tokens_seen": 5649880, "step": 62740 }, { "epoch": 16.30587318087318, "grad_norm": 9.5020112991333, "learning_rate": 5.0193817785679995e-06, "loss": 0.2308, "num_input_tokens_seen": 5650312, "step": 62745 }, { "epoch": 16.307172557172557, "grad_norm": 0.8793408870697021, "learning_rate": 5.01597468045103e-06, "loss": 0.0036, "num_input_tokens_seen": 5650712, "step": 62750 }, { "epoch": 16.308471933471935, "grad_norm": 0.07290413230657578, "learning_rate": 5.012568610131635e-06, "loss": 0.0884, "num_input_tokens_seen": 5651128, "step": 62755 }, { "epoch": 16.30977130977131, "grad_norm": 0.41698601841926575, "learning_rate": 5.00916356778498e-06, "loss": 0.0255, "num_input_tokens_seen": 5651560, "step": 62760 }, { "epoch": 16.311070686070686, "grad_norm": 0.004309272859245539, "learning_rate": 5.005759553586206e-06, "loss": 0.0611, "num_input_tokens_seen": 5652024, "step": 62765 }, { "epoch": 16.31237006237006, "grad_norm": 0.08046533167362213, "learning_rate": 5.002356567710367e-06, "loss": 0.1438, "num_input_tokens_seen": 5652472, "step": 62770 }, { "epoch": 16.31366943866944, "grad_norm": 0.9143178462982178, "learning_rate": 4.998954610332499e-06, "loss": 0.0497, "num_input_tokens_seen": 5652888, "step": 62775 }, { "epoch": 16.314968814968815, "grad_norm": 10.018155097961426, "learning_rate": 4.9955536816275516e-06, "loss": 0.0323, "num_input_tokens_seen": 5653352, "step": 62780 }, { "epoch": 16.31626819126819, "grad_norm": 0.6539980173110962, "learning_rate": 4.992153781770448e-06, "loss": 0.3066, "num_input_tokens_seen": 5653800, "step": 62785 }, { "epoch": 16.31756756756757, "grad_norm": 0.3096317648887634, "learning_rate": 4.98875491093605e-06, "loss": 0.0049, "num_input_tokens_seen": 5654264, "step": 62790 }, { "epoch": 16.318866943866944, "grad_norm": 6.259189605712891, "learning_rate": 4.985357069299154e-06, "loss": 0.0112, "num_input_tokens_seen": 5654728, "step": 62795 }, { "epoch": 16.32016632016632, "grad_norm": 0.16576659679412842, "learning_rate": 4.981960257034529e-06, "loss": 0.2416, "num_input_tokens_seen": 5655208, "step": 62800 }, { "epoch": 16.321465696465697, "grad_norm": 0.009014887735247612, "learning_rate": 4.978564474316863e-06, "loss": 0.0014, "num_input_tokens_seen": 5655656, "step": 62805 }, { "epoch": 16.322765072765073, "grad_norm": 0.0331982783973217, "learning_rate": 4.975169721320813e-06, "loss": 0.1727, "num_input_tokens_seen": 5656120, "step": 62810 }, { "epoch": 16.324064449064448, "grad_norm": 0.02606900781393051, "learning_rate": 4.971775998220968e-06, "loss": 0.1057, "num_input_tokens_seen": 5656584, "step": 62815 }, { "epoch": 16.325363825363826, "grad_norm": 10.124631881713867, "learning_rate": 4.96838330519187e-06, "loss": 0.2941, "num_input_tokens_seen": 5657032, "step": 62820 }, { "epoch": 16.3266632016632, "grad_norm": 4.577242374420166, "learning_rate": 4.964991642408015e-06, "loss": 0.0112, "num_input_tokens_seen": 5657464, "step": 62825 }, { "epoch": 16.327962577962577, "grad_norm": 25.766613006591797, "learning_rate": 4.96160101004384e-06, "loss": 0.1594, "num_input_tokens_seen": 5657912, "step": 62830 }, { "epoch": 16.329261954261955, "grad_norm": 1.4412479400634766, "learning_rate": 4.958211408273722e-06, "loss": 0.003, "num_input_tokens_seen": 5658344, "step": 62835 }, { "epoch": 16.33056133056133, "grad_norm": 16.48862648010254, "learning_rate": 4.954822837271997e-06, "loss": 0.0497, "num_input_tokens_seen": 5658792, "step": 62840 }, { "epoch": 16.331860706860706, "grad_norm": 1.6972901821136475, "learning_rate": 4.951435297212937e-06, "loss": 0.2971, "num_input_tokens_seen": 5659224, "step": 62845 }, { "epoch": 16.333160083160084, "grad_norm": 0.06238845735788345, "learning_rate": 4.948048788270768e-06, "loss": 0.0394, "num_input_tokens_seen": 5659688, "step": 62850 }, { "epoch": 16.33445945945946, "grad_norm": 0.8174605965614319, "learning_rate": 4.9446633106196685e-06, "loss": 0.2711, "num_input_tokens_seen": 5660152, "step": 62855 }, { "epoch": 16.335758835758835, "grad_norm": 0.07012400031089783, "learning_rate": 4.9412788644337436e-06, "loss": 0.0248, "num_input_tokens_seen": 5660584, "step": 62860 }, { "epoch": 16.337058212058214, "grad_norm": 26.432832717895508, "learning_rate": 4.937895449887075e-06, "loss": 0.1686, "num_input_tokens_seen": 5661048, "step": 62865 }, { "epoch": 16.33835758835759, "grad_norm": 9.679201126098633, "learning_rate": 4.934513067153657e-06, "loss": 0.0628, "num_input_tokens_seen": 5661464, "step": 62870 }, { "epoch": 16.339656964656964, "grad_norm": 0.8060035109519958, "learning_rate": 4.9311317164074664e-06, "loss": 0.0769, "num_input_tokens_seen": 5661944, "step": 62875 }, { "epoch": 16.340956340956343, "grad_norm": 4.177271366119385, "learning_rate": 4.927751397822391e-06, "loss": 0.0054, "num_input_tokens_seen": 5662408, "step": 62880 }, { "epoch": 16.342255717255718, "grad_norm": 1.8603081703186035, "learning_rate": 4.924372111572298e-06, "loss": 0.0036, "num_input_tokens_seen": 5662856, "step": 62885 }, { "epoch": 16.343555093555093, "grad_norm": 15.35351276397705, "learning_rate": 4.92099385783098e-06, "loss": 0.0333, "num_input_tokens_seen": 5663256, "step": 62890 }, { "epoch": 16.34485446985447, "grad_norm": 0.0075781019404530525, "learning_rate": 4.917616636772193e-06, "loss": 0.1852, "num_input_tokens_seen": 5663688, "step": 62895 }, { "epoch": 16.346153846153847, "grad_norm": 1.9493703842163086, "learning_rate": 4.914240448569621e-06, "loss": 0.2484, "num_input_tokens_seen": 5664136, "step": 62900 }, { "epoch": 16.347453222453222, "grad_norm": 2.313260555267334, "learning_rate": 4.91086529339691e-06, "loss": 0.3928, "num_input_tokens_seen": 5664584, "step": 62905 }, { "epoch": 16.348752598752597, "grad_norm": 0.49843525886535645, "learning_rate": 4.907491171427642e-06, "loss": 0.0796, "num_input_tokens_seen": 5665032, "step": 62910 }, { "epoch": 16.350051975051976, "grad_norm": 0.04985577240586281, "learning_rate": 4.90411808283536e-06, "loss": 0.0692, "num_input_tokens_seen": 5665528, "step": 62915 }, { "epoch": 16.35135135135135, "grad_norm": 0.023761792108416557, "learning_rate": 4.900746027793535e-06, "loss": 0.0827, "num_input_tokens_seen": 5666008, "step": 62920 }, { "epoch": 16.352650727650726, "grad_norm": 0.6475006341934204, "learning_rate": 4.897375006475599e-06, "loss": 0.1277, "num_input_tokens_seen": 5666440, "step": 62925 }, { "epoch": 16.353950103950105, "grad_norm": 0.03561472147703171, "learning_rate": 4.894005019054934e-06, "loss": 0.4905, "num_input_tokens_seen": 5666840, "step": 62930 }, { "epoch": 16.35524948024948, "grad_norm": 0.5280241966247559, "learning_rate": 4.890636065704848e-06, "loss": 0.412, "num_input_tokens_seen": 5667320, "step": 62935 }, { "epoch": 16.356548856548855, "grad_norm": 0.009992588311433792, "learning_rate": 4.887268146598625e-06, "loss": 0.3717, "num_input_tokens_seen": 5667736, "step": 62940 }, { "epoch": 16.357848232848234, "grad_norm": 0.0134065430611372, "learning_rate": 4.883901261909465e-06, "loss": 0.0091, "num_input_tokens_seen": 5668184, "step": 62945 }, { "epoch": 16.35914760914761, "grad_norm": 1.2066240310668945, "learning_rate": 4.880535411810544e-06, "loss": 0.7774, "num_input_tokens_seen": 5668664, "step": 62950 }, { "epoch": 16.360446985446984, "grad_norm": 17.701709747314453, "learning_rate": 4.877170596474959e-06, "loss": 0.3548, "num_input_tokens_seen": 5669128, "step": 62955 }, { "epoch": 16.361746361746363, "grad_norm": 0.03195576369762421, "learning_rate": 4.873806816075771e-06, "loss": 0.0564, "num_input_tokens_seen": 5669608, "step": 62960 }, { "epoch": 16.363045738045738, "grad_norm": 29.048198699951172, "learning_rate": 4.870444070785981e-06, "loss": 0.2137, "num_input_tokens_seen": 5670088, "step": 62965 }, { "epoch": 16.364345114345113, "grad_norm": 26.534671783447266, "learning_rate": 4.867082360778547e-06, "loss": 0.1001, "num_input_tokens_seen": 5670552, "step": 62970 }, { "epoch": 16.365644490644492, "grad_norm": 0.020092349499464035, "learning_rate": 4.86372168622635e-06, "loss": 0.1077, "num_input_tokens_seen": 5670984, "step": 62975 }, { "epoch": 16.366943866943867, "grad_norm": 0.4423021972179413, "learning_rate": 4.860362047302247e-06, "loss": 0.3849, "num_input_tokens_seen": 5671480, "step": 62980 }, { "epoch": 16.368243243243242, "grad_norm": 4.719958305358887, "learning_rate": 4.8570034441790226e-06, "loss": 0.156, "num_input_tokens_seen": 5671912, "step": 62985 }, { "epoch": 16.36954261954262, "grad_norm": 0.7989321947097778, "learning_rate": 4.853645877029403e-06, "loss": 0.2336, "num_input_tokens_seen": 5672360, "step": 62990 }, { "epoch": 16.370841995841996, "grad_norm": 0.27244070172309875, "learning_rate": 4.850289346026079e-06, "loss": 0.0562, "num_input_tokens_seen": 5672792, "step": 62995 }, { "epoch": 16.37214137214137, "grad_norm": 1.6341904401779175, "learning_rate": 4.8469338513416814e-06, "loss": 0.1142, "num_input_tokens_seen": 5673272, "step": 63000 }, { "epoch": 16.37344074844075, "grad_norm": 11.099861145019531, "learning_rate": 4.843579393148792e-06, "loss": 0.27, "num_input_tokens_seen": 5673752, "step": 63005 }, { "epoch": 16.374740124740125, "grad_norm": 0.04285592585802078, "learning_rate": 4.8402259716199215e-06, "loss": 0.0926, "num_input_tokens_seen": 5674200, "step": 63010 }, { "epoch": 16.3760395010395, "grad_norm": 0.01840118318796158, "learning_rate": 4.836873586927551e-06, "loss": 0.1036, "num_input_tokens_seen": 5674648, "step": 63015 }, { "epoch": 16.37733887733888, "grad_norm": 0.02065632864832878, "learning_rate": 4.833522239244085e-06, "loss": 0.6447, "num_input_tokens_seen": 5675080, "step": 63020 }, { "epoch": 16.378638253638254, "grad_norm": 0.09988600760698318, "learning_rate": 4.8301719287419e-06, "loss": 0.1313, "num_input_tokens_seen": 5675544, "step": 63025 }, { "epoch": 16.37993762993763, "grad_norm": 28.312646865844727, "learning_rate": 4.826822655593291e-06, "loss": 0.2226, "num_input_tokens_seen": 5675944, "step": 63030 }, { "epoch": 16.381237006237008, "grad_norm": 0.0015112449182197452, "learning_rate": 4.823474419970522e-06, "loss": 0.0532, "num_input_tokens_seen": 5676392, "step": 63035 }, { "epoch": 16.382536382536383, "grad_norm": 10.291675567626953, "learning_rate": 4.8201272220457945e-06, "loss": 0.2251, "num_input_tokens_seen": 5676824, "step": 63040 }, { "epoch": 16.383835758835758, "grad_norm": 1.5412895679473877, "learning_rate": 4.816781061991268e-06, "loss": 0.0045, "num_input_tokens_seen": 5677272, "step": 63045 }, { "epoch": 16.385135135135137, "grad_norm": 24.00050163269043, "learning_rate": 4.81343593997903e-06, "loss": 0.0731, "num_input_tokens_seen": 5677672, "step": 63050 }, { "epoch": 16.386434511434512, "grad_norm": 2.0203590393066406, "learning_rate": 4.810091856181118e-06, "loss": 0.0458, "num_input_tokens_seen": 5678136, "step": 63055 }, { "epoch": 16.387733887733887, "grad_norm": 0.05907975509762764, "learning_rate": 4.806748810769529e-06, "loss": 0.0382, "num_input_tokens_seen": 5678536, "step": 63060 }, { "epoch": 16.389033264033262, "grad_norm": 3.0942373275756836, "learning_rate": 4.803406803916194e-06, "loss": 0.1605, "num_input_tokens_seen": 5678984, "step": 63065 }, { "epoch": 16.39033264033264, "grad_norm": 17.332500457763672, "learning_rate": 4.800065835792996e-06, "loss": 0.0611, "num_input_tokens_seen": 5679416, "step": 63070 }, { "epoch": 16.391632016632016, "grad_norm": 55.76210403442383, "learning_rate": 4.796725906571769e-06, "loss": 0.5909, "num_input_tokens_seen": 5679880, "step": 63075 }, { "epoch": 16.39293139293139, "grad_norm": 26.211746215820312, "learning_rate": 4.793387016424294e-06, "loss": 0.1981, "num_input_tokens_seen": 5680360, "step": 63080 }, { "epoch": 16.39423076923077, "grad_norm": 0.014502168633043766, "learning_rate": 4.790049165522278e-06, "loss": 0.0374, "num_input_tokens_seen": 5680792, "step": 63085 }, { "epoch": 16.395530145530145, "grad_norm": 2.1562016010284424, "learning_rate": 4.786712354037404e-06, "loss": 0.2965, "num_input_tokens_seen": 5681240, "step": 63090 }, { "epoch": 16.39682952182952, "grad_norm": 0.06164783611893654, "learning_rate": 4.783376582141275e-06, "loss": 0.0004, "num_input_tokens_seen": 5681688, "step": 63095 }, { "epoch": 16.3981288981289, "grad_norm": 0.45840156078338623, "learning_rate": 4.7800418500054565e-06, "loss": 0.0646, "num_input_tokens_seen": 5682104, "step": 63100 }, { "epoch": 16.399428274428274, "grad_norm": 0.10320155322551727, "learning_rate": 4.776708157801463e-06, "loss": 0.071, "num_input_tokens_seen": 5682536, "step": 63105 }, { "epoch": 16.40072765072765, "grad_norm": 0.04307475686073303, "learning_rate": 4.77337550570075e-06, "loss": 0.0016, "num_input_tokens_seen": 5682984, "step": 63110 }, { "epoch": 16.402027027027028, "grad_norm": 1.0566813945770264, "learning_rate": 4.770043893874715e-06, "loss": 0.0053, "num_input_tokens_seen": 5683432, "step": 63115 }, { "epoch": 16.403326403326403, "grad_norm": 0.49552419781684875, "learning_rate": 4.7667133224947006e-06, "loss": 0.1299, "num_input_tokens_seen": 5683848, "step": 63120 }, { "epoch": 16.40462577962578, "grad_norm": 0.6315469741821289, "learning_rate": 4.763383791732012e-06, "loss": 0.0022, "num_input_tokens_seen": 5684296, "step": 63125 }, { "epoch": 16.405925155925157, "grad_norm": 0.006794393993914127, "learning_rate": 4.760055301757879e-06, "loss": 0.045, "num_input_tokens_seen": 5684728, "step": 63130 }, { "epoch": 16.407224532224532, "grad_norm": 0.271806925535202, "learning_rate": 4.756727852743495e-06, "loss": 0.1209, "num_input_tokens_seen": 5685176, "step": 63135 }, { "epoch": 16.408523908523907, "grad_norm": 0.09255529940128326, "learning_rate": 4.753401444859995e-06, "loss": 0.0334, "num_input_tokens_seen": 5685624, "step": 63140 }, { "epoch": 16.409823284823286, "grad_norm": 59.19647979736328, "learning_rate": 4.750076078278462e-06, "loss": 0.3521, "num_input_tokens_seen": 5686088, "step": 63145 }, { "epoch": 16.41112266112266, "grad_norm": 24.704938888549805, "learning_rate": 4.746751753169915e-06, "loss": 0.1048, "num_input_tokens_seen": 5686568, "step": 63150 }, { "epoch": 16.412422037422036, "grad_norm": 58.014225006103516, "learning_rate": 4.743428469705335e-06, "loss": 0.3258, "num_input_tokens_seen": 5687048, "step": 63155 }, { "epoch": 16.413721413721415, "grad_norm": 0.03148861601948738, "learning_rate": 4.740106228055632e-06, "loss": 0.2814, "num_input_tokens_seen": 5687512, "step": 63160 }, { "epoch": 16.41502079002079, "grad_norm": 0.03554689511656761, "learning_rate": 4.736785028391685e-06, "loss": 0.209, "num_input_tokens_seen": 5687960, "step": 63165 }, { "epoch": 16.416320166320165, "grad_norm": 0.08603759109973907, "learning_rate": 4.733464870884291e-06, "loss": 0.0844, "num_input_tokens_seen": 5688440, "step": 63170 }, { "epoch": 16.417619542619544, "grad_norm": 28.079654693603516, "learning_rate": 4.730145755704218e-06, "loss": 0.3537, "num_input_tokens_seen": 5688904, "step": 63175 }, { "epoch": 16.41891891891892, "grad_norm": 0.7179645895957947, "learning_rate": 4.726827683022178e-06, "loss": 0.4325, "num_input_tokens_seen": 5689336, "step": 63180 }, { "epoch": 16.420218295218294, "grad_norm": 0.035700924694538116, "learning_rate": 4.7235106530088085e-06, "loss": 0.0496, "num_input_tokens_seen": 5689784, "step": 63185 }, { "epoch": 16.421517671517673, "grad_norm": 0.00575387803837657, "learning_rate": 4.720194665834721e-06, "loss": 0.0028, "num_input_tokens_seen": 5690200, "step": 63190 }, { "epoch": 16.42281704781705, "grad_norm": 22.545576095581055, "learning_rate": 4.7168797216704445e-06, "loss": 0.4021, "num_input_tokens_seen": 5690664, "step": 63195 }, { "epoch": 16.424116424116423, "grad_norm": 3.6898093223571777, "learning_rate": 4.713565820686486e-06, "loss": 0.0706, "num_input_tokens_seen": 5691128, "step": 63200 }, { "epoch": 16.4254158004158, "grad_norm": 0.139745831489563, "learning_rate": 4.710252963053268e-06, "loss": 0.0061, "num_input_tokens_seen": 5691592, "step": 63205 }, { "epoch": 16.426715176715177, "grad_norm": 18.684171676635742, "learning_rate": 4.706941148941179e-06, "loss": 0.0378, "num_input_tokens_seen": 5692072, "step": 63210 }, { "epoch": 16.428014553014552, "grad_norm": 26.214542388916016, "learning_rate": 4.703630378520554e-06, "loss": 0.0779, "num_input_tokens_seen": 5692472, "step": 63215 }, { "epoch": 16.429313929313928, "grad_norm": 0.05081212520599365, "learning_rate": 4.7003206519616706e-06, "loss": 0.3509, "num_input_tokens_seen": 5692936, "step": 63220 }, { "epoch": 16.430613305613306, "grad_norm": 1.4197722673416138, "learning_rate": 4.6970119694347394e-06, "loss": 0.2667, "num_input_tokens_seen": 5693448, "step": 63225 }, { "epoch": 16.43191268191268, "grad_norm": 9.478899002075195, "learning_rate": 4.693704331109943e-06, "loss": 0.1248, "num_input_tokens_seen": 5693896, "step": 63230 }, { "epoch": 16.433212058212057, "grad_norm": 0.5079766511917114, "learning_rate": 4.690397737157384e-06, "loss": 0.1292, "num_input_tokens_seen": 5694344, "step": 63235 }, { "epoch": 16.434511434511435, "grad_norm": 0.0051168943755328655, "learning_rate": 4.687092187747136e-06, "loss": 0.1064, "num_input_tokens_seen": 5694824, "step": 63240 }, { "epoch": 16.43581081081081, "grad_norm": 0.03611985221505165, "learning_rate": 4.68378768304919e-06, "loss": 0.2106, "num_input_tokens_seen": 5695256, "step": 63245 }, { "epoch": 16.437110187110186, "grad_norm": 1.1332730054855347, "learning_rate": 4.680484223233511e-06, "loss": 0.4923, "num_input_tokens_seen": 5695720, "step": 63250 }, { "epoch": 16.438409563409564, "grad_norm": 0.08402756601572037, "learning_rate": 4.677181808470005e-06, "loss": 0.1882, "num_input_tokens_seen": 5696152, "step": 63255 }, { "epoch": 16.43970893970894, "grad_norm": 0.0015980496536940336, "learning_rate": 4.673880438928505e-06, "loss": 0.0397, "num_input_tokens_seen": 5696584, "step": 63260 }, { "epoch": 16.441008316008315, "grad_norm": 42.45761489868164, "learning_rate": 4.670580114778813e-06, "loss": 0.3442, "num_input_tokens_seen": 5697032, "step": 63265 }, { "epoch": 16.442307692307693, "grad_norm": 0.0017116125673055649, "learning_rate": 4.66728083619066e-06, "loss": 0.0113, "num_input_tokens_seen": 5697496, "step": 63270 }, { "epoch": 16.44360706860707, "grad_norm": 0.08979400247335434, "learning_rate": 4.663982603333741e-06, "loss": 0.075, "num_input_tokens_seen": 5697944, "step": 63275 }, { "epoch": 16.444906444906444, "grad_norm": 0.01650191657245159, "learning_rate": 4.660685416377677e-06, "loss": 0.2732, "num_input_tokens_seen": 5698376, "step": 63280 }, { "epoch": 16.446205821205822, "grad_norm": 0.5462944507598877, "learning_rate": 4.65738927549205e-06, "loss": 0.0014, "num_input_tokens_seen": 5698824, "step": 63285 }, { "epoch": 16.447505197505198, "grad_norm": 2.6237354278564453, "learning_rate": 4.654094180846383e-06, "loss": 0.0793, "num_input_tokens_seen": 5699256, "step": 63290 }, { "epoch": 16.448804573804573, "grad_norm": 0.11127163469791412, "learning_rate": 4.650800132610153e-06, "loss": 0.2048, "num_input_tokens_seen": 5699704, "step": 63295 }, { "epoch": 16.45010395010395, "grad_norm": 22.40705680847168, "learning_rate": 4.647507130952763e-06, "loss": 0.0772, "num_input_tokens_seen": 5700152, "step": 63300 }, { "epoch": 16.451403326403327, "grad_norm": 31.011112213134766, "learning_rate": 4.6442151760435895e-06, "loss": 0.0758, "num_input_tokens_seen": 5700632, "step": 63305 }, { "epoch": 16.4527027027027, "grad_norm": 0.1794019192457199, "learning_rate": 4.640924268051933e-06, "loss": 0.0608, "num_input_tokens_seen": 5701080, "step": 63310 }, { "epoch": 16.45400207900208, "grad_norm": 0.1270427256822586, "learning_rate": 4.637634407147043e-06, "loss": 0.468, "num_input_tokens_seen": 5701560, "step": 63315 }, { "epoch": 16.455301455301456, "grad_norm": 0.08856356889009476, "learning_rate": 4.6343455934981235e-06, "loss": 0.0735, "num_input_tokens_seen": 5702040, "step": 63320 }, { "epoch": 16.45660083160083, "grad_norm": 0.00481558870524168, "learning_rate": 4.631057827274327e-06, "loss": 0.04, "num_input_tokens_seen": 5702520, "step": 63325 }, { "epoch": 16.45790020790021, "grad_norm": 0.07706804573535919, "learning_rate": 4.627771108644746e-06, "loss": 0.0024, "num_input_tokens_seen": 5702984, "step": 63330 }, { "epoch": 16.459199584199585, "grad_norm": 55.111576080322266, "learning_rate": 4.624485437778414e-06, "loss": 0.3865, "num_input_tokens_seen": 5703416, "step": 63335 }, { "epoch": 16.46049896049896, "grad_norm": 0.07717664539813995, "learning_rate": 4.621200814844323e-06, "loss": 0.0022, "num_input_tokens_seen": 5703896, "step": 63340 }, { "epoch": 16.46179833679834, "grad_norm": 0.0025719727855175734, "learning_rate": 4.617917240011394e-06, "loss": 0.1948, "num_input_tokens_seen": 5704360, "step": 63345 }, { "epoch": 16.463097713097714, "grad_norm": 3.502149820327759, "learning_rate": 4.61463471344851e-06, "loss": 0.2004, "num_input_tokens_seen": 5704824, "step": 63350 }, { "epoch": 16.46439708939709, "grad_norm": 30.093908309936523, "learning_rate": 4.611353235324494e-06, "loss": 0.2179, "num_input_tokens_seen": 5705272, "step": 63355 }, { "epoch": 16.465696465696467, "grad_norm": 0.1896183043718338, "learning_rate": 4.608072805808125e-06, "loss": 0.0005, "num_input_tokens_seen": 5705768, "step": 63360 }, { "epoch": 16.466995841995843, "grad_norm": 3.481964588165283, "learning_rate": 4.604793425068102e-06, "loss": 0.0423, "num_input_tokens_seen": 5706232, "step": 63365 }, { "epoch": 16.468295218295218, "grad_norm": 44.79933547973633, "learning_rate": 4.601515093273101e-06, "loss": 0.095, "num_input_tokens_seen": 5706696, "step": 63370 }, { "epoch": 16.469594594594593, "grad_norm": 0.1977854073047638, "learning_rate": 4.598237810591727e-06, "loss": 0.1, "num_input_tokens_seen": 5707096, "step": 63375 }, { "epoch": 16.47089397089397, "grad_norm": 0.0963514894247055, "learning_rate": 4.594961577192522e-06, "loss": 0.0043, "num_input_tokens_seen": 5707560, "step": 63380 }, { "epoch": 16.472193347193347, "grad_norm": 18.338336944580078, "learning_rate": 4.5916863932439934e-06, "loss": 0.0881, "num_input_tokens_seen": 5708008, "step": 63385 }, { "epoch": 16.473492723492722, "grad_norm": 47.58101272583008, "learning_rate": 4.58841225891459e-06, "loss": 0.1298, "num_input_tokens_seen": 5708440, "step": 63390 }, { "epoch": 16.4747920997921, "grad_norm": 0.02025737054646015, "learning_rate": 4.585139174372704e-06, "loss": 0.0024, "num_input_tokens_seen": 5708888, "step": 63395 }, { "epoch": 16.476091476091476, "grad_norm": 0.006894504185765982, "learning_rate": 4.581867139786669e-06, "loss": 0.2526, "num_input_tokens_seen": 5709336, "step": 63400 }, { "epoch": 16.47739085239085, "grad_norm": 0.09279008209705353, "learning_rate": 4.578596155324774e-06, "loss": 0.0083, "num_input_tokens_seen": 5709784, "step": 63405 }, { "epoch": 16.47869022869023, "grad_norm": 6.169641017913818, "learning_rate": 4.57532622115524e-06, "loss": 0.1108, "num_input_tokens_seen": 5710264, "step": 63410 }, { "epoch": 16.479989604989605, "grad_norm": 7.841452121734619, "learning_rate": 4.572057337446256e-06, "loss": 0.0289, "num_input_tokens_seen": 5710680, "step": 63415 }, { "epoch": 16.48128898128898, "grad_norm": 15.1566801071167, "learning_rate": 4.56878950436593e-06, "loss": 0.0197, "num_input_tokens_seen": 5711112, "step": 63420 }, { "epoch": 16.48258835758836, "grad_norm": 13.978263854980469, "learning_rate": 4.565522722082336e-06, "loss": 0.17, "num_input_tokens_seen": 5711544, "step": 63425 }, { "epoch": 16.483887733887734, "grad_norm": 0.24259547889232635, "learning_rate": 4.562256990763486e-06, "loss": 0.0026, "num_input_tokens_seen": 5711976, "step": 63430 }, { "epoch": 16.48518711018711, "grad_norm": 28.657657623291016, "learning_rate": 4.558992310577348e-06, "loss": 0.1468, "num_input_tokens_seen": 5712408, "step": 63435 }, { "epoch": 16.486486486486488, "grad_norm": 21.49627113342285, "learning_rate": 4.5557286816918205e-06, "loss": 0.1573, "num_input_tokens_seen": 5712840, "step": 63440 }, { "epoch": 16.487785862785863, "grad_norm": 2.036142110824585, "learning_rate": 4.552466104274752e-06, "loss": 0.2124, "num_input_tokens_seen": 5713288, "step": 63445 }, { "epoch": 16.489085239085238, "grad_norm": 0.014261203818023205, "learning_rate": 4.549204578493949e-06, "loss": 0.0039, "num_input_tokens_seen": 5713816, "step": 63450 }, { "epoch": 16.490384615384617, "grad_norm": 2.1928048133850098, "learning_rate": 4.5459441045171414e-06, "loss": 0.3131, "num_input_tokens_seen": 5714280, "step": 63455 }, { "epoch": 16.491683991683992, "grad_norm": 29.241952896118164, "learning_rate": 4.54268468251203e-06, "loss": 0.2739, "num_input_tokens_seen": 5714712, "step": 63460 }, { "epoch": 16.492983367983367, "grad_norm": 16.36642837524414, "learning_rate": 4.5394263126462445e-06, "loss": 0.1132, "num_input_tokens_seen": 5715128, "step": 63465 }, { "epoch": 16.494282744282746, "grad_norm": 4.921795845031738, "learning_rate": 4.5361689950873725e-06, "loss": 0.2139, "num_input_tokens_seen": 5715624, "step": 63470 }, { "epoch": 16.49558212058212, "grad_norm": 22.555011749267578, "learning_rate": 4.532912730002934e-06, "loss": 0.0977, "num_input_tokens_seen": 5716104, "step": 63475 }, { "epoch": 16.496881496881496, "grad_norm": 29.29011344909668, "learning_rate": 4.52965751756041e-06, "loss": 0.19, "num_input_tokens_seen": 5716536, "step": 63480 }, { "epoch": 16.498180873180875, "grad_norm": 6.152042865753174, "learning_rate": 4.526403357927206e-06, "loss": 0.0311, "num_input_tokens_seen": 5716984, "step": 63485 }, { "epoch": 16.49948024948025, "grad_norm": 0.001599145820364356, "learning_rate": 4.5231502512707e-06, "loss": 0.0987, "num_input_tokens_seen": 5717432, "step": 63490 }, { "epoch": 16.500779625779625, "grad_norm": 0.012155001983046532, "learning_rate": 4.519898197758191e-06, "loss": 0.1421, "num_input_tokens_seen": 5717864, "step": 63495 }, { "epoch": 16.502079002079, "grad_norm": 1.329084038734436, "learning_rate": 4.516647197556942e-06, "loss": 0.3501, "num_input_tokens_seen": 5718296, "step": 63500 }, { "epoch": 16.50337837837838, "grad_norm": 0.1397242248058319, "learning_rate": 4.513397250834159e-06, "loss": 0.0529, "num_input_tokens_seen": 5718792, "step": 63505 }, { "epoch": 16.504677754677754, "grad_norm": 0.12416888028383255, "learning_rate": 4.510148357756977e-06, "loss": 0.0846, "num_input_tokens_seen": 5719240, "step": 63510 }, { "epoch": 16.50597713097713, "grad_norm": 34.036338806152344, "learning_rate": 4.506900518492504e-06, "loss": 0.2783, "num_input_tokens_seen": 5719672, "step": 63515 }, { "epoch": 16.507276507276508, "grad_norm": 0.3661005198955536, "learning_rate": 4.503653733207769e-06, "loss": 0.4086, "num_input_tokens_seen": 5720088, "step": 63520 }, { "epoch": 16.508575883575883, "grad_norm": 2.2291581630706787, "learning_rate": 4.500408002069767e-06, "loss": 0.1553, "num_input_tokens_seen": 5720568, "step": 63525 }, { "epoch": 16.50987525987526, "grad_norm": 0.00830561388283968, "learning_rate": 4.497163325245416e-06, "loss": 0.0046, "num_input_tokens_seen": 5721000, "step": 63530 }, { "epoch": 16.511174636174637, "grad_norm": 34.43952560424805, "learning_rate": 4.4939197029016e-06, "loss": 0.0725, "num_input_tokens_seen": 5721448, "step": 63535 }, { "epoch": 16.512474012474012, "grad_norm": 0.003657130291685462, "learning_rate": 4.49067713520514e-06, "loss": 0.1489, "num_input_tokens_seen": 5721912, "step": 63540 }, { "epoch": 16.513773388773387, "grad_norm": 0.7838333249092102, "learning_rate": 4.487435622322814e-06, "loss": 0.0434, "num_input_tokens_seen": 5722360, "step": 63545 }, { "epoch": 16.515072765072766, "grad_norm": 45.015525817871094, "learning_rate": 4.4841951644213235e-06, "loss": 0.3692, "num_input_tokens_seen": 5722776, "step": 63550 }, { "epoch": 16.51637214137214, "grad_norm": 5.079973220825195, "learning_rate": 4.480955761667338e-06, "loss": 0.3027, "num_input_tokens_seen": 5723224, "step": 63555 }, { "epoch": 16.517671517671516, "grad_norm": 0.25992655754089355, "learning_rate": 4.4777174142274506e-06, "loss": 0.0005, "num_input_tokens_seen": 5723704, "step": 63560 }, { "epoch": 16.518970893970895, "grad_norm": 22.575632095336914, "learning_rate": 4.474480122268226e-06, "loss": 0.0339, "num_input_tokens_seen": 5724152, "step": 63565 }, { "epoch": 16.52027027027027, "grad_norm": 0.04569080099463463, "learning_rate": 4.47124388595615e-06, "loss": 0.0009, "num_input_tokens_seen": 5724584, "step": 63570 }, { "epoch": 16.521569646569645, "grad_norm": 0.016592366620898247, "learning_rate": 4.468008705457671e-06, "loss": 0.1711, "num_input_tokens_seen": 5725032, "step": 63575 }, { "epoch": 16.522869022869024, "grad_norm": 0.01135889533907175, "learning_rate": 4.464774580939185e-06, "loss": 0.0182, "num_input_tokens_seen": 5725480, "step": 63580 }, { "epoch": 16.5241683991684, "grad_norm": 3.220764398574829, "learning_rate": 4.461541512567011e-06, "loss": 0.046, "num_input_tokens_seen": 5725896, "step": 63585 }, { "epoch": 16.525467775467774, "grad_norm": 0.02168402448296547, "learning_rate": 4.458309500507441e-06, "loss": 0.3027, "num_input_tokens_seen": 5726344, "step": 63590 }, { "epoch": 16.526767151767153, "grad_norm": 0.510559618473053, "learning_rate": 4.455078544926689e-06, "loss": 0.0151, "num_input_tokens_seen": 5726792, "step": 63595 }, { "epoch": 16.528066528066528, "grad_norm": 0.018911361694335938, "learning_rate": 4.451848645990933e-06, "loss": 0.0018, "num_input_tokens_seen": 5727240, "step": 63600 }, { "epoch": 16.529365904365903, "grad_norm": 0.023705242201685905, "learning_rate": 4.448619803866291e-06, "loss": 0.2227, "num_input_tokens_seen": 5727672, "step": 63605 }, { "epoch": 16.530665280665282, "grad_norm": 0.010858136229217052, "learning_rate": 4.445392018718827e-06, "loss": 0.0023, "num_input_tokens_seen": 5728104, "step": 63610 }, { "epoch": 16.531964656964657, "grad_norm": 2.6388635635375977, "learning_rate": 4.442165290714542e-06, "loss": 0.1803, "num_input_tokens_seen": 5728536, "step": 63615 }, { "epoch": 16.533264033264032, "grad_norm": 6.695408821105957, "learning_rate": 4.4389396200194e-06, "loss": 0.0221, "num_input_tokens_seen": 5728984, "step": 63620 }, { "epoch": 16.53456340956341, "grad_norm": 0.09858476370573044, "learning_rate": 4.4357150067992876e-06, "loss": 0.3164, "num_input_tokens_seen": 5729432, "step": 63625 }, { "epoch": 16.535862785862786, "grad_norm": 1.926587700843811, "learning_rate": 4.432491451220061e-06, "loss": 0.0366, "num_input_tokens_seen": 5729880, "step": 63630 }, { "epoch": 16.53716216216216, "grad_norm": 4.329178333282471, "learning_rate": 4.4292689534475015e-06, "loss": 0.2312, "num_input_tokens_seen": 5730296, "step": 63635 }, { "epoch": 16.53846153846154, "grad_norm": 19.13875389099121, "learning_rate": 4.4260475136473495e-06, "loss": 0.3312, "num_input_tokens_seen": 5730744, "step": 63640 }, { "epoch": 16.539760914760915, "grad_norm": 0.8043538331985474, "learning_rate": 4.422827131985291e-06, "loss": 0.1494, "num_input_tokens_seen": 5731176, "step": 63645 }, { "epoch": 16.54106029106029, "grad_norm": 0.04459971934556961, "learning_rate": 4.419607808626946e-06, "loss": 0.0012, "num_input_tokens_seen": 5731608, "step": 63650 }, { "epoch": 16.54235966735967, "grad_norm": 26.296796798706055, "learning_rate": 4.416389543737895e-06, "loss": 0.2854, "num_input_tokens_seen": 5732040, "step": 63655 }, { "epoch": 16.543659043659044, "grad_norm": 0.049254246056079865, "learning_rate": 4.413172337483645e-06, "loss": 0.2896, "num_input_tokens_seen": 5732488, "step": 63660 }, { "epoch": 16.54495841995842, "grad_norm": 0.20948441326618195, "learning_rate": 4.409956190029674e-06, "loss": 0.3917, "num_input_tokens_seen": 5732952, "step": 63665 }, { "epoch": 16.546257796257795, "grad_norm": 0.04307189956307411, "learning_rate": 4.406741101541378e-06, "loss": 0.5075, "num_input_tokens_seen": 5733384, "step": 63670 }, { "epoch": 16.547557172557173, "grad_norm": 0.15051597356796265, "learning_rate": 4.4035270721841215e-06, "loss": 0.2384, "num_input_tokens_seen": 5733800, "step": 63675 }, { "epoch": 16.54885654885655, "grad_norm": 30.890296936035156, "learning_rate": 4.400314102123199e-06, "loss": 0.086, "num_input_tokens_seen": 5734280, "step": 63680 }, { "epoch": 16.550155925155924, "grad_norm": 3.5778722763061523, "learning_rate": 4.397102191523869e-06, "loss": 0.1984, "num_input_tokens_seen": 5734712, "step": 63685 }, { "epoch": 16.551455301455302, "grad_norm": 2.3047313690185547, "learning_rate": 4.393891340551304e-06, "loss": 0.003, "num_input_tokens_seen": 5735160, "step": 63690 }, { "epoch": 16.552754677754677, "grad_norm": 49.82772445678711, "learning_rate": 4.390681549370659e-06, "loss": 0.2289, "num_input_tokens_seen": 5735576, "step": 63695 }, { "epoch": 16.554054054054053, "grad_norm": 5.795137882232666, "learning_rate": 4.387472818147012e-06, "loss": 0.0191, "num_input_tokens_seen": 5736008, "step": 63700 }, { "epoch": 16.55535343035343, "grad_norm": 5.873415470123291, "learning_rate": 4.384265147045377e-06, "loss": 0.139, "num_input_tokens_seen": 5736472, "step": 63705 }, { "epoch": 16.556652806652806, "grad_norm": 0.051376137882471085, "learning_rate": 4.381058536230742e-06, "loss": 0.0027, "num_input_tokens_seen": 5736888, "step": 63710 }, { "epoch": 16.55795218295218, "grad_norm": 0.015281399711966515, "learning_rate": 4.377852985868019e-06, "loss": 0.2227, "num_input_tokens_seen": 5737352, "step": 63715 }, { "epoch": 16.55925155925156, "grad_norm": 0.18431918323040009, "learning_rate": 4.3746484961220856e-06, "loss": 0.0154, "num_input_tokens_seen": 5737816, "step": 63720 }, { "epoch": 16.560550935550935, "grad_norm": 35.77360916137695, "learning_rate": 4.371445067157734e-06, "loss": 0.358, "num_input_tokens_seen": 5738264, "step": 63725 }, { "epoch": 16.56185031185031, "grad_norm": 0.9360783696174622, "learning_rate": 4.368242699139735e-06, "loss": 0.2505, "num_input_tokens_seen": 5738728, "step": 63730 }, { "epoch": 16.56314968814969, "grad_norm": 0.685850977897644, "learning_rate": 4.365041392232777e-06, "loss": 0.485, "num_input_tokens_seen": 5739160, "step": 63735 }, { "epoch": 16.564449064449065, "grad_norm": 28.57065773010254, "learning_rate": 4.3618411466015165e-06, "loss": 0.0838, "num_input_tokens_seen": 5739624, "step": 63740 }, { "epoch": 16.56574844074844, "grad_norm": 11.548256874084473, "learning_rate": 4.358641962410537e-06, "loss": 0.4147, "num_input_tokens_seen": 5740072, "step": 63745 }, { "epoch": 16.56704781704782, "grad_norm": 21.65971565246582, "learning_rate": 4.355443839824375e-06, "loss": 0.3537, "num_input_tokens_seen": 5740488, "step": 63750 }, { "epoch": 16.568347193347194, "grad_norm": 31.98305320739746, "learning_rate": 4.352246779007518e-06, "loss": 0.368, "num_input_tokens_seen": 5740952, "step": 63755 }, { "epoch": 16.56964656964657, "grad_norm": 0.3230973780155182, "learning_rate": 4.3490507801244005e-06, "loss": 0.0397, "num_input_tokens_seen": 5741416, "step": 63760 }, { "epoch": 16.570945945945947, "grad_norm": 0.5696945190429688, "learning_rate": 4.3458558433393885e-06, "loss": 0.064, "num_input_tokens_seen": 5741880, "step": 63765 }, { "epoch": 16.572245322245323, "grad_norm": 0.01897605136036873, "learning_rate": 4.342661968816791e-06, "loss": 0.0024, "num_input_tokens_seen": 5742328, "step": 63770 }, { "epoch": 16.573544698544698, "grad_norm": 82.56449890136719, "learning_rate": 4.33946915672089e-06, "loss": 0.1757, "num_input_tokens_seen": 5742792, "step": 63775 }, { "epoch": 16.574844074844076, "grad_norm": 1.6516677141189575, "learning_rate": 4.33627740721588e-06, "loss": 0.0251, "num_input_tokens_seen": 5743256, "step": 63780 }, { "epoch": 16.57614345114345, "grad_norm": 0.9449262619018555, "learning_rate": 4.333086720465923e-06, "loss": 0.0049, "num_input_tokens_seen": 5743688, "step": 63785 }, { "epoch": 16.577442827442827, "grad_norm": 0.37507161498069763, "learning_rate": 4.329897096635116e-06, "loss": 0.0138, "num_input_tokens_seen": 5744136, "step": 63790 }, { "epoch": 16.578742203742205, "grad_norm": 0.31404930353164673, "learning_rate": 4.326708535887514e-06, "loss": 0.138, "num_input_tokens_seen": 5744600, "step": 63795 }, { "epoch": 16.58004158004158, "grad_norm": 2.162461042404175, "learning_rate": 4.323521038387094e-06, "loss": 0.2138, "num_input_tokens_seen": 5745064, "step": 63800 }, { "epoch": 16.581340956340956, "grad_norm": 0.29647892713546753, "learning_rate": 4.320334604297801e-06, "loss": 0.0093, "num_input_tokens_seen": 5745512, "step": 63805 }, { "epoch": 16.58264033264033, "grad_norm": 5.142972469329834, "learning_rate": 4.3171492337835105e-06, "loss": 0.0064, "num_input_tokens_seen": 5746008, "step": 63810 }, { "epoch": 16.58393970893971, "grad_norm": 1.3580337762832642, "learning_rate": 4.31396492700806e-06, "loss": 0.0411, "num_input_tokens_seen": 5746456, "step": 63815 }, { "epoch": 16.585239085239085, "grad_norm": 0.0024647582322359085, "learning_rate": 4.310781684135199e-06, "loss": 0.0022, "num_input_tokens_seen": 5746904, "step": 63820 }, { "epoch": 16.58653846153846, "grad_norm": 0.8052776455879211, "learning_rate": 4.307599505328672e-06, "loss": 0.382, "num_input_tokens_seen": 5747352, "step": 63825 }, { "epoch": 16.58783783783784, "grad_norm": 0.26866352558135986, "learning_rate": 4.3044183907521244e-06, "loss": 0.0051, "num_input_tokens_seen": 5747768, "step": 63830 }, { "epoch": 16.589137214137214, "grad_norm": 0.15858326852321625, "learning_rate": 4.301238340569172e-06, "loss": 0.003, "num_input_tokens_seen": 5748248, "step": 63835 }, { "epoch": 16.59043659043659, "grad_norm": 0.4629608392715454, "learning_rate": 4.298059354943365e-06, "loss": 0.0061, "num_input_tokens_seen": 5748760, "step": 63840 }, { "epoch": 16.591735966735968, "grad_norm": 17.863752365112305, "learning_rate": 4.294881434038195e-06, "loss": 0.0947, "num_input_tokens_seen": 5749208, "step": 63845 }, { "epoch": 16.593035343035343, "grad_norm": 0.0793105810880661, "learning_rate": 4.291704578017114e-06, "loss": 0.283, "num_input_tokens_seen": 5749640, "step": 63850 }, { "epoch": 16.594334719334718, "grad_norm": 0.120737224817276, "learning_rate": 4.288528787043505e-06, "loss": 0.0887, "num_input_tokens_seen": 5750088, "step": 63855 }, { "epoch": 16.595634095634097, "grad_norm": 25.644935607910156, "learning_rate": 4.285354061280713e-06, "loss": 0.061, "num_input_tokens_seen": 5750536, "step": 63860 }, { "epoch": 16.596933471933472, "grad_norm": 0.7472628355026245, "learning_rate": 4.282180400892002e-06, "loss": 0.1868, "num_input_tokens_seen": 5750968, "step": 63865 }, { "epoch": 16.598232848232847, "grad_norm": 0.011871443130075932, "learning_rate": 4.279007806040611e-06, "loss": 0.061, "num_input_tokens_seen": 5751416, "step": 63870 }, { "epoch": 16.599532224532226, "grad_norm": 3.850497007369995, "learning_rate": 4.275836276889697e-06, "loss": 0.0069, "num_input_tokens_seen": 5751848, "step": 63875 }, { "epoch": 16.6008316008316, "grad_norm": 0.43564993143081665, "learning_rate": 4.272665813602386e-06, "loss": 0.1081, "num_input_tokens_seen": 5752280, "step": 63880 }, { "epoch": 16.602130977130976, "grad_norm": 0.005573916248977184, "learning_rate": 4.269496416341725e-06, "loss": 0.2366, "num_input_tokens_seen": 5752776, "step": 63885 }, { "epoch": 16.603430353430355, "grad_norm": 21.62871742248535, "learning_rate": 4.266328085270726e-06, "loss": 0.4635, "num_input_tokens_seen": 5753192, "step": 63890 }, { "epoch": 16.60472972972973, "grad_norm": 55.003257751464844, "learning_rate": 4.2631608205523424e-06, "loss": 0.4052, "num_input_tokens_seen": 5753640, "step": 63895 }, { "epoch": 16.606029106029105, "grad_norm": 11.381539344787598, "learning_rate": 4.25999462234947e-06, "loss": 0.0466, "num_input_tokens_seen": 5754136, "step": 63900 }, { "epoch": 16.607328482328484, "grad_norm": 0.06681716442108154, "learning_rate": 4.256829490824949e-06, "loss": 0.0091, "num_input_tokens_seen": 5754584, "step": 63905 }, { "epoch": 16.60862785862786, "grad_norm": 4.443567276000977, "learning_rate": 4.253665426141554e-06, "loss": 0.0162, "num_input_tokens_seen": 5755016, "step": 63910 }, { "epoch": 16.609927234927234, "grad_norm": 0.18333286046981812, "learning_rate": 4.25050242846203e-06, "loss": 0.0266, "num_input_tokens_seen": 5755432, "step": 63915 }, { "epoch": 16.611226611226613, "grad_norm": 0.12208230048418045, "learning_rate": 4.247340497949043e-06, "loss": 0.1579, "num_input_tokens_seen": 5755880, "step": 63920 }, { "epoch": 16.612525987525988, "grad_norm": 48.625614166259766, "learning_rate": 4.244179634765219e-06, "loss": 0.2973, "num_input_tokens_seen": 5756312, "step": 63925 }, { "epoch": 16.613825363825363, "grad_norm": 1.8817403316497803, "learning_rate": 4.241019839073124e-06, "loss": 0.2091, "num_input_tokens_seen": 5756760, "step": 63930 }, { "epoch": 16.61512474012474, "grad_norm": 0.005396897904574871, "learning_rate": 4.237861111035271e-06, "loss": 0.0008, "num_input_tokens_seen": 5757224, "step": 63935 }, { "epoch": 16.616424116424117, "grad_norm": 2.5404043197631836, "learning_rate": 4.234703450814112e-06, "loss": 0.2696, "num_input_tokens_seen": 5757656, "step": 63940 }, { "epoch": 16.617723492723492, "grad_norm": 1.5347830057144165, "learning_rate": 4.231546858572055e-06, "loss": 0.0037, "num_input_tokens_seen": 5758120, "step": 63945 }, { "epoch": 16.61902286902287, "grad_norm": 50.71528244018555, "learning_rate": 4.2283913344714375e-06, "loss": 0.1343, "num_input_tokens_seen": 5758616, "step": 63950 }, { "epoch": 16.620322245322246, "grad_norm": 21.82352638244629, "learning_rate": 4.225236878674563e-06, "loss": 0.2552, "num_input_tokens_seen": 5759064, "step": 63955 }, { "epoch": 16.62162162162162, "grad_norm": 0.026563623920083046, "learning_rate": 4.222083491343653e-06, "loss": 0.4554, "num_input_tokens_seen": 5759528, "step": 63960 }, { "epoch": 16.622920997921, "grad_norm": 4.832674503326416, "learning_rate": 4.218931172640899e-06, "loss": 0.0059, "num_input_tokens_seen": 5759928, "step": 63965 }, { "epoch": 16.624220374220375, "grad_norm": 0.010954577475786209, "learning_rate": 4.215779922728432e-06, "loss": 0.1869, "num_input_tokens_seen": 5760392, "step": 63970 }, { "epoch": 16.62551975051975, "grad_norm": 0.09837339073419571, "learning_rate": 4.212629741768312e-06, "loss": 0.2195, "num_input_tokens_seen": 5760808, "step": 63975 }, { "epoch": 16.626819126819125, "grad_norm": 43.335121154785156, "learning_rate": 4.209480629922569e-06, "loss": 0.1657, "num_input_tokens_seen": 5761240, "step": 63980 }, { "epoch": 16.628118503118504, "grad_norm": 0.025544432923197746, "learning_rate": 4.206332587353149e-06, "loss": 0.2991, "num_input_tokens_seen": 5761672, "step": 63985 }, { "epoch": 16.62941787941788, "grad_norm": 22.650161743164062, "learning_rate": 4.203185614221975e-06, "loss": 0.1599, "num_input_tokens_seen": 5762104, "step": 63990 }, { "epoch": 16.630717255717254, "grad_norm": 36.04249572753906, "learning_rate": 4.2000397106908865e-06, "loss": 0.1053, "num_input_tokens_seen": 5762584, "step": 63995 }, { "epoch": 16.632016632016633, "grad_norm": 0.5908635258674622, "learning_rate": 4.196894876921684e-06, "loss": 0.1975, "num_input_tokens_seen": 5763032, "step": 64000 }, { "epoch": 16.633316008316008, "grad_norm": 0.0033595114946365356, "learning_rate": 4.1937511130761106e-06, "loss": 0.1235, "num_input_tokens_seen": 5763480, "step": 64005 }, { "epoch": 16.634615384615383, "grad_norm": 0.01170043833553791, "learning_rate": 4.190608419315858e-06, "loss": 0.0182, "num_input_tokens_seen": 5763960, "step": 64010 }, { "epoch": 16.635914760914762, "grad_norm": 0.0059038144536316395, "learning_rate": 4.18746679580255e-06, "loss": 0.0531, "num_input_tokens_seen": 5764392, "step": 64015 }, { "epoch": 16.637214137214137, "grad_norm": 5.025929927825928, "learning_rate": 4.18432624269777e-06, "loss": 0.0155, "num_input_tokens_seen": 5764840, "step": 64020 }, { "epoch": 16.638513513513512, "grad_norm": 0.5157904624938965, "learning_rate": 4.18118676016303e-06, "loss": 0.1737, "num_input_tokens_seen": 5765304, "step": 64025 }, { "epoch": 16.63981288981289, "grad_norm": 0.040540408343076706, "learning_rate": 4.178048348359809e-06, "loss": 0.0093, "num_input_tokens_seen": 5765784, "step": 64030 }, { "epoch": 16.641112266112266, "grad_norm": 34.08998107910156, "learning_rate": 4.174911007449505e-06, "loss": 0.265, "num_input_tokens_seen": 5766264, "step": 64035 }, { "epoch": 16.64241164241164, "grad_norm": 0.12964260578155518, "learning_rate": 4.171774737593484e-06, "loss": 0.1807, "num_input_tokens_seen": 5766680, "step": 64040 }, { "epoch": 16.64371101871102, "grad_norm": 0.0070334868505597115, "learning_rate": 4.168639538953048e-06, "loss": 0.0433, "num_input_tokens_seen": 5767112, "step": 64045 }, { "epoch": 16.645010395010395, "grad_norm": 0.20043854415416718, "learning_rate": 4.165505411689435e-06, "loss": 0.0081, "num_input_tokens_seen": 5767528, "step": 64050 }, { "epoch": 16.64630977130977, "grad_norm": 53.367183685302734, "learning_rate": 4.162372355963848e-06, "loss": 0.2891, "num_input_tokens_seen": 5768040, "step": 64055 }, { "epoch": 16.64760914760915, "grad_norm": 0.0713341012597084, "learning_rate": 4.159240371937412e-06, "loss": 0.0021, "num_input_tokens_seen": 5768456, "step": 64060 }, { "epoch": 16.648908523908524, "grad_norm": 3.732013463973999, "learning_rate": 4.1561094597712155e-06, "loss": 0.0614, "num_input_tokens_seen": 5768920, "step": 64065 }, { "epoch": 16.6502079002079, "grad_norm": 1.2781931161880493, "learning_rate": 4.152979619626271e-06, "loss": 0.0593, "num_input_tokens_seen": 5769384, "step": 64070 }, { "epoch": 16.651507276507278, "grad_norm": 0.10350203514099121, "learning_rate": 4.149850851663569e-06, "loss": 0.0156, "num_input_tokens_seen": 5769848, "step": 64075 }, { "epoch": 16.652806652806653, "grad_norm": 0.027846558019518852, "learning_rate": 4.146723156044014e-06, "loss": 0.0018, "num_input_tokens_seen": 5770280, "step": 64080 }, { "epoch": 16.65410602910603, "grad_norm": 9.106023788452148, "learning_rate": 4.143596532928468e-06, "loss": 0.1104, "num_input_tokens_seen": 5770744, "step": 64085 }, { "epoch": 16.655405405405407, "grad_norm": 1.6538846492767334, "learning_rate": 4.140470982477735e-06, "loss": 0.0822, "num_input_tokens_seen": 5771208, "step": 64090 }, { "epoch": 16.656704781704782, "grad_norm": 0.13860449194908142, "learning_rate": 4.137346504852569e-06, "loss": 0.0024, "num_input_tokens_seen": 5771672, "step": 64095 }, { "epoch": 16.658004158004157, "grad_norm": 3.9131789207458496, "learning_rate": 4.134223100213655e-06, "loss": 0.5043, "num_input_tokens_seen": 5772104, "step": 64100 }, { "epoch": 16.659303534303533, "grad_norm": 8.981365203857422, "learning_rate": 4.131100768721641e-06, "loss": 0.029, "num_input_tokens_seen": 5772568, "step": 64105 }, { "epoch": 16.66060291060291, "grad_norm": 74.79828643798828, "learning_rate": 4.127979510537117e-06, "loss": 0.4861, "num_input_tokens_seen": 5773032, "step": 64110 }, { "epoch": 16.661902286902286, "grad_norm": 0.0065336572006344795, "learning_rate": 4.1248593258205986e-06, "loss": 0.1072, "num_input_tokens_seen": 5773480, "step": 64115 }, { "epoch": 16.66320166320166, "grad_norm": 11.282339096069336, "learning_rate": 4.121740214732572e-06, "loss": 0.0341, "num_input_tokens_seen": 5773928, "step": 64120 }, { "epoch": 16.66450103950104, "grad_norm": 0.038332268595695496, "learning_rate": 4.1186221774334455e-06, "loss": 0.2509, "num_input_tokens_seen": 5774392, "step": 64125 }, { "epoch": 16.665800415800415, "grad_norm": 0.07148951292037964, "learning_rate": 4.115505214083596e-06, "loss": 0.3098, "num_input_tokens_seen": 5774840, "step": 64130 }, { "epoch": 16.66709979209979, "grad_norm": 0.23013044893741608, "learning_rate": 4.112389324843313e-06, "loss": 0.2796, "num_input_tokens_seen": 5775288, "step": 64135 }, { "epoch": 16.66839916839917, "grad_norm": 1.3645530939102173, "learning_rate": 4.109274509872865e-06, "loss": 0.0575, "num_input_tokens_seen": 5775720, "step": 64140 }, { "epoch": 16.669698544698544, "grad_norm": 2.2707085609436035, "learning_rate": 4.106160769332443e-06, "loss": 0.0337, "num_input_tokens_seen": 5776184, "step": 64145 }, { "epoch": 16.67099792099792, "grad_norm": 41.337432861328125, "learning_rate": 4.103048103382198e-06, "loss": 0.2115, "num_input_tokens_seen": 5776632, "step": 64150 }, { "epoch": 16.6722972972973, "grad_norm": 13.663277626037598, "learning_rate": 4.099936512182207e-06, "loss": 0.0301, "num_input_tokens_seen": 5777080, "step": 64155 }, { "epoch": 16.673596673596673, "grad_norm": 2.759129285812378, "learning_rate": 4.096825995892512e-06, "loss": 0.4173, "num_input_tokens_seen": 5777512, "step": 64160 }, { "epoch": 16.67489604989605, "grad_norm": 12.349565505981445, "learning_rate": 4.093716554673085e-06, "loss": 0.2993, "num_input_tokens_seen": 5777928, "step": 64165 }, { "epoch": 16.676195426195427, "grad_norm": 0.14064830541610718, "learning_rate": 4.090608188683842e-06, "loss": 0.019, "num_input_tokens_seen": 5778360, "step": 64170 }, { "epoch": 16.677494802494802, "grad_norm": 12.111611366271973, "learning_rate": 4.087500898084651e-06, "loss": 0.7799, "num_input_tokens_seen": 5778824, "step": 64175 }, { "epoch": 16.678794178794178, "grad_norm": 0.19823302328586578, "learning_rate": 4.084394683035328e-06, "loss": 0.347, "num_input_tokens_seen": 5779256, "step": 64180 }, { "epoch": 16.680093555093556, "grad_norm": 0.0010824176715686917, "learning_rate": 4.081289543695635e-06, "loss": 0.0957, "num_input_tokens_seen": 5779704, "step": 64185 }, { "epoch": 16.68139293139293, "grad_norm": 2.614184617996216, "learning_rate": 4.078185480225255e-06, "loss": 0.027, "num_input_tokens_seen": 5780136, "step": 64190 }, { "epoch": 16.682692307692307, "grad_norm": 0.013083935715258121, "learning_rate": 4.075082492783849e-06, "loss": 0.0457, "num_input_tokens_seen": 5780600, "step": 64195 }, { "epoch": 16.683991683991685, "grad_norm": 0.04273996502161026, "learning_rate": 4.071980581530993e-06, "loss": 0.023, "num_input_tokens_seen": 5781080, "step": 64200 }, { "epoch": 16.68529106029106, "grad_norm": 0.006236481945961714, "learning_rate": 4.068879746626236e-06, "loss": 0.011, "num_input_tokens_seen": 5781544, "step": 64205 }, { "epoch": 16.686590436590436, "grad_norm": 0.18804702162742615, "learning_rate": 4.06577998822904e-06, "loss": 0.0033, "num_input_tokens_seen": 5781960, "step": 64210 }, { "epoch": 16.687889812889814, "grad_norm": 0.021205052733421326, "learning_rate": 4.062681306498839e-06, "loss": 0.1297, "num_input_tokens_seen": 5782408, "step": 64215 }, { "epoch": 16.68918918918919, "grad_norm": 0.4520753026008606, "learning_rate": 4.059583701595002e-06, "loss": 0.6254, "num_input_tokens_seen": 5782824, "step": 64220 }, { "epoch": 16.690488565488565, "grad_norm": 0.6100247502326965, "learning_rate": 4.056487173676843e-06, "loss": 0.3244, "num_input_tokens_seen": 5783288, "step": 64225 }, { "epoch": 16.691787941787943, "grad_norm": 1.3192259073257446, "learning_rate": 4.053391722903616e-06, "loss": 0.0085, "num_input_tokens_seen": 5783704, "step": 64230 }, { "epoch": 16.69308731808732, "grad_norm": 8.696769714355469, "learning_rate": 4.050297349434515e-06, "loss": 0.0145, "num_input_tokens_seen": 5784136, "step": 64235 }, { "epoch": 16.694386694386694, "grad_norm": 0.274177223443985, "learning_rate": 4.047204053428702e-06, "loss": 0.1321, "num_input_tokens_seen": 5784552, "step": 64240 }, { "epoch": 16.695686070686072, "grad_norm": 2.472564935684204, "learning_rate": 4.044111835045256e-06, "loss": 0.0161, "num_input_tokens_seen": 5784968, "step": 64245 }, { "epoch": 16.696985446985448, "grad_norm": 1.941023349761963, "learning_rate": 4.041020694443215e-06, "loss": 0.088, "num_input_tokens_seen": 5785416, "step": 64250 }, { "epoch": 16.698284823284823, "grad_norm": 29.870399475097656, "learning_rate": 4.037930631781561e-06, "loss": 0.0802, "num_input_tokens_seen": 5785864, "step": 64255 }, { "epoch": 16.6995841995842, "grad_norm": 0.03858229145407677, "learning_rate": 4.0348416472192256e-06, "loss": 0.0008, "num_input_tokens_seen": 5786328, "step": 64260 }, { "epoch": 16.700883575883577, "grad_norm": 1.087660312652588, "learning_rate": 4.031753740915067e-06, "loss": 0.1228, "num_input_tokens_seen": 5786776, "step": 64265 }, { "epoch": 16.70218295218295, "grad_norm": 53.80739974975586, "learning_rate": 4.028666913027912e-06, "loss": 0.3546, "num_input_tokens_seen": 5787208, "step": 64270 }, { "epoch": 16.703482328482327, "grad_norm": 2.4316818714141846, "learning_rate": 4.025581163716505e-06, "loss": 0.0033, "num_input_tokens_seen": 5787672, "step": 64275 }, { "epoch": 16.704781704781706, "grad_norm": 15.86796760559082, "learning_rate": 4.0224964931395604e-06, "loss": 0.0689, "num_input_tokens_seen": 5788120, "step": 64280 }, { "epoch": 16.70608108108108, "grad_norm": 5.878835678100586, "learning_rate": 4.019412901455716e-06, "loss": 0.0225, "num_input_tokens_seen": 5788632, "step": 64285 }, { "epoch": 16.707380457380456, "grad_norm": 19.401308059692383, "learning_rate": 4.0163303888235664e-06, "loss": 0.9927, "num_input_tokens_seen": 5789064, "step": 64290 }, { "epoch": 16.708679833679835, "grad_norm": 0.07945764064788818, "learning_rate": 4.013248955401658e-06, "loss": 0.3855, "num_input_tokens_seen": 5789496, "step": 64295 }, { "epoch": 16.70997920997921, "grad_norm": 0.010524038225412369, "learning_rate": 4.010168601348458e-06, "loss": 0.0203, "num_input_tokens_seen": 5789976, "step": 64300 }, { "epoch": 16.711278586278585, "grad_norm": 1.394058108329773, "learning_rate": 4.007089326822405e-06, "loss": 0.0022, "num_input_tokens_seen": 5790424, "step": 64305 }, { "epoch": 16.712577962577964, "grad_norm": 0.4821906089782715, "learning_rate": 4.004011131981855e-06, "loss": 0.2371, "num_input_tokens_seen": 5790856, "step": 64310 }, { "epoch": 16.71387733887734, "grad_norm": 48.35317611694336, "learning_rate": 4.000934016985136e-06, "loss": 0.3883, "num_input_tokens_seen": 5791320, "step": 64315 }, { "epoch": 16.715176715176714, "grad_norm": 25.2842960357666, "learning_rate": 3.997857981990491e-06, "loss": 0.4407, "num_input_tokens_seen": 5791768, "step": 64320 }, { "epoch": 16.716476091476093, "grad_norm": 0.6546992659568787, "learning_rate": 3.9947830271561435e-06, "loss": 0.2287, "num_input_tokens_seen": 5792200, "step": 64325 }, { "epoch": 16.717775467775468, "grad_norm": 62.22885513305664, "learning_rate": 3.991709152640224e-06, "loss": 0.2727, "num_input_tokens_seen": 5792616, "step": 64330 }, { "epoch": 16.719074844074843, "grad_norm": 2.0250468254089355, "learning_rate": 3.988636358600839e-06, "loss": 0.2047, "num_input_tokens_seen": 5793112, "step": 64335 }, { "epoch": 16.72037422037422, "grad_norm": 0.37365636229515076, "learning_rate": 3.9855646451960136e-06, "loss": 0.0009, "num_input_tokens_seen": 5793592, "step": 64340 }, { "epoch": 16.721673596673597, "grad_norm": 10.535489082336426, "learning_rate": 3.982494012583737e-06, "loss": 0.1631, "num_input_tokens_seen": 5794072, "step": 64345 }, { "epoch": 16.722972972972972, "grad_norm": 26.051441192626953, "learning_rate": 3.9794244609219265e-06, "loss": 0.3197, "num_input_tokens_seen": 5794520, "step": 64350 }, { "epoch": 16.72427234927235, "grad_norm": 0.12824711203575134, "learning_rate": 3.976355990368458e-06, "loss": 0.0013, "num_input_tokens_seen": 5794968, "step": 64355 }, { "epoch": 16.725571725571726, "grad_norm": 1.7077431678771973, "learning_rate": 3.973288601081152e-06, "loss": 0.0021, "num_input_tokens_seen": 5795416, "step": 64360 }, { "epoch": 16.7268711018711, "grad_norm": 0.9851018786430359, "learning_rate": 3.970222293217751e-06, "loss": 0.0013, "num_input_tokens_seen": 5795896, "step": 64365 }, { "epoch": 16.72817047817048, "grad_norm": 0.0016068321419879794, "learning_rate": 3.9671570669359754e-06, "loss": 0.2739, "num_input_tokens_seen": 5796360, "step": 64370 }, { "epoch": 16.729469854469855, "grad_norm": 29.842472076416016, "learning_rate": 3.9640929223934555e-06, "loss": 0.1055, "num_input_tokens_seen": 5796776, "step": 64375 }, { "epoch": 16.73076923076923, "grad_norm": 48.344642639160156, "learning_rate": 3.9610298597478e-06, "loss": 0.3283, "num_input_tokens_seen": 5797240, "step": 64380 }, { "epoch": 16.73206860706861, "grad_norm": 17.08184051513672, "learning_rate": 3.957967879156533e-06, "loss": 0.1662, "num_input_tokens_seen": 5797672, "step": 64385 }, { "epoch": 16.733367983367984, "grad_norm": 69.62858581542969, "learning_rate": 3.954906980777137e-06, "loss": 0.5371, "num_input_tokens_seen": 5798104, "step": 64390 }, { "epoch": 16.73466735966736, "grad_norm": 0.5419595241546631, "learning_rate": 3.951847164767042e-06, "loss": 0.1411, "num_input_tokens_seen": 5798568, "step": 64395 }, { "epoch": 16.735966735966738, "grad_norm": 2.0174529552459717, "learning_rate": 3.948788431283617e-06, "loss": 0.0941, "num_input_tokens_seen": 5799048, "step": 64400 }, { "epoch": 16.737266112266113, "grad_norm": 0.029144341126084328, "learning_rate": 3.945730780484172e-06, "loss": 0.0343, "num_input_tokens_seen": 5799496, "step": 64405 }, { "epoch": 16.738565488565488, "grad_norm": 0.001669332035817206, "learning_rate": 3.942674212525968e-06, "loss": 0.1502, "num_input_tokens_seen": 5799928, "step": 64410 }, { "epoch": 16.739864864864863, "grad_norm": 13.431437492370605, "learning_rate": 3.939618727566202e-06, "loss": 0.2627, "num_input_tokens_seen": 5800360, "step": 64415 }, { "epoch": 16.741164241164242, "grad_norm": 0.000860502477735281, "learning_rate": 3.93656432576203e-06, "loss": 0.1432, "num_input_tokens_seen": 5800840, "step": 64420 }, { "epoch": 16.742463617463617, "grad_norm": 0.36088505387306213, "learning_rate": 3.933511007270529e-06, "loss": 0.2071, "num_input_tokens_seen": 5801272, "step": 64425 }, { "epoch": 16.743762993762992, "grad_norm": 0.004234896507114172, "learning_rate": 3.93045877224874e-06, "loss": 0.1932, "num_input_tokens_seen": 5801672, "step": 64430 }, { "epoch": 16.74506237006237, "grad_norm": 0.5009610056877136, "learning_rate": 3.927407620853654e-06, "loss": 0.1737, "num_input_tokens_seen": 5802104, "step": 64435 }, { "epoch": 16.746361746361746, "grad_norm": 0.048687271773815155, "learning_rate": 3.9243575532421775e-06, "loss": 0.14, "num_input_tokens_seen": 5802552, "step": 64440 }, { "epoch": 16.74766112266112, "grad_norm": 0.01176443137228489, "learning_rate": 3.921308569571192e-06, "loss": 0.0768, "num_input_tokens_seen": 5803016, "step": 64445 }, { "epoch": 16.7489604989605, "grad_norm": 0.3556937277317047, "learning_rate": 3.918260669997498e-06, "loss": 0.1087, "num_input_tokens_seen": 5803496, "step": 64450 }, { "epoch": 16.750259875259875, "grad_norm": 0.00826847180724144, "learning_rate": 3.9152138546778625e-06, "loss": 0.2305, "num_input_tokens_seen": 5803944, "step": 64455 }, { "epoch": 16.75155925155925, "grad_norm": 8.96237850189209, "learning_rate": 3.912168123768975e-06, "loss": 0.2428, "num_input_tokens_seen": 5804408, "step": 64460 }, { "epoch": 16.75285862785863, "grad_norm": 3.101952314376831, "learning_rate": 3.909123477427487e-06, "loss": 0.3427, "num_input_tokens_seen": 5804856, "step": 64465 }, { "epoch": 16.754158004158004, "grad_norm": 0.576799213886261, "learning_rate": 3.906079915809984e-06, "loss": 0.2181, "num_input_tokens_seen": 5805304, "step": 64470 }, { "epoch": 16.75545738045738, "grad_norm": 6.601759433746338, "learning_rate": 3.903037439073012e-06, "loss": 0.1083, "num_input_tokens_seen": 5805704, "step": 64475 }, { "epoch": 16.756756756756758, "grad_norm": 52.0338249206543, "learning_rate": 3.8999960473730304e-06, "loss": 0.5238, "num_input_tokens_seen": 5806168, "step": 64480 }, { "epoch": 16.758056133056133, "grad_norm": 28.977516174316406, "learning_rate": 3.896955740866476e-06, "loss": 0.3985, "num_input_tokens_seen": 5806648, "step": 64485 }, { "epoch": 16.759355509355508, "grad_norm": 0.29236623644828796, "learning_rate": 3.8939165197097065e-06, "loss": 0.0029, "num_input_tokens_seen": 5807112, "step": 64490 }, { "epoch": 16.760654885654887, "grad_norm": 0.049969423562288284, "learning_rate": 3.8908783840590295e-06, "loss": 0.2008, "num_input_tokens_seen": 5807576, "step": 64495 }, { "epoch": 16.761954261954262, "grad_norm": 1.026556372642517, "learning_rate": 3.887841334070702e-06, "loss": 0.6151, "num_input_tokens_seen": 5807992, "step": 64500 }, { "epoch": 16.763253638253637, "grad_norm": 2.016439914703369, "learning_rate": 3.88480536990092e-06, "loss": 0.1948, "num_input_tokens_seen": 5808424, "step": 64505 }, { "epoch": 16.764553014553016, "grad_norm": 0.035514239221811295, "learning_rate": 3.8817704917058415e-06, "loss": 0.0045, "num_input_tokens_seen": 5808872, "step": 64510 }, { "epoch": 16.76585239085239, "grad_norm": 0.21307840943336487, "learning_rate": 3.87873669964153e-06, "loss": 0.089, "num_input_tokens_seen": 5809304, "step": 64515 }, { "epoch": 16.767151767151766, "grad_norm": 0.43055376410484314, "learning_rate": 3.875703993864036e-06, "loss": 0.136, "num_input_tokens_seen": 5809784, "step": 64520 }, { "epoch": 16.768451143451145, "grad_norm": 45.361724853515625, "learning_rate": 3.872672374529321e-06, "loss": 0.2408, "num_input_tokens_seen": 5810216, "step": 64525 }, { "epoch": 16.76975051975052, "grad_norm": 0.01066364161670208, "learning_rate": 3.869641841793311e-06, "loss": 0.2021, "num_input_tokens_seen": 5810648, "step": 64530 }, { "epoch": 16.771049896049895, "grad_norm": 1.393053412437439, "learning_rate": 3.866612395811864e-06, "loss": 0.0623, "num_input_tokens_seen": 5811080, "step": 64535 }, { "epoch": 16.772349272349274, "grad_norm": 1.1424956321716309, "learning_rate": 3.863584036740792e-06, "loss": 0.1289, "num_input_tokens_seen": 5811512, "step": 64540 }, { "epoch": 16.77364864864865, "grad_norm": 20.999107360839844, "learning_rate": 3.860556764735842e-06, "loss": 0.1612, "num_input_tokens_seen": 5811960, "step": 64545 }, { "epoch": 16.774948024948024, "grad_norm": 14.378334999084473, "learning_rate": 3.85753057995272e-06, "loss": 0.3919, "num_input_tokens_seen": 5812376, "step": 64550 }, { "epoch": 16.776247401247403, "grad_norm": 2.1810789108276367, "learning_rate": 3.854505482547057e-06, "loss": 0.0179, "num_input_tokens_seen": 5812808, "step": 64555 }, { "epoch": 16.777546777546778, "grad_norm": 0.016083765774965286, "learning_rate": 3.851481472674434e-06, "loss": 0.3304, "num_input_tokens_seen": 5813224, "step": 64560 }, { "epoch": 16.778846153846153, "grad_norm": 0.21728633344173431, "learning_rate": 3.848458550490386e-06, "loss": 0.0318, "num_input_tokens_seen": 5813688, "step": 64565 }, { "epoch": 16.78014553014553, "grad_norm": 0.8949851393699646, "learning_rate": 3.845436716150369e-06, "loss": 0.5241, "num_input_tokens_seen": 5814152, "step": 64570 }, { "epoch": 16.781444906444907, "grad_norm": 41.33950424194336, "learning_rate": 3.842415969809823e-06, "loss": 0.2232, "num_input_tokens_seen": 5814648, "step": 64575 }, { "epoch": 16.782744282744282, "grad_norm": 49.18943786621094, "learning_rate": 3.839396311624091e-06, "loss": 0.3652, "num_input_tokens_seen": 5815064, "step": 64580 }, { "epoch": 16.784043659043657, "grad_norm": 1.758313536643982, "learning_rate": 3.83637774174849e-06, "loss": 0.0045, "num_input_tokens_seen": 5815512, "step": 64585 }, { "epoch": 16.785343035343036, "grad_norm": 4.666263580322266, "learning_rate": 3.833360260338251e-06, "loss": 0.0492, "num_input_tokens_seen": 5815960, "step": 64590 }, { "epoch": 16.78664241164241, "grad_norm": 0.04868542030453682, "learning_rate": 3.8303438675485804e-06, "loss": 0.4683, "num_input_tokens_seen": 5816440, "step": 64595 }, { "epoch": 16.787941787941786, "grad_norm": 24.978092193603516, "learning_rate": 3.827328563534602e-06, "loss": 0.1691, "num_input_tokens_seen": 5816904, "step": 64600 }, { "epoch": 16.789241164241165, "grad_norm": 18.054222106933594, "learning_rate": 3.824314348451405e-06, "loss": 0.1464, "num_input_tokens_seen": 5817336, "step": 64605 }, { "epoch": 16.79054054054054, "grad_norm": 0.132056325674057, "learning_rate": 3.821301222454008e-06, "loss": 0.3523, "num_input_tokens_seen": 5817800, "step": 64610 }, { "epoch": 16.791839916839916, "grad_norm": 1.9146610498428345, "learning_rate": 3.818289185697388e-06, "loss": 0.2818, "num_input_tokens_seen": 5818264, "step": 64615 }, { "epoch": 16.793139293139294, "grad_norm": 10.093439102172852, "learning_rate": 3.8152782383364506e-06, "loss": 0.0126, "num_input_tokens_seen": 5818728, "step": 64620 }, { "epoch": 16.79443866943867, "grad_norm": 7.587639331817627, "learning_rate": 3.812268380526046e-06, "loss": 0.2441, "num_input_tokens_seen": 5819192, "step": 64625 }, { "epoch": 16.795738045738045, "grad_norm": 4.123500347137451, "learning_rate": 3.8092596124209874e-06, "loss": 0.1825, "num_input_tokens_seen": 5819624, "step": 64630 }, { "epoch": 16.797037422037423, "grad_norm": 0.04104897379875183, "learning_rate": 3.8062519341760027e-06, "loss": 0.024, "num_input_tokens_seen": 5820056, "step": 64635 }, { "epoch": 16.7983367983368, "grad_norm": 0.007572715636342764, "learning_rate": 3.8032453459457884e-06, "loss": 0.0409, "num_input_tokens_seen": 5820536, "step": 64640 }, { "epoch": 16.799636174636174, "grad_norm": 0.7297230362892151, "learning_rate": 3.800239847884976e-06, "loss": 0.2122, "num_input_tokens_seen": 5821016, "step": 64645 }, { "epoch": 16.800935550935552, "grad_norm": 0.010577556677162647, "learning_rate": 3.7972354401481474e-06, "loss": 0.1127, "num_input_tokens_seen": 5821480, "step": 64650 }, { "epoch": 16.802234927234927, "grad_norm": 0.010395386256277561, "learning_rate": 3.794232122889807e-06, "loss": 0.0649, "num_input_tokens_seen": 5821912, "step": 64655 }, { "epoch": 16.803534303534303, "grad_norm": 61.21519470214844, "learning_rate": 3.7912298962644367e-06, "loss": 0.443, "num_input_tokens_seen": 5822344, "step": 64660 }, { "epoch": 16.80483367983368, "grad_norm": 61.809486389160156, "learning_rate": 3.7882287604264236e-06, "loss": 0.2566, "num_input_tokens_seen": 5822792, "step": 64665 }, { "epoch": 16.806133056133056, "grad_norm": 0.22022950649261475, "learning_rate": 3.7852287155301396e-06, "loss": 0.493, "num_input_tokens_seen": 5823240, "step": 64670 }, { "epoch": 16.80743243243243, "grad_norm": 0.06847235560417175, "learning_rate": 3.7822297617298606e-06, "loss": 0.2847, "num_input_tokens_seen": 5823688, "step": 64675 }, { "epoch": 16.80873180873181, "grad_norm": 1.8506956100463867, "learning_rate": 3.7792318991798386e-06, "loss": 0.3714, "num_input_tokens_seen": 5824152, "step": 64680 }, { "epoch": 16.810031185031185, "grad_norm": 13.192906379699707, "learning_rate": 3.7762351280342556e-06, "loss": 0.1709, "num_input_tokens_seen": 5824600, "step": 64685 }, { "epoch": 16.81133056133056, "grad_norm": 0.02548981085419655, "learning_rate": 3.77323944844723e-06, "loss": 0.0517, "num_input_tokens_seen": 5825080, "step": 64690 }, { "epoch": 16.81262993762994, "grad_norm": 18.156862258911133, "learning_rate": 3.7702448605728467e-06, "loss": 0.0328, "num_input_tokens_seen": 5825544, "step": 64695 }, { "epoch": 16.813929313929314, "grad_norm": 4.920527458190918, "learning_rate": 3.7672513645651024e-06, "loss": 0.0242, "num_input_tokens_seen": 5826008, "step": 64700 }, { "epoch": 16.81522869022869, "grad_norm": 0.004925962071865797, "learning_rate": 3.764258960577971e-06, "loss": 0.061, "num_input_tokens_seen": 5826456, "step": 64705 }, { "epoch": 16.816528066528065, "grad_norm": 24.127765655517578, "learning_rate": 3.7612676487653432e-06, "loss": 0.0471, "num_input_tokens_seen": 5826888, "step": 64710 }, { "epoch": 16.817827442827443, "grad_norm": 27.43914031982422, "learning_rate": 3.7582774292810708e-06, "loss": 0.0637, "num_input_tokens_seen": 5827336, "step": 64715 }, { "epoch": 16.81912681912682, "grad_norm": 0.08468978106975555, "learning_rate": 3.755288302278942e-06, "loss": 0.3914, "num_input_tokens_seen": 5827800, "step": 64720 }, { "epoch": 16.820426195426194, "grad_norm": 6.143866539001465, "learning_rate": 3.7523002679126978e-06, "loss": 0.2159, "num_input_tokens_seen": 5828248, "step": 64725 }, { "epoch": 16.821725571725572, "grad_norm": 0.23973435163497925, "learning_rate": 3.749313326336004e-06, "loss": 0.0563, "num_input_tokens_seen": 5828712, "step": 64730 }, { "epoch": 16.823024948024948, "grad_norm": 44.61142349243164, "learning_rate": 3.7463274777024936e-06, "loss": 0.091, "num_input_tokens_seen": 5829144, "step": 64735 }, { "epoch": 16.824324324324323, "grad_norm": 36.970130920410156, "learning_rate": 3.7433427221657186e-06, "loss": 0.1675, "num_input_tokens_seen": 5829624, "step": 64740 }, { "epoch": 16.8256237006237, "grad_norm": 0.12936803698539734, "learning_rate": 3.7403590598792003e-06, "loss": 0.0089, "num_input_tokens_seen": 5830104, "step": 64745 }, { "epoch": 16.826923076923077, "grad_norm": 1.1815732717514038, "learning_rate": 3.7373764909963806e-06, "loss": 0.0024, "num_input_tokens_seen": 5830552, "step": 64750 }, { "epoch": 16.828222453222452, "grad_norm": 0.22406694293022156, "learning_rate": 3.7343950156706608e-06, "loss": 0.2509, "num_input_tokens_seen": 5830968, "step": 64755 }, { "epoch": 16.82952182952183, "grad_norm": 0.46984922885894775, "learning_rate": 3.7314146340553857e-06, "loss": 0.0344, "num_input_tokens_seen": 5831432, "step": 64760 }, { "epoch": 16.830821205821206, "grad_norm": 0.008496219292283058, "learning_rate": 3.7284353463038297e-06, "loss": 0.2569, "num_input_tokens_seen": 5831864, "step": 64765 }, { "epoch": 16.83212058212058, "grad_norm": 33.27170944213867, "learning_rate": 3.7254571525692283e-06, "loss": 0.1829, "num_input_tokens_seen": 5832264, "step": 64770 }, { "epoch": 16.83341995841996, "grad_norm": 21.219472885131836, "learning_rate": 3.7224800530047484e-06, "loss": 0.097, "num_input_tokens_seen": 5832776, "step": 64775 }, { "epoch": 16.834719334719335, "grad_norm": 0.01946166716516018, "learning_rate": 3.7195040477635084e-06, "loss": 0.0147, "num_input_tokens_seen": 5833240, "step": 64780 }, { "epoch": 16.83601871101871, "grad_norm": 10.93491268157959, "learning_rate": 3.7165291369985618e-06, "loss": 0.0362, "num_input_tokens_seen": 5833720, "step": 64785 }, { "epoch": 16.83731808731809, "grad_norm": 7.319782733917236, "learning_rate": 3.713555320862913e-06, "loss": 0.0502, "num_input_tokens_seen": 5834152, "step": 64790 }, { "epoch": 16.838617463617464, "grad_norm": 32.9675407409668, "learning_rate": 3.710582599509513e-06, "loss": 0.1037, "num_input_tokens_seen": 5834584, "step": 64795 }, { "epoch": 16.83991683991684, "grad_norm": 26.25266456604004, "learning_rate": 3.7076109730912494e-06, "loss": 0.2036, "num_input_tokens_seen": 5835032, "step": 64800 }, { "epoch": 16.841216216216218, "grad_norm": 18.65506362915039, "learning_rate": 3.7046404417609538e-06, "loss": 0.0932, "num_input_tokens_seen": 5835480, "step": 64805 }, { "epoch": 16.842515592515593, "grad_norm": 1.450421690940857, "learning_rate": 3.701671005671406e-06, "loss": 0.0041, "num_input_tokens_seen": 5835896, "step": 64810 }, { "epoch": 16.843814968814968, "grad_norm": 0.7955573797225952, "learning_rate": 3.6987026649753286e-06, "loss": 0.161, "num_input_tokens_seen": 5836312, "step": 64815 }, { "epoch": 16.845114345114347, "grad_norm": 31.440589904785156, "learning_rate": 3.695735419825369e-06, "loss": 0.0796, "num_input_tokens_seen": 5836776, "step": 64820 }, { "epoch": 16.84641372141372, "grad_norm": 8.141753196716309, "learning_rate": 3.6927692703741634e-06, "loss": 0.4437, "num_input_tokens_seen": 5837176, "step": 64825 }, { "epoch": 16.847713097713097, "grad_norm": 0.006536898203194141, "learning_rate": 3.6898042167742426e-06, "loss": 0.1327, "num_input_tokens_seen": 5837640, "step": 64830 }, { "epoch": 16.849012474012476, "grad_norm": 0.1740415245294571, "learning_rate": 3.6868402591781175e-06, "loss": 0.139, "num_input_tokens_seen": 5838072, "step": 64835 }, { "epoch": 16.85031185031185, "grad_norm": 15.172887802124023, "learning_rate": 3.6838773977382113e-06, "loss": 0.0237, "num_input_tokens_seen": 5838488, "step": 64840 }, { "epoch": 16.851611226611226, "grad_norm": 2.1565024852752686, "learning_rate": 3.6809156326069183e-06, "loss": 0.2425, "num_input_tokens_seen": 5838984, "step": 64845 }, { "epoch": 16.852910602910605, "grad_norm": 20.240079879760742, "learning_rate": 3.6779549639365584e-06, "loss": 0.1394, "num_input_tokens_seen": 5839432, "step": 64850 }, { "epoch": 16.85420997920998, "grad_norm": 0.010852512903511524, "learning_rate": 3.6749953918794043e-06, "loss": 0.5607, "num_input_tokens_seen": 5839880, "step": 64855 }, { "epoch": 16.855509355509355, "grad_norm": 3.0052578449249268, "learning_rate": 3.6720369165876704e-06, "loss": 0.2382, "num_input_tokens_seen": 5840344, "step": 64860 }, { "epoch": 16.856808731808734, "grad_norm": 42.712528228759766, "learning_rate": 3.6690795382135186e-06, "loss": 0.1683, "num_input_tokens_seen": 5840776, "step": 64865 }, { "epoch": 16.85810810810811, "grad_norm": 0.006412145681679249, "learning_rate": 3.6661232569090375e-06, "loss": 0.0949, "num_input_tokens_seen": 5841256, "step": 64870 }, { "epoch": 16.859407484407484, "grad_norm": 0.6958885192871094, "learning_rate": 3.663168072826284e-06, "loss": 0.1441, "num_input_tokens_seen": 5841704, "step": 64875 }, { "epoch": 16.86070686070686, "grad_norm": 2.060246706008911, "learning_rate": 3.660213986117242e-06, "loss": 0.0713, "num_input_tokens_seen": 5842152, "step": 64880 }, { "epoch": 16.862006237006238, "grad_norm": 2.415407419204712, "learning_rate": 3.6572609969338365e-06, "loss": 0.3256, "num_input_tokens_seen": 5842584, "step": 64885 }, { "epoch": 16.863305613305613, "grad_norm": 0.09054262936115265, "learning_rate": 3.6543091054279445e-06, "loss": 0.0092, "num_input_tokens_seen": 5843048, "step": 64890 }, { "epoch": 16.864604989604988, "grad_norm": 0.18083147704601288, "learning_rate": 3.6513583117513906e-06, "loss": 0.083, "num_input_tokens_seen": 5843496, "step": 64895 }, { "epoch": 16.865904365904367, "grad_norm": 0.4954017698764801, "learning_rate": 3.648408616055937e-06, "loss": 0.0419, "num_input_tokens_seen": 5843944, "step": 64900 }, { "epoch": 16.867203742203742, "grad_norm": 47.26149368286133, "learning_rate": 3.6454600184932824e-06, "loss": 0.0729, "num_input_tokens_seen": 5844360, "step": 64905 }, { "epoch": 16.868503118503117, "grad_norm": 29.829498291015625, "learning_rate": 3.6425125192150854e-06, "loss": 0.0477, "num_input_tokens_seen": 5844824, "step": 64910 }, { "epoch": 16.869802494802496, "grad_norm": 13.700488090515137, "learning_rate": 3.639566118372928e-06, "loss": 0.0927, "num_input_tokens_seen": 5845288, "step": 64915 }, { "epoch": 16.87110187110187, "grad_norm": 12.000036239624023, "learning_rate": 3.6366208161183578e-06, "loss": 0.1126, "num_input_tokens_seen": 5845704, "step": 64920 }, { "epoch": 16.872401247401246, "grad_norm": 4.912631034851074, "learning_rate": 3.633676612602843e-06, "loss": 0.0319, "num_input_tokens_seen": 5846120, "step": 64925 }, { "epoch": 16.873700623700625, "grad_norm": 0.018386557698249817, "learning_rate": 3.6307335079778097e-06, "loss": 0.0015, "num_input_tokens_seen": 5846552, "step": 64930 }, { "epoch": 16.875, "grad_norm": 51.43284225463867, "learning_rate": 3.6277915023946285e-06, "loss": 0.1723, "num_input_tokens_seen": 5847016, "step": 64935 }, { "epoch": 16.876299376299375, "grad_norm": 0.1846330612897873, "learning_rate": 3.6248505960046144e-06, "loss": 0.0019, "num_input_tokens_seen": 5847464, "step": 64940 }, { "epoch": 16.877598752598754, "grad_norm": 0.5142890810966492, "learning_rate": 3.6219107889590155e-06, "loss": 0.0026, "num_input_tokens_seen": 5847896, "step": 64945 }, { "epoch": 16.87889812889813, "grad_norm": 0.3472701907157898, "learning_rate": 3.6189720814090223e-06, "loss": 0.2225, "num_input_tokens_seen": 5848344, "step": 64950 }, { "epoch": 16.880197505197504, "grad_norm": 0.010091508738696575, "learning_rate": 3.616034473505786e-06, "loss": 0.0446, "num_input_tokens_seen": 5848776, "step": 64955 }, { "epoch": 16.881496881496883, "grad_norm": 0.0028123906813561916, "learning_rate": 3.6130979654003805e-06, "loss": 0.288, "num_input_tokens_seen": 5849224, "step": 64960 }, { "epoch": 16.882796257796258, "grad_norm": 0.06600093096494675, "learning_rate": 3.6101625572438373e-06, "loss": 0.1574, "num_input_tokens_seen": 5849672, "step": 64965 }, { "epoch": 16.884095634095633, "grad_norm": 1.3602428436279297, "learning_rate": 3.6072282491871306e-06, "loss": 0.0321, "num_input_tokens_seen": 5850136, "step": 64970 }, { "epoch": 16.885395010395012, "grad_norm": 0.49800607562065125, "learning_rate": 3.6042950413811784e-06, "loss": 0.0232, "num_input_tokens_seen": 5850552, "step": 64975 }, { "epoch": 16.886694386694387, "grad_norm": 46.13571548461914, "learning_rate": 3.6013629339768264e-06, "loss": 0.2433, "num_input_tokens_seen": 5850984, "step": 64980 }, { "epoch": 16.887993762993762, "grad_norm": 38.205894470214844, "learning_rate": 3.5984319271248844e-06, "loss": 0.1442, "num_input_tokens_seen": 5851416, "step": 64985 }, { "epoch": 16.88929313929314, "grad_norm": 0.11690835654735565, "learning_rate": 3.5955020209760935e-06, "loss": 0.2432, "num_input_tokens_seen": 5851864, "step": 64990 }, { "epoch": 16.890592515592516, "grad_norm": 1.1507612466812134, "learning_rate": 3.592573215681144e-06, "loss": 0.1679, "num_input_tokens_seen": 5852296, "step": 64995 }, { "epoch": 16.89189189189189, "grad_norm": 0.4787977635860443, "learning_rate": 3.589645511390663e-06, "loss": 0.0243, "num_input_tokens_seen": 5852744, "step": 65000 }, { "epoch": 16.893191268191266, "grad_norm": 3.575246810913086, "learning_rate": 3.5867189082552266e-06, "loss": 0.0145, "num_input_tokens_seen": 5853208, "step": 65005 }, { "epoch": 16.894490644490645, "grad_norm": 0.007169740274548531, "learning_rate": 3.583793406425359e-06, "loss": 0.2516, "num_input_tokens_seen": 5853704, "step": 65010 }, { "epoch": 16.89579002079002, "grad_norm": 18.377662658691406, "learning_rate": 3.5808690060515126e-06, "loss": 0.034, "num_input_tokens_seen": 5854152, "step": 65015 }, { "epoch": 16.897089397089395, "grad_norm": 0.4508435130119324, "learning_rate": 3.5779457072841e-06, "loss": 0.0013, "num_input_tokens_seen": 5854632, "step": 65020 }, { "epoch": 16.898388773388774, "grad_norm": 3.118044853210449, "learning_rate": 3.575023510273462e-06, "loss": 0.0052, "num_input_tokens_seen": 5855112, "step": 65025 }, { "epoch": 16.89968814968815, "grad_norm": 39.64502716064453, "learning_rate": 3.572102415169898e-06, "loss": 0.3949, "num_input_tokens_seen": 5855560, "step": 65030 }, { "epoch": 16.900987525987524, "grad_norm": 0.12396999448537827, "learning_rate": 3.569182422123629e-06, "loss": 0.0016, "num_input_tokens_seen": 5856024, "step": 65035 }, { "epoch": 16.902286902286903, "grad_norm": 27.412227630615234, "learning_rate": 3.566263531284847e-06, "loss": 0.2809, "num_input_tokens_seen": 5856472, "step": 65040 }, { "epoch": 16.90358627858628, "grad_norm": 43.18556594848633, "learning_rate": 3.5633457428036644e-06, "loss": 0.1569, "num_input_tokens_seen": 5856920, "step": 65045 }, { "epoch": 16.904885654885653, "grad_norm": 18.10059928894043, "learning_rate": 3.5604290568301586e-06, "loss": 0.0676, "num_input_tokens_seen": 5857416, "step": 65050 }, { "epoch": 16.906185031185032, "grad_norm": 0.0017357979668304324, "learning_rate": 3.557513473514321e-06, "loss": 0.0062, "num_input_tokens_seen": 5857832, "step": 65055 }, { "epoch": 16.907484407484407, "grad_norm": 0.04935428872704506, "learning_rate": 3.5545989930061167e-06, "loss": 0.0388, "num_input_tokens_seen": 5858280, "step": 65060 }, { "epoch": 16.908783783783782, "grad_norm": 1.0674951076507568, "learning_rate": 3.55168561545543e-06, "loss": 0.0724, "num_input_tokens_seen": 5858744, "step": 65065 }, { "epoch": 16.91008316008316, "grad_norm": 0.49392175674438477, "learning_rate": 3.5487733410121015e-06, "loss": 0.6559, "num_input_tokens_seen": 5859160, "step": 65070 }, { "epoch": 16.911382536382536, "grad_norm": 38.0620002746582, "learning_rate": 3.54586216982592e-06, "loss": 0.1164, "num_input_tokens_seen": 5859640, "step": 65075 }, { "epoch": 16.91268191268191, "grad_norm": 0.009998321533203125, "learning_rate": 3.542952102046598e-06, "loss": 0.1065, "num_input_tokens_seen": 5860104, "step": 65080 }, { "epoch": 16.91398128898129, "grad_norm": 48.527435302734375, "learning_rate": 3.540043137823812e-06, "loss": 0.2968, "num_input_tokens_seen": 5860552, "step": 65085 }, { "epoch": 16.915280665280665, "grad_norm": 10.25533390045166, "learning_rate": 3.537135277307166e-06, "loss": 0.0191, "num_input_tokens_seen": 5861032, "step": 65090 }, { "epoch": 16.91658004158004, "grad_norm": 0.05117224529385567, "learning_rate": 3.534228520646224e-06, "loss": 0.0062, "num_input_tokens_seen": 5861480, "step": 65095 }, { "epoch": 16.91787941787942, "grad_norm": 6.564939498901367, "learning_rate": 3.531322867990469e-06, "loss": 0.1118, "num_input_tokens_seen": 5861912, "step": 65100 }, { "epoch": 16.919178794178794, "grad_norm": 7.473042964935303, "learning_rate": 3.5284183194893488e-06, "loss": 0.103, "num_input_tokens_seen": 5862456, "step": 65105 }, { "epoch": 16.92047817047817, "grad_norm": 1.4835667610168457, "learning_rate": 3.525514875292249e-06, "loss": 0.4825, "num_input_tokens_seen": 5862888, "step": 65110 }, { "epoch": 16.921777546777548, "grad_norm": 0.04434321075677872, "learning_rate": 3.5226125355485e-06, "loss": 0.0303, "num_input_tokens_seen": 5863336, "step": 65115 }, { "epoch": 16.923076923076923, "grad_norm": 0.8608607053756714, "learning_rate": 3.5197113004073604e-06, "loss": 0.2614, "num_input_tokens_seen": 5863784, "step": 65120 }, { "epoch": 16.9243762993763, "grad_norm": 0.5919907689094543, "learning_rate": 3.516811170018056e-06, "loss": 0.1515, "num_input_tokens_seen": 5864232, "step": 65125 }, { "epoch": 16.925675675675677, "grad_norm": 0.0284701120108366, "learning_rate": 3.5139121445297322e-06, "loss": 0.1897, "num_input_tokens_seen": 5864744, "step": 65130 }, { "epoch": 16.926975051975052, "grad_norm": 31.781307220458984, "learning_rate": 3.5110142240914993e-06, "loss": 0.5165, "num_input_tokens_seen": 5865160, "step": 65135 }, { "epoch": 16.928274428274428, "grad_norm": 0.09379357099533081, "learning_rate": 3.5081174088523904e-06, "loss": 0.1781, "num_input_tokens_seen": 5865624, "step": 65140 }, { "epoch": 16.929573804573806, "grad_norm": 4.565099716186523, "learning_rate": 3.5052216989613943e-06, "loss": 0.0086, "num_input_tokens_seen": 5866024, "step": 65145 }, { "epoch": 16.93087318087318, "grad_norm": 4.291586875915527, "learning_rate": 3.502327094567448e-06, "loss": 0.0043, "num_input_tokens_seen": 5866472, "step": 65150 }, { "epoch": 16.932172557172557, "grad_norm": 0.9482961893081665, "learning_rate": 3.49943359581941e-06, "loss": 0.0864, "num_input_tokens_seen": 5866936, "step": 65155 }, { "epoch": 16.933471933471935, "grad_norm": 0.06007583811879158, "learning_rate": 3.4965412028661103e-06, "loss": 0.0009, "num_input_tokens_seen": 5867368, "step": 65160 }, { "epoch": 16.93477130977131, "grad_norm": 0.006283082999289036, "learning_rate": 3.4936499158562925e-06, "loss": 0.0163, "num_input_tokens_seen": 5867832, "step": 65165 }, { "epoch": 16.936070686070686, "grad_norm": 0.2736717760562897, "learning_rate": 3.490759734938673e-06, "loss": 0.1412, "num_input_tokens_seen": 5868264, "step": 65170 }, { "epoch": 16.93737006237006, "grad_norm": 0.2503165900707245, "learning_rate": 3.4878706602618856e-06, "loss": 0.0067, "num_input_tokens_seen": 5868712, "step": 65175 }, { "epoch": 16.93866943866944, "grad_norm": 0.19026610255241394, "learning_rate": 3.48498269197452e-06, "loss": 0.0073, "num_input_tokens_seen": 5869160, "step": 65180 }, { "epoch": 16.939968814968815, "grad_norm": 24.473587036132812, "learning_rate": 3.48209583022511e-06, "loss": 0.163, "num_input_tokens_seen": 5869624, "step": 65185 }, { "epoch": 16.94126819126819, "grad_norm": 0.0038043404929339886, "learning_rate": 3.4792100751621343e-06, "loss": 0.0727, "num_input_tokens_seen": 5870056, "step": 65190 }, { "epoch": 16.94256756756757, "grad_norm": 1.960840106010437, "learning_rate": 3.4763254269339963e-06, "loss": 0.2519, "num_input_tokens_seen": 5870520, "step": 65195 }, { "epoch": 16.943866943866944, "grad_norm": 0.05589928478002548, "learning_rate": 3.4734418856890743e-06, "loss": 0.6628, "num_input_tokens_seen": 5870984, "step": 65200 }, { "epoch": 16.94516632016632, "grad_norm": 0.4872594177722931, "learning_rate": 3.470559451575661e-06, "loss": 0.1746, "num_input_tokens_seen": 5871416, "step": 65205 }, { "epoch": 16.946465696465697, "grad_norm": 0.6357964873313904, "learning_rate": 3.4676781247419963e-06, "loss": 0.0071, "num_input_tokens_seen": 5871864, "step": 65210 }, { "epoch": 16.947765072765073, "grad_norm": 0.012280846014618874, "learning_rate": 3.464797905336278e-06, "loss": 0.0069, "num_input_tokens_seen": 5872312, "step": 65215 }, { "epoch": 16.949064449064448, "grad_norm": 0.1759394407272339, "learning_rate": 3.461918793506638e-06, "loss": 0.0517, "num_input_tokens_seen": 5872760, "step": 65220 }, { "epoch": 16.950363825363826, "grad_norm": 3.8490891456604004, "learning_rate": 3.459040789401155e-06, "loss": 0.016, "num_input_tokens_seen": 5873208, "step": 65225 }, { "epoch": 16.9516632016632, "grad_norm": 0.05841316655278206, "learning_rate": 3.4561638931678407e-06, "loss": 0.0007, "num_input_tokens_seen": 5873672, "step": 65230 }, { "epoch": 16.952962577962577, "grad_norm": 6.314404487609863, "learning_rate": 3.4532881049546634e-06, "loss": 0.5887, "num_input_tokens_seen": 5874104, "step": 65235 }, { "epoch": 16.954261954261955, "grad_norm": 0.009201322682201862, "learning_rate": 3.450413424909518e-06, "loss": 0.1313, "num_input_tokens_seen": 5874568, "step": 65240 }, { "epoch": 16.95556133056133, "grad_norm": 0.0008104218286462128, "learning_rate": 3.4475398531802648e-06, "loss": 0.1955, "num_input_tokens_seen": 5875016, "step": 65245 }, { "epoch": 16.956860706860706, "grad_norm": 35.295806884765625, "learning_rate": 3.4446673899146825e-06, "loss": 0.2893, "num_input_tokens_seen": 5875496, "step": 65250 }, { "epoch": 16.958160083160084, "grad_norm": 0.18158000707626343, "learning_rate": 3.4417960352605083e-06, "loss": 0.1424, "num_input_tokens_seen": 5875912, "step": 65255 }, { "epoch": 16.95945945945946, "grad_norm": 5.456106662750244, "learning_rate": 3.4389257893654186e-06, "loss": 0.063, "num_input_tokens_seen": 5876376, "step": 65260 }, { "epoch": 16.960758835758835, "grad_norm": 0.5277323126792908, "learning_rate": 3.4360566523770426e-06, "loss": 0.1619, "num_input_tokens_seen": 5876840, "step": 65265 }, { "epoch": 16.962058212058214, "grad_norm": 9.3340482711792, "learning_rate": 3.4331886244429345e-06, "loss": 0.0461, "num_input_tokens_seen": 5877240, "step": 65270 }, { "epoch": 16.96335758835759, "grad_norm": 29.541187286376953, "learning_rate": 3.430321705710593e-06, "loss": 0.2278, "num_input_tokens_seen": 5877688, "step": 65275 }, { "epoch": 16.964656964656964, "grad_norm": 3.4305427074432373, "learning_rate": 3.4274558963274806e-06, "loss": 0.1977, "num_input_tokens_seen": 5878120, "step": 65280 }, { "epoch": 16.965956340956343, "grad_norm": 0.009185203351080418, "learning_rate": 3.424591196440974e-06, "loss": 0.393, "num_input_tokens_seen": 5878600, "step": 65285 }, { "epoch": 16.967255717255718, "grad_norm": 30.124792098999023, "learning_rate": 3.421727606198416e-06, "loss": 0.2888, "num_input_tokens_seen": 5879064, "step": 65290 }, { "epoch": 16.968555093555093, "grad_norm": 53.68299865722656, "learning_rate": 3.418865125747081e-06, "loss": 0.4431, "num_input_tokens_seen": 5879512, "step": 65295 }, { "epoch": 16.96985446985447, "grad_norm": 0.011666498146951199, "learning_rate": 3.4160037552341955e-06, "loss": 0.0082, "num_input_tokens_seen": 5879976, "step": 65300 }, { "epoch": 16.971153846153847, "grad_norm": 12.226218223571777, "learning_rate": 3.413143494806914e-06, "loss": 0.2088, "num_input_tokens_seen": 5880408, "step": 65305 }, { "epoch": 16.972453222453222, "grad_norm": 34.66500473022461, "learning_rate": 3.410284344612352e-06, "loss": 0.3515, "num_input_tokens_seen": 5880856, "step": 65310 }, { "epoch": 16.973752598752597, "grad_norm": 0.13478446006774902, "learning_rate": 3.4074263047975454e-06, "loss": 0.3331, "num_input_tokens_seen": 5881336, "step": 65315 }, { "epoch": 16.975051975051976, "grad_norm": 0.026255114004015923, "learning_rate": 3.404569375509492e-06, "loss": 0.0956, "num_input_tokens_seen": 5881768, "step": 65320 }, { "epoch": 16.97635135135135, "grad_norm": 0.06987357884645462, "learning_rate": 3.401713556895131e-06, "loss": 0.0672, "num_input_tokens_seen": 5882216, "step": 65325 }, { "epoch": 16.977650727650726, "grad_norm": 0.03298592567443848, "learning_rate": 3.3988588491013385e-06, "loss": 0.0188, "num_input_tokens_seen": 5882648, "step": 65330 }, { "epoch": 16.978950103950105, "grad_norm": 2.9350991249084473, "learning_rate": 3.3960052522749335e-06, "loss": 0.0091, "num_input_tokens_seen": 5883096, "step": 65335 }, { "epoch": 16.98024948024948, "grad_norm": 8.35311222076416, "learning_rate": 3.3931527665626738e-06, "loss": 0.1468, "num_input_tokens_seen": 5883528, "step": 65340 }, { "epoch": 16.981548856548855, "grad_norm": 1.5146441459655762, "learning_rate": 3.3903013921112755e-06, "loss": 0.0047, "num_input_tokens_seen": 5884008, "step": 65345 }, { "epoch": 16.982848232848234, "grad_norm": 1.7454239130020142, "learning_rate": 3.387451129067376e-06, "loss": 0.108, "num_input_tokens_seen": 5884488, "step": 65350 }, { "epoch": 16.98414760914761, "grad_norm": 1.278247594833374, "learning_rate": 3.3846019775775727e-06, "loss": 0.0681, "num_input_tokens_seen": 5884936, "step": 65355 }, { "epoch": 16.985446985446984, "grad_norm": 0.9361855983734131, "learning_rate": 3.381753937788401e-06, "loss": 0.3127, "num_input_tokens_seen": 5885352, "step": 65360 }, { "epoch": 16.986746361746363, "grad_norm": 0.013016813434660435, "learning_rate": 3.378907009846341e-06, "loss": 0.1927, "num_input_tokens_seen": 5885768, "step": 65365 }, { "epoch": 16.988045738045738, "grad_norm": 37.003662109375, "learning_rate": 3.3760611938978087e-06, "loss": 0.0755, "num_input_tokens_seen": 5886216, "step": 65370 }, { "epoch": 16.989345114345113, "grad_norm": 0.9728963375091553, "learning_rate": 3.3732164900891706e-06, "loss": 0.2788, "num_input_tokens_seen": 5886712, "step": 65375 }, { "epoch": 16.990644490644492, "grad_norm": 0.0013351417146623135, "learning_rate": 3.370372898566726e-06, "loss": 0.2282, "num_input_tokens_seen": 5887160, "step": 65380 }, { "epoch": 16.991943866943867, "grad_norm": 0.5237013101577759, "learning_rate": 3.3675304194767333e-06, "loss": 0.0867, "num_input_tokens_seen": 5887624, "step": 65385 }, { "epoch": 16.993243243243242, "grad_norm": 0.0019981416407972574, "learning_rate": 3.3646890529653726e-06, "loss": 0.1309, "num_input_tokens_seen": 5888056, "step": 65390 }, { "epoch": 16.99454261954262, "grad_norm": 69.89488220214844, "learning_rate": 3.3618487991787822e-06, "loss": 0.2752, "num_input_tokens_seen": 5888488, "step": 65395 }, { "epoch": 16.995841995841996, "grad_norm": 0.45443737506866455, "learning_rate": 3.3590096582630487e-06, "loss": 0.0545, "num_input_tokens_seen": 5888952, "step": 65400 }, { "epoch": 16.99714137214137, "grad_norm": 0.13270285725593567, "learning_rate": 3.356171630364177e-06, "loss": 0.1622, "num_input_tokens_seen": 5889368, "step": 65405 }, { "epoch": 16.99844074844075, "grad_norm": 32.658931732177734, "learning_rate": 3.3533347156281427e-06, "loss": 0.0568, "num_input_tokens_seen": 5889832, "step": 65410 }, { "epoch": 16.999740124740125, "grad_norm": 0.01654585637152195, "learning_rate": 3.350498914200839e-06, "loss": 0.1082, "num_input_tokens_seen": 5890280, "step": 65415 }, { "epoch": 17.0, "eval_loss": 0.6811466217041016, "eval_runtime": 13.1862, "eval_samples_per_second": 64.917, "eval_steps_per_second": 32.458, "num_input_tokens_seen": 5890320, "step": 65416 }, { "epoch": 17.0010395010395, "grad_norm": 0.22002749145030975, "learning_rate": 3.3476642262281256e-06, "loss": 0.0613, "num_input_tokens_seen": 5890672, "step": 65420 }, { "epoch": 17.00233887733888, "grad_norm": 33.37185287475586, "learning_rate": 3.3448306518557795e-06, "loss": 0.1345, "num_input_tokens_seen": 5891120, "step": 65425 }, { "epoch": 17.003638253638254, "grad_norm": 0.09966805577278137, "learning_rate": 3.341998191229545e-06, "loss": 0.1082, "num_input_tokens_seen": 5891552, "step": 65430 }, { "epoch": 17.00493762993763, "grad_norm": 0.03301281854510307, "learning_rate": 3.3391668444950926e-06, "loss": 0.0773, "num_input_tokens_seen": 5892016, "step": 65435 }, { "epoch": 17.006237006237008, "grad_norm": 2.291002035140991, "learning_rate": 3.336336611798052e-06, "loss": 0.1759, "num_input_tokens_seen": 5892448, "step": 65440 }, { "epoch": 17.007536382536383, "grad_norm": 22.036222457885742, "learning_rate": 3.333507493283969e-06, "loss": 0.0179, "num_input_tokens_seen": 5892944, "step": 65445 }, { "epoch": 17.008835758835758, "grad_norm": 18.654647827148438, "learning_rate": 3.3306794890983623e-06, "loss": 0.0573, "num_input_tokens_seen": 5893392, "step": 65450 }, { "epoch": 17.010135135135137, "grad_norm": 2.172494649887085, "learning_rate": 3.327852599386666e-06, "loss": 0.0089, "num_input_tokens_seen": 5893872, "step": 65455 }, { "epoch": 17.011434511434512, "grad_norm": 0.552488386631012, "learning_rate": 3.3250268242942807e-06, "loss": 0.0096, "num_input_tokens_seen": 5894336, "step": 65460 }, { "epoch": 17.012733887733887, "grad_norm": 3.371920585632324, "learning_rate": 3.3222021639665286e-06, "loss": 0.0973, "num_input_tokens_seen": 5894768, "step": 65465 }, { "epoch": 17.014033264033262, "grad_norm": 3.0361921787261963, "learning_rate": 3.3193786185486907e-06, "loss": 0.0831, "num_input_tokens_seen": 5895232, "step": 65470 }, { "epoch": 17.01533264033264, "grad_norm": 4.826587200164795, "learning_rate": 3.3165561881859873e-06, "loss": 0.0207, "num_input_tokens_seen": 5895664, "step": 65475 }, { "epoch": 17.016632016632016, "grad_norm": 0.003699866123497486, "learning_rate": 3.313734873023572e-06, "loss": 0.2054, "num_input_tokens_seen": 5896128, "step": 65480 }, { "epoch": 17.01793139293139, "grad_norm": 37.31619644165039, "learning_rate": 3.3109146732065554e-06, "loss": 0.2095, "num_input_tokens_seen": 5896576, "step": 65485 }, { "epoch": 17.01923076923077, "grad_norm": 6.414492607116699, "learning_rate": 3.3080955888799726e-06, "loss": 0.0786, "num_input_tokens_seen": 5896976, "step": 65490 }, { "epoch": 17.020530145530145, "grad_norm": 0.007214268669486046, "learning_rate": 3.305277620188826e-06, "loss": 0.0015, "num_input_tokens_seen": 5897456, "step": 65495 }, { "epoch": 17.02182952182952, "grad_norm": 1.370173692703247, "learning_rate": 3.302460767278029e-06, "loss": 0.129, "num_input_tokens_seen": 5897936, "step": 65500 }, { "epoch": 17.0231288981289, "grad_norm": 0.37570247054100037, "learning_rate": 3.299645030292467e-06, "loss": 0.3044, "num_input_tokens_seen": 5898368, "step": 65505 }, { "epoch": 17.024428274428274, "grad_norm": 0.1875959187746048, "learning_rate": 3.2968304093769525e-06, "loss": 0.3475, "num_input_tokens_seen": 5898800, "step": 65510 }, { "epoch": 17.02572765072765, "grad_norm": 4.959316253662109, "learning_rate": 3.2940169046762504e-06, "loss": 0.0234, "num_input_tokens_seen": 5899232, "step": 65515 }, { "epoch": 17.027027027027028, "grad_norm": 36.289695739746094, "learning_rate": 3.29120451633505e-06, "loss": 0.0699, "num_input_tokens_seen": 5899680, "step": 65520 }, { "epoch": 17.028326403326403, "grad_norm": 1.6321756839752197, "learning_rate": 3.2883932444980086e-06, "loss": 0.1678, "num_input_tokens_seen": 5900112, "step": 65525 }, { "epoch": 17.02962577962578, "grad_norm": 15.847110748291016, "learning_rate": 3.285583089309702e-06, "loss": 0.0308, "num_input_tokens_seen": 5900608, "step": 65530 }, { "epoch": 17.030925155925157, "grad_norm": 0.502312421798706, "learning_rate": 3.2827740509146667e-06, "loss": 0.0356, "num_input_tokens_seen": 5901040, "step": 65535 }, { "epoch": 17.032224532224532, "grad_norm": 0.18442966043949127, "learning_rate": 3.2799661294573624e-06, "loss": 0.0074, "num_input_tokens_seen": 5901504, "step": 65540 }, { "epoch": 17.033523908523907, "grad_norm": 0.014491647481918335, "learning_rate": 3.277159325082213e-06, "loss": 0.0253, "num_input_tokens_seen": 5901920, "step": 65545 }, { "epoch": 17.034823284823286, "grad_norm": 0.04888612776994705, "learning_rate": 3.2743536379335804e-06, "loss": 0.3545, "num_input_tokens_seen": 5902400, "step": 65550 }, { "epoch": 17.03612266112266, "grad_norm": 0.007073222193866968, "learning_rate": 3.271549068155749e-06, "loss": 0.0005, "num_input_tokens_seen": 5902864, "step": 65555 }, { "epoch": 17.037422037422036, "grad_norm": 0.6230740547180176, "learning_rate": 3.268745615892976e-06, "loss": 0.0016, "num_input_tokens_seen": 5903296, "step": 65560 }, { "epoch": 17.038721413721415, "grad_norm": 0.02158336713910103, "learning_rate": 3.2659432812894296e-06, "loss": 0.0378, "num_input_tokens_seen": 5903728, "step": 65565 }, { "epoch": 17.04002079002079, "grad_norm": 0.03546890988945961, "learning_rate": 3.263142064489247e-06, "loss": 0.0614, "num_input_tokens_seen": 5904176, "step": 65570 }, { "epoch": 17.041320166320165, "grad_norm": 0.03578518331050873, "learning_rate": 3.2603419656364957e-06, "loss": 0.0012, "num_input_tokens_seen": 5904640, "step": 65575 }, { "epoch": 17.042619542619544, "grad_norm": 7.299557685852051, "learning_rate": 3.257542984875192e-06, "loss": 0.0304, "num_input_tokens_seen": 5905072, "step": 65580 }, { "epoch": 17.04391891891892, "grad_norm": 31.236099243164062, "learning_rate": 3.2547451223492786e-06, "loss": 0.1141, "num_input_tokens_seen": 5905504, "step": 65585 }, { "epoch": 17.045218295218294, "grad_norm": 27.25640869140625, "learning_rate": 3.2519483782026654e-06, "loss": 0.3225, "num_input_tokens_seen": 5905920, "step": 65590 }, { "epoch": 17.046517671517673, "grad_norm": 0.023167386651039124, "learning_rate": 3.2491527525791794e-06, "loss": 0.6291, "num_input_tokens_seen": 5906384, "step": 65595 }, { "epoch": 17.04781704781705, "grad_norm": 0.1943470686674118, "learning_rate": 3.2463582456226103e-06, "loss": 0.0037, "num_input_tokens_seen": 5906832, "step": 65600 }, { "epoch": 17.049116424116423, "grad_norm": 0.27175799012184143, "learning_rate": 3.2435648574766776e-06, "loss": 0.0009, "num_input_tokens_seen": 5907312, "step": 65605 }, { "epoch": 17.050415800415802, "grad_norm": 0.07966407388448715, "learning_rate": 3.2407725882850516e-06, "loss": 0.2057, "num_input_tokens_seen": 5907776, "step": 65610 }, { "epoch": 17.051715176715177, "grad_norm": 1.6456347703933716, "learning_rate": 3.2379814381913426e-06, "loss": 0.4297, "num_input_tokens_seen": 5908272, "step": 65615 }, { "epoch": 17.053014553014552, "grad_norm": 0.004768866579979658, "learning_rate": 3.235191407339097e-06, "loss": 0.0081, "num_input_tokens_seen": 5908704, "step": 65620 }, { "epoch": 17.054313929313928, "grad_norm": 0.6136952042579651, "learning_rate": 3.232402495871814e-06, "loss": 0.0015, "num_input_tokens_seen": 5909120, "step": 65625 }, { "epoch": 17.055613305613306, "grad_norm": 70.48396301269531, "learning_rate": 3.2296147039329234e-06, "loss": 0.2552, "num_input_tokens_seen": 5909568, "step": 65630 }, { "epoch": 17.05691268191268, "grad_norm": 0.1848834753036499, "learning_rate": 3.2268280316658127e-06, "loss": 0.0014, "num_input_tokens_seen": 5910016, "step": 65635 }, { "epoch": 17.058212058212057, "grad_norm": 40.78546142578125, "learning_rate": 3.224042479213793e-06, "loss": 0.4583, "num_input_tokens_seen": 5910464, "step": 65640 }, { "epoch": 17.059511434511435, "grad_norm": 0.014126386493444443, "learning_rate": 3.221258046720135e-06, "loss": 0.014, "num_input_tokens_seen": 5910912, "step": 65645 }, { "epoch": 17.06081081081081, "grad_norm": 4.792492389678955, "learning_rate": 3.218474734328042e-06, "loss": 0.143, "num_input_tokens_seen": 5911392, "step": 65650 }, { "epoch": 17.062110187110186, "grad_norm": 0.6975143551826477, "learning_rate": 3.2156925421806677e-06, "loss": 0.0023, "num_input_tokens_seen": 5911840, "step": 65655 }, { "epoch": 17.063409563409564, "grad_norm": 1.237798810005188, "learning_rate": 3.2129114704210957e-06, "loss": 0.0013, "num_input_tokens_seen": 5912288, "step": 65660 }, { "epoch": 17.06470893970894, "grad_norm": 0.5958967208862305, "learning_rate": 3.2101315191923663e-06, "loss": 0.0948, "num_input_tokens_seen": 5912752, "step": 65665 }, { "epoch": 17.066008316008315, "grad_norm": 0.7264894247055054, "learning_rate": 3.2073526886374494e-06, "loss": 0.241, "num_input_tokens_seen": 5913168, "step": 65670 }, { "epoch": 17.067307692307693, "grad_norm": 46.29376220703125, "learning_rate": 3.204574978899261e-06, "loss": 0.0958, "num_input_tokens_seen": 5913648, "step": 65675 }, { "epoch": 17.06860706860707, "grad_norm": 0.02660014107823372, "learning_rate": 3.201798390120664e-06, "loss": 0.0012, "num_input_tokens_seen": 5914064, "step": 65680 }, { "epoch": 17.069906444906444, "grad_norm": 31.98958396911621, "learning_rate": 3.199022922444461e-06, "loss": 0.2371, "num_input_tokens_seen": 5914496, "step": 65685 }, { "epoch": 17.071205821205822, "grad_norm": 61.7490119934082, "learning_rate": 3.196248576013405e-06, "loss": 0.3886, "num_input_tokens_seen": 5914960, "step": 65690 }, { "epoch": 17.072505197505198, "grad_norm": 0.5345394611358643, "learning_rate": 3.193475350970171e-06, "loss": 0.0151, "num_input_tokens_seen": 5915392, "step": 65695 }, { "epoch": 17.073804573804573, "grad_norm": 46.70460510253906, "learning_rate": 3.1907032474573968e-06, "loss": 0.2606, "num_input_tokens_seen": 5915840, "step": 65700 }, { "epoch": 17.07510395010395, "grad_norm": 0.5222121477127075, "learning_rate": 3.1879322656176463e-06, "loss": 0.0078, "num_input_tokens_seen": 5916336, "step": 65705 }, { "epoch": 17.076403326403327, "grad_norm": 0.004080860875546932, "learning_rate": 3.1851624055934447e-06, "loss": 0.0102, "num_input_tokens_seen": 5916800, "step": 65710 }, { "epoch": 17.0777027027027, "grad_norm": 4.673410415649414, "learning_rate": 3.1823936675272393e-06, "loss": 0.0278, "num_input_tokens_seen": 5917280, "step": 65715 }, { "epoch": 17.07900207900208, "grad_norm": 0.029841521754860878, "learning_rate": 3.1796260515614297e-06, "loss": 0.1659, "num_input_tokens_seen": 5917712, "step": 65720 }, { "epoch": 17.080301455301456, "grad_norm": 46.50455093383789, "learning_rate": 3.1768595578383604e-06, "loss": 0.1277, "num_input_tokens_seen": 5918176, "step": 65725 }, { "epoch": 17.08160083160083, "grad_norm": 2.0278432369232178, "learning_rate": 3.1740941865003175e-06, "loss": 0.003, "num_input_tokens_seen": 5918592, "step": 65730 }, { "epoch": 17.08290020790021, "grad_norm": 24.16109275817871, "learning_rate": 3.171329937689524e-06, "loss": 0.0568, "num_input_tokens_seen": 5919072, "step": 65735 }, { "epoch": 17.084199584199585, "grad_norm": 7.903863906860352, "learning_rate": 3.168566811548143e-06, "loss": 0.0107, "num_input_tokens_seen": 5919520, "step": 65740 }, { "epoch": 17.08549896049896, "grad_norm": 1.8271832466125488, "learning_rate": 3.165804808218292e-06, "loss": 0.0908, "num_input_tokens_seen": 5919968, "step": 65745 }, { "epoch": 17.08679833679834, "grad_norm": 0.005473838187754154, "learning_rate": 3.1630439278420188e-06, "loss": 0.3735, "num_input_tokens_seen": 5920416, "step": 65750 }, { "epoch": 17.088097713097714, "grad_norm": 0.08510183542966843, "learning_rate": 3.1602841705613143e-06, "loss": 0.0055, "num_input_tokens_seen": 5920912, "step": 65755 }, { "epoch": 17.08939708939709, "grad_norm": 0.36927640438079834, "learning_rate": 3.1575255365181243e-06, "loss": 0.0286, "num_input_tokens_seen": 5921376, "step": 65760 }, { "epoch": 17.090696465696464, "grad_norm": 0.019607393071055412, "learning_rate": 3.1547680258543295e-06, "loss": 0.2513, "num_input_tokens_seen": 5921792, "step": 65765 }, { "epoch": 17.091995841995843, "grad_norm": 0.2848895788192749, "learning_rate": 3.152011638711741e-06, "loss": 0.0171, "num_input_tokens_seen": 5922224, "step": 65770 }, { "epoch": 17.093295218295218, "grad_norm": 0.31391096115112305, "learning_rate": 3.149256375232132e-06, "loss": 0.2423, "num_input_tokens_seen": 5922704, "step": 65775 }, { "epoch": 17.094594594594593, "grad_norm": 0.002776453038677573, "learning_rate": 3.146502235557197e-06, "loss": 0.0002, "num_input_tokens_seen": 5923168, "step": 65780 }, { "epoch": 17.09589397089397, "grad_norm": 0.304913192987442, "learning_rate": 3.1437492198285985e-06, "loss": 0.0198, "num_input_tokens_seen": 5923616, "step": 65785 }, { "epoch": 17.097193347193347, "grad_norm": 0.1603088527917862, "learning_rate": 3.1409973281879085e-06, "loss": 0.0018, "num_input_tokens_seen": 5924064, "step": 65790 }, { "epoch": 17.098492723492722, "grad_norm": 46.18783950805664, "learning_rate": 3.1382465607766807e-06, "loss": 0.6162, "num_input_tokens_seen": 5924512, "step": 65795 }, { "epoch": 17.0997920997921, "grad_norm": 0.3025359511375427, "learning_rate": 3.1354969177363804e-06, "loss": 0.0072, "num_input_tokens_seen": 5924960, "step": 65800 }, { "epoch": 17.101091476091476, "grad_norm": 0.15554462373256683, "learning_rate": 3.1327483992084154e-06, "loss": 0.0812, "num_input_tokens_seen": 5925424, "step": 65805 }, { "epoch": 17.10239085239085, "grad_norm": 0.13865654170513153, "learning_rate": 3.13000100533416e-06, "loss": 0.3617, "num_input_tokens_seen": 5925872, "step": 65810 }, { "epoch": 17.10369022869023, "grad_norm": 0.08911784738302231, "learning_rate": 3.1272547362549004e-06, "loss": 0.006, "num_input_tokens_seen": 5926304, "step": 65815 }, { "epoch": 17.104989604989605, "grad_norm": 0.034416064620018005, "learning_rate": 3.124509592111888e-06, "loss": 0.0763, "num_input_tokens_seen": 5926768, "step": 65820 }, { "epoch": 17.10628898128898, "grad_norm": 0.030365360900759697, "learning_rate": 3.1217655730463093e-06, "loss": 0.0507, "num_input_tokens_seen": 5927216, "step": 65825 }, { "epoch": 17.10758835758836, "grad_norm": 41.981529235839844, "learning_rate": 3.119022679199293e-06, "loss": 0.0891, "num_input_tokens_seen": 5927680, "step": 65830 }, { "epoch": 17.108887733887734, "grad_norm": 0.21887369453907013, "learning_rate": 3.1162809107118996e-06, "loss": 0.2301, "num_input_tokens_seen": 5928128, "step": 65835 }, { "epoch": 17.11018711018711, "grad_norm": 24.814302444458008, "learning_rate": 3.113540267725154e-06, "loss": 0.0668, "num_input_tokens_seen": 5928576, "step": 65840 }, { "epoch": 17.111486486486488, "grad_norm": 29.015037536621094, "learning_rate": 3.1108007503799967e-06, "loss": 0.081, "num_input_tokens_seen": 5929040, "step": 65845 }, { "epoch": 17.112785862785863, "grad_norm": 0.6715565323829651, "learning_rate": 3.1080623588173372e-06, "loss": 0.053, "num_input_tokens_seen": 5929472, "step": 65850 }, { "epoch": 17.114085239085238, "grad_norm": 0.20893003046512604, "learning_rate": 3.1053250931779993e-06, "loss": 0.002, "num_input_tokens_seen": 5929968, "step": 65855 }, { "epoch": 17.115384615384617, "grad_norm": 0.5025666356086731, "learning_rate": 3.1025889536027696e-06, "loss": 0.1432, "num_input_tokens_seen": 5930416, "step": 65860 }, { "epoch": 17.116683991683992, "grad_norm": 0.01076843123883009, "learning_rate": 3.099853940232378e-06, "loss": 0.1945, "num_input_tokens_seen": 5930912, "step": 65865 }, { "epoch": 17.117983367983367, "grad_norm": 1.0387386083602905, "learning_rate": 3.097120053207475e-06, "loss": 0.0033, "num_input_tokens_seen": 5931392, "step": 65870 }, { "epoch": 17.119282744282746, "grad_norm": 8.234017372131348, "learning_rate": 3.094387292668682e-06, "loss": 0.2383, "num_input_tokens_seen": 5931872, "step": 65875 }, { "epoch": 17.12058212058212, "grad_norm": 63.35744857788086, "learning_rate": 3.0916556587565316e-06, "loss": 0.1565, "num_input_tokens_seen": 5932336, "step": 65880 }, { "epoch": 17.121881496881496, "grad_norm": 48.96257400512695, "learning_rate": 3.08892515161153e-06, "loss": 0.0727, "num_input_tokens_seen": 5932752, "step": 65885 }, { "epoch": 17.123180873180875, "grad_norm": 0.015946699306368828, "learning_rate": 3.0861957713740953e-06, "loss": 0.0006, "num_input_tokens_seen": 5933216, "step": 65890 }, { "epoch": 17.12448024948025, "grad_norm": 4.887509822845459, "learning_rate": 3.0834675181846073e-06, "loss": 0.0066, "num_input_tokens_seen": 5933664, "step": 65895 }, { "epoch": 17.125779625779625, "grad_norm": 0.00952752586454153, "learning_rate": 3.0807403921833873e-06, "loss": 0.0089, "num_input_tokens_seen": 5934160, "step": 65900 }, { "epoch": 17.127079002079004, "grad_norm": 4.2594428062438965, "learning_rate": 3.078014393510695e-06, "loss": 0.0061, "num_input_tokens_seen": 5934624, "step": 65905 }, { "epoch": 17.12837837837838, "grad_norm": 0.0643036887049675, "learning_rate": 3.0752895223067207e-06, "loss": 0.0008, "num_input_tokens_seen": 5935088, "step": 65910 }, { "epoch": 17.129677754677754, "grad_norm": 36.72945022583008, "learning_rate": 3.0725657787116197e-06, "loss": 0.4002, "num_input_tokens_seen": 5935552, "step": 65915 }, { "epoch": 17.13097713097713, "grad_norm": 51.40106201171875, "learning_rate": 3.0698431628654655e-06, "loss": 0.255, "num_input_tokens_seen": 5936000, "step": 65920 }, { "epoch": 17.132276507276508, "grad_norm": 55.52214431762695, "learning_rate": 3.0671216749082936e-06, "loss": 0.3485, "num_input_tokens_seen": 5936432, "step": 65925 }, { "epoch": 17.133575883575883, "grad_norm": 2.2505600452423096, "learning_rate": 3.0644013149800672e-06, "loss": 0.017, "num_input_tokens_seen": 5936864, "step": 65930 }, { "epoch": 17.13487525987526, "grad_norm": 19.522663116455078, "learning_rate": 3.061682083220696e-06, "loss": 0.1453, "num_input_tokens_seen": 5937312, "step": 65935 }, { "epoch": 17.136174636174637, "grad_norm": 2.7182857990264893, "learning_rate": 3.0589639797700408e-06, "loss": 0.0049, "num_input_tokens_seen": 5937760, "step": 65940 }, { "epoch": 17.137474012474012, "grad_norm": 0.4127901792526245, "learning_rate": 3.056247004767887e-06, "loss": 0.018, "num_input_tokens_seen": 5938256, "step": 65945 }, { "epoch": 17.138773388773387, "grad_norm": 0.05249530076980591, "learning_rate": 3.0535311583539812e-06, "loss": 0.1276, "num_input_tokens_seen": 5938720, "step": 65950 }, { "epoch": 17.140072765072766, "grad_norm": 0.7274909615516663, "learning_rate": 3.050816440667989e-06, "loss": 0.0614, "num_input_tokens_seen": 5939168, "step": 65955 }, { "epoch": 17.14137214137214, "grad_norm": 0.015506255440413952, "learning_rate": 3.0481028518495435e-06, "loss": 0.1729, "num_input_tokens_seen": 5939632, "step": 65960 }, { "epoch": 17.142671517671516, "grad_norm": 0.3655087947845459, "learning_rate": 3.045390392038197e-06, "loss": 0.0348, "num_input_tokens_seen": 5940064, "step": 65965 }, { "epoch": 17.143970893970895, "grad_norm": 34.37735366821289, "learning_rate": 3.0426790613734575e-06, "loss": 0.1493, "num_input_tokens_seen": 5940512, "step": 65970 }, { "epoch": 17.14527027027027, "grad_norm": 0.8983282446861267, "learning_rate": 3.039968859994774e-06, "loss": 0.012, "num_input_tokens_seen": 5940944, "step": 65975 }, { "epoch": 17.146569646569645, "grad_norm": 7.745205402374268, "learning_rate": 3.0372597880415355e-06, "loss": 0.0351, "num_input_tokens_seen": 5941376, "step": 65980 }, { "epoch": 17.147869022869024, "grad_norm": 5.53961706161499, "learning_rate": 3.0345518456530665e-06, "loss": 0.1231, "num_input_tokens_seen": 5941856, "step": 65985 }, { "epoch": 17.1491683991684, "grad_norm": 1.6012223958969116, "learning_rate": 3.031845032968647e-06, "loss": 0.0143, "num_input_tokens_seen": 5942352, "step": 65990 }, { "epoch": 17.150467775467774, "grad_norm": 0.4063987731933594, "learning_rate": 3.0291393501274884e-06, "loss": 0.1973, "num_input_tokens_seen": 5942816, "step": 65995 }, { "epoch": 17.151767151767153, "grad_norm": 0.0364639088511467, "learning_rate": 3.026434797268737e-06, "loss": 0.0011, "num_input_tokens_seen": 5943264, "step": 66000 }, { "epoch": 17.153066528066528, "grad_norm": 0.0029326898511499166, "learning_rate": 3.0237313745314992e-06, "loss": 0.3401, "num_input_tokens_seen": 5943728, "step": 66005 }, { "epoch": 17.154365904365903, "grad_norm": 6.027264595031738, "learning_rate": 3.021029082054813e-06, "loss": 0.6025, "num_input_tokens_seen": 5944160, "step": 66010 }, { "epoch": 17.155665280665282, "grad_norm": 1.8868578672409058, "learning_rate": 3.018327919977665e-06, "loss": 0.3436, "num_input_tokens_seen": 5944608, "step": 66015 }, { "epoch": 17.156964656964657, "grad_norm": 0.017841672524809837, "learning_rate": 3.015627888438971e-06, "loss": 0.0003, "num_input_tokens_seen": 5945040, "step": 66020 }, { "epoch": 17.158264033264032, "grad_norm": 1.2941383123397827, "learning_rate": 3.0129289875776013e-06, "loss": 0.0117, "num_input_tokens_seen": 5945504, "step": 66025 }, { "epoch": 17.15956340956341, "grad_norm": 3.5655243396759033, "learning_rate": 3.0102312175323556e-06, "loss": 0.0354, "num_input_tokens_seen": 5945984, "step": 66030 }, { "epoch": 17.160862785862786, "grad_norm": 0.023798083886504173, "learning_rate": 3.007534578441995e-06, "loss": 0.0003, "num_input_tokens_seen": 5946416, "step": 66035 }, { "epoch": 17.16216216216216, "grad_norm": 0.8028311729431152, "learning_rate": 3.004839070445192e-06, "loss": 0.1829, "num_input_tokens_seen": 5946816, "step": 66040 }, { "epoch": 17.16346153846154, "grad_norm": 0.7195708751678467, "learning_rate": 3.0021446936806e-06, "loss": 0.0535, "num_input_tokens_seen": 5947280, "step": 66045 }, { "epoch": 17.164760914760915, "grad_norm": 0.04828266054391861, "learning_rate": 2.9994514482867803e-06, "loss": 0.0023, "num_input_tokens_seen": 5947744, "step": 66050 }, { "epoch": 17.16606029106029, "grad_norm": 10.395332336425781, "learning_rate": 2.9967593344022576e-06, "loss": 0.0161, "num_input_tokens_seen": 5948224, "step": 66055 }, { "epoch": 17.16735966735967, "grad_norm": 0.005414127837866545, "learning_rate": 2.994068352165483e-06, "loss": 0.0754, "num_input_tokens_seen": 5948656, "step": 66060 }, { "epoch": 17.168659043659044, "grad_norm": 0.002426563762128353, "learning_rate": 2.991378501714856e-06, "loss": 0.3355, "num_input_tokens_seen": 5949088, "step": 66065 }, { "epoch": 17.16995841995842, "grad_norm": 36.550472259521484, "learning_rate": 2.9886897831887166e-06, "loss": 0.2814, "num_input_tokens_seen": 5949536, "step": 66070 }, { "epoch": 17.171257796257795, "grad_norm": 12.063055992126465, "learning_rate": 2.9860021967253543e-06, "loss": 0.0244, "num_input_tokens_seen": 5949984, "step": 66075 }, { "epoch": 17.172557172557173, "grad_norm": 0.11182692646980286, "learning_rate": 2.983315742462997e-06, "loss": 0.161, "num_input_tokens_seen": 5950464, "step": 66080 }, { "epoch": 17.17385654885655, "grad_norm": 0.07549615204334259, "learning_rate": 2.9806304205397986e-06, "loss": 0.1979, "num_input_tokens_seen": 5950944, "step": 66085 }, { "epoch": 17.175155925155924, "grad_norm": 57.91401290893555, "learning_rate": 2.977946231093884e-06, "loss": 0.2599, "num_input_tokens_seen": 5951376, "step": 66090 }, { "epoch": 17.176455301455302, "grad_norm": 0.014510144479572773, "learning_rate": 2.9752631742632876e-06, "loss": 0.0336, "num_input_tokens_seen": 5951840, "step": 66095 }, { "epoch": 17.177754677754677, "grad_norm": 0.08852934092283249, "learning_rate": 2.9725812501860157e-06, "loss": 0.0616, "num_input_tokens_seen": 5952288, "step": 66100 }, { "epoch": 17.179054054054053, "grad_norm": 55.83833694458008, "learning_rate": 2.9699004589999913e-06, "loss": 0.466, "num_input_tokens_seen": 5952736, "step": 66105 }, { "epoch": 17.18035343035343, "grad_norm": 0.6515964865684509, "learning_rate": 2.9672208008430928e-06, "loss": 0.0903, "num_input_tokens_seen": 5953216, "step": 66110 }, { "epoch": 17.181652806652806, "grad_norm": 0.2091246396303177, "learning_rate": 2.964542275853141e-06, "loss": 0.0376, "num_input_tokens_seen": 5953696, "step": 66115 }, { "epoch": 17.18295218295218, "grad_norm": 0.038386713713407516, "learning_rate": 2.9618648841678977e-06, "loss": 0.2829, "num_input_tokens_seen": 5954144, "step": 66120 }, { "epoch": 17.18425155925156, "grad_norm": 1.8247910737991333, "learning_rate": 2.9591886259250605e-06, "loss": 0.0054, "num_input_tokens_seen": 5954576, "step": 66125 }, { "epoch": 17.185550935550935, "grad_norm": 79.69412231445312, "learning_rate": 2.956513501262265e-06, "loss": 0.1577, "num_input_tokens_seen": 5955024, "step": 66130 }, { "epoch": 17.18685031185031, "grad_norm": 0.3316718637943268, "learning_rate": 2.953839510317105e-06, "loss": 0.0043, "num_input_tokens_seen": 5955456, "step": 66135 }, { "epoch": 17.18814968814969, "grad_norm": 0.5469847917556763, "learning_rate": 2.951166653227097e-06, "loss": 0.0067, "num_input_tokens_seen": 5955888, "step": 66140 }, { "epoch": 17.189449064449065, "grad_norm": 53.047584533691406, "learning_rate": 2.9484949301297166e-06, "loss": 0.1678, "num_input_tokens_seen": 5956320, "step": 66145 }, { "epoch": 17.19074844074844, "grad_norm": 0.45498067140579224, "learning_rate": 2.9458243411623677e-06, "loss": 0.0243, "num_input_tokens_seen": 5956768, "step": 66150 }, { "epoch": 17.19204781704782, "grad_norm": 0.05152153596282005, "learning_rate": 2.9431548864624127e-06, "loss": 0.0586, "num_input_tokens_seen": 5957200, "step": 66155 }, { "epoch": 17.193347193347194, "grad_norm": 77.62367248535156, "learning_rate": 2.940486566167128e-06, "loss": 0.3531, "num_input_tokens_seen": 5957696, "step": 66160 }, { "epoch": 17.19464656964657, "grad_norm": 4.275397777557373, "learning_rate": 2.9378193804137617e-06, "loss": 0.0778, "num_input_tokens_seen": 5958128, "step": 66165 }, { "epoch": 17.195945945945947, "grad_norm": 0.5842382311820984, "learning_rate": 2.9351533293394797e-06, "loss": 0.0023, "num_input_tokens_seen": 5958592, "step": 66170 }, { "epoch": 17.197245322245323, "grad_norm": 0.020553655922412872, "learning_rate": 2.932488413081408e-06, "loss": 0.0769, "num_input_tokens_seen": 5959056, "step": 66175 }, { "epoch": 17.198544698544698, "grad_norm": 43.71327209472656, "learning_rate": 2.9298246317765954e-06, "loss": 0.2071, "num_input_tokens_seen": 5959504, "step": 66180 }, { "epoch": 17.199844074844076, "grad_norm": 0.3908367455005646, "learning_rate": 2.9271619855620493e-06, "loss": 0.2071, "num_input_tokens_seen": 5959952, "step": 66185 }, { "epoch": 17.20114345114345, "grad_norm": 0.014270184561610222, "learning_rate": 2.924500474574715e-06, "loss": 0.0002, "num_input_tokens_seen": 5960432, "step": 66190 }, { "epoch": 17.202442827442827, "grad_norm": 14.191758155822754, "learning_rate": 2.92184009895147e-06, "loss": 0.0907, "num_input_tokens_seen": 5960832, "step": 66195 }, { "epoch": 17.203742203742205, "grad_norm": 0.31864267587661743, "learning_rate": 2.919180858829146e-06, "loss": 0.018, "num_input_tokens_seen": 5961312, "step": 66200 }, { "epoch": 17.20504158004158, "grad_norm": 0.03105732798576355, "learning_rate": 2.916522754344503e-06, "loss": 0.4611, "num_input_tokens_seen": 5961760, "step": 66205 }, { "epoch": 17.206340956340956, "grad_norm": 3.2896056175231934, "learning_rate": 2.9138657856342596e-06, "loss": 0.166, "num_input_tokens_seen": 5962208, "step": 66210 }, { "epoch": 17.20764033264033, "grad_norm": 4.1426544189453125, "learning_rate": 2.911209952835056e-06, "loss": 0.0039, "num_input_tokens_seen": 5962688, "step": 66215 }, { "epoch": 17.20893970893971, "grad_norm": 7.605896472930908, "learning_rate": 2.9085552560834894e-06, "loss": 0.084, "num_input_tokens_seen": 5963136, "step": 66220 }, { "epoch": 17.210239085239085, "grad_norm": 1.164795994758606, "learning_rate": 2.9059016955160916e-06, "loss": 0.0253, "num_input_tokens_seen": 5963568, "step": 66225 }, { "epoch": 17.21153846153846, "grad_norm": 42.81319046020508, "learning_rate": 2.9032492712693426e-06, "loss": 0.0637, "num_input_tokens_seen": 5964048, "step": 66230 }, { "epoch": 17.21283783783784, "grad_norm": 0.026075096800923347, "learning_rate": 2.9005979834796555e-06, "loss": 0.0412, "num_input_tokens_seen": 5964512, "step": 66235 }, { "epoch": 17.214137214137214, "grad_norm": 19.504940032958984, "learning_rate": 2.8979478322833902e-06, "loss": 0.1818, "num_input_tokens_seen": 5964960, "step": 66240 }, { "epoch": 17.21543659043659, "grad_norm": 0.5433707237243652, "learning_rate": 2.895298817816841e-06, "loss": 0.2143, "num_input_tokens_seen": 5965392, "step": 66245 }, { "epoch": 17.216735966735968, "grad_norm": 8.874675750732422, "learning_rate": 2.892650940216257e-06, "loss": 0.1141, "num_input_tokens_seen": 5965856, "step": 66250 }, { "epoch": 17.218035343035343, "grad_norm": 0.8917912244796753, "learning_rate": 2.890004199617813e-06, "loss": 0.0093, "num_input_tokens_seen": 5966288, "step": 66255 }, { "epoch": 17.219334719334718, "grad_norm": 27.15206527709961, "learning_rate": 2.887358596157638e-06, "loss": 0.1819, "num_input_tokens_seen": 5966704, "step": 66260 }, { "epoch": 17.220634095634097, "grad_norm": 0.05639604851603508, "learning_rate": 2.884714129971805e-06, "loss": 0.0348, "num_input_tokens_seen": 5967168, "step": 66265 }, { "epoch": 17.221933471933472, "grad_norm": 46.78656768798828, "learning_rate": 2.8820708011963073e-06, "loss": 0.5004, "num_input_tokens_seen": 5967600, "step": 66270 }, { "epoch": 17.223232848232847, "grad_norm": 0.005346811842173338, "learning_rate": 2.879428609967105e-06, "loss": 0.2119, "num_input_tokens_seen": 5968048, "step": 66275 }, { "epoch": 17.224532224532226, "grad_norm": 0.008572855032980442, "learning_rate": 2.8767875564200795e-06, "loss": 0.2355, "num_input_tokens_seen": 5968464, "step": 66280 }, { "epoch": 17.2258316008316, "grad_norm": 0.007382375653833151, "learning_rate": 2.8741476406910715e-06, "loss": 0.1685, "num_input_tokens_seen": 5968928, "step": 66285 }, { "epoch": 17.227130977130976, "grad_norm": 22.837020874023438, "learning_rate": 2.8715088629158422e-06, "loss": 0.3811, "num_input_tokens_seen": 5969376, "step": 66290 }, { "epoch": 17.228430353430355, "grad_norm": 0.034322235733270645, "learning_rate": 2.868871223230124e-06, "loss": 0.5555, "num_input_tokens_seen": 5969840, "step": 66295 }, { "epoch": 17.22972972972973, "grad_norm": 62.01904296875, "learning_rate": 2.866234721769559e-06, "loss": 0.1492, "num_input_tokens_seen": 5970304, "step": 66300 }, { "epoch": 17.231029106029105, "grad_norm": 1.0809780359268188, "learning_rate": 2.8635993586697553e-06, "loss": 0.0069, "num_input_tokens_seen": 5970736, "step": 66305 }, { "epoch": 17.232328482328484, "grad_norm": 0.028208734467625618, "learning_rate": 2.8609651340662403e-06, "loss": 0.0247, "num_input_tokens_seen": 5971200, "step": 66310 }, { "epoch": 17.23362785862786, "grad_norm": 2.279043436050415, "learning_rate": 2.858332048094506e-06, "loss": 0.0019, "num_input_tokens_seen": 5971696, "step": 66315 }, { "epoch": 17.234927234927234, "grad_norm": 18.68308448791504, "learning_rate": 2.855700100889966e-06, "loss": 0.1645, "num_input_tokens_seen": 5972128, "step": 66320 }, { "epoch": 17.236226611226613, "grad_norm": 39.79240417480469, "learning_rate": 2.8530692925879872e-06, "loss": 0.0541, "num_input_tokens_seen": 5972624, "step": 66325 }, { "epoch": 17.237525987525988, "grad_norm": 48.48699188232422, "learning_rate": 2.850439623323878e-06, "loss": 0.1146, "num_input_tokens_seen": 5973120, "step": 66330 }, { "epoch": 17.238825363825363, "grad_norm": 0.005277839954942465, "learning_rate": 2.8478110932328773e-06, "loss": 0.1894, "num_input_tokens_seen": 5973568, "step": 66335 }, { "epoch": 17.24012474012474, "grad_norm": 0.08948664367198944, "learning_rate": 2.8451837024501826e-06, "loss": 0.0016, "num_input_tokens_seen": 5974016, "step": 66340 }, { "epoch": 17.241424116424117, "grad_norm": 11.392789840698242, "learning_rate": 2.842557451110914e-06, "loss": 0.084, "num_input_tokens_seen": 5974496, "step": 66345 }, { "epoch": 17.242723492723492, "grad_norm": 33.123939514160156, "learning_rate": 2.8399323393501514e-06, "loss": 0.1607, "num_input_tokens_seen": 5974960, "step": 66350 }, { "epoch": 17.24402286902287, "grad_norm": 32.5616455078125, "learning_rate": 2.8373083673028934e-06, "loss": 0.3062, "num_input_tokens_seen": 5975376, "step": 66355 }, { "epoch": 17.245322245322246, "grad_norm": 13.500175476074219, "learning_rate": 2.8346855351041036e-06, "loss": 0.1477, "num_input_tokens_seen": 5975808, "step": 66360 }, { "epoch": 17.24662162162162, "grad_norm": 0.2153995931148529, "learning_rate": 2.8320638428886742e-06, "loss": 0.0091, "num_input_tokens_seen": 5976256, "step": 66365 }, { "epoch": 17.247920997920996, "grad_norm": 0.38291579484939575, "learning_rate": 2.829443290791445e-06, "loss": 0.021, "num_input_tokens_seen": 5976704, "step": 66370 }, { "epoch": 17.249220374220375, "grad_norm": 0.1708667129278183, "learning_rate": 2.826823878947188e-06, "loss": 0.0072, "num_input_tokens_seen": 5977136, "step": 66375 }, { "epoch": 17.25051975051975, "grad_norm": 0.006176898255944252, "learning_rate": 2.8242056074906266e-06, "loss": 0.0007, "num_input_tokens_seen": 5977616, "step": 66380 }, { "epoch": 17.251819126819125, "grad_norm": 0.09699735790491104, "learning_rate": 2.8215884765564193e-06, "loss": 0.1284, "num_input_tokens_seen": 5978064, "step": 66385 }, { "epoch": 17.253118503118504, "grad_norm": 0.03938796743750572, "learning_rate": 2.8189724862791617e-06, "loss": 0.1186, "num_input_tokens_seen": 5978496, "step": 66390 }, { "epoch": 17.25441787941788, "grad_norm": 0.36940819025039673, "learning_rate": 2.8163576367934042e-06, "loss": 0.1408, "num_input_tokens_seen": 5978944, "step": 66395 }, { "epoch": 17.255717255717254, "grad_norm": 23.125240325927734, "learning_rate": 2.813743928233625e-06, "loss": 0.0574, "num_input_tokens_seen": 5979376, "step": 66400 }, { "epoch": 17.257016632016633, "grad_norm": 33.08842849731445, "learning_rate": 2.8111313607342623e-06, "loss": 0.1226, "num_input_tokens_seen": 5979808, "step": 66405 }, { "epoch": 17.258316008316008, "grad_norm": 0.029619434848427773, "learning_rate": 2.808519934429668e-06, "loss": 0.4184, "num_input_tokens_seen": 5980256, "step": 66410 }, { "epoch": 17.259615384615383, "grad_norm": 0.7232184410095215, "learning_rate": 2.8059096494541607e-06, "loss": 0.0039, "num_input_tokens_seen": 5980704, "step": 66415 }, { "epoch": 17.260914760914762, "grad_norm": 8.395977020263672, "learning_rate": 2.80330050594198e-06, "loss": 0.0074, "num_input_tokens_seen": 5981184, "step": 66420 }, { "epoch": 17.262214137214137, "grad_norm": 31.683393478393555, "learning_rate": 2.8006925040273275e-06, "loss": 0.0494, "num_input_tokens_seen": 5981616, "step": 66425 }, { "epoch": 17.263513513513512, "grad_norm": 0.05791200324892998, "learning_rate": 2.7980856438443255e-06, "loss": 0.0324, "num_input_tokens_seen": 5982064, "step": 66430 }, { "epoch": 17.26481288981289, "grad_norm": 90.17208099365234, "learning_rate": 2.7954799255270502e-06, "loss": 0.2963, "num_input_tokens_seen": 5982512, "step": 66435 }, { "epoch": 17.266112266112266, "grad_norm": 1.0273430347442627, "learning_rate": 2.79287534920952e-06, "loss": 0.1814, "num_input_tokens_seen": 5982960, "step": 66440 }, { "epoch": 17.26741164241164, "grad_norm": 0.08663394302129745, "learning_rate": 2.790271915025691e-06, "loss": 0.16, "num_input_tokens_seen": 5983408, "step": 66445 }, { "epoch": 17.26871101871102, "grad_norm": 13.342482566833496, "learning_rate": 2.7876696231094596e-06, "loss": 0.0243, "num_input_tokens_seen": 5983872, "step": 66450 }, { "epoch": 17.270010395010395, "grad_norm": 46.02462387084961, "learning_rate": 2.785068473594657e-06, "loss": 0.1228, "num_input_tokens_seen": 5984336, "step": 66455 }, { "epoch": 17.27130977130977, "grad_norm": 0.049618132412433624, "learning_rate": 2.7824684666150706e-06, "loss": 0.5128, "num_input_tokens_seen": 5984816, "step": 66460 }, { "epoch": 17.27260914760915, "grad_norm": 0.00046198273776099086, "learning_rate": 2.7798696023044163e-06, "loss": 0.0082, "num_input_tokens_seen": 5985264, "step": 66465 }, { "epoch": 17.273908523908524, "grad_norm": 0.9414069652557373, "learning_rate": 2.777271880796359e-06, "loss": 0.0534, "num_input_tokens_seen": 5985680, "step": 66470 }, { "epoch": 17.2752079002079, "grad_norm": 0.003418227192014456, "learning_rate": 2.7746753022244996e-06, "loss": 0.0791, "num_input_tokens_seen": 5986096, "step": 66475 }, { "epoch": 17.276507276507278, "grad_norm": 0.0052238209173083305, "learning_rate": 2.7720798667223934e-06, "loss": 0.0006, "num_input_tokens_seen": 5986512, "step": 66480 }, { "epoch": 17.277806652806653, "grad_norm": 48.26325988769531, "learning_rate": 2.7694855744235083e-06, "loss": 0.1065, "num_input_tokens_seen": 5987008, "step": 66485 }, { "epoch": 17.27910602910603, "grad_norm": 0.3878176510334015, "learning_rate": 2.766892425461287e-06, "loss": 0.0225, "num_input_tokens_seen": 5987456, "step": 66490 }, { "epoch": 17.280405405405407, "grad_norm": 17.076156616210938, "learning_rate": 2.7643004199690873e-06, "loss": 0.0369, "num_input_tokens_seen": 5987920, "step": 66495 }, { "epoch": 17.281704781704782, "grad_norm": 42.790992736816406, "learning_rate": 2.761709558080225e-06, "loss": 0.102, "num_input_tokens_seen": 5988368, "step": 66500 }, { "epoch": 17.283004158004157, "grad_norm": 21.72791862487793, "learning_rate": 2.759119839927943e-06, "loss": 0.1956, "num_input_tokens_seen": 5988800, "step": 66505 }, { "epoch": 17.284303534303533, "grad_norm": 0.0018968046642839909, "learning_rate": 2.75653126564544e-06, "loss": 0.1103, "num_input_tokens_seen": 5989264, "step": 66510 }, { "epoch": 17.28560291060291, "grad_norm": 0.005004378501325846, "learning_rate": 2.753943835365849e-06, "loss": 0.1675, "num_input_tokens_seen": 5989728, "step": 66515 }, { "epoch": 17.286902286902286, "grad_norm": 50.06737518310547, "learning_rate": 2.7513575492222387e-06, "loss": 0.5364, "num_input_tokens_seen": 5990208, "step": 66520 }, { "epoch": 17.28820166320166, "grad_norm": 0.004447632934898138, "learning_rate": 2.7487724073476327e-06, "loss": 0.2021, "num_input_tokens_seen": 5990656, "step": 66525 }, { "epoch": 17.28950103950104, "grad_norm": 0.11663271486759186, "learning_rate": 2.746188409874975e-06, "loss": 0.0014, "num_input_tokens_seen": 5991104, "step": 66530 }, { "epoch": 17.290800415800415, "grad_norm": 3.2647488117218018, "learning_rate": 2.743605556937176e-06, "loss": 0.178, "num_input_tokens_seen": 5991568, "step": 66535 }, { "epoch": 17.29209979209979, "grad_norm": 0.033136434853076935, "learning_rate": 2.741023848667057e-06, "loss": 0.0489, "num_input_tokens_seen": 5992032, "step": 66540 }, { "epoch": 17.29339916839917, "grad_norm": 0.054113615304231644, "learning_rate": 2.73844328519742e-06, "loss": 0.0019, "num_input_tokens_seen": 5992544, "step": 66545 }, { "epoch": 17.294698544698544, "grad_norm": 38.96712112426758, "learning_rate": 2.7358638666609702e-06, "loss": 0.0858, "num_input_tokens_seen": 5993008, "step": 66550 }, { "epoch": 17.29599792099792, "grad_norm": 0.09156083315610886, "learning_rate": 2.7332855931903794e-06, "loss": 0.0258, "num_input_tokens_seen": 5993456, "step": 66555 }, { "epoch": 17.2972972972973, "grad_norm": 0.22684380412101746, "learning_rate": 2.7307084649182385e-06, "loss": 0.0023, "num_input_tokens_seen": 5993872, "step": 66560 }, { "epoch": 17.298596673596673, "grad_norm": 44.471988677978516, "learning_rate": 2.728132481977105e-06, "loss": 0.3456, "num_input_tokens_seen": 5994336, "step": 66565 }, { "epoch": 17.29989604989605, "grad_norm": 0.006887953262776136, "learning_rate": 2.725557644499452e-06, "loss": 0.0002, "num_input_tokens_seen": 5994800, "step": 66570 }, { "epoch": 17.301195426195427, "grad_norm": 0.6777684688568115, "learning_rate": 2.722983952617714e-06, "loss": 0.0153, "num_input_tokens_seen": 5995248, "step": 66575 }, { "epoch": 17.302494802494802, "grad_norm": 0.000494167732540518, "learning_rate": 2.72041140646426e-06, "loss": 0.0893, "num_input_tokens_seen": 5995712, "step": 66580 }, { "epoch": 17.303794178794178, "grad_norm": 0.5540873408317566, "learning_rate": 2.7178400061713875e-06, "loss": 0.0617, "num_input_tokens_seen": 5996160, "step": 66585 }, { "epoch": 17.305093555093556, "grad_norm": 0.22554834187030792, "learning_rate": 2.7152697518713603e-06, "loss": 0.0458, "num_input_tokens_seen": 5996624, "step": 66590 }, { "epoch": 17.30639293139293, "grad_norm": 0.007869650609791279, "learning_rate": 2.7127006436963557e-06, "loss": 0.0002, "num_input_tokens_seen": 5997040, "step": 66595 }, { "epoch": 17.307692307692307, "grad_norm": 28.496355056762695, "learning_rate": 2.710132681778518e-06, "loss": 0.0369, "num_input_tokens_seen": 5997504, "step": 66600 }, { "epoch": 17.308991683991685, "grad_norm": 0.1308044046163559, "learning_rate": 2.707565866249909e-06, "loss": 0.009, "num_input_tokens_seen": 5997936, "step": 66605 }, { "epoch": 17.31029106029106, "grad_norm": 1.7114064693450928, "learning_rate": 2.7050001972425463e-06, "loss": 0.5127, "num_input_tokens_seen": 5998352, "step": 66610 }, { "epoch": 17.311590436590436, "grad_norm": 0.40036001801490784, "learning_rate": 2.7024356748883844e-06, "loss": 0.0033, "num_input_tokens_seen": 5998784, "step": 66615 }, { "epoch": 17.312889812889814, "grad_norm": 0.019716808572411537, "learning_rate": 2.6998722993193282e-06, "loss": 0.2171, "num_input_tokens_seen": 5999232, "step": 66620 }, { "epoch": 17.31418918918919, "grad_norm": 0.0015213999431580305, "learning_rate": 2.6973100706672e-06, "loss": 0.1586, "num_input_tokens_seen": 5999680, "step": 66625 }, { "epoch": 17.315488565488565, "grad_norm": 9.475445747375488, "learning_rate": 2.6947489890637887e-06, "loss": 0.0263, "num_input_tokens_seen": 6000112, "step": 66630 }, { "epoch": 17.316787941787943, "grad_norm": 2.391602039337158, "learning_rate": 2.6921890546408034e-06, "loss": 0.1367, "num_input_tokens_seen": 6000576, "step": 66635 }, { "epoch": 17.31808731808732, "grad_norm": 0.01158206071704626, "learning_rate": 2.6896302675299134e-06, "loss": 0.0313, "num_input_tokens_seen": 6001008, "step": 66640 }, { "epoch": 17.319386694386694, "grad_norm": 0.004851041827350855, "learning_rate": 2.687072627862713e-06, "loss": 0.0003, "num_input_tokens_seen": 6001472, "step": 66645 }, { "epoch": 17.320686070686072, "grad_norm": 0.0038931844756007195, "learning_rate": 2.6845161357707454e-06, "loss": 0.0012, "num_input_tokens_seen": 6001936, "step": 66650 }, { "epoch": 17.321985446985448, "grad_norm": 0.001099331653676927, "learning_rate": 2.6819607913855017e-06, "loss": 0.029, "num_input_tokens_seen": 6002400, "step": 66655 }, { "epoch": 17.323284823284823, "grad_norm": 0.1621357798576355, "learning_rate": 2.679406594838391e-06, "loss": 0.0089, "num_input_tokens_seen": 6002848, "step": 66660 }, { "epoch": 17.3245841995842, "grad_norm": 65.25855255126953, "learning_rate": 2.676853546260791e-06, "loss": 0.4311, "num_input_tokens_seen": 6003328, "step": 66665 }, { "epoch": 17.325883575883577, "grad_norm": 14.185497283935547, "learning_rate": 2.674301645783997e-06, "loss": 0.0326, "num_input_tokens_seen": 6003792, "step": 66670 }, { "epoch": 17.32718295218295, "grad_norm": 0.0021905486937612295, "learning_rate": 2.6717508935392682e-06, "loss": 0.0064, "num_input_tokens_seen": 6004240, "step": 66675 }, { "epoch": 17.328482328482327, "grad_norm": 5.979043006896973, "learning_rate": 2.669201289657777e-06, "loss": 0.1533, "num_input_tokens_seen": 6004688, "step": 66680 }, { "epoch": 17.329781704781706, "grad_norm": 0.08731409907341003, "learning_rate": 2.666652834270661e-06, "loss": 0.0176, "num_input_tokens_seen": 6005088, "step": 66685 }, { "epoch": 17.33108108108108, "grad_norm": 0.00694042956456542, "learning_rate": 2.6641055275089866e-06, "loss": 0.1569, "num_input_tokens_seen": 6005536, "step": 66690 }, { "epoch": 17.332380457380456, "grad_norm": 5.724339962005615, "learning_rate": 2.6615593695037745e-06, "loss": 0.1518, "num_input_tokens_seen": 6005984, "step": 66695 }, { "epoch": 17.333679833679835, "grad_norm": 72.91517639160156, "learning_rate": 2.6590143603859614e-06, "loss": 0.472, "num_input_tokens_seen": 6006416, "step": 66700 }, { "epoch": 17.33497920997921, "grad_norm": 10.629586219787598, "learning_rate": 2.656470500286451e-06, "loss": 0.4773, "num_input_tokens_seen": 6006896, "step": 66705 }, { "epoch": 17.336278586278585, "grad_norm": 0.09098358452320099, "learning_rate": 2.6539277893360692e-06, "loss": 0.0116, "num_input_tokens_seen": 6007344, "step": 66710 }, { "epoch": 17.337577962577964, "grad_norm": 0.06634046882390976, "learning_rate": 2.6513862276655894e-06, "loss": 0.1902, "num_input_tokens_seen": 6007776, "step": 66715 }, { "epoch": 17.33887733887734, "grad_norm": 14.950343132019043, "learning_rate": 2.648845815405729e-06, "loss": 0.2588, "num_input_tokens_seen": 6008224, "step": 66720 }, { "epoch": 17.340176715176714, "grad_norm": 26.950183868408203, "learning_rate": 2.6463065526871445e-06, "loss": 0.0464, "num_input_tokens_seen": 6008688, "step": 66725 }, { "epoch": 17.341476091476093, "grad_norm": 0.004949300549924374, "learning_rate": 2.6437684396404344e-06, "loss": 0.1269, "num_input_tokens_seen": 6009200, "step": 66730 }, { "epoch": 17.342775467775468, "grad_norm": 0.1158275380730629, "learning_rate": 2.6412314763961332e-06, "loss": 0.3687, "num_input_tokens_seen": 6009632, "step": 66735 }, { "epoch": 17.344074844074843, "grad_norm": 0.11704689264297485, "learning_rate": 2.6386956630847226e-06, "loss": 0.0282, "num_input_tokens_seen": 6010096, "step": 66740 }, { "epoch": 17.34537422037422, "grad_norm": 0.749038815498352, "learning_rate": 2.636160999836615e-06, "loss": 0.1131, "num_input_tokens_seen": 6010528, "step": 66745 }, { "epoch": 17.346673596673597, "grad_norm": 54.87866973876953, "learning_rate": 2.6336274867821808e-06, "loss": 0.0364, "num_input_tokens_seen": 6011024, "step": 66750 }, { "epoch": 17.347972972972972, "grad_norm": 0.007881004363298416, "learning_rate": 2.6310951240517107e-06, "loss": 0.0168, "num_input_tokens_seen": 6011488, "step": 66755 }, { "epoch": 17.34927234927235, "grad_norm": 0.004326158203184605, "learning_rate": 2.62856391177545e-06, "loss": 0.0732, "num_input_tokens_seen": 6011952, "step": 66760 }, { "epoch": 17.350571725571726, "grad_norm": 10.217530250549316, "learning_rate": 2.626033850083584e-06, "loss": 0.0471, "num_input_tokens_seen": 6012432, "step": 66765 }, { "epoch": 17.3518711018711, "grad_norm": 3.005856513977051, "learning_rate": 2.623504939106239e-06, "loss": 0.0495, "num_input_tokens_seen": 6012848, "step": 66770 }, { "epoch": 17.35317047817048, "grad_norm": 0.37420451641082764, "learning_rate": 2.620977178973474e-06, "loss": 0.0119, "num_input_tokens_seen": 6013312, "step": 66775 }, { "epoch": 17.354469854469855, "grad_norm": 0.05876799672842026, "learning_rate": 2.6184505698152922e-06, "loss": 0.026, "num_input_tokens_seen": 6013744, "step": 66780 }, { "epoch": 17.35576923076923, "grad_norm": 0.0036263882648199797, "learning_rate": 2.615925111761647e-06, "loss": 0.1835, "num_input_tokens_seen": 6014192, "step": 66785 }, { "epoch": 17.35706860706861, "grad_norm": 0.26505231857299805, "learning_rate": 2.613400804942412e-06, "loss": 0.0186, "num_input_tokens_seen": 6014592, "step": 66790 }, { "epoch": 17.358367983367984, "grad_norm": 0.09122191369533539, "learning_rate": 2.610877649487431e-06, "loss": 0.2492, "num_input_tokens_seen": 6015040, "step": 66795 }, { "epoch": 17.35966735966736, "grad_norm": 1.9474209547042847, "learning_rate": 2.6083556455264613e-06, "loss": 0.0145, "num_input_tokens_seen": 6015536, "step": 66800 }, { "epoch": 17.360966735966738, "grad_norm": 0.02428685687482357, "learning_rate": 2.6058347931892187e-06, "loss": 0.0042, "num_input_tokens_seen": 6015984, "step": 66805 }, { "epoch": 17.362266112266113, "grad_norm": 38.3499755859375, "learning_rate": 2.6033150926053463e-06, "loss": 0.0913, "num_input_tokens_seen": 6016416, "step": 66810 }, { "epoch": 17.363565488565488, "grad_norm": 0.13214866816997528, "learning_rate": 2.6007965439044436e-06, "loss": 0.0013, "num_input_tokens_seen": 6016880, "step": 66815 }, { "epoch": 17.364864864864863, "grad_norm": 1.8656264543533325, "learning_rate": 2.598279147216029e-06, "loss": 0.0241, "num_input_tokens_seen": 6017360, "step": 66820 }, { "epoch": 17.366164241164242, "grad_norm": 1.802897334098816, "learning_rate": 2.595762902669585e-06, "loss": 0.0865, "num_input_tokens_seen": 6017808, "step": 66825 }, { "epoch": 17.367463617463617, "grad_norm": 0.3334549367427826, "learning_rate": 2.5932478103945196e-06, "loss": 0.0395, "num_input_tokens_seen": 6018240, "step": 66830 }, { "epoch": 17.368762993762992, "grad_norm": 0.12761539220809937, "learning_rate": 2.5907338705201954e-06, "loss": 0.0014, "num_input_tokens_seen": 6018768, "step": 66835 }, { "epoch": 17.37006237006237, "grad_norm": 10.28475570678711, "learning_rate": 2.5882210831758987e-06, "loss": 0.0102, "num_input_tokens_seen": 6019200, "step": 66840 }, { "epoch": 17.371361746361746, "grad_norm": 0.0029666549526154995, "learning_rate": 2.585709448490858e-06, "loss": 0.0399, "num_input_tokens_seen": 6019648, "step": 66845 }, { "epoch": 17.37266112266112, "grad_norm": 0.7788248062133789, "learning_rate": 2.5831989665942664e-06, "loss": 0.0331, "num_input_tokens_seen": 6020080, "step": 66850 }, { "epoch": 17.3739604989605, "grad_norm": 68.7806167602539, "learning_rate": 2.5806896376152218e-06, "loss": 0.4054, "num_input_tokens_seen": 6020544, "step": 66855 }, { "epoch": 17.375259875259875, "grad_norm": 0.16935712099075317, "learning_rate": 2.578181461682794e-06, "loss": 0.2002, "num_input_tokens_seen": 6020992, "step": 66860 }, { "epoch": 17.37655925155925, "grad_norm": 90.17131042480469, "learning_rate": 2.5756744389259734e-06, "loss": 0.3296, "num_input_tokens_seen": 6021440, "step": 66865 }, { "epoch": 17.37785862785863, "grad_norm": 14.26917839050293, "learning_rate": 2.573168569473711e-06, "loss": 0.0136, "num_input_tokens_seen": 6021888, "step": 66870 }, { "epoch": 17.379158004158004, "grad_norm": 0.22079598903656006, "learning_rate": 2.570663853454869e-06, "loss": 0.3472, "num_input_tokens_seen": 6022352, "step": 66875 }, { "epoch": 17.38045738045738, "grad_norm": 0.019570400938391685, "learning_rate": 2.568160290998281e-06, "loss": 0.0006, "num_input_tokens_seen": 6022752, "step": 66880 }, { "epoch": 17.381756756756758, "grad_norm": 10.470484733581543, "learning_rate": 2.5656578822327e-06, "loss": 0.0399, "num_input_tokens_seen": 6023280, "step": 66885 }, { "epoch": 17.383056133056133, "grad_norm": 1.918688416481018, "learning_rate": 2.5631566272868335e-06, "loss": 0.017, "num_input_tokens_seen": 6023712, "step": 66890 }, { "epoch": 17.384355509355508, "grad_norm": 0.004979857709258795, "learning_rate": 2.560656526289315e-06, "loss": 0.0016, "num_input_tokens_seen": 6024176, "step": 66895 }, { "epoch": 17.385654885654887, "grad_norm": 0.09415611624717712, "learning_rate": 2.5581575793687307e-06, "loss": 0.0035, "num_input_tokens_seen": 6024672, "step": 66900 }, { "epoch": 17.386954261954262, "grad_norm": 32.87767791748047, "learning_rate": 2.555659786653611e-06, "loss": 0.1786, "num_input_tokens_seen": 6025104, "step": 66905 }, { "epoch": 17.388253638253637, "grad_norm": 0.21431781351566315, "learning_rate": 2.5531631482724085e-06, "loss": 0.0397, "num_input_tokens_seen": 6025568, "step": 66910 }, { "epoch": 17.389553014553016, "grad_norm": 5.7036285400390625, "learning_rate": 2.550667664353534e-06, "loss": 0.0422, "num_input_tokens_seen": 6026016, "step": 66915 }, { "epoch": 17.39085239085239, "grad_norm": 0.6234648823738098, "learning_rate": 2.5481733350253305e-06, "loss": 0.0958, "num_input_tokens_seen": 6026448, "step": 66920 }, { "epoch": 17.392151767151766, "grad_norm": 33.78053283691406, "learning_rate": 2.545680160416089e-06, "loss": 0.0992, "num_input_tokens_seen": 6026880, "step": 66925 }, { "epoch": 17.393451143451145, "grad_norm": 0.023604581132531166, "learning_rate": 2.5431881406540266e-06, "loss": 0.001, "num_input_tokens_seen": 6027312, "step": 66930 }, { "epoch": 17.39475051975052, "grad_norm": 3.9341418743133545, "learning_rate": 2.540697275867315e-06, "loss": 0.0064, "num_input_tokens_seen": 6027792, "step": 66935 }, { "epoch": 17.396049896049895, "grad_norm": 0.4600234627723694, "learning_rate": 2.5382075661840613e-06, "loss": 0.0931, "num_input_tokens_seen": 6028240, "step": 66940 }, { "epoch": 17.397349272349274, "grad_norm": 6.228116035461426, "learning_rate": 2.535719011732321e-06, "loss": 0.0812, "num_input_tokens_seen": 6028688, "step": 66945 }, { "epoch": 17.39864864864865, "grad_norm": 0.0029769984539598227, "learning_rate": 2.533231612640069e-06, "loss": 0.0354, "num_input_tokens_seen": 6029168, "step": 66950 }, { "epoch": 17.399948024948024, "grad_norm": 0.1634892374277115, "learning_rate": 2.530745369035248e-06, "loss": 0.0488, "num_input_tokens_seen": 6029616, "step": 66955 }, { "epoch": 17.401247401247403, "grad_norm": 9.12902545928955, "learning_rate": 2.5282602810457163e-06, "loss": 0.0345, "num_input_tokens_seen": 6030080, "step": 66960 }, { "epoch": 17.402546777546778, "grad_norm": 1.1678493022918701, "learning_rate": 2.525776348799297e-06, "loss": 0.2554, "num_input_tokens_seen": 6030560, "step": 66965 }, { "epoch": 17.403846153846153, "grad_norm": 0.04669348895549774, "learning_rate": 2.5232935724237266e-06, "loss": 0.0091, "num_input_tokens_seen": 6030992, "step": 66970 }, { "epoch": 17.40514553014553, "grad_norm": 1.0559395551681519, "learning_rate": 2.520811952046703e-06, "loss": 0.1718, "num_input_tokens_seen": 6031472, "step": 66975 }, { "epoch": 17.406444906444907, "grad_norm": 0.00512583227828145, "learning_rate": 2.5183314877958663e-06, "loss": 0.019, "num_input_tokens_seen": 6031920, "step": 66980 }, { "epoch": 17.407744282744282, "grad_norm": 112.86791229248047, "learning_rate": 2.515852179798775e-06, "loss": 0.3731, "num_input_tokens_seen": 6032384, "step": 66985 }, { "epoch": 17.409043659043657, "grad_norm": 0.8506327867507935, "learning_rate": 2.513374028182955e-06, "loss": 0.0069, "num_input_tokens_seen": 6032816, "step": 66990 }, { "epoch": 17.410343035343036, "grad_norm": 1.7054803371429443, "learning_rate": 2.5108970330758515e-06, "loss": 0.2468, "num_input_tokens_seen": 6033312, "step": 66995 }, { "epoch": 17.41164241164241, "grad_norm": 57.83625030517578, "learning_rate": 2.5084211946048654e-06, "loss": 0.4116, "num_input_tokens_seen": 6033744, "step": 67000 }, { "epoch": 17.412941787941786, "grad_norm": 28.794767379760742, "learning_rate": 2.5059465128973198e-06, "loss": 0.1083, "num_input_tokens_seen": 6034176, "step": 67005 }, { "epoch": 17.414241164241165, "grad_norm": 0.1686638742685318, "learning_rate": 2.503472988080502e-06, "loss": 0.0688, "num_input_tokens_seen": 6034656, "step": 67010 }, { "epoch": 17.41554054054054, "grad_norm": 3.9239726066589355, "learning_rate": 2.501000620281621e-06, "loss": 0.0549, "num_input_tokens_seen": 6035120, "step": 67015 }, { "epoch": 17.416839916839916, "grad_norm": 44.983680725097656, "learning_rate": 2.498529409627842e-06, "loss": 0.4072, "num_input_tokens_seen": 6035584, "step": 67020 }, { "epoch": 17.418139293139294, "grad_norm": 0.011467279866337776, "learning_rate": 2.49605935624625e-06, "loss": 0.0008, "num_input_tokens_seen": 6036080, "step": 67025 }, { "epoch": 17.41943866943867, "grad_norm": 3.608194351196289, "learning_rate": 2.493590460263892e-06, "loss": 0.3493, "num_input_tokens_seen": 6036480, "step": 67030 }, { "epoch": 17.420738045738045, "grad_norm": 0.018180115148425102, "learning_rate": 2.49112272180774e-06, "loss": 0.0006, "num_input_tokens_seen": 6036944, "step": 67035 }, { "epoch": 17.422037422037423, "grad_norm": 75.36062622070312, "learning_rate": 2.4886561410047053e-06, "loss": 0.1565, "num_input_tokens_seen": 6037392, "step": 67040 }, { "epoch": 17.4233367983368, "grad_norm": 0.08757665008306503, "learning_rate": 2.486190717981665e-06, "loss": 0.081, "num_input_tokens_seen": 6037840, "step": 67045 }, { "epoch": 17.424636174636174, "grad_norm": 0.10723333805799484, "learning_rate": 2.4837264528654007e-06, "loss": 0.007, "num_input_tokens_seen": 6038256, "step": 67050 }, { "epoch": 17.425935550935552, "grad_norm": 68.44014739990234, "learning_rate": 2.4812633457826667e-06, "loss": 0.1238, "num_input_tokens_seen": 6038688, "step": 67055 }, { "epoch": 17.427234927234927, "grad_norm": 42.399593353271484, "learning_rate": 2.478801396860128e-06, "loss": 0.6794, "num_input_tokens_seen": 6039136, "step": 67060 }, { "epoch": 17.428534303534303, "grad_norm": 1.0382132530212402, "learning_rate": 2.476340606224417e-06, "loss": 0.0045, "num_input_tokens_seen": 6039632, "step": 67065 }, { "epoch": 17.42983367983368, "grad_norm": 0.10255592316389084, "learning_rate": 2.473880974002088e-06, "loss": 0.0092, "num_input_tokens_seen": 6040080, "step": 67070 }, { "epoch": 17.431133056133056, "grad_norm": 0.13932569324970245, "learning_rate": 2.4714225003196423e-06, "loss": 0.0574, "num_input_tokens_seen": 6040496, "step": 67075 }, { "epoch": 17.43243243243243, "grad_norm": 0.0038871464785188437, "learning_rate": 2.4689651853035205e-06, "loss": 0.034, "num_input_tokens_seen": 6040944, "step": 67080 }, { "epoch": 17.43373180873181, "grad_norm": 0.003201870247721672, "learning_rate": 2.4665090290801162e-06, "loss": 0.0023, "num_input_tokens_seen": 6041376, "step": 67085 }, { "epoch": 17.435031185031185, "grad_norm": 0.08913856744766235, "learning_rate": 2.4640540317757365e-06, "loss": 0.0002, "num_input_tokens_seen": 6041808, "step": 67090 }, { "epoch": 17.43633056133056, "grad_norm": 0.004935866687446833, "learning_rate": 2.4616001935166554e-06, "loss": 0.0086, "num_input_tokens_seen": 6042272, "step": 67095 }, { "epoch": 17.43762993762994, "grad_norm": 1.2095668315887451, "learning_rate": 2.4591475144290722e-06, "loss": 0.0012, "num_input_tokens_seen": 6042704, "step": 67100 }, { "epoch": 17.438929313929314, "grad_norm": 0.49114710092544556, "learning_rate": 2.4566959946391243e-06, "loss": 0.0031, "num_input_tokens_seen": 6043184, "step": 67105 }, { "epoch": 17.44022869022869, "grad_norm": 0.0018931971862912178, "learning_rate": 2.4542456342729033e-06, "loss": 0.1695, "num_input_tokens_seen": 6043632, "step": 67110 }, { "epoch": 17.441528066528065, "grad_norm": 6.65235710144043, "learning_rate": 2.4517964334564297e-06, "loss": 0.0105, "num_input_tokens_seen": 6044112, "step": 67115 }, { "epoch": 17.442827442827443, "grad_norm": 56.678314208984375, "learning_rate": 2.449348392315676e-06, "loss": 0.2709, "num_input_tokens_seen": 6044544, "step": 67120 }, { "epoch": 17.44412681912682, "grad_norm": 1.9005409479141235, "learning_rate": 2.4469015109765345e-06, "loss": 0.0218, "num_input_tokens_seen": 6045008, "step": 67125 }, { "epoch": 17.445426195426194, "grad_norm": 53.59406280517578, "learning_rate": 2.4444557895648643e-06, "loss": 0.202, "num_input_tokens_seen": 6045440, "step": 67130 }, { "epoch": 17.446725571725572, "grad_norm": 0.04321856424212456, "learning_rate": 2.4420112282064394e-06, "loss": 0.1413, "num_input_tokens_seen": 6045856, "step": 67135 }, { "epoch": 17.448024948024948, "grad_norm": 37.076171875, "learning_rate": 2.439567827026995e-06, "loss": 0.0417, "num_input_tokens_seen": 6046320, "step": 67140 }, { "epoch": 17.449324324324323, "grad_norm": 0.1986786276102066, "learning_rate": 2.4371255861521917e-06, "loss": 0.0142, "num_input_tokens_seen": 6046784, "step": 67145 }, { "epoch": 17.4506237006237, "grad_norm": 0.013769597746431828, "learning_rate": 2.4346845057076352e-06, "loss": 0.0396, "num_input_tokens_seen": 6047200, "step": 67150 }, { "epoch": 17.451923076923077, "grad_norm": 0.6747086048126221, "learning_rate": 2.4322445858188768e-06, "loss": 0.2186, "num_input_tokens_seen": 6047648, "step": 67155 }, { "epoch": 17.453222453222452, "grad_norm": 0.12389732152223587, "learning_rate": 2.429805826611409e-06, "loss": 0.0157, "num_input_tokens_seen": 6048080, "step": 67160 }, { "epoch": 17.45452182952183, "grad_norm": 0.0027997044380754232, "learning_rate": 2.4273682282106467e-06, "loss": 0.0157, "num_input_tokens_seen": 6048528, "step": 67165 }, { "epoch": 17.455821205821206, "grad_norm": 0.00482620345428586, "learning_rate": 2.424931790741969e-06, "loss": 0.0128, "num_input_tokens_seen": 6048976, "step": 67170 }, { "epoch": 17.45712058212058, "grad_norm": 0.0021419005934149027, "learning_rate": 2.42249651433068e-06, "loss": 0.2894, "num_input_tokens_seen": 6049424, "step": 67175 }, { "epoch": 17.45841995841996, "grad_norm": 0.11878031492233276, "learning_rate": 2.4200623991020243e-06, "loss": 0.1292, "num_input_tokens_seen": 6049904, "step": 67180 }, { "epoch": 17.459719334719335, "grad_norm": 3.443225145339966, "learning_rate": 2.417629445181194e-06, "loss": 0.0285, "num_input_tokens_seen": 6050352, "step": 67185 }, { "epoch": 17.46101871101871, "grad_norm": 21.14944839477539, "learning_rate": 2.4151976526933157e-06, "loss": 0.055, "num_input_tokens_seen": 6050784, "step": 67190 }, { "epoch": 17.46231808731809, "grad_norm": 45.77634811401367, "learning_rate": 2.4127670217634707e-06, "loss": 0.122, "num_input_tokens_seen": 6051232, "step": 67195 }, { "epoch": 17.463617463617464, "grad_norm": 2.219864845275879, "learning_rate": 2.410337552516653e-06, "loss": 0.2377, "num_input_tokens_seen": 6051664, "step": 67200 }, { "epoch": 17.46491683991684, "grad_norm": 12.899465560913086, "learning_rate": 2.4079092450778244e-06, "loss": 0.0126, "num_input_tokens_seen": 6052128, "step": 67205 }, { "epoch": 17.466216216216218, "grad_norm": 0.04228658229112625, "learning_rate": 2.405482099571865e-06, "loss": 0.0659, "num_input_tokens_seen": 6052560, "step": 67210 }, { "epoch": 17.467515592515593, "grad_norm": 64.41431427001953, "learning_rate": 2.4030561161236172e-06, "loss": 0.1794, "num_input_tokens_seen": 6053040, "step": 67215 }, { "epoch": 17.468814968814968, "grad_norm": 0.015350512228906155, "learning_rate": 2.4006312948578387e-06, "loss": 0.0014, "num_input_tokens_seen": 6053456, "step": 67220 }, { "epoch": 17.470114345114347, "grad_norm": 45.08639144897461, "learning_rate": 2.398207635899244e-06, "loss": 0.2788, "num_input_tokens_seen": 6053904, "step": 67225 }, { "epoch": 17.47141372141372, "grad_norm": 0.08320393413305283, "learning_rate": 2.395785139372489e-06, "loss": 0.0126, "num_input_tokens_seen": 6054368, "step": 67230 }, { "epoch": 17.472713097713097, "grad_norm": 0.027254300191998482, "learning_rate": 2.3933638054021654e-06, "loss": 0.1461, "num_input_tokens_seen": 6054848, "step": 67235 }, { "epoch": 17.474012474012476, "grad_norm": 0.03319592401385307, "learning_rate": 2.390943634112805e-06, "loss": 0.0007, "num_input_tokens_seen": 6055296, "step": 67240 }, { "epoch": 17.47531185031185, "grad_norm": 0.012119166553020477, "learning_rate": 2.3885246256288685e-06, "loss": 0.0002, "num_input_tokens_seen": 6055744, "step": 67245 }, { "epoch": 17.476611226611226, "grad_norm": 21.012170791625977, "learning_rate": 2.3861067800747845e-06, "loss": 0.1904, "num_input_tokens_seen": 6056160, "step": 67250 }, { "epoch": 17.477910602910605, "grad_norm": 27.899578094482422, "learning_rate": 2.3836900975748894e-06, "loss": 0.0382, "num_input_tokens_seen": 6056624, "step": 67255 }, { "epoch": 17.47920997920998, "grad_norm": 0.06007979065179825, "learning_rate": 2.3812745782534813e-06, "loss": 0.0006, "num_input_tokens_seen": 6057056, "step": 67260 }, { "epoch": 17.480509355509355, "grad_norm": 0.10380040109157562, "learning_rate": 2.378860222234794e-06, "loss": 0.2226, "num_input_tokens_seen": 6057552, "step": 67265 }, { "epoch": 17.48180873180873, "grad_norm": 0.6955769062042236, "learning_rate": 2.3764470296430056e-06, "loss": 0.3281, "num_input_tokens_seen": 6058016, "step": 67270 }, { "epoch": 17.48310810810811, "grad_norm": 0.017749320715665817, "learning_rate": 2.3740350006022175e-06, "loss": 0.1786, "num_input_tokens_seen": 6058432, "step": 67275 }, { "epoch": 17.484407484407484, "grad_norm": 0.029587246477603912, "learning_rate": 2.371624135236497e-06, "loss": 0.0001, "num_input_tokens_seen": 6058848, "step": 67280 }, { "epoch": 17.48570686070686, "grad_norm": 19.2428035736084, "learning_rate": 2.3692144336698195e-06, "loss": 0.0587, "num_input_tokens_seen": 6059296, "step": 67285 }, { "epoch": 17.487006237006238, "grad_norm": 1.0392504930496216, "learning_rate": 2.3668058960261307e-06, "loss": 0.0299, "num_input_tokens_seen": 6059744, "step": 67290 }, { "epoch": 17.488305613305613, "grad_norm": 6.132096290588379, "learning_rate": 2.364398522429298e-06, "loss": 0.0041, "num_input_tokens_seen": 6060208, "step": 67295 }, { "epoch": 17.489604989604988, "grad_norm": 0.020765943452715874, "learning_rate": 2.3619923130031446e-06, "loss": 0.2851, "num_input_tokens_seen": 6060640, "step": 67300 }, { "epoch": 17.490904365904367, "grad_norm": 0.4713906943798065, "learning_rate": 2.359587267871416e-06, "loss": 0.0038, "num_input_tokens_seen": 6061056, "step": 67305 }, { "epoch": 17.492203742203742, "grad_norm": 1.2156577110290527, "learning_rate": 2.357183387157802e-06, "loss": 0.0762, "num_input_tokens_seen": 6061520, "step": 67310 }, { "epoch": 17.493503118503117, "grad_norm": 54.38288497924805, "learning_rate": 2.3547806709859483e-06, "loss": 0.59, "num_input_tokens_seen": 6061968, "step": 67315 }, { "epoch": 17.494802494802496, "grad_norm": 0.0013599744997918606, "learning_rate": 2.352379119479417e-06, "loss": 0.0037, "num_input_tokens_seen": 6062400, "step": 67320 }, { "epoch": 17.49610187110187, "grad_norm": 0.0020491823088377714, "learning_rate": 2.349978732761726e-06, "loss": 0.1064, "num_input_tokens_seen": 6062832, "step": 67325 }, { "epoch": 17.497401247401246, "grad_norm": 5.917274475097656, "learning_rate": 2.3475795109563326e-06, "loss": 0.7247, "num_input_tokens_seen": 6063248, "step": 67330 }, { "epoch": 17.498700623700625, "grad_norm": 0.05012692138552666, "learning_rate": 2.3451814541866317e-06, "loss": 0.0945, "num_input_tokens_seen": 6063760, "step": 67335 }, { "epoch": 17.5, "grad_norm": 45.70290756225586, "learning_rate": 2.342784562575953e-06, "loss": 0.0575, "num_input_tokens_seen": 6064240, "step": 67340 }, { "epoch": 17.501299376299375, "grad_norm": 0.4827078878879547, "learning_rate": 2.3403888362475782e-06, "loss": 0.0032, "num_input_tokens_seen": 6064704, "step": 67345 }, { "epoch": 17.502598752598754, "grad_norm": 4.0108642578125, "learning_rate": 2.3379942753247118e-06, "loss": 0.0175, "num_input_tokens_seen": 6065168, "step": 67350 }, { "epoch": 17.50389812889813, "grad_norm": 0.04718944430351257, "learning_rate": 2.335600879930516e-06, "loss": 0.1806, "num_input_tokens_seen": 6065632, "step": 67355 }, { "epoch": 17.505197505197504, "grad_norm": 0.11584363877773285, "learning_rate": 2.3332086501880818e-06, "loss": 0.2095, "num_input_tokens_seen": 6066080, "step": 67360 }, { "epoch": 17.506496881496883, "grad_norm": 0.024698087945580482, "learning_rate": 2.3308175862204435e-06, "loss": 0.0097, "num_input_tokens_seen": 6066528, "step": 67365 }, { "epoch": 17.507796257796258, "grad_norm": 1.3653004169464111, "learning_rate": 2.3284276881505805e-06, "loss": 0.183, "num_input_tokens_seen": 6066960, "step": 67370 }, { "epoch": 17.509095634095633, "grad_norm": 0.00227985717356205, "learning_rate": 2.3260389561014035e-06, "loss": 0.0106, "num_input_tokens_seen": 6067408, "step": 67375 }, { "epoch": 17.510395010395012, "grad_norm": 0.16928578913211823, "learning_rate": 2.323651390195769e-06, "loss": 0.0646, "num_input_tokens_seen": 6067856, "step": 67380 }, { "epoch": 17.511694386694387, "grad_norm": 0.16096030175685883, "learning_rate": 2.321264990556468e-06, "loss": 0.0103, "num_input_tokens_seen": 6068304, "step": 67385 }, { "epoch": 17.512993762993762, "grad_norm": 47.348487854003906, "learning_rate": 2.3188797573062415e-06, "loss": 0.1071, "num_input_tokens_seen": 6068768, "step": 67390 }, { "epoch": 17.51429313929314, "grad_norm": 0.01671757362782955, "learning_rate": 2.3164956905677577e-06, "loss": 0.0072, "num_input_tokens_seen": 6069264, "step": 67395 }, { "epoch": 17.515592515592516, "grad_norm": 0.34551143646240234, "learning_rate": 2.314112790463635e-06, "loss": 0.3551, "num_input_tokens_seen": 6069744, "step": 67400 }, { "epoch": 17.51689189189189, "grad_norm": 0.001103950315155089, "learning_rate": 2.3117310571164286e-06, "loss": 0.3339, "num_input_tokens_seen": 6070192, "step": 67405 }, { "epoch": 17.518191268191266, "grad_norm": 0.029772235080599785, "learning_rate": 2.3093504906486346e-06, "loss": 0.0087, "num_input_tokens_seen": 6070656, "step": 67410 }, { "epoch": 17.519490644490645, "grad_norm": 0.09974957257509232, "learning_rate": 2.306971091182686e-06, "loss": 0.5009, "num_input_tokens_seen": 6071104, "step": 67415 }, { "epoch": 17.52079002079002, "grad_norm": 38.129981994628906, "learning_rate": 2.3045928588409594e-06, "loss": 0.2149, "num_input_tokens_seen": 6071600, "step": 67420 }, { "epoch": 17.522089397089395, "grad_norm": 0.40337634086608887, "learning_rate": 2.3022157937457627e-06, "loss": 0.01, "num_input_tokens_seen": 6072032, "step": 67425 }, { "epoch": 17.523388773388774, "grad_norm": 77.2115249633789, "learning_rate": 2.2998398960193597e-06, "loss": 0.4353, "num_input_tokens_seen": 6072448, "step": 67430 }, { "epoch": 17.52468814968815, "grad_norm": 10.292171478271484, "learning_rate": 2.297465165783938e-06, "loss": 0.3605, "num_input_tokens_seen": 6072928, "step": 67435 }, { "epoch": 17.525987525987524, "grad_norm": 0.4251589775085449, "learning_rate": 2.2950916031616338e-06, "loss": 0.0009, "num_input_tokens_seen": 6073392, "step": 67440 }, { "epoch": 17.527286902286903, "grad_norm": 0.17793741822242737, "learning_rate": 2.2927192082745296e-06, "loss": 0.0079, "num_input_tokens_seen": 6073872, "step": 67445 }, { "epoch": 17.52858627858628, "grad_norm": 0.002863219939172268, "learning_rate": 2.2903479812446284e-06, "loss": 0.0715, "num_input_tokens_seen": 6074320, "step": 67450 }, { "epoch": 17.529885654885653, "grad_norm": 0.029241308569908142, "learning_rate": 2.2879779221938957e-06, "loss": 0.0002, "num_input_tokens_seen": 6074752, "step": 67455 }, { "epoch": 17.531185031185032, "grad_norm": 85.05103302001953, "learning_rate": 2.2856090312442123e-06, "loss": 0.4782, "num_input_tokens_seen": 6075232, "step": 67460 }, { "epoch": 17.532484407484407, "grad_norm": 0.41367122530937195, "learning_rate": 2.2832413085174282e-06, "loss": 0.0313, "num_input_tokens_seen": 6075696, "step": 67465 }, { "epoch": 17.533783783783782, "grad_norm": 0.014465429820120335, "learning_rate": 2.2808747541353035e-06, "loss": 0.207, "num_input_tokens_seen": 6076144, "step": 67470 }, { "epoch": 17.53508316008316, "grad_norm": 3.5121097564697266, "learning_rate": 2.2785093682195585e-06, "loss": 0.0105, "num_input_tokens_seen": 6076544, "step": 67475 }, { "epoch": 17.536382536382536, "grad_norm": 0.18221880495548248, "learning_rate": 2.276145150891848e-06, "loss": 0.0245, "num_input_tokens_seen": 6076992, "step": 67480 }, { "epoch": 17.53768191268191, "grad_norm": 2.2258644104003906, "learning_rate": 2.2737821022737693e-06, "loss": 0.177, "num_input_tokens_seen": 6077424, "step": 67485 }, { "epoch": 17.53898128898129, "grad_norm": 0.010349515825510025, "learning_rate": 2.2714202224868477e-06, "loss": 0.0301, "num_input_tokens_seen": 6077872, "step": 67490 }, { "epoch": 17.540280665280665, "grad_norm": 42.753578186035156, "learning_rate": 2.2690595116525688e-06, "loss": 0.1027, "num_input_tokens_seen": 6078304, "step": 67495 }, { "epoch": 17.54158004158004, "grad_norm": 0.3432725667953491, "learning_rate": 2.2666999698923357e-06, "loss": 0.1361, "num_input_tokens_seen": 6078736, "step": 67500 }, { "epoch": 17.54287941787942, "grad_norm": 29.0028018951416, "learning_rate": 2.2643415973275016e-06, "loss": 0.0801, "num_input_tokens_seen": 6079184, "step": 67505 }, { "epoch": 17.544178794178794, "grad_norm": 0.25472891330718994, "learning_rate": 2.261984394079364e-06, "loss": 0.002, "num_input_tokens_seen": 6079600, "step": 67510 }, { "epoch": 17.54547817047817, "grad_norm": 0.04929046332836151, "learning_rate": 2.259628360269153e-06, "loss": 0.4299, "num_input_tokens_seen": 6080048, "step": 67515 }, { "epoch": 17.546777546777548, "grad_norm": 0.1958322376012802, "learning_rate": 2.2572734960180502e-06, "loss": 0.0726, "num_input_tokens_seen": 6080464, "step": 67520 }, { "epoch": 17.548076923076923, "grad_norm": 0.18144725263118744, "learning_rate": 2.2549198014471584e-06, "loss": 0.0224, "num_input_tokens_seen": 6080896, "step": 67525 }, { "epoch": 17.5493762993763, "grad_norm": 0.4354482591152191, "learning_rate": 2.2525672766775392e-06, "loss": 0.0033, "num_input_tokens_seen": 6081344, "step": 67530 }, { "epoch": 17.550675675675677, "grad_norm": 2.1047778129577637, "learning_rate": 2.250215921830176e-06, "loss": 0.0118, "num_input_tokens_seen": 6081792, "step": 67535 }, { "epoch": 17.551975051975052, "grad_norm": 0.04640989378094673, "learning_rate": 2.247865737026006e-06, "loss": 0.004, "num_input_tokens_seen": 6082240, "step": 67540 }, { "epoch": 17.553274428274428, "grad_norm": 0.8954058289527893, "learning_rate": 2.2455167223859012e-06, "loss": 0.0009, "num_input_tokens_seen": 6082720, "step": 67545 }, { "epoch": 17.554573804573806, "grad_norm": 6.926110744476318, "learning_rate": 2.243168878030677e-06, "loss": 0.2265, "num_input_tokens_seen": 6083136, "step": 67550 }, { "epoch": 17.55587318087318, "grad_norm": 3.2028725147247314, "learning_rate": 2.240822204081078e-06, "loss": 0.0166, "num_input_tokens_seen": 6083568, "step": 67555 }, { "epoch": 17.557172557172557, "grad_norm": 0.0638825073838234, "learning_rate": 2.2384767006578045e-06, "loss": 0.3695, "num_input_tokens_seen": 6084048, "step": 67560 }, { "epoch": 17.558471933471935, "grad_norm": 0.5876691341400146, "learning_rate": 2.236132367881483e-06, "loss": 0.1795, "num_input_tokens_seen": 6084512, "step": 67565 }, { "epoch": 17.55977130977131, "grad_norm": 0.5132446885108948, "learning_rate": 2.2337892058726826e-06, "loss": 0.0102, "num_input_tokens_seen": 6084992, "step": 67570 }, { "epoch": 17.561070686070686, "grad_norm": 0.29758161306381226, "learning_rate": 2.231447214751914e-06, "loss": 0.4288, "num_input_tokens_seen": 6085440, "step": 67575 }, { "epoch": 17.56237006237006, "grad_norm": 0.3299753963947296, "learning_rate": 2.2291063946396324e-06, "loss": 0.0166, "num_input_tokens_seen": 6085872, "step": 67580 }, { "epoch": 17.56366943866944, "grad_norm": 1.6648449897766113, "learning_rate": 2.2267667456562307e-06, "loss": 0.0045, "num_input_tokens_seen": 6086336, "step": 67585 }, { "epoch": 17.564968814968815, "grad_norm": 1.5889893770217896, "learning_rate": 2.2244282679220326e-06, "loss": 0.0691, "num_input_tokens_seen": 6086768, "step": 67590 }, { "epoch": 17.56626819126819, "grad_norm": 0.1303994208574295, "learning_rate": 2.2220909615573134e-06, "loss": 0.1136, "num_input_tokens_seen": 6087232, "step": 67595 }, { "epoch": 17.56756756756757, "grad_norm": 27.57500648498535, "learning_rate": 2.219754826682277e-06, "loss": 0.0256, "num_input_tokens_seen": 6087680, "step": 67600 }, { "epoch": 17.568866943866944, "grad_norm": 0.09074666351079941, "learning_rate": 2.2174198634170836e-06, "loss": 0.0009, "num_input_tokens_seen": 6088096, "step": 67605 }, { "epoch": 17.57016632016632, "grad_norm": 0.8496191501617432, "learning_rate": 2.2150860718818085e-06, "loss": 0.3331, "num_input_tokens_seen": 6088544, "step": 67610 }, { "epoch": 17.571465696465697, "grad_norm": 0.014549000188708305, "learning_rate": 2.212753452196489e-06, "loss": 0.4045, "num_input_tokens_seen": 6088992, "step": 67615 }, { "epoch": 17.572765072765073, "grad_norm": 0.06307241320610046, "learning_rate": 2.210422004481094e-06, "loss": 0.2728, "num_input_tokens_seen": 6089456, "step": 67620 }, { "epoch": 17.574064449064448, "grad_norm": 0.1267291009426117, "learning_rate": 2.208091728855535e-06, "loss": 0.068, "num_input_tokens_seen": 6089904, "step": 67625 }, { "epoch": 17.575363825363826, "grad_norm": 11.273030281066895, "learning_rate": 2.205762625439656e-06, "loss": 0.2122, "num_input_tokens_seen": 6090320, "step": 67630 }, { "epoch": 17.5766632016632, "grad_norm": 0.08942218124866486, "learning_rate": 2.203434694353243e-06, "loss": 0.0348, "num_input_tokens_seen": 6090768, "step": 67635 }, { "epoch": 17.577962577962577, "grad_norm": 24.7279109954834, "learning_rate": 2.201107935716029e-06, "loss": 0.0801, "num_input_tokens_seen": 6091248, "step": 67640 }, { "epoch": 17.579261954261955, "grad_norm": 0.06327956169843674, "learning_rate": 2.198782349647674e-06, "loss": 0.0484, "num_input_tokens_seen": 6091664, "step": 67645 }, { "epoch": 17.58056133056133, "grad_norm": 0.011988270096480846, "learning_rate": 2.1964579362677903e-06, "loss": 0.0044, "num_input_tokens_seen": 6092128, "step": 67650 }, { "epoch": 17.581860706860706, "grad_norm": 30.677101135253906, "learning_rate": 2.194134695695926e-06, "loss": 0.2538, "num_input_tokens_seen": 6092576, "step": 67655 }, { "epoch": 17.583160083160084, "grad_norm": 0.0017726466758176684, "learning_rate": 2.1918126280515678e-06, "loss": 0.0491, "num_input_tokens_seen": 6093040, "step": 67660 }, { "epoch": 17.58445945945946, "grad_norm": 0.03288314864039421, "learning_rate": 2.1894917334541354e-06, "loss": 0.1935, "num_input_tokens_seen": 6093472, "step": 67665 }, { "epoch": 17.585758835758835, "grad_norm": 20.78582763671875, "learning_rate": 2.187172012023006e-06, "loss": 0.1366, "num_input_tokens_seen": 6093904, "step": 67670 }, { "epoch": 17.587058212058214, "grad_norm": 2.3255348205566406, "learning_rate": 2.184853463877473e-06, "loss": 0.2611, "num_input_tokens_seen": 6094320, "step": 67675 }, { "epoch": 17.58835758835759, "grad_norm": 69.33177947998047, "learning_rate": 2.1825360891367884e-06, "loss": 0.4198, "num_input_tokens_seen": 6094752, "step": 67680 }, { "epoch": 17.589656964656964, "grad_norm": 2.626281261444092, "learning_rate": 2.1802198879201346e-06, "loss": 0.004, "num_input_tokens_seen": 6095200, "step": 67685 }, { "epoch": 17.590956340956343, "grad_norm": 0.7123983502388, "learning_rate": 2.1779048603466357e-06, "loss": 0.0202, "num_input_tokens_seen": 6095632, "step": 67690 }, { "epoch": 17.592255717255718, "grad_norm": 0.02770603820681572, "learning_rate": 2.17559100653536e-06, "loss": 0.3011, "num_input_tokens_seen": 6096064, "step": 67695 }, { "epoch": 17.593555093555093, "grad_norm": 0.13273915648460388, "learning_rate": 2.1732783266053045e-06, "loss": 0.2668, "num_input_tokens_seen": 6096512, "step": 67700 }, { "epoch": 17.59485446985447, "grad_norm": 2.046198844909668, "learning_rate": 2.1709668206754215e-06, "loss": 0.1882, "num_input_tokens_seen": 6096960, "step": 67705 }, { "epoch": 17.596153846153847, "grad_norm": 0.0017379082273691893, "learning_rate": 2.1686564888645816e-06, "loss": 0.0421, "num_input_tokens_seen": 6097408, "step": 67710 }, { "epoch": 17.597453222453222, "grad_norm": 50.77317428588867, "learning_rate": 2.166347331291621e-06, "loss": 0.4744, "num_input_tokens_seen": 6097872, "step": 67715 }, { "epoch": 17.598752598752597, "grad_norm": 0.03652529418468475, "learning_rate": 2.164039348075292e-06, "loss": 0.0008, "num_input_tokens_seen": 6098336, "step": 67720 }, { "epoch": 17.600051975051976, "grad_norm": 0.3193370997905731, "learning_rate": 2.161732539334299e-06, "loss": 0.0435, "num_input_tokens_seen": 6098816, "step": 67725 }, { "epoch": 17.60135135135135, "grad_norm": 0.010685883462429047, "learning_rate": 2.1594269051872812e-06, "loss": 0.0002, "num_input_tokens_seen": 6099248, "step": 67730 }, { "epoch": 17.602650727650726, "grad_norm": 0.003790011629462242, "learning_rate": 2.1571224457528294e-06, "loss": 0.004, "num_input_tokens_seen": 6099664, "step": 67735 }, { "epoch": 17.603950103950105, "grad_norm": 0.03610963001847267, "learning_rate": 2.154819161149452e-06, "loss": 0.0068, "num_input_tokens_seen": 6100096, "step": 67740 }, { "epoch": 17.60524948024948, "grad_norm": 0.1049560159444809, "learning_rate": 2.15251705149562e-06, "loss": 0.0043, "num_input_tokens_seen": 6100512, "step": 67745 }, { "epoch": 17.606548856548855, "grad_norm": 0.03099440038204193, "learning_rate": 2.150216116909723e-06, "loss": 0.1468, "num_input_tokens_seen": 6100976, "step": 67750 }, { "epoch": 17.607848232848234, "grad_norm": 1.4138445854187012, "learning_rate": 2.1479163575101075e-06, "loss": 0.0893, "num_input_tokens_seen": 6101408, "step": 67755 }, { "epoch": 17.60914760914761, "grad_norm": 1.4962334632873535, "learning_rate": 2.1456177734150455e-06, "loss": 0.0021, "num_input_tokens_seen": 6101888, "step": 67760 }, { "epoch": 17.610446985446984, "grad_norm": 0.013078033924102783, "learning_rate": 2.14332036474276e-06, "loss": 0.2911, "num_input_tokens_seen": 6102368, "step": 67765 }, { "epoch": 17.611746361746363, "grad_norm": 0.2270326465368271, "learning_rate": 2.1410241316114133e-06, "loss": 0.0031, "num_input_tokens_seen": 6102800, "step": 67770 }, { "epoch": 17.613045738045738, "grad_norm": 0.023120716214179993, "learning_rate": 2.1387290741390923e-06, "loss": 0.0002, "num_input_tokens_seen": 6103264, "step": 67775 }, { "epoch": 17.614345114345113, "grad_norm": 0.018369805067777634, "learning_rate": 2.1364351924438446e-06, "loss": 0.0158, "num_input_tokens_seen": 6103712, "step": 67780 }, { "epoch": 17.615644490644492, "grad_norm": 0.11883246153593063, "learning_rate": 2.1341424866436364e-06, "loss": 0.1905, "num_input_tokens_seen": 6104144, "step": 67785 }, { "epoch": 17.616943866943867, "grad_norm": 0.3687242865562439, "learning_rate": 2.1318509568563904e-06, "loss": 0.0441, "num_input_tokens_seen": 6104608, "step": 67790 }, { "epoch": 17.618243243243242, "grad_norm": 0.006707748398184776, "learning_rate": 2.1295606031999623e-06, "loss": 0.2126, "num_input_tokens_seen": 6105056, "step": 67795 }, { "epoch": 17.61954261954262, "grad_norm": 1.0778601169586182, "learning_rate": 2.127271425792146e-06, "loss": 0.0865, "num_input_tokens_seen": 6105504, "step": 67800 }, { "epoch": 17.620841995841996, "grad_norm": 0.0005755769088864326, "learning_rate": 2.124983424750676e-06, "loss": 0.0007, "num_input_tokens_seen": 6106016, "step": 67805 }, { "epoch": 17.62214137214137, "grad_norm": 68.12334442138672, "learning_rate": 2.1226966001932277e-06, "loss": 0.2495, "num_input_tokens_seen": 6106448, "step": 67810 }, { "epoch": 17.62344074844075, "grad_norm": 7.665431976318359, "learning_rate": 2.120410952237409e-06, "loss": 0.0419, "num_input_tokens_seen": 6106880, "step": 67815 }, { "epoch": 17.624740124740125, "grad_norm": 41.059810638427734, "learning_rate": 2.1181264810007817e-06, "loss": 0.2645, "num_input_tokens_seen": 6107296, "step": 67820 }, { "epoch": 17.6260395010395, "grad_norm": 0.5149469375610352, "learning_rate": 2.11584318660083e-06, "loss": 0.3521, "num_input_tokens_seen": 6107744, "step": 67825 }, { "epoch": 17.62733887733888, "grad_norm": 0.042104437947273254, "learning_rate": 2.11356106915499e-06, "loss": 0.0024, "num_input_tokens_seen": 6108224, "step": 67830 }, { "epoch": 17.628638253638254, "grad_norm": 0.08746044337749481, "learning_rate": 2.1112801287806378e-06, "loss": 0.4303, "num_input_tokens_seen": 6108640, "step": 67835 }, { "epoch": 17.62993762993763, "grad_norm": 86.29537963867188, "learning_rate": 2.1090003655950737e-06, "loss": 0.2358, "num_input_tokens_seen": 6109120, "step": 67840 }, { "epoch": 17.631237006237008, "grad_norm": 25.609121322631836, "learning_rate": 2.106721779715559e-06, "loss": 0.0662, "num_input_tokens_seen": 6109568, "step": 67845 }, { "epoch": 17.632536382536383, "grad_norm": 7.712369441986084, "learning_rate": 2.1044443712592758e-06, "loss": 0.5471, "num_input_tokens_seen": 6110000, "step": 67850 }, { "epoch": 17.633835758835758, "grad_norm": 83.41804504394531, "learning_rate": 2.102168140343358e-06, "loss": 0.1945, "num_input_tokens_seen": 6110464, "step": 67855 }, { "epoch": 17.635135135135137, "grad_norm": 0.07257691025733948, "learning_rate": 2.0998930870848698e-06, "loss": 0.0512, "num_input_tokens_seen": 6110944, "step": 67860 }, { "epoch": 17.636434511434512, "grad_norm": 41.31818771362305, "learning_rate": 2.0976192116008208e-06, "loss": 0.5648, "num_input_tokens_seen": 6111360, "step": 67865 }, { "epoch": 17.637733887733887, "grad_norm": 0.0145048126578331, "learning_rate": 2.095346514008159e-06, "loss": 0.0162, "num_input_tokens_seen": 6111776, "step": 67870 }, { "epoch": 17.639033264033262, "grad_norm": 0.001587852486409247, "learning_rate": 2.093074994423777e-06, "loss": 0.2229, "num_input_tokens_seen": 6112224, "step": 67875 }, { "epoch": 17.64033264033264, "grad_norm": 0.5902262926101685, "learning_rate": 2.0908046529644926e-06, "loss": 0.4533, "num_input_tokens_seen": 6112704, "step": 67880 }, { "epoch": 17.641632016632016, "grad_norm": 0.005167732480913401, "learning_rate": 2.0885354897470787e-06, "loss": 0.0303, "num_input_tokens_seen": 6113216, "step": 67885 }, { "epoch": 17.64293139293139, "grad_norm": 0.08037939667701721, "learning_rate": 2.0862675048882395e-06, "loss": 0.1827, "num_input_tokens_seen": 6113712, "step": 67890 }, { "epoch": 17.64423076923077, "grad_norm": 0.04457029327750206, "learning_rate": 2.0840006985046094e-06, "loss": 0.0644, "num_input_tokens_seen": 6114160, "step": 67895 }, { "epoch": 17.645530145530145, "grad_norm": 36.560760498046875, "learning_rate": 2.081735070712784e-06, "loss": 0.0483, "num_input_tokens_seen": 6114656, "step": 67900 }, { "epoch": 17.64682952182952, "grad_norm": 2.244159698486328, "learning_rate": 2.0794706216292813e-06, "loss": 0.0188, "num_input_tokens_seen": 6115120, "step": 67905 }, { "epoch": 17.6481288981289, "grad_norm": 1.4489812850952148, "learning_rate": 2.077207351370572e-06, "loss": 0.1448, "num_input_tokens_seen": 6115552, "step": 67910 }, { "epoch": 17.649428274428274, "grad_norm": 0.8849015235900879, "learning_rate": 2.074945260053046e-06, "loss": 0.2319, "num_input_tokens_seen": 6116016, "step": 67915 }, { "epoch": 17.65072765072765, "grad_norm": 0.07731364667415619, "learning_rate": 2.072684347793055e-06, "loss": 0.0499, "num_input_tokens_seen": 6116480, "step": 67920 }, { "epoch": 17.652027027027028, "grad_norm": 16.348453521728516, "learning_rate": 2.070424614706873e-06, "loss": 0.0602, "num_input_tokens_seen": 6116944, "step": 67925 }, { "epoch": 17.653326403326403, "grad_norm": 51.86083221435547, "learning_rate": 2.068166060910728e-06, "loss": 0.2635, "num_input_tokens_seen": 6117376, "step": 67930 }, { "epoch": 17.65462577962578, "grad_norm": 32.70808792114258, "learning_rate": 2.0659086865207677e-06, "loss": 0.4934, "num_input_tokens_seen": 6117824, "step": 67935 }, { "epoch": 17.655925155925157, "grad_norm": 0.008036043494939804, "learning_rate": 2.063652491653101e-06, "loss": 0.0361, "num_input_tokens_seen": 6118272, "step": 67940 }, { "epoch": 17.657224532224532, "grad_norm": 63.77527618408203, "learning_rate": 2.0613974764237626e-06, "loss": 0.1817, "num_input_tokens_seen": 6118720, "step": 67945 }, { "epoch": 17.658523908523907, "grad_norm": 22.42171859741211, "learning_rate": 2.0591436409487326e-06, "loss": 0.4993, "num_input_tokens_seen": 6119152, "step": 67950 }, { "epoch": 17.659823284823286, "grad_norm": 41.3237190246582, "learning_rate": 2.056890985343929e-06, "loss": 0.5144, "num_input_tokens_seen": 6119600, "step": 67955 }, { "epoch": 17.66112266112266, "grad_norm": 0.007697305176407099, "learning_rate": 2.054639509725198e-06, "loss": 0.2621, "num_input_tokens_seen": 6120080, "step": 67960 }, { "epoch": 17.662422037422036, "grad_norm": 14.429794311523438, "learning_rate": 2.05238921420835e-06, "loss": 0.1271, "num_input_tokens_seen": 6120512, "step": 67965 }, { "epoch": 17.663721413721415, "grad_norm": 0.23906709253787994, "learning_rate": 2.0501400989091036e-06, "loss": 0.0099, "num_input_tokens_seen": 6120944, "step": 67970 }, { "epoch": 17.66502079002079, "grad_norm": 24.930234909057617, "learning_rate": 2.0478921639431436e-06, "loss": 0.4363, "num_input_tokens_seen": 6121408, "step": 67975 }, { "epoch": 17.666320166320165, "grad_norm": 40.76183319091797, "learning_rate": 2.0456454094260807e-06, "loss": 0.1449, "num_input_tokens_seen": 6121872, "step": 67980 }, { "epoch": 17.667619542619544, "grad_norm": 0.8003342151641846, "learning_rate": 2.043399835473475e-06, "loss": 0.1232, "num_input_tokens_seen": 6122368, "step": 67985 }, { "epoch": 17.66891891891892, "grad_norm": 0.008035753853619099, "learning_rate": 2.041155442200804e-06, "loss": 0.3748, "num_input_tokens_seen": 6122816, "step": 67990 }, { "epoch": 17.670218295218294, "grad_norm": 0.0017723412020131946, "learning_rate": 2.0389122297235134e-06, "loss": 0.0026, "num_input_tokens_seen": 6123216, "step": 67995 }, { "epoch": 17.671517671517673, "grad_norm": 13.392213821411133, "learning_rate": 2.036670198156962e-06, "loss": 0.1324, "num_input_tokens_seen": 6123664, "step": 68000 }, { "epoch": 17.67281704781705, "grad_norm": 0.0014107031747698784, "learning_rate": 2.034429347616468e-06, "loss": 0.0616, "num_input_tokens_seen": 6124112, "step": 68005 }, { "epoch": 17.674116424116423, "grad_norm": 0.025094367563724518, "learning_rate": 2.0321896782172693e-06, "loss": 0.1692, "num_input_tokens_seen": 6124544, "step": 68010 }, { "epoch": 17.6754158004158, "grad_norm": 0.10650341957807541, "learning_rate": 2.029951190074572e-06, "loss": 0.0015, "num_input_tokens_seen": 6124992, "step": 68015 }, { "epoch": 17.676715176715177, "grad_norm": 0.004484143573790789, "learning_rate": 2.027713883303492e-06, "loss": 0.0006, "num_input_tokens_seen": 6125488, "step": 68020 }, { "epoch": 17.678014553014552, "grad_norm": 4.468160152435303, "learning_rate": 2.0254777580190954e-06, "loss": 0.0084, "num_input_tokens_seen": 6125952, "step": 68025 }, { "epoch": 17.679313929313928, "grad_norm": 0.00037768628681078553, "learning_rate": 2.023242814336393e-06, "loss": 0.1514, "num_input_tokens_seen": 6126416, "step": 68030 }, { "epoch": 17.680613305613306, "grad_norm": 76.5673599243164, "learning_rate": 2.0210090523703232e-06, "loss": 0.4193, "num_input_tokens_seen": 6126864, "step": 68035 }, { "epoch": 17.68191268191268, "grad_norm": 5.584236145019531, "learning_rate": 2.0187764722357782e-06, "loss": 0.0336, "num_input_tokens_seen": 6127280, "step": 68040 }, { "epoch": 17.683212058212057, "grad_norm": 0.644973635673523, "learning_rate": 2.0165450740475765e-06, "loss": 0.008, "num_input_tokens_seen": 6127680, "step": 68045 }, { "epoch": 17.684511434511435, "grad_norm": 0.052290741354227066, "learning_rate": 2.014314857920488e-06, "loss": 0.0143, "num_input_tokens_seen": 6128096, "step": 68050 }, { "epoch": 17.68581081081081, "grad_norm": 0.47706982493400574, "learning_rate": 2.0120858239692065e-06, "loss": 0.4539, "num_input_tokens_seen": 6128512, "step": 68055 }, { "epoch": 17.687110187110186, "grad_norm": 0.08690454065799713, "learning_rate": 2.0098579723083823e-06, "loss": 0.025, "num_input_tokens_seen": 6128944, "step": 68060 }, { "epoch": 17.688409563409564, "grad_norm": 9.398651123046875, "learning_rate": 2.0076313030525844e-06, "loss": 0.0148, "num_input_tokens_seen": 6129344, "step": 68065 }, { "epoch": 17.68970893970894, "grad_norm": 0.8999364376068115, "learning_rate": 2.005405816316344e-06, "loss": 0.4371, "num_input_tokens_seen": 6129808, "step": 68070 }, { "epoch": 17.691008316008315, "grad_norm": 0.0014657359570264816, "learning_rate": 2.0031815122141113e-06, "loss": 0.1164, "num_input_tokens_seen": 6130224, "step": 68075 }, { "epoch": 17.692307692307693, "grad_norm": 27.315717697143555, "learning_rate": 2.0009583908602857e-06, "loss": 0.0416, "num_input_tokens_seen": 6130720, "step": 68080 }, { "epoch": 17.69360706860707, "grad_norm": 31.310190200805664, "learning_rate": 1.9987364523692125e-06, "loss": 0.0381, "num_input_tokens_seen": 6131200, "step": 68085 }, { "epoch": 17.694906444906444, "grad_norm": 36.09074401855469, "learning_rate": 1.9965156968551586e-06, "loss": 0.2108, "num_input_tokens_seen": 6131632, "step": 68090 }, { "epoch": 17.696205821205822, "grad_norm": 0.20300975441932678, "learning_rate": 1.994296124432346e-06, "loss": 0.1181, "num_input_tokens_seen": 6132064, "step": 68095 }, { "epoch": 17.697505197505198, "grad_norm": 0.004438143689185381, "learning_rate": 1.9920777352149255e-06, "loss": 0.2646, "num_input_tokens_seen": 6132544, "step": 68100 }, { "epoch": 17.698804573804573, "grad_norm": 19.800992965698242, "learning_rate": 1.989860529316992e-06, "loss": 0.0218, "num_input_tokens_seen": 6132992, "step": 68105 }, { "epoch": 17.70010395010395, "grad_norm": 0.045665036886930466, "learning_rate": 1.987644506852579e-06, "loss": 0.1018, "num_input_tokens_seen": 6133440, "step": 68110 }, { "epoch": 17.701403326403327, "grad_norm": 0.16772885620594025, "learning_rate": 1.9854296679356536e-06, "loss": 0.1086, "num_input_tokens_seen": 6133920, "step": 68115 }, { "epoch": 17.7027027027027, "grad_norm": 0.017424479126930237, "learning_rate": 1.9832160126801365e-06, "loss": 0.0682, "num_input_tokens_seen": 6134384, "step": 68120 }, { "epoch": 17.70400207900208, "grad_norm": 0.2686099410057068, "learning_rate": 1.9810035411998746e-06, "loss": 0.0037, "num_input_tokens_seen": 6134832, "step": 68125 }, { "epoch": 17.705301455301456, "grad_norm": 0.02706989087164402, "learning_rate": 1.9787922536086522e-06, "loss": 0.0026, "num_input_tokens_seen": 6135312, "step": 68130 }, { "epoch": 17.70660083160083, "grad_norm": 7.329216480255127, "learning_rate": 1.976582150020209e-06, "loss": 0.4396, "num_input_tokens_seen": 6135760, "step": 68135 }, { "epoch": 17.70790020790021, "grad_norm": 8.621125221252441, "learning_rate": 1.9743732305481986e-06, "loss": 0.0482, "num_input_tokens_seen": 6136208, "step": 68140 }, { "epoch": 17.709199584199585, "grad_norm": 12.371623039245605, "learning_rate": 1.972165495306241e-06, "loss": 0.0752, "num_input_tokens_seen": 6136640, "step": 68145 }, { "epoch": 17.71049896049896, "grad_norm": 3.070431709289551, "learning_rate": 1.969958944407871e-06, "loss": 0.134, "num_input_tokens_seen": 6137104, "step": 68150 }, { "epoch": 17.71179833679834, "grad_norm": 4.915381908416748, "learning_rate": 1.9677535779665803e-06, "loss": 0.1996, "num_input_tokens_seen": 6137616, "step": 68155 }, { "epoch": 17.713097713097714, "grad_norm": 0.009308183565735817, "learning_rate": 1.965549396095795e-06, "loss": 0.2174, "num_input_tokens_seen": 6138080, "step": 68160 }, { "epoch": 17.71439708939709, "grad_norm": 0.011449206620454788, "learning_rate": 1.963346398908869e-06, "loss": 0.1688, "num_input_tokens_seen": 6138528, "step": 68165 }, { "epoch": 17.715696465696467, "grad_norm": 49.98352813720703, "learning_rate": 1.9611445865191146e-06, "loss": 0.1708, "num_input_tokens_seen": 6138976, "step": 68170 }, { "epoch": 17.716995841995843, "grad_norm": 0.2853148877620697, "learning_rate": 1.9589439590397664e-06, "loss": 0.0944, "num_input_tokens_seen": 6139424, "step": 68175 }, { "epoch": 17.718295218295218, "grad_norm": 0.5389328598976135, "learning_rate": 1.956744516584011e-06, "loss": 0.2387, "num_input_tokens_seen": 6139856, "step": 68180 }, { "epoch": 17.719594594594593, "grad_norm": 0.14192570745944977, "learning_rate": 1.954546259264961e-06, "loss": 0.0695, "num_input_tokens_seen": 6140304, "step": 68185 }, { "epoch": 17.72089397089397, "grad_norm": 0.1577262282371521, "learning_rate": 1.952349187195676e-06, "loss": 0.0833, "num_input_tokens_seen": 6140752, "step": 68190 }, { "epoch": 17.722193347193347, "grad_norm": 0.0033619108144193888, "learning_rate": 1.9501533004891577e-06, "loss": 0.1424, "num_input_tokens_seen": 6141200, "step": 68195 }, { "epoch": 17.723492723492722, "grad_norm": 0.18208560347557068, "learning_rate": 1.9479585992583426e-06, "loss": 0.3993, "num_input_tokens_seen": 6141616, "step": 68200 }, { "epoch": 17.7247920997921, "grad_norm": 0.24572637677192688, "learning_rate": 1.9457650836160997e-06, "loss": 0.2959, "num_input_tokens_seen": 6142016, "step": 68205 }, { "epoch": 17.726091476091476, "grad_norm": 0.5737617015838623, "learning_rate": 1.9435727536752547e-06, "loss": 0.0011, "num_input_tokens_seen": 6142480, "step": 68210 }, { "epoch": 17.72739085239085, "grad_norm": 0.0078502856194973, "learning_rate": 1.941381609548551e-06, "loss": 0.0071, "num_input_tokens_seen": 6142896, "step": 68215 }, { "epoch": 17.72869022869023, "grad_norm": 0.06316927075386047, "learning_rate": 1.939191651348685e-06, "loss": 0.0104, "num_input_tokens_seen": 6143392, "step": 68220 }, { "epoch": 17.729989604989605, "grad_norm": 0.9234034419059753, "learning_rate": 1.937002879188285e-06, "loss": 0.0341, "num_input_tokens_seen": 6143840, "step": 68225 }, { "epoch": 17.73128898128898, "grad_norm": 0.002801591996103525, "learning_rate": 1.934815293179926e-06, "loss": 0.0292, "num_input_tokens_seen": 6144320, "step": 68230 }, { "epoch": 17.73258835758836, "grad_norm": 0.048035938292741776, "learning_rate": 1.93262889343612e-06, "loss": 0.6385, "num_input_tokens_seen": 6144784, "step": 68235 }, { "epoch": 17.733887733887734, "grad_norm": 23.917001724243164, "learning_rate": 1.9304436800693074e-06, "loss": 0.0624, "num_input_tokens_seen": 6145232, "step": 68240 }, { "epoch": 17.73518711018711, "grad_norm": 0.1671549528837204, "learning_rate": 1.9282596531918855e-06, "loss": 0.0006, "num_input_tokens_seen": 6145680, "step": 68245 }, { "epoch": 17.736486486486488, "grad_norm": 1.6553822755813599, "learning_rate": 1.9260768129161723e-06, "loss": 0.1499, "num_input_tokens_seen": 6146128, "step": 68250 }, { "epoch": 17.737785862785863, "grad_norm": 0.3343895971775055, "learning_rate": 1.923895159354441e-06, "loss": 0.0211, "num_input_tokens_seen": 6146544, "step": 68255 }, { "epoch": 17.739085239085238, "grad_norm": 6.8555588722229, "learning_rate": 1.9217146926188813e-06, "loss": 0.071, "num_input_tokens_seen": 6146992, "step": 68260 }, { "epoch": 17.740384615384617, "grad_norm": 1.3056246042251587, "learning_rate": 1.91953541282166e-06, "loss": 0.301, "num_input_tokens_seen": 6147488, "step": 68265 }, { "epoch": 17.741683991683992, "grad_norm": 12.844986915588379, "learning_rate": 1.917357320074839e-06, "loss": 0.1025, "num_input_tokens_seen": 6147904, "step": 68270 }, { "epoch": 17.742983367983367, "grad_norm": 0.020084787160158157, "learning_rate": 1.9151804144904556e-06, "loss": 0.2956, "num_input_tokens_seen": 6148352, "step": 68275 }, { "epoch": 17.744282744282746, "grad_norm": 4.030298709869385, "learning_rate": 1.9130046961804593e-06, "loss": 0.2538, "num_input_tokens_seen": 6148832, "step": 68280 }, { "epoch": 17.74558212058212, "grad_norm": 0.020346688106656075, "learning_rate": 1.9108301652567497e-06, "loss": 0.0056, "num_input_tokens_seen": 6149280, "step": 68285 }, { "epoch": 17.746881496881496, "grad_norm": 0.007427823729813099, "learning_rate": 1.908656821831167e-06, "loss": 0.0033, "num_input_tokens_seen": 6149696, "step": 68290 }, { "epoch": 17.748180873180875, "grad_norm": 0.09393107146024704, "learning_rate": 1.90648466601549e-06, "loss": 0.1522, "num_input_tokens_seen": 6150160, "step": 68295 }, { "epoch": 17.74948024948025, "grad_norm": 0.017961643636226654, "learning_rate": 1.9043136979214365e-06, "loss": 0.003, "num_input_tokens_seen": 6150592, "step": 68300 }, { "epoch": 17.750779625779625, "grad_norm": 0.02181330695748329, "learning_rate": 1.9021439176606564e-06, "loss": 0.0015, "num_input_tokens_seen": 6151056, "step": 68305 }, { "epoch": 17.752079002079, "grad_norm": 2.702261447906494, "learning_rate": 1.8999753253447466e-06, "loss": 0.189, "num_input_tokens_seen": 6151488, "step": 68310 }, { "epoch": 17.75337837837838, "grad_norm": 0.09033340215682983, "learning_rate": 1.89780792108524e-06, "loss": 0.0048, "num_input_tokens_seen": 6151920, "step": 68315 }, { "epoch": 17.754677754677754, "grad_norm": 8.408366203308105, "learning_rate": 1.8956417049936088e-06, "loss": 0.043, "num_input_tokens_seen": 6152352, "step": 68320 }, { "epoch": 17.75597713097713, "grad_norm": 0.0025288844481110573, "learning_rate": 1.8934766771812578e-06, "loss": 0.0001, "num_input_tokens_seen": 6152784, "step": 68325 }, { "epoch": 17.757276507276508, "grad_norm": 0.09047462046146393, "learning_rate": 1.8913128377595402e-06, "loss": 0.0983, "num_input_tokens_seen": 6153184, "step": 68330 }, { "epoch": 17.758575883575883, "grad_norm": 37.417415618896484, "learning_rate": 1.8891501868397472e-06, "loss": 0.1932, "num_input_tokens_seen": 6153648, "step": 68335 }, { "epoch": 17.75987525987526, "grad_norm": 0.013134581968188286, "learning_rate": 1.8869887245331063e-06, "loss": 0.0011, "num_input_tokens_seen": 6154064, "step": 68340 }, { "epoch": 17.761174636174637, "grad_norm": 0.17011316120624542, "learning_rate": 1.8848284509507818e-06, "loss": 0.2038, "num_input_tokens_seen": 6154560, "step": 68345 }, { "epoch": 17.762474012474012, "grad_norm": 1.6367177963256836, "learning_rate": 1.8826693662038737e-06, "loss": 0.0039, "num_input_tokens_seen": 6155040, "step": 68350 }, { "epoch": 17.763773388773387, "grad_norm": 0.0013079220661893487, "learning_rate": 1.8805114704034343e-06, "loss": 0.3724, "num_input_tokens_seen": 6155456, "step": 68355 }, { "epoch": 17.765072765072766, "grad_norm": 0.004364095628261566, "learning_rate": 1.8783547636604392e-06, "loss": 0.0151, "num_input_tokens_seen": 6155936, "step": 68360 }, { "epoch": 17.76637214137214, "grad_norm": 0.12729963660240173, "learning_rate": 1.8761992460858107e-06, "loss": 0.0007, "num_input_tokens_seen": 6156368, "step": 68365 }, { "epoch": 17.767671517671516, "grad_norm": 0.0015339814126491547, "learning_rate": 1.874044917790413e-06, "loss": 0.0062, "num_input_tokens_seen": 6156784, "step": 68370 }, { "epoch": 17.768970893970895, "grad_norm": 0.0955178290605545, "learning_rate": 1.871891778885046e-06, "loss": 0.1867, "num_input_tokens_seen": 6157216, "step": 68375 }, { "epoch": 17.77027027027027, "grad_norm": 0.705957293510437, "learning_rate": 1.8697398294804407e-06, "loss": 0.3894, "num_input_tokens_seen": 6157680, "step": 68380 }, { "epoch": 17.771569646569645, "grad_norm": 0.015106662176549435, "learning_rate": 1.8675890696872838e-06, "loss": 0.0009, "num_input_tokens_seen": 6158096, "step": 68385 }, { "epoch": 17.772869022869024, "grad_norm": 0.0024180395994335413, "learning_rate": 1.865439499616181e-06, "loss": 0.2704, "num_input_tokens_seen": 6158560, "step": 68390 }, { "epoch": 17.7741683991684, "grad_norm": 0.07181448489427567, "learning_rate": 1.8632911193776942e-06, "loss": 0.1933, "num_input_tokens_seen": 6159024, "step": 68395 }, { "epoch": 17.775467775467774, "grad_norm": 55.59211730957031, "learning_rate": 1.8611439290823096e-06, "loss": 0.05, "num_input_tokens_seen": 6159488, "step": 68400 }, { "epoch": 17.776767151767153, "grad_norm": 0.09875916689634323, "learning_rate": 1.858997928840464e-06, "loss": 0.0626, "num_input_tokens_seen": 6159968, "step": 68405 }, { "epoch": 17.778066528066528, "grad_norm": 0.08333983272314072, "learning_rate": 1.8568531187625333e-06, "loss": 0.0318, "num_input_tokens_seen": 6160432, "step": 68410 }, { "epoch": 17.779365904365903, "grad_norm": 0.0007978714420460165, "learning_rate": 1.8547094989588148e-06, "loss": 0.028, "num_input_tokens_seen": 6160864, "step": 68415 }, { "epoch": 17.780665280665282, "grad_norm": 0.35544949769973755, "learning_rate": 1.852567069539568e-06, "loss": 0.2012, "num_input_tokens_seen": 6161312, "step": 68420 }, { "epoch": 17.781964656964657, "grad_norm": 6.369683742523193, "learning_rate": 1.8504258306149713e-06, "loss": 0.0474, "num_input_tokens_seen": 6161744, "step": 68425 }, { "epoch": 17.783264033264032, "grad_norm": 0.7717801928520203, "learning_rate": 1.8482857822951616e-06, "loss": 0.0138, "num_input_tokens_seen": 6162208, "step": 68430 }, { "epoch": 17.78456340956341, "grad_norm": 0.06741820275783539, "learning_rate": 1.8461469246901897e-06, "loss": 0.0007, "num_input_tokens_seen": 6162688, "step": 68435 }, { "epoch": 17.785862785862786, "grad_norm": 10.333697319030762, "learning_rate": 1.8440092579100677e-06, "loss": 0.0091, "num_input_tokens_seen": 6163136, "step": 68440 }, { "epoch": 17.78716216216216, "grad_norm": 35.22105026245117, "learning_rate": 1.841872782064738e-06, "loss": 0.4983, "num_input_tokens_seen": 6163600, "step": 68445 }, { "epoch": 17.78846153846154, "grad_norm": 0.3400247395038605, "learning_rate": 1.8397374972640823e-06, "loss": 0.0004, "num_input_tokens_seen": 6164064, "step": 68450 }, { "epoch": 17.789760914760915, "grad_norm": 0.37685224413871765, "learning_rate": 1.8376034036179158e-06, "loss": 0.0009, "num_input_tokens_seen": 6164528, "step": 68455 }, { "epoch": 17.79106029106029, "grad_norm": 0.401836633682251, "learning_rate": 1.8354705012360002e-06, "loss": 0.0004, "num_input_tokens_seen": 6164944, "step": 68460 }, { "epoch": 17.79235966735967, "grad_norm": 0.01588405855000019, "learning_rate": 1.8333387902280314e-06, "loss": 0.0056, "num_input_tokens_seen": 6165456, "step": 68465 }, { "epoch": 17.793659043659044, "grad_norm": 0.01453996729105711, "learning_rate": 1.8312082707036494e-06, "loss": 0.0882, "num_input_tokens_seen": 6165952, "step": 68470 }, { "epoch": 17.79495841995842, "grad_norm": 0.06206880509853363, "learning_rate": 1.8290789427724191e-06, "loss": 0.2111, "num_input_tokens_seen": 6166448, "step": 68475 }, { "epoch": 17.796257796257795, "grad_norm": 0.42552119493484497, "learning_rate": 1.8269508065438617e-06, "loss": 0.0047, "num_input_tokens_seen": 6166864, "step": 68480 }, { "epoch": 17.797557172557173, "grad_norm": 40.6944465637207, "learning_rate": 1.824823862127431e-06, "loss": 0.1225, "num_input_tokens_seen": 6167328, "step": 68485 }, { "epoch": 17.79885654885655, "grad_norm": 2.6910905838012695, "learning_rate": 1.8226981096325119e-06, "loss": 0.022, "num_input_tokens_seen": 6167792, "step": 68490 }, { "epoch": 17.800155925155924, "grad_norm": 2.0945582389831543, "learning_rate": 1.820573549168439e-06, "loss": 0.2801, "num_input_tokens_seen": 6168288, "step": 68495 }, { "epoch": 17.801455301455302, "grad_norm": 0.003027117345482111, "learning_rate": 1.8184501808444749e-06, "loss": 0.001, "num_input_tokens_seen": 6168720, "step": 68500 }, { "epoch": 17.802754677754677, "grad_norm": 0.12663070857524872, "learning_rate": 1.8163280047698323e-06, "loss": 0.0004, "num_input_tokens_seen": 6169200, "step": 68505 }, { "epoch": 17.804054054054053, "grad_norm": 2.264834403991699, "learning_rate": 1.814207021053646e-06, "loss": 0.0531, "num_input_tokens_seen": 6169616, "step": 68510 }, { "epoch": 17.80535343035343, "grad_norm": 0.0029870211146771908, "learning_rate": 1.812087229805018e-06, "loss": 0.2557, "num_input_tokens_seen": 6170080, "step": 68515 }, { "epoch": 17.806652806652806, "grad_norm": 0.005573865491896868, "learning_rate": 1.809968631132955e-06, "loss": 0.0027, "num_input_tokens_seen": 6170560, "step": 68520 }, { "epoch": 17.80795218295218, "grad_norm": 40.69501876831055, "learning_rate": 1.8078512251464286e-06, "loss": 0.3225, "num_input_tokens_seen": 6170992, "step": 68525 }, { "epoch": 17.80925155925156, "grad_norm": 0.0634152963757515, "learning_rate": 1.8057350119543348e-06, "loss": 0.0005, "num_input_tokens_seen": 6171472, "step": 68530 }, { "epoch": 17.810550935550935, "grad_norm": 22.874069213867188, "learning_rate": 1.8036199916655144e-06, "loss": 0.0267, "num_input_tokens_seen": 6171920, "step": 68535 }, { "epoch": 17.81185031185031, "grad_norm": 0.027933089062571526, "learning_rate": 1.8015061643887387e-06, "loss": 0.0046, "num_input_tokens_seen": 6172336, "step": 68540 }, { "epoch": 17.81314968814969, "grad_norm": 77.95751953125, "learning_rate": 1.7993935302327292e-06, "loss": 0.1573, "num_input_tokens_seen": 6172784, "step": 68545 }, { "epoch": 17.814449064449065, "grad_norm": 0.022788435220718384, "learning_rate": 1.797282089306146e-06, "loss": 0.0063, "num_input_tokens_seen": 6173200, "step": 68550 }, { "epoch": 17.81574844074844, "grad_norm": 0.11272764950990677, "learning_rate": 1.7951718417175716e-06, "loss": 0.016, "num_input_tokens_seen": 6173632, "step": 68555 }, { "epoch": 17.81704781704782, "grad_norm": 0.014803343452513218, "learning_rate": 1.7930627875755473e-06, "loss": 0.0061, "num_input_tokens_seen": 6174096, "step": 68560 }, { "epoch": 17.818347193347194, "grad_norm": 0.4011012017726898, "learning_rate": 1.7909549269885362e-06, "loss": 0.0043, "num_input_tokens_seen": 6174544, "step": 68565 }, { "epoch": 17.81964656964657, "grad_norm": 0.11963275820016861, "learning_rate": 1.7888482600649542e-06, "loss": 0.0012, "num_input_tokens_seen": 6174976, "step": 68570 }, { "epoch": 17.820945945945947, "grad_norm": 20.245140075683594, "learning_rate": 1.7867427869131426e-06, "loss": 0.0309, "num_input_tokens_seen": 6175440, "step": 68575 }, { "epoch": 17.822245322245323, "grad_norm": 0.26774686574935913, "learning_rate": 1.7846385076413897e-06, "loss": 0.342, "num_input_tokens_seen": 6175888, "step": 68580 }, { "epoch": 17.823544698544698, "grad_norm": 0.323301762342453, "learning_rate": 1.7825354223579255e-06, "loss": 0.3566, "num_input_tokens_seen": 6176336, "step": 68585 }, { "epoch": 17.824844074844076, "grad_norm": 0.16122807562351227, "learning_rate": 1.7804335311709108e-06, "loss": 0.002, "num_input_tokens_seen": 6176768, "step": 68590 }, { "epoch": 17.82614345114345, "grad_norm": 3.0420103073120117, "learning_rate": 1.7783328341884452e-06, "loss": 0.0097, "num_input_tokens_seen": 6177216, "step": 68595 }, { "epoch": 17.827442827442827, "grad_norm": 36.83094024658203, "learning_rate": 1.776233331518576e-06, "loss": 0.2237, "num_input_tokens_seen": 6177712, "step": 68600 }, { "epoch": 17.828742203742205, "grad_norm": 0.01819351688027382, "learning_rate": 1.77413502326928e-06, "loss": 0.0044, "num_input_tokens_seen": 6178144, "step": 68605 }, { "epoch": 17.83004158004158, "grad_norm": 30.89908218383789, "learning_rate": 1.7720379095484686e-06, "loss": 0.0433, "num_input_tokens_seen": 6178640, "step": 68610 }, { "epoch": 17.831340956340956, "grad_norm": 0.4641466736793518, "learning_rate": 1.7699419904640024e-06, "loss": 0.0063, "num_input_tokens_seen": 6179040, "step": 68615 }, { "epoch": 17.83264033264033, "grad_norm": 0.3412506878376007, "learning_rate": 1.7678472661236789e-06, "loss": 0.0012, "num_input_tokens_seen": 6179472, "step": 68620 }, { "epoch": 17.83393970893971, "grad_norm": 0.5003809332847595, "learning_rate": 1.7657537366352338e-06, "loss": 0.3945, "num_input_tokens_seen": 6179936, "step": 68625 }, { "epoch": 17.835239085239085, "grad_norm": 0.16842682659626007, "learning_rate": 1.7636614021063313e-06, "loss": 0.0005, "num_input_tokens_seen": 6180400, "step": 68630 }, { "epoch": 17.83653846153846, "grad_norm": 0.001625705393962562, "learning_rate": 1.7615702626445907e-06, "loss": 0.0194, "num_input_tokens_seen": 6180848, "step": 68635 }, { "epoch": 17.83783783783784, "grad_norm": 1.6312609910964966, "learning_rate": 1.7594803183575536e-06, "loss": 0.0093, "num_input_tokens_seen": 6181280, "step": 68640 }, { "epoch": 17.839137214137214, "grad_norm": 0.07762053608894348, "learning_rate": 1.7573915693527176e-06, "loss": 0.0702, "num_input_tokens_seen": 6181760, "step": 68645 }, { "epoch": 17.84043659043659, "grad_norm": 0.057843346148729324, "learning_rate": 1.7553040157374968e-06, "loss": 0.0002, "num_input_tokens_seen": 6182224, "step": 68650 }, { "epoch": 17.841735966735968, "grad_norm": 0.20499911904335022, "learning_rate": 1.753217657619266e-06, "loss": 0.0002, "num_input_tokens_seen": 6182656, "step": 68655 }, { "epoch": 17.843035343035343, "grad_norm": 64.22576141357422, "learning_rate": 1.7511324951053204e-06, "loss": 0.0646, "num_input_tokens_seen": 6183120, "step": 68660 }, { "epoch": 17.844334719334718, "grad_norm": 0.08063361793756485, "learning_rate": 1.749048528302913e-06, "loss": 0.2868, "num_input_tokens_seen": 6183584, "step": 68665 }, { "epoch": 17.845634095634097, "grad_norm": 35.763668060302734, "learning_rate": 1.746965757319219e-06, "loss": 0.1338, "num_input_tokens_seen": 6184048, "step": 68670 }, { "epoch": 17.846933471933472, "grad_norm": 57.028038024902344, "learning_rate": 1.74488418226135e-06, "loss": 0.1082, "num_input_tokens_seen": 6184496, "step": 68675 }, { "epoch": 17.848232848232847, "grad_norm": 0.9158661365509033, "learning_rate": 1.7428038032363758e-06, "loss": 0.0045, "num_input_tokens_seen": 6184944, "step": 68680 }, { "epoch": 17.849532224532226, "grad_norm": 0.6872715353965759, "learning_rate": 1.7407246203512805e-06, "loss": 0.0194, "num_input_tokens_seen": 6185440, "step": 68685 }, { "epoch": 17.8508316008316, "grad_norm": 0.12782053649425507, "learning_rate": 1.7386466337130036e-06, "loss": 0.2015, "num_input_tokens_seen": 6185856, "step": 68690 }, { "epoch": 17.852130977130976, "grad_norm": 97.19049072265625, "learning_rate": 1.7365698434284178e-06, "loss": 0.2687, "num_input_tokens_seen": 6186304, "step": 68695 }, { "epoch": 17.853430353430355, "grad_norm": 25.0355281829834, "learning_rate": 1.7344942496043403e-06, "loss": 0.0356, "num_input_tokens_seen": 6186736, "step": 68700 }, { "epoch": 17.85472972972973, "grad_norm": 11.546985626220703, "learning_rate": 1.732419852347511e-06, "loss": 0.0845, "num_input_tokens_seen": 6187184, "step": 68705 }, { "epoch": 17.856029106029105, "grad_norm": 0.016783224418759346, "learning_rate": 1.730346651764625e-06, "loss": 0.4741, "num_input_tokens_seen": 6187632, "step": 68710 }, { "epoch": 17.857328482328484, "grad_norm": 0.052018195390701294, "learning_rate": 1.728274647962305e-06, "loss": 0.1715, "num_input_tokens_seen": 6188096, "step": 68715 }, { "epoch": 17.85862785862786, "grad_norm": 0.05618111416697502, "learning_rate": 1.7262038410471193e-06, "loss": 0.0675, "num_input_tokens_seen": 6188528, "step": 68720 }, { "epoch": 17.859927234927234, "grad_norm": 0.691842257976532, "learning_rate": 1.724134231125568e-06, "loss": 0.2131, "num_input_tokens_seen": 6189008, "step": 68725 }, { "epoch": 17.861226611226613, "grad_norm": 0.0036559170112013817, "learning_rate": 1.7220658183040944e-06, "loss": 0.0179, "num_input_tokens_seen": 6189456, "step": 68730 }, { "epoch": 17.862525987525988, "grad_norm": 0.0433504693210125, "learning_rate": 1.7199986026890853e-06, "loss": 0.0074, "num_input_tokens_seen": 6189888, "step": 68735 }, { "epoch": 17.863825363825363, "grad_norm": 0.029718607664108276, "learning_rate": 1.717932584386847e-06, "loss": 0.1769, "num_input_tokens_seen": 6190336, "step": 68740 }, { "epoch": 17.86512474012474, "grad_norm": 0.7468001246452332, "learning_rate": 1.715867763503648e-06, "loss": 0.0076, "num_input_tokens_seen": 6190816, "step": 68745 }, { "epoch": 17.866424116424117, "grad_norm": 24.227087020874023, "learning_rate": 1.7138041401456778e-06, "loss": 0.1637, "num_input_tokens_seen": 6191232, "step": 68750 }, { "epoch": 17.867723492723492, "grad_norm": 1.311752200126648, "learning_rate": 1.711741714419074e-06, "loss": 0.2705, "num_input_tokens_seen": 6191664, "step": 68755 }, { "epoch": 17.86902286902287, "grad_norm": 0.7148280143737793, "learning_rate": 1.7096804864299015e-06, "loss": 0.0273, "num_input_tokens_seen": 6192128, "step": 68760 }, { "epoch": 17.870322245322246, "grad_norm": 0.020693108439445496, "learning_rate": 1.707620456284184e-06, "loss": 0.0003, "num_input_tokens_seen": 6192592, "step": 68765 }, { "epoch": 17.87162162162162, "grad_norm": 0.01721423678100109, "learning_rate": 1.7055616240878618e-06, "loss": 0.1161, "num_input_tokens_seen": 6193040, "step": 68770 }, { "epoch": 17.872920997921, "grad_norm": 0.045278407633304596, "learning_rate": 1.703503989946828e-06, "loss": 0.057, "num_input_tokens_seen": 6193488, "step": 68775 }, { "epoch": 17.874220374220375, "grad_norm": 0.4713335931301117, "learning_rate": 1.7014475539669005e-06, "loss": 0.1459, "num_input_tokens_seen": 6193952, "step": 68780 }, { "epoch": 17.87551975051975, "grad_norm": 0.5995322465896606, "learning_rate": 1.699392316253856e-06, "loss": 0.0035, "num_input_tokens_seen": 6194400, "step": 68785 }, { "epoch": 17.876819126819125, "grad_norm": 2.5788581371307373, "learning_rate": 1.6973382769133823e-06, "loss": 0.208, "num_input_tokens_seen": 6194832, "step": 68790 }, { "epoch": 17.878118503118504, "grad_norm": 0.01741739548742771, "learning_rate": 1.6952854360511305e-06, "loss": 0.2351, "num_input_tokens_seen": 6195248, "step": 68795 }, { "epoch": 17.87941787941788, "grad_norm": 39.228973388671875, "learning_rate": 1.6932337937726834e-06, "loss": 0.2619, "num_input_tokens_seen": 6195712, "step": 68800 }, { "epoch": 17.880717255717254, "grad_norm": 0.35381510853767395, "learning_rate": 1.6911833501835478e-06, "loss": 0.0027, "num_input_tokens_seen": 6196160, "step": 68805 }, { "epoch": 17.882016632016633, "grad_norm": 57.394371032714844, "learning_rate": 1.6891341053891896e-06, "loss": 0.2926, "num_input_tokens_seen": 6196624, "step": 68810 }, { "epoch": 17.883316008316008, "grad_norm": 21.591745376586914, "learning_rate": 1.6870860594949966e-06, "loss": 0.4479, "num_input_tokens_seen": 6197056, "step": 68815 }, { "epoch": 17.884615384615383, "grad_norm": 0.6677764058113098, "learning_rate": 1.6850392126063097e-06, "loss": 0.1409, "num_input_tokens_seen": 6197488, "step": 68820 }, { "epoch": 17.885914760914762, "grad_norm": 31.627382278442383, "learning_rate": 1.6829935648283918e-06, "loss": 0.2754, "num_input_tokens_seen": 6197936, "step": 68825 }, { "epoch": 17.887214137214137, "grad_norm": 0.01437623891979456, "learning_rate": 1.680949116266456e-06, "loss": 0.1238, "num_input_tokens_seen": 6198368, "step": 68830 }, { "epoch": 17.888513513513512, "grad_norm": 33.73960876464844, "learning_rate": 1.6789058670256485e-06, "loss": 0.2724, "num_input_tokens_seen": 6198800, "step": 68835 }, { "epoch": 17.88981288981289, "grad_norm": 0.003401647787541151, "learning_rate": 1.6768638172110606e-06, "loss": 0.013, "num_input_tokens_seen": 6199280, "step": 68840 }, { "epoch": 17.891112266112266, "grad_norm": 0.012021497823297977, "learning_rate": 1.6748229669277138e-06, "loss": 0.0004, "num_input_tokens_seen": 6199712, "step": 68845 }, { "epoch": 17.89241164241164, "grad_norm": 0.014139620587229729, "learning_rate": 1.6727833162805711e-06, "loss": 0.0109, "num_input_tokens_seen": 6200112, "step": 68850 }, { "epoch": 17.89371101871102, "grad_norm": 0.6432904005050659, "learning_rate": 1.6707448653745323e-06, "loss": 0.0016, "num_input_tokens_seen": 6200560, "step": 68855 }, { "epoch": 17.895010395010395, "grad_norm": 18.13536834716797, "learning_rate": 1.668707614314441e-06, "loss": 0.0171, "num_input_tokens_seen": 6200992, "step": 68860 }, { "epoch": 17.89630977130977, "grad_norm": 84.6454849243164, "learning_rate": 1.666671563205069e-06, "loss": 0.1138, "num_input_tokens_seen": 6201440, "step": 68865 }, { "epoch": 17.89760914760915, "grad_norm": 1.1586815118789673, "learning_rate": 1.6646367121511326e-06, "loss": 0.2155, "num_input_tokens_seen": 6201936, "step": 68870 }, { "epoch": 17.898908523908524, "grad_norm": 0.13150793313980103, "learning_rate": 1.6626030612572951e-06, "loss": 0.1688, "num_input_tokens_seen": 6202384, "step": 68875 }, { "epoch": 17.9002079002079, "grad_norm": 0.2046506106853485, "learning_rate": 1.6605706106281399e-06, "loss": 0.0052, "num_input_tokens_seen": 6202848, "step": 68880 }, { "epoch": 17.901507276507278, "grad_norm": 3.8927805423736572, "learning_rate": 1.6585393603682026e-06, "loss": 0.0564, "num_input_tokens_seen": 6203280, "step": 68885 }, { "epoch": 17.902806652806653, "grad_norm": 43.044036865234375, "learning_rate": 1.6565093105819468e-06, "loss": 0.1719, "num_input_tokens_seen": 6203760, "step": 68890 }, { "epoch": 17.90410602910603, "grad_norm": 0.08653341978788376, "learning_rate": 1.654480461373789e-06, "loss": 0.0044, "num_input_tokens_seen": 6204224, "step": 68895 }, { "epoch": 17.905405405405407, "grad_norm": 0.004833632614463568, "learning_rate": 1.6524528128480655e-06, "loss": 0.3965, "num_input_tokens_seen": 6204672, "step": 68900 }, { "epoch": 17.906704781704782, "grad_norm": 64.6514892578125, "learning_rate": 1.6504263651090645e-06, "loss": 0.3186, "num_input_tokens_seen": 6205088, "step": 68905 }, { "epoch": 17.908004158004157, "grad_norm": 14.46940803527832, "learning_rate": 1.6484011182610088e-06, "loss": 0.1099, "num_input_tokens_seen": 6205536, "step": 68910 }, { "epoch": 17.909303534303533, "grad_norm": 0.0016648129094392061, "learning_rate": 1.6463770724080618e-06, "loss": 0.0024, "num_input_tokens_seen": 6206000, "step": 68915 }, { "epoch": 17.91060291060291, "grad_norm": 0.1821879744529724, "learning_rate": 1.6443542276543128e-06, "loss": 0.0262, "num_input_tokens_seen": 6206432, "step": 68920 }, { "epoch": 17.911902286902286, "grad_norm": 54.62872314453125, "learning_rate": 1.642332584103809e-06, "loss": 0.2275, "num_input_tokens_seen": 6206880, "step": 68925 }, { "epoch": 17.91320166320166, "grad_norm": 0.3701441287994385, "learning_rate": 1.6403121418605172e-06, "loss": 0.0873, "num_input_tokens_seen": 6207296, "step": 68930 }, { "epoch": 17.91450103950104, "grad_norm": 6.259897232055664, "learning_rate": 1.638292901028357e-06, "loss": 0.0121, "num_input_tokens_seen": 6207728, "step": 68935 }, { "epoch": 17.915800415800415, "grad_norm": 0.7382977604866028, "learning_rate": 1.6362748617111734e-06, "loss": 0.0011, "num_input_tokens_seen": 6208192, "step": 68940 }, { "epoch": 17.91709979209979, "grad_norm": 0.5864609479904175, "learning_rate": 1.6342580240127582e-06, "loss": 0.1155, "num_input_tokens_seen": 6208640, "step": 68945 }, { "epoch": 17.91839916839917, "grad_norm": 4.5995612144470215, "learning_rate": 1.6322423880368477e-06, "loss": 0.027, "num_input_tokens_seen": 6209104, "step": 68950 }, { "epoch": 17.919698544698544, "grad_norm": 0.004841419868171215, "learning_rate": 1.6302279538870952e-06, "loss": 0.0937, "num_input_tokens_seen": 6209536, "step": 68955 }, { "epoch": 17.92099792099792, "grad_norm": 0.36995139718055725, "learning_rate": 1.6282147216671152e-06, "loss": 0.0009, "num_input_tokens_seen": 6209984, "step": 68960 }, { "epoch": 17.9222972972973, "grad_norm": 0.20920687913894653, "learning_rate": 1.626202691480444e-06, "loss": 0.0068, "num_input_tokens_seen": 6210416, "step": 68965 }, { "epoch": 17.923596673596673, "grad_norm": 101.39810943603516, "learning_rate": 1.6241918634305658e-06, "loss": 0.3442, "num_input_tokens_seen": 6210896, "step": 68970 }, { "epoch": 17.92489604989605, "grad_norm": 0.1233743205666542, "learning_rate": 1.6221822376208922e-06, "loss": 0.2848, "num_input_tokens_seen": 6211312, "step": 68975 }, { "epoch": 17.926195426195427, "grad_norm": 0.008854186162352562, "learning_rate": 1.620173814154788e-06, "loss": 0.1948, "num_input_tokens_seen": 6211744, "step": 68980 }, { "epoch": 17.927494802494802, "grad_norm": 78.46680450439453, "learning_rate": 1.6181665931355478e-06, "loss": 0.4566, "num_input_tokens_seen": 6212192, "step": 68985 }, { "epoch": 17.928794178794178, "grad_norm": 24.715408325195312, "learning_rate": 1.6161605746664032e-06, "loss": 0.0243, "num_input_tokens_seen": 6212672, "step": 68990 }, { "epoch": 17.930093555093556, "grad_norm": 1.8384917974472046, "learning_rate": 1.6141557588505246e-06, "loss": 0.0012, "num_input_tokens_seen": 6213120, "step": 68995 }, { "epoch": 17.93139293139293, "grad_norm": 33.512908935546875, "learning_rate": 1.6121521457910239e-06, "loss": 0.3782, "num_input_tokens_seen": 6213536, "step": 69000 }, { "epoch": 17.932692307692307, "grad_norm": 53.164466857910156, "learning_rate": 1.610149735590949e-06, "loss": 0.073, "num_input_tokens_seen": 6214032, "step": 69005 }, { "epoch": 17.933991683991685, "grad_norm": 74.7623062133789, "learning_rate": 1.608148528353276e-06, "loss": 0.1672, "num_input_tokens_seen": 6214480, "step": 69010 }, { "epoch": 17.93529106029106, "grad_norm": 0.0491584911942482, "learning_rate": 1.6061485241809449e-06, "loss": 0.4313, "num_input_tokens_seen": 6214944, "step": 69015 }, { "epoch": 17.936590436590436, "grad_norm": 0.005275137256830931, "learning_rate": 1.6041497231768066e-06, "loss": 0.0003, "num_input_tokens_seen": 6215376, "step": 69020 }, { "epoch": 17.937889812889814, "grad_norm": 0.008532474748790264, "learning_rate": 1.6021521254436678e-06, "loss": 0.1179, "num_input_tokens_seen": 6215840, "step": 69025 }, { "epoch": 17.93918918918919, "grad_norm": 0.0008446210413239896, "learning_rate": 1.600155731084263e-06, "loss": 0.0008, "num_input_tokens_seen": 6216304, "step": 69030 }, { "epoch": 17.940488565488565, "grad_norm": 0.11540185660123825, "learning_rate": 1.5981605402012711e-06, "loss": 0.0572, "num_input_tokens_seen": 6216752, "step": 69035 }, { "epoch": 17.941787941787943, "grad_norm": 0.034428227692842484, "learning_rate": 1.5961665528973019e-06, "loss": 0.0062, "num_input_tokens_seen": 6217200, "step": 69040 }, { "epoch": 17.94308731808732, "grad_norm": 10.555401802062988, "learning_rate": 1.5941737692749091e-06, "loss": 0.2335, "num_input_tokens_seen": 6217664, "step": 69045 }, { "epoch": 17.944386694386694, "grad_norm": 1.8676420450210571, "learning_rate": 1.592182189436589e-06, "loss": 0.0422, "num_input_tokens_seen": 6218128, "step": 69050 }, { "epoch": 17.945686070686072, "grad_norm": 0.03407730534672737, "learning_rate": 1.5901918134847676e-06, "loss": 0.0061, "num_input_tokens_seen": 6218576, "step": 69055 }, { "epoch": 17.946985446985448, "grad_norm": 52.98422622680664, "learning_rate": 1.5882026415218105e-06, "loss": 0.1322, "num_input_tokens_seen": 6218992, "step": 69060 }, { "epoch": 17.948284823284823, "grad_norm": 0.0019913692958652973, "learning_rate": 1.5862146736500272e-06, "loss": 0.1962, "num_input_tokens_seen": 6219440, "step": 69065 }, { "epoch": 17.9495841995842, "grad_norm": 29.70819854736328, "learning_rate": 1.5842279099716556e-06, "loss": 0.1373, "num_input_tokens_seen": 6219856, "step": 69070 }, { "epoch": 17.950883575883577, "grad_norm": 39.32666015625, "learning_rate": 1.5822423505888778e-06, "loss": 0.4722, "num_input_tokens_seen": 6220304, "step": 69075 }, { "epoch": 17.95218295218295, "grad_norm": 5.701070308685303, "learning_rate": 1.5802579956038095e-06, "loss": 0.0075, "num_input_tokens_seen": 6220704, "step": 69080 }, { "epoch": 17.953482328482327, "grad_norm": 0.5942175388336182, "learning_rate": 1.578274845118516e-06, "loss": 0.0758, "num_input_tokens_seen": 6221184, "step": 69085 }, { "epoch": 17.954781704781706, "grad_norm": 0.9845561981201172, "learning_rate": 1.5762928992349907e-06, "loss": 0.1328, "num_input_tokens_seen": 6221632, "step": 69090 }, { "epoch": 17.95608108108108, "grad_norm": 0.0010384432971477509, "learning_rate": 1.5743121580551607e-06, "loss": 0.001, "num_input_tokens_seen": 6222112, "step": 69095 }, { "epoch": 17.957380457380456, "grad_norm": 47.20725631713867, "learning_rate": 1.5723326216809082e-06, "loss": 0.2161, "num_input_tokens_seen": 6222592, "step": 69100 }, { "epoch": 17.958679833679835, "grad_norm": 0.7706270217895508, "learning_rate": 1.5703542902140294e-06, "loss": 0.0556, "num_input_tokens_seen": 6223056, "step": 69105 }, { "epoch": 17.95997920997921, "grad_norm": 0.030133413150906563, "learning_rate": 1.568377163756285e-06, "loss": 0.0011, "num_input_tokens_seen": 6223504, "step": 69110 }, { "epoch": 17.961278586278585, "grad_norm": 47.292213439941406, "learning_rate": 1.5664012424093488e-06, "loss": 0.294, "num_input_tokens_seen": 6223952, "step": 69115 }, { "epoch": 17.962577962577964, "grad_norm": 0.003989432938396931, "learning_rate": 1.5644265262748508e-06, "loss": 0.0009, "num_input_tokens_seen": 6224432, "step": 69120 }, { "epoch": 17.96387733887734, "grad_norm": 0.007207346614450216, "learning_rate": 1.5624530154543487e-06, "loss": 0.262, "num_input_tokens_seen": 6224880, "step": 69125 }, { "epoch": 17.965176715176714, "grad_norm": 0.3691195249557495, "learning_rate": 1.5604807100493528e-06, "loss": 0.2479, "num_input_tokens_seen": 6225296, "step": 69130 }, { "epoch": 17.966476091476093, "grad_norm": 0.21398572623729706, "learning_rate": 1.5585096101612906e-06, "loss": 0.0581, "num_input_tokens_seen": 6225792, "step": 69135 }, { "epoch": 17.967775467775468, "grad_norm": 3.118173599243164, "learning_rate": 1.5565397158915363e-06, "loss": 0.0021, "num_input_tokens_seen": 6226208, "step": 69140 }, { "epoch": 17.969074844074843, "grad_norm": 0.01664159819483757, "learning_rate": 1.5545710273414089e-06, "loss": 0.019, "num_input_tokens_seen": 6226656, "step": 69145 }, { "epoch": 17.97037422037422, "grad_norm": 0.0115983746945858, "learning_rate": 1.552603544612155e-06, "loss": 0.4775, "num_input_tokens_seen": 6227088, "step": 69150 }, { "epoch": 17.971673596673597, "grad_norm": 0.01554136723279953, "learning_rate": 1.550637267804969e-06, "loss": 0.0029, "num_input_tokens_seen": 6227552, "step": 69155 }, { "epoch": 17.972972972972972, "grad_norm": 0.009498825296759605, "learning_rate": 1.5486721970209727e-06, "loss": 0.0381, "num_input_tokens_seen": 6227984, "step": 69160 }, { "epoch": 17.97427234927235, "grad_norm": 4.285215854644775, "learning_rate": 1.5467083323612408e-06, "loss": 0.0795, "num_input_tokens_seen": 6228448, "step": 69165 }, { "epoch": 17.975571725571726, "grad_norm": 78.49095916748047, "learning_rate": 1.54474567392677e-06, "loss": 0.1629, "num_input_tokens_seen": 6228880, "step": 69170 }, { "epoch": 17.9768711018711, "grad_norm": 0.48740720748901367, "learning_rate": 1.5427842218185025e-06, "loss": 0.5177, "num_input_tokens_seen": 6229360, "step": 69175 }, { "epoch": 17.97817047817048, "grad_norm": 6.024306297302246, "learning_rate": 1.5408239761373177e-06, "loss": 0.1297, "num_input_tokens_seen": 6229808, "step": 69180 }, { "epoch": 17.979469854469855, "grad_norm": 0.0026862225495278835, "learning_rate": 1.5388649369840357e-06, "loss": 0.2143, "num_input_tokens_seen": 6230240, "step": 69185 }, { "epoch": 17.98076923076923, "grad_norm": 0.12298476696014404, "learning_rate": 1.5369071044594063e-06, "loss": 0.0907, "num_input_tokens_seen": 6230704, "step": 69190 }, { "epoch": 17.98206860706861, "grad_norm": 58.6884651184082, "learning_rate": 1.5349504786641239e-06, "loss": 0.1895, "num_input_tokens_seen": 6231120, "step": 69195 }, { "epoch": 17.983367983367984, "grad_norm": 1.5405783653259277, "learning_rate": 1.5329950596988273e-06, "loss": 0.0028, "num_input_tokens_seen": 6231568, "step": 69200 }, { "epoch": 17.98466735966736, "grad_norm": 0.016229946166276932, "learning_rate": 1.5310408476640754e-06, "loss": 0.147, "num_input_tokens_seen": 6232016, "step": 69205 }, { "epoch": 17.985966735966738, "grad_norm": 10.500816345214844, "learning_rate": 1.5290878426603817e-06, "loss": 0.0087, "num_input_tokens_seen": 6232464, "step": 69210 }, { "epoch": 17.987266112266113, "grad_norm": 69.19254302978516, "learning_rate": 1.5271360447881883e-06, "loss": 0.1365, "num_input_tokens_seen": 6232896, "step": 69215 }, { "epoch": 17.988565488565488, "grad_norm": 0.12062128633260727, "learning_rate": 1.5251854541478788e-06, "loss": 0.3481, "num_input_tokens_seen": 6233312, "step": 69220 }, { "epoch": 17.989864864864863, "grad_norm": 0.02107458934187889, "learning_rate": 1.523236070839773e-06, "loss": 0.0791, "num_input_tokens_seen": 6233744, "step": 69225 }, { "epoch": 17.991164241164242, "grad_norm": 56.32880783081055, "learning_rate": 1.5212878949641296e-06, "loss": 0.1906, "num_input_tokens_seen": 6234176, "step": 69230 }, { "epoch": 17.992463617463617, "grad_norm": 0.003910496365278959, "learning_rate": 1.5193409266211462e-06, "loss": 0.0002, "num_input_tokens_seen": 6234688, "step": 69235 }, { "epoch": 17.993762993762992, "grad_norm": 3.1586427688598633, "learning_rate": 1.5173951659109592e-06, "loss": 0.0945, "num_input_tokens_seen": 6235104, "step": 69240 }, { "epoch": 17.99506237006237, "grad_norm": 40.410125732421875, "learning_rate": 1.5154506129336388e-06, "loss": 0.2616, "num_input_tokens_seen": 6235536, "step": 69245 }, { "epoch": 17.996361746361746, "grad_norm": 19.058761596679688, "learning_rate": 1.5135072677891965e-06, "loss": 0.2545, "num_input_tokens_seen": 6236000, "step": 69250 }, { "epoch": 17.99766112266112, "grad_norm": 25.087308883666992, "learning_rate": 1.5115651305775746e-06, "loss": 0.2349, "num_input_tokens_seen": 6236432, "step": 69255 }, { "epoch": 17.9989604989605, "grad_norm": 22.280414581298828, "learning_rate": 1.5096242013986628e-06, "loss": 0.2789, "num_input_tokens_seen": 6236880, "step": 69260 }, { "epoch": 18.0, "eval_loss": 0.7981295585632324, "eval_runtime": 13.166, "eval_samples_per_second": 65.016, "eval_steps_per_second": 32.508, "num_input_tokens_seen": 6237200, "step": 69264 }, { "epoch": 18.000259875259875, "grad_norm": 0.7863827347755432, "learning_rate": 1.5076844803522922e-06, "loss": 0.0052, "num_input_tokens_seen": 6237280, "step": 69265 }, { "epoch": 18.00155925155925, "grad_norm": 0.009387638419866562, "learning_rate": 1.5057459675382134e-06, "loss": 0.1424, "num_input_tokens_seen": 6237728, "step": 69270 }, { "epoch": 18.00285862785863, "grad_norm": 0.10658450424671173, "learning_rate": 1.5038086630561332e-06, "loss": 0.0736, "num_input_tokens_seen": 6238240, "step": 69275 }, { "epoch": 18.004158004158004, "grad_norm": 0.196865975856781, "learning_rate": 1.5018725670056826e-06, "loss": 0.0068, "num_input_tokens_seen": 6238688, "step": 69280 }, { "epoch": 18.00545738045738, "grad_norm": 35.91985321044922, "learning_rate": 1.4999376794864462e-06, "loss": 0.0697, "num_input_tokens_seen": 6239184, "step": 69285 }, { "epoch": 18.006756756756758, "grad_norm": 8.291183471679688, "learning_rate": 1.4980040005979273e-06, "loss": 0.0234, "num_input_tokens_seen": 6239616, "step": 69290 }, { "epoch": 18.008056133056133, "grad_norm": 47.76097869873047, "learning_rate": 1.49607153043958e-06, "loss": 0.2157, "num_input_tokens_seen": 6240064, "step": 69295 }, { "epoch": 18.009355509355508, "grad_norm": 0.05844094604253769, "learning_rate": 1.4941402691107969e-06, "loss": 0.008, "num_input_tokens_seen": 6240560, "step": 69300 }, { "epoch": 18.010654885654887, "grad_norm": 37.6545295715332, "learning_rate": 1.4922102167109015e-06, "loss": 0.0958, "num_input_tokens_seen": 6241024, "step": 69305 }, { "epoch": 18.011954261954262, "grad_norm": 0.10186003148555756, "learning_rate": 1.4902813733391586e-06, "loss": 0.0202, "num_input_tokens_seen": 6241440, "step": 69310 }, { "epoch": 18.013253638253637, "grad_norm": 85.6926040649414, "learning_rate": 1.4883537390947722e-06, "loss": 0.1325, "num_input_tokens_seen": 6241904, "step": 69315 }, { "epoch": 18.014553014553016, "grad_norm": 0.005030001047998667, "learning_rate": 1.4864273140768797e-06, "loss": 0.1273, "num_input_tokens_seen": 6242336, "step": 69320 }, { "epoch": 18.01585239085239, "grad_norm": 0.03349945321679115, "learning_rate": 1.4845020983845603e-06, "loss": 0.0197, "num_input_tokens_seen": 6242784, "step": 69325 }, { "epoch": 18.017151767151766, "grad_norm": 2.000058650970459, "learning_rate": 1.4825780921168293e-06, "loss": 0.0012, "num_input_tokens_seen": 6243264, "step": 69330 }, { "epoch": 18.018451143451145, "grad_norm": 1.3572592735290527, "learning_rate": 1.4806552953726377e-06, "loss": 0.0027, "num_input_tokens_seen": 6243680, "step": 69335 }, { "epoch": 18.01975051975052, "grad_norm": 0.9496338367462158, "learning_rate": 1.4787337082508846e-06, "loss": 0.0041, "num_input_tokens_seen": 6244144, "step": 69340 }, { "epoch": 18.021049896049895, "grad_norm": 0.010159224271774292, "learning_rate": 1.476813330850388e-06, "loss": 0.0649, "num_input_tokens_seen": 6244592, "step": 69345 }, { "epoch": 18.022349272349274, "grad_norm": 0.4832803010940552, "learning_rate": 1.4748941632699271e-06, "loss": 0.0142, "num_input_tokens_seen": 6245056, "step": 69350 }, { "epoch": 18.02364864864865, "grad_norm": 0.008502621203660965, "learning_rate": 1.4729762056081952e-06, "loss": 0.0115, "num_input_tokens_seen": 6245504, "step": 69355 }, { "epoch": 18.024948024948024, "grad_norm": 0.024281790480017662, "learning_rate": 1.4710594579638443e-06, "loss": 0.0018, "num_input_tokens_seen": 6245952, "step": 69360 }, { "epoch": 18.026247401247403, "grad_norm": 0.013934459537267685, "learning_rate": 1.4691439204354424e-06, "loss": 0.1706, "num_input_tokens_seen": 6246384, "step": 69365 }, { "epoch": 18.027546777546778, "grad_norm": 0.1591397225856781, "learning_rate": 1.4672295931215192e-06, "loss": 0.001, "num_input_tokens_seen": 6246800, "step": 69370 }, { "epoch": 18.028846153846153, "grad_norm": 2.054222345352173, "learning_rate": 1.4653164761205235e-06, "loss": 0.0554, "num_input_tokens_seen": 6247216, "step": 69375 }, { "epoch": 18.03014553014553, "grad_norm": 1.6461161375045776, "learning_rate": 1.4634045695308574e-06, "loss": 0.0013, "num_input_tokens_seen": 6247648, "step": 69380 }, { "epoch": 18.031444906444907, "grad_norm": 17.9758243560791, "learning_rate": 1.4614938734508392e-06, "loss": 0.0149, "num_input_tokens_seen": 6248096, "step": 69385 }, { "epoch": 18.032744282744282, "grad_norm": 0.007750075310468674, "learning_rate": 1.459584387978749e-06, "loss": 0.0154, "num_input_tokens_seen": 6248528, "step": 69390 }, { "epoch": 18.034043659043657, "grad_norm": 2.920208692550659, "learning_rate": 1.4576761132127915e-06, "loss": 0.0105, "num_input_tokens_seen": 6248944, "step": 69395 }, { "epoch": 18.035343035343036, "grad_norm": 0.06088406220078468, "learning_rate": 1.4557690492511016e-06, "loss": 0.3879, "num_input_tokens_seen": 6249392, "step": 69400 }, { "epoch": 18.03664241164241, "grad_norm": 0.29323065280914307, "learning_rate": 1.4538631961917709e-06, "loss": 0.0132, "num_input_tokens_seen": 6249840, "step": 69405 }, { "epoch": 18.037941787941786, "grad_norm": 0.3142942190170288, "learning_rate": 1.451958554132815e-06, "loss": 0.0058, "num_input_tokens_seen": 6250320, "step": 69410 }, { "epoch": 18.039241164241165, "grad_norm": 0.005025065969675779, "learning_rate": 1.4500551231722004e-06, "loss": 0.2512, "num_input_tokens_seen": 6250752, "step": 69415 }, { "epoch": 18.04054054054054, "grad_norm": 0.7855678200721741, "learning_rate": 1.4481529034078067e-06, "loss": 0.0368, "num_input_tokens_seen": 6251264, "step": 69420 }, { "epoch": 18.041839916839916, "grad_norm": 0.08017290383577347, "learning_rate": 1.4462518949374838e-06, "loss": 0.0315, "num_input_tokens_seen": 6251680, "step": 69425 }, { "epoch": 18.043139293139294, "grad_norm": 8.608721733093262, "learning_rate": 1.4443520978589864e-06, "loss": 0.0637, "num_input_tokens_seen": 6252096, "step": 69430 }, { "epoch": 18.04443866943867, "grad_norm": 0.04537945240736008, "learning_rate": 1.4424535122700367e-06, "loss": 0.0449, "num_input_tokens_seen": 6252592, "step": 69435 }, { "epoch": 18.045738045738045, "grad_norm": 0.24954958260059357, "learning_rate": 1.4405561382682703e-06, "loss": 0.0282, "num_input_tokens_seen": 6253024, "step": 69440 }, { "epoch": 18.047037422037423, "grad_norm": 0.9790030121803284, "learning_rate": 1.4386599759512759e-06, "loss": 0.0936, "num_input_tokens_seen": 6253456, "step": 69445 }, { "epoch": 18.0483367983368, "grad_norm": 0.022638166323304176, "learning_rate": 1.4367650254165727e-06, "loss": 0.4387, "num_input_tokens_seen": 6253920, "step": 69450 }, { "epoch": 18.049636174636174, "grad_norm": 0.08081086724996567, "learning_rate": 1.434871286761627e-06, "loss": 0.0002, "num_input_tokens_seen": 6254352, "step": 69455 }, { "epoch": 18.050935550935552, "grad_norm": 0.9458815455436707, "learning_rate": 1.4329787600838308e-06, "loss": 0.0205, "num_input_tokens_seen": 6254816, "step": 69460 }, { "epoch": 18.052234927234927, "grad_norm": 0.018243785947561264, "learning_rate": 1.4310874454805112e-06, "loss": 0.037, "num_input_tokens_seen": 6255264, "step": 69465 }, { "epoch": 18.053534303534303, "grad_norm": 0.0016518209595233202, "learning_rate": 1.429197343048952e-06, "loss": 0.052, "num_input_tokens_seen": 6255728, "step": 69470 }, { "epoch": 18.05483367983368, "grad_norm": 0.2492019534111023, "learning_rate": 1.4273084528863529e-06, "loss": 0.1197, "num_input_tokens_seen": 6256192, "step": 69475 }, { "epoch": 18.056133056133056, "grad_norm": 61.27916717529297, "learning_rate": 1.4254207750898696e-06, "loss": 0.3774, "num_input_tokens_seen": 6256624, "step": 69480 }, { "epoch": 18.05743243243243, "grad_norm": 60.19069290161133, "learning_rate": 1.42353430975658e-06, "loss": 0.1141, "num_input_tokens_seen": 6257136, "step": 69485 }, { "epoch": 18.05873180873181, "grad_norm": 0.0004950676811859012, "learning_rate": 1.4216490569835178e-06, "loss": 0.0031, "num_input_tokens_seen": 6257600, "step": 69490 }, { "epoch": 18.060031185031185, "grad_norm": 0.00863298773765564, "learning_rate": 1.4197650168676301e-06, "loss": 0.0001, "num_input_tokens_seen": 6258048, "step": 69495 }, { "epoch": 18.06133056133056, "grad_norm": 0.14450295269489288, "learning_rate": 1.417882189505826e-06, "loss": 0.0002, "num_input_tokens_seen": 6258528, "step": 69500 }, { "epoch": 18.06262993762994, "grad_norm": 2.1670079231262207, "learning_rate": 1.4160005749949328e-06, "loss": 0.0515, "num_input_tokens_seen": 6259008, "step": 69505 }, { "epoch": 18.063929313929314, "grad_norm": 56.8441047668457, "learning_rate": 1.4141201734317267e-06, "loss": 0.1545, "num_input_tokens_seen": 6259440, "step": 69510 }, { "epoch": 18.06522869022869, "grad_norm": 1.3707869052886963, "learning_rate": 1.4122409849129186e-06, "loss": 0.0191, "num_input_tokens_seen": 6259888, "step": 69515 }, { "epoch": 18.066528066528065, "grad_norm": 0.23220449686050415, "learning_rate": 1.4103630095351622e-06, "loss": 0.0586, "num_input_tokens_seen": 6260304, "step": 69520 }, { "epoch": 18.067827442827443, "grad_norm": 0.5386196374893188, "learning_rate": 1.4084862473950383e-06, "loss": 0.0008, "num_input_tokens_seen": 6260736, "step": 69525 }, { "epoch": 18.06912681912682, "grad_norm": 0.5248870849609375, "learning_rate": 1.406610698589067e-06, "loss": 0.1369, "num_input_tokens_seen": 6261232, "step": 69530 }, { "epoch": 18.070426195426194, "grad_norm": 64.38833618164062, "learning_rate": 1.4047363632137157e-06, "loss": 0.2086, "num_input_tokens_seen": 6261664, "step": 69535 }, { "epoch": 18.071725571725572, "grad_norm": 0.0017285379581153393, "learning_rate": 1.4028632413653792e-06, "loss": 0.0008, "num_input_tokens_seen": 6262160, "step": 69540 }, { "epoch": 18.073024948024948, "grad_norm": 0.8304489850997925, "learning_rate": 1.4009913331403945e-06, "loss": 0.0051, "num_input_tokens_seen": 6262576, "step": 69545 }, { "epoch": 18.074324324324323, "grad_norm": 0.05739198252558708, "learning_rate": 1.3991206386350403e-06, "loss": 0.0047, "num_input_tokens_seen": 6263024, "step": 69550 }, { "epoch": 18.0756237006237, "grad_norm": 5.887348175048828, "learning_rate": 1.3972511579455254e-06, "loss": 0.3761, "num_input_tokens_seen": 6263488, "step": 69555 }, { "epoch": 18.076923076923077, "grad_norm": 0.2614986002445221, "learning_rate": 1.3953828911679955e-06, "loss": 0.0078, "num_input_tokens_seen": 6263920, "step": 69560 }, { "epoch": 18.078222453222452, "grad_norm": 0.03506544232368469, "learning_rate": 1.3935158383985431e-06, "loss": 0.1414, "num_input_tokens_seen": 6264336, "step": 69565 }, { "epoch": 18.07952182952183, "grad_norm": 0.03364519774913788, "learning_rate": 1.3916499997331883e-06, "loss": 0.1511, "num_input_tokens_seen": 6264768, "step": 69570 }, { "epoch": 18.080821205821206, "grad_norm": 0.15985538065433502, "learning_rate": 1.3897853752678964e-06, "loss": 0.2205, "num_input_tokens_seen": 6265232, "step": 69575 }, { "epoch": 18.08212058212058, "grad_norm": 61.67548751831055, "learning_rate": 1.3879219650985625e-06, "loss": 0.1612, "num_input_tokens_seen": 6265680, "step": 69580 }, { "epoch": 18.08341995841996, "grad_norm": 0.006191825494170189, "learning_rate": 1.386059769321027e-06, "loss": 0.0016, "num_input_tokens_seen": 6266128, "step": 69585 }, { "epoch": 18.084719334719335, "grad_norm": 0.4206960201263428, "learning_rate": 1.384198788031063e-06, "loss": 0.0717, "num_input_tokens_seen": 6266592, "step": 69590 }, { "epoch": 18.08601871101871, "grad_norm": 0.01753809303045273, "learning_rate": 1.382339021324383e-06, "loss": 0.0107, "num_input_tokens_seen": 6267072, "step": 69595 }, { "epoch": 18.08731808731809, "grad_norm": 0.26990747451782227, "learning_rate": 1.3804804692966383e-06, "loss": 0.0018, "num_input_tokens_seen": 6267472, "step": 69600 }, { "epoch": 18.088617463617464, "grad_norm": 0.1475623995065689, "learning_rate": 1.3786231320434107e-06, "loss": 0.0058, "num_input_tokens_seen": 6267904, "step": 69605 }, { "epoch": 18.08991683991684, "grad_norm": 0.06971398741006851, "learning_rate": 1.376767009660232e-06, "loss": 0.0001, "num_input_tokens_seen": 6268352, "step": 69610 }, { "epoch": 18.091216216216218, "grad_norm": 0.1413431018590927, "learning_rate": 1.3749121022425593e-06, "loss": 0.1419, "num_input_tokens_seen": 6268800, "step": 69615 }, { "epoch": 18.092515592515593, "grad_norm": 0.016634328290820122, "learning_rate": 1.3730584098857913e-06, "loss": 0.0021, "num_input_tokens_seen": 6269232, "step": 69620 }, { "epoch": 18.093814968814968, "grad_norm": 0.0028006630018353462, "learning_rate": 1.3712059326852683e-06, "loss": 0.0514, "num_input_tokens_seen": 6269648, "step": 69625 }, { "epoch": 18.095114345114347, "grad_norm": 62.49297332763672, "learning_rate": 1.369354670736267e-06, "loss": 0.1489, "num_input_tokens_seen": 6270112, "step": 69630 }, { "epoch": 18.09641372141372, "grad_norm": 0.1908833533525467, "learning_rate": 1.3675046241339918e-06, "loss": 0.022, "num_input_tokens_seen": 6270544, "step": 69635 }, { "epoch": 18.097713097713097, "grad_norm": 27.78299903869629, "learning_rate": 1.3656557929736053e-06, "loss": 0.4692, "num_input_tokens_seen": 6270992, "step": 69640 }, { "epoch": 18.099012474012476, "grad_norm": 0.6963773965835571, "learning_rate": 1.3638081773501788e-06, "loss": 0.0517, "num_input_tokens_seen": 6271440, "step": 69645 }, { "epoch": 18.10031185031185, "grad_norm": 0.0059942579828202724, "learning_rate": 1.36196177735875e-06, "loss": 0.001, "num_input_tokens_seen": 6271888, "step": 69650 }, { "epoch": 18.101611226611226, "grad_norm": 48.47459411621094, "learning_rate": 1.3601165930942738e-06, "loss": 0.313, "num_input_tokens_seen": 6272320, "step": 69655 }, { "epoch": 18.102910602910605, "grad_norm": 56.09260559082031, "learning_rate": 1.3582726246516491e-06, "loss": 0.1928, "num_input_tokens_seen": 6272784, "step": 69660 }, { "epoch": 18.10420997920998, "grad_norm": 0.016124138608574867, "learning_rate": 1.3564298721257223e-06, "loss": 0.0241, "num_input_tokens_seen": 6273264, "step": 69665 }, { "epoch": 18.105509355509355, "grad_norm": 0.0005215846467763186, "learning_rate": 1.3545883356112565e-06, "loss": 0.0775, "num_input_tokens_seen": 6273712, "step": 69670 }, { "epoch": 18.10680873180873, "grad_norm": 0.0015078959986567497, "learning_rate": 1.3527480152029703e-06, "loss": 0.0361, "num_input_tokens_seen": 6274144, "step": 69675 }, { "epoch": 18.10810810810811, "grad_norm": 0.15644600987434387, "learning_rate": 1.3509089109955104e-06, "loss": 0.312, "num_input_tokens_seen": 6274576, "step": 69680 }, { "epoch": 18.109407484407484, "grad_norm": 0.010356614366173744, "learning_rate": 1.3490710230834674e-06, "loss": 0.007, "num_input_tokens_seen": 6275072, "step": 69685 }, { "epoch": 18.11070686070686, "grad_norm": 0.061457883566617966, "learning_rate": 1.3472343515613577e-06, "loss": 0.0031, "num_input_tokens_seen": 6275520, "step": 69690 }, { "epoch": 18.112006237006238, "grad_norm": 5.044044494628906, "learning_rate": 1.3453988965236503e-06, "loss": 0.1496, "num_input_tokens_seen": 6275968, "step": 69695 }, { "epoch": 18.113305613305613, "grad_norm": 1.0676120519638062, "learning_rate": 1.3435646580647415e-06, "loss": 0.0014, "num_input_tokens_seen": 6276480, "step": 69700 }, { "epoch": 18.114604989604988, "grad_norm": 0.06912614405155182, "learning_rate": 1.3417316362789728e-06, "loss": 0.2578, "num_input_tokens_seen": 6276944, "step": 69705 }, { "epoch": 18.115904365904367, "grad_norm": 0.02689843252301216, "learning_rate": 1.3398998312606104e-06, "loss": 0.0008, "num_input_tokens_seen": 6277376, "step": 69710 }, { "epoch": 18.117203742203742, "grad_norm": 0.9535702466964722, "learning_rate": 1.3380692431038732e-06, "loss": 0.0073, "num_input_tokens_seen": 6277808, "step": 69715 }, { "epoch": 18.118503118503117, "grad_norm": 1.1009799242019653, "learning_rate": 1.3362398719029084e-06, "loss": 0.0014, "num_input_tokens_seen": 6278304, "step": 69720 }, { "epoch": 18.119802494802496, "grad_norm": 0.007290659938007593, "learning_rate": 1.3344117177517933e-06, "loss": 0.0123, "num_input_tokens_seen": 6278784, "step": 69725 }, { "epoch": 18.12110187110187, "grad_norm": 4.442997932434082, "learning_rate": 1.3325847807445612e-06, "loss": 0.0076, "num_input_tokens_seen": 6279200, "step": 69730 }, { "epoch": 18.122401247401246, "grad_norm": 0.020130334421992302, "learning_rate": 1.330759060975173e-06, "loss": 0.0628, "num_input_tokens_seen": 6279648, "step": 69735 }, { "epoch": 18.123700623700625, "grad_norm": 0.503629744052887, "learning_rate": 1.3289345585375257e-06, "loss": 0.1726, "num_input_tokens_seen": 6280096, "step": 69740 }, { "epoch": 18.125, "grad_norm": 0.3050929009914398, "learning_rate": 1.3271112735254498e-06, "loss": 0.0383, "num_input_tokens_seen": 6280560, "step": 69745 }, { "epoch": 18.126299376299375, "grad_norm": 0.005075287073850632, "learning_rate": 1.3252892060327288e-06, "loss": 0.1368, "num_input_tokens_seen": 6281040, "step": 69750 }, { "epoch": 18.127598752598754, "grad_norm": 0.0033536646515130997, "learning_rate": 1.3234683561530653e-06, "loss": 0.2447, "num_input_tokens_seen": 6281504, "step": 69755 }, { "epoch": 18.12889812889813, "grad_norm": 0.002963976003229618, "learning_rate": 1.3216487239801095e-06, "loss": 0.0008, "num_input_tokens_seen": 6281936, "step": 69760 }, { "epoch": 18.130197505197504, "grad_norm": 0.04991722106933594, "learning_rate": 1.3198303096074478e-06, "loss": 0.0291, "num_input_tokens_seen": 6282384, "step": 69765 }, { "epoch": 18.131496881496883, "grad_norm": 34.140872955322266, "learning_rate": 1.318013113128605e-06, "loss": 0.2878, "num_input_tokens_seen": 6282832, "step": 69770 }, { "epoch": 18.132796257796258, "grad_norm": 0.12826207280158997, "learning_rate": 1.3161971346370344e-06, "loss": 0.0308, "num_input_tokens_seen": 6283248, "step": 69775 }, { "epoch": 18.134095634095633, "grad_norm": 2.584453582763672, "learning_rate": 1.3143823742261418e-06, "loss": 0.2245, "num_input_tokens_seen": 6283744, "step": 69780 }, { "epoch": 18.135395010395012, "grad_norm": 4.48956298828125, "learning_rate": 1.312568831989258e-06, "loss": 0.0054, "num_input_tokens_seen": 6284192, "step": 69785 }, { "epoch": 18.136694386694387, "grad_norm": 0.008909512311220169, "learning_rate": 1.3107565080196533e-06, "loss": 0.0011, "num_input_tokens_seen": 6284656, "step": 69790 }, { "epoch": 18.137993762993762, "grad_norm": 0.019447920843958855, "learning_rate": 1.3089454024105385e-06, "loss": 0.0001, "num_input_tokens_seen": 6285120, "step": 69795 }, { "epoch": 18.13929313929314, "grad_norm": 0.0005387808778323233, "learning_rate": 1.3071355152550619e-06, "loss": 0.0091, "num_input_tokens_seen": 6285584, "step": 69800 }, { "epoch": 18.140592515592516, "grad_norm": 0.11043766140937805, "learning_rate": 1.3053268466463099e-06, "loss": 0.1979, "num_input_tokens_seen": 6286096, "step": 69805 }, { "epoch": 18.14189189189189, "grad_norm": 1.1997634172439575, "learning_rate": 1.3035193966772996e-06, "loss": 0.0195, "num_input_tokens_seen": 6286528, "step": 69810 }, { "epoch": 18.14319126819127, "grad_norm": 73.2309799194336, "learning_rate": 1.301713165440993e-06, "loss": 0.3654, "num_input_tokens_seen": 6286928, "step": 69815 }, { "epoch": 18.144490644490645, "grad_norm": 0.013480739668011665, "learning_rate": 1.2999081530302826e-06, "loss": 0.3315, "num_input_tokens_seen": 6287376, "step": 69820 }, { "epoch": 18.14579002079002, "grad_norm": 0.008927715942263603, "learning_rate": 1.298104359538005e-06, "loss": 0.0164, "num_input_tokens_seen": 6287856, "step": 69825 }, { "epoch": 18.147089397089395, "grad_norm": 0.2896774113178253, "learning_rate": 1.2963017850569304e-06, "loss": 0.0026, "num_input_tokens_seen": 6288336, "step": 69830 }, { "epoch": 18.148388773388774, "grad_norm": 46.58247756958008, "learning_rate": 1.2945004296797654e-06, "loss": 0.1696, "num_input_tokens_seen": 6288784, "step": 69835 }, { "epoch": 18.14968814968815, "grad_norm": 0.20380789041519165, "learning_rate": 1.292700293499155e-06, "loss": 0.0015, "num_input_tokens_seen": 6289216, "step": 69840 }, { "epoch": 18.150987525987524, "grad_norm": 0.18449901044368744, "learning_rate": 1.2909013766076893e-06, "loss": 0.071, "num_input_tokens_seen": 6289648, "step": 69845 }, { "epoch": 18.152286902286903, "grad_norm": 20.71103286743164, "learning_rate": 1.2891036790978833e-06, "loss": 0.2238, "num_input_tokens_seen": 6290112, "step": 69850 }, { "epoch": 18.15358627858628, "grad_norm": 0.051948294043540955, "learning_rate": 1.2873072010621878e-06, "loss": 0.0006, "num_input_tokens_seen": 6290544, "step": 69855 }, { "epoch": 18.154885654885653, "grad_norm": 1.2682697772979736, "learning_rate": 1.2855119425930096e-06, "loss": 0.2395, "num_input_tokens_seen": 6290992, "step": 69860 }, { "epoch": 18.156185031185032, "grad_norm": 1.0529128313064575, "learning_rate": 1.2837179037826692e-06, "loss": 0.0009, "num_input_tokens_seen": 6291408, "step": 69865 }, { "epoch": 18.157484407484407, "grad_norm": 0.037306394428014755, "learning_rate": 1.2819250847234427e-06, "loss": 0.0348, "num_input_tokens_seen": 6291824, "step": 69870 }, { "epoch": 18.158783783783782, "grad_norm": 21.545515060424805, "learning_rate": 1.2801334855075314e-06, "loss": 0.0165, "num_input_tokens_seen": 6292272, "step": 69875 }, { "epoch": 18.16008316008316, "grad_norm": 17.32509422302246, "learning_rate": 1.2783431062270895e-06, "loss": 0.0114, "num_input_tokens_seen": 6292720, "step": 69880 }, { "epoch": 18.161382536382536, "grad_norm": 0.0895804837346077, "learning_rate": 1.2765539469741849e-06, "loss": 0.0038, "num_input_tokens_seen": 6293216, "step": 69885 }, { "epoch": 18.16268191268191, "grad_norm": 17.765993118286133, "learning_rate": 1.2747660078408442e-06, "loss": 0.1303, "num_input_tokens_seen": 6293664, "step": 69890 }, { "epoch": 18.16398128898129, "grad_norm": 0.14085493981838226, "learning_rate": 1.2729792889190184e-06, "loss": 0.3172, "num_input_tokens_seen": 6294112, "step": 69895 }, { "epoch": 18.165280665280665, "grad_norm": 0.6955633759498596, "learning_rate": 1.2711937903006039e-06, "loss": 0.0191, "num_input_tokens_seen": 6294528, "step": 69900 }, { "epoch": 18.16658004158004, "grad_norm": 0.04021543636918068, "learning_rate": 1.269409512077427e-06, "loss": 0.0008, "num_input_tokens_seen": 6294976, "step": 69905 }, { "epoch": 18.16787941787942, "grad_norm": 0.24632492661476135, "learning_rate": 1.2676264543412558e-06, "loss": 0.0016, "num_input_tokens_seen": 6295408, "step": 69910 }, { "epoch": 18.169178794178794, "grad_norm": 0.07685283571481705, "learning_rate": 1.2658446171837979e-06, "loss": 0.2054, "num_input_tokens_seen": 6295872, "step": 69915 }, { "epoch": 18.17047817047817, "grad_norm": 0.6473953723907471, "learning_rate": 1.2640640006966882e-06, "loss": 0.001, "num_input_tokens_seen": 6296288, "step": 69920 }, { "epoch": 18.171777546777548, "grad_norm": 0.013877661898732185, "learning_rate": 1.2622846049715142e-06, "loss": 0.0019, "num_input_tokens_seen": 6296720, "step": 69925 }, { "epoch": 18.173076923076923, "grad_norm": 0.7055416703224182, "learning_rate": 1.2605064300997837e-06, "loss": 0.0024, "num_input_tokens_seen": 6297216, "step": 69930 }, { "epoch": 18.1743762993763, "grad_norm": 45.409515380859375, "learning_rate": 1.2587294761729596e-06, "loss": 0.2462, "num_input_tokens_seen": 6297664, "step": 69935 }, { "epoch": 18.175675675675677, "grad_norm": 0.1607220470905304, "learning_rate": 1.2569537432824186e-06, "loss": 0.0019, "num_input_tokens_seen": 6298144, "step": 69940 }, { "epoch": 18.176975051975052, "grad_norm": 0.6707534790039062, "learning_rate": 1.2551792315194989e-06, "loss": 0.0292, "num_input_tokens_seen": 6298624, "step": 69945 }, { "epoch": 18.178274428274428, "grad_norm": 0.04613937810063362, "learning_rate": 1.2534059409754606e-06, "loss": 0.0022, "num_input_tokens_seen": 6299056, "step": 69950 }, { "epoch": 18.179573804573806, "grad_norm": 0.08523980528116226, "learning_rate": 1.2516338717415144e-06, "loss": 0.0142, "num_input_tokens_seen": 6299488, "step": 69955 }, { "epoch": 18.18087318087318, "grad_norm": 0.001215635915286839, "learning_rate": 1.2498630239087873e-06, "loss": 0.0016, "num_input_tokens_seen": 6299936, "step": 69960 }, { "epoch": 18.182172557172557, "grad_norm": 0.40056121349334717, "learning_rate": 1.2480933975683617e-06, "loss": 0.0194, "num_input_tokens_seen": 6300400, "step": 69965 }, { "epoch": 18.183471933471935, "grad_norm": 0.029002677649259567, "learning_rate": 1.2463249928112514e-06, "loss": 0.0428, "num_input_tokens_seen": 6300880, "step": 69970 }, { "epoch": 18.18477130977131, "grad_norm": 19.028099060058594, "learning_rate": 1.2445578097284084e-06, "loss": 0.3067, "num_input_tokens_seen": 6301312, "step": 69975 }, { "epoch": 18.186070686070686, "grad_norm": 0.0868784710764885, "learning_rate": 1.2427918484107153e-06, "loss": 0.0058, "num_input_tokens_seen": 6301744, "step": 69980 }, { "epoch": 18.18737006237006, "grad_norm": 0.3843950629234314, "learning_rate": 1.241027108949e-06, "loss": 0.0005, "num_input_tokens_seen": 6302240, "step": 69985 }, { "epoch": 18.18866943866944, "grad_norm": 83.57795715332031, "learning_rate": 1.2392635914340285e-06, "loss": 0.339, "num_input_tokens_seen": 6302672, "step": 69990 }, { "epoch": 18.189968814968815, "grad_norm": 0.5396495461463928, "learning_rate": 1.2375012959564947e-06, "loss": 0.0539, "num_input_tokens_seen": 6303120, "step": 69995 }, { "epoch": 18.19126819126819, "grad_norm": 49.97761154174805, "learning_rate": 1.2357402226070402e-06, "loss": 0.3659, "num_input_tokens_seen": 6303568, "step": 70000 }, { "epoch": 18.19256756756757, "grad_norm": 33.068458557128906, "learning_rate": 1.2339803714762316e-06, "loss": 0.0188, "num_input_tokens_seen": 6304032, "step": 70005 }, { "epoch": 18.193866943866944, "grad_norm": 46.2510871887207, "learning_rate": 1.232221742654585e-06, "loss": 0.2627, "num_input_tokens_seen": 6304480, "step": 70010 }, { "epoch": 18.19516632016632, "grad_norm": 0.03778636455535889, "learning_rate": 1.2304643362325452e-06, "loss": 0.0001, "num_input_tokens_seen": 6304960, "step": 70015 }, { "epoch": 18.196465696465697, "grad_norm": 1.6782113313674927, "learning_rate": 1.2287081523005062e-06, "loss": 0.0368, "num_input_tokens_seen": 6305376, "step": 70020 }, { "epoch": 18.197765072765073, "grad_norm": 0.6222802996635437, "learning_rate": 1.226953190948779e-06, "loss": 0.0006, "num_input_tokens_seen": 6305808, "step": 70025 }, { "epoch": 18.199064449064448, "grad_norm": 0.2134031355381012, "learning_rate": 1.2251994522676307e-06, "loss": 0.0011, "num_input_tokens_seen": 6306256, "step": 70030 }, { "epoch": 18.200363825363826, "grad_norm": 0.5406412482261658, "learning_rate": 1.22344693634725e-06, "loss": 0.0048, "num_input_tokens_seen": 6306704, "step": 70035 }, { "epoch": 18.2016632016632, "grad_norm": 0.015673339366912842, "learning_rate": 1.2216956432777788e-06, "loss": 0.006, "num_input_tokens_seen": 6307136, "step": 70040 }, { "epoch": 18.202962577962577, "grad_norm": 0.20921090245246887, "learning_rate": 1.2199455731492809e-06, "loss": 0.1326, "num_input_tokens_seen": 6307568, "step": 70045 }, { "epoch": 18.204261954261955, "grad_norm": 1.7355461120605469, "learning_rate": 1.218196726051768e-06, "loss": 0.16, "num_input_tokens_seen": 6307984, "step": 70050 }, { "epoch": 18.20556133056133, "grad_norm": 0.2254030555486679, "learning_rate": 1.2164491020751872e-06, "loss": 0.0007, "num_input_tokens_seen": 6308448, "step": 70055 }, { "epoch": 18.206860706860706, "grad_norm": 0.02721722424030304, "learning_rate": 1.2147027013094113e-06, "loss": 0.5243, "num_input_tokens_seen": 6308912, "step": 70060 }, { "epoch": 18.208160083160084, "grad_norm": 0.0038629481568932533, "learning_rate": 1.2129575238442715e-06, "loss": 0.0612, "num_input_tokens_seen": 6309328, "step": 70065 }, { "epoch": 18.20945945945946, "grad_norm": 3.7611303329467773, "learning_rate": 1.2112135697695147e-06, "loss": 0.2224, "num_input_tokens_seen": 6309776, "step": 70070 }, { "epoch": 18.210758835758835, "grad_norm": 3.328760862350464, "learning_rate": 1.2094708391748395e-06, "loss": 0.0334, "num_input_tokens_seen": 6310224, "step": 70075 }, { "epoch": 18.212058212058214, "grad_norm": 123.5582504272461, "learning_rate": 1.2077293321498706e-06, "loss": 0.601, "num_input_tokens_seen": 6310688, "step": 70080 }, { "epoch": 18.21335758835759, "grad_norm": 0.05344264954328537, "learning_rate": 1.2059890487841813e-06, "loss": 0.019, "num_input_tokens_seen": 6311104, "step": 70085 }, { "epoch": 18.214656964656964, "grad_norm": 0.048460882157087326, "learning_rate": 1.2042499891672693e-06, "loss": 0.0643, "num_input_tokens_seen": 6311504, "step": 70090 }, { "epoch": 18.215956340956343, "grad_norm": 4.175328254699707, "learning_rate": 1.202512153388588e-06, "loss": 0.0031, "num_input_tokens_seen": 6311952, "step": 70095 }, { "epoch": 18.217255717255718, "grad_norm": 5.91684627532959, "learning_rate": 1.2007755415375022e-06, "loss": 0.0066, "num_input_tokens_seen": 6312416, "step": 70100 }, { "epoch": 18.218555093555093, "grad_norm": 0.1927555501461029, "learning_rate": 1.1990401537033373e-06, "loss": 0.0014, "num_input_tokens_seen": 6312896, "step": 70105 }, { "epoch": 18.21985446985447, "grad_norm": 0.004940082784742117, "learning_rate": 1.197305989975342e-06, "loss": 0.0003, "num_input_tokens_seen": 6313328, "step": 70110 }, { "epoch": 18.221153846153847, "grad_norm": 0.22041215002536774, "learning_rate": 1.1955730504427055e-06, "loss": 0.1248, "num_input_tokens_seen": 6313760, "step": 70115 }, { "epoch": 18.222453222453222, "grad_norm": 0.11528147011995316, "learning_rate": 1.193841335194551e-06, "loss": 0.0006, "num_input_tokens_seen": 6314256, "step": 70120 }, { "epoch": 18.223752598752597, "grad_norm": 41.81913757324219, "learning_rate": 1.192110844319949e-06, "loss": 0.0233, "num_input_tokens_seen": 6314736, "step": 70125 }, { "epoch": 18.225051975051976, "grad_norm": 0.14494508504867554, "learning_rate": 1.1903815779079035e-06, "loss": 0.0038, "num_input_tokens_seen": 6315152, "step": 70130 }, { "epoch": 18.22635135135135, "grad_norm": 1.2866275310516357, "learning_rate": 1.1886535360473406e-06, "loss": 0.0121, "num_input_tokens_seen": 6315568, "step": 70135 }, { "epoch": 18.227650727650726, "grad_norm": 0.22551997005939484, "learning_rate": 1.1869267188271444e-06, "loss": 0.1103, "num_input_tokens_seen": 6316000, "step": 70140 }, { "epoch": 18.228950103950105, "grad_norm": 0.10298305749893188, "learning_rate": 1.185201126336122e-06, "loss": 0.1418, "num_input_tokens_seen": 6316464, "step": 70145 }, { "epoch": 18.23024948024948, "grad_norm": 62.534568786621094, "learning_rate": 1.183476758663024e-06, "loss": 0.2205, "num_input_tokens_seen": 6316944, "step": 70150 }, { "epoch": 18.231548856548855, "grad_norm": 0.08834351599216461, "learning_rate": 1.181753615896536e-06, "loss": 0.0034, "num_input_tokens_seen": 6317392, "step": 70155 }, { "epoch": 18.232848232848234, "grad_norm": 0.07451734691858292, "learning_rate": 1.1800316981252808e-06, "loss": 0.1742, "num_input_tokens_seen": 6317840, "step": 70160 }, { "epoch": 18.23414760914761, "grad_norm": 50.18865203857422, "learning_rate": 1.1783110054378182e-06, "loss": 0.0743, "num_input_tokens_seen": 6318272, "step": 70165 }, { "epoch": 18.235446985446984, "grad_norm": 0.37151023745536804, "learning_rate": 1.1765915379226471e-06, "loss": 0.1025, "num_input_tokens_seen": 6318704, "step": 70170 }, { "epoch": 18.236746361746363, "grad_norm": 0.004292211029678583, "learning_rate": 1.1748732956682025e-06, "loss": 0.308, "num_input_tokens_seen": 6319168, "step": 70175 }, { "epoch": 18.238045738045738, "grad_norm": 0.028291521593928337, "learning_rate": 1.1731562787628464e-06, "loss": 0.0005, "num_input_tokens_seen": 6319584, "step": 70180 }, { "epoch": 18.239345114345113, "grad_norm": 0.008480893447995186, "learning_rate": 1.1714404872948976e-06, "loss": 0.0127, "num_input_tokens_seen": 6320000, "step": 70185 }, { "epoch": 18.240644490644492, "grad_norm": 0.0016885771183297038, "learning_rate": 1.1697259213525935e-06, "loss": 0.1446, "num_input_tokens_seen": 6320480, "step": 70190 }, { "epoch": 18.241943866943867, "grad_norm": 0.0041724261827766895, "learning_rate": 1.1680125810241166e-06, "loss": 0.3552, "num_input_tokens_seen": 6320960, "step": 70195 }, { "epoch": 18.243243243243242, "grad_norm": 14.116950988769531, "learning_rate": 1.166300466397588e-06, "loss": 0.0115, "num_input_tokens_seen": 6321424, "step": 70200 }, { "epoch": 18.24454261954262, "grad_norm": 0.035704534500837326, "learning_rate": 1.1645895775610677e-06, "loss": 0.4704, "num_input_tokens_seen": 6321872, "step": 70205 }, { "epoch": 18.245841995841996, "grad_norm": 4.475457191467285, "learning_rate": 1.1628799146025383e-06, "loss": 0.3071, "num_input_tokens_seen": 6322320, "step": 70210 }, { "epoch": 18.24714137214137, "grad_norm": 0.036674655973911285, "learning_rate": 1.1611714776099376e-06, "loss": 0.0076, "num_input_tokens_seen": 6322736, "step": 70215 }, { "epoch": 18.24844074844075, "grad_norm": 0.19337183237075806, "learning_rate": 1.1594642666711259e-06, "loss": 0.0343, "num_input_tokens_seen": 6323184, "step": 70220 }, { "epoch": 18.249740124740125, "grad_norm": 33.681793212890625, "learning_rate": 1.1577582818739135e-06, "loss": 0.0259, "num_input_tokens_seen": 6323600, "step": 70225 }, { "epoch": 18.2510395010395, "grad_norm": 0.034161437302827835, "learning_rate": 1.1560535233060304e-06, "loss": 0.0327, "num_input_tokens_seen": 6324064, "step": 70230 }, { "epoch": 18.25233887733888, "grad_norm": 0.005916397087275982, "learning_rate": 1.1543499910551676e-06, "loss": 0.0006, "num_input_tokens_seen": 6324512, "step": 70235 }, { "epoch": 18.253638253638254, "grad_norm": 0.040461599826812744, "learning_rate": 1.1526476852089324e-06, "loss": 0.032, "num_input_tokens_seen": 6324944, "step": 70240 }, { "epoch": 18.25493762993763, "grad_norm": 0.009636208415031433, "learning_rate": 1.1509466058548719e-06, "loss": 0.001, "num_input_tokens_seen": 6325424, "step": 70245 }, { "epoch": 18.256237006237008, "grad_norm": 1.6208837032318115, "learning_rate": 1.1492467530804823e-06, "loss": 0.0025, "num_input_tokens_seen": 6325856, "step": 70250 }, { "epoch": 18.257536382536383, "grad_norm": 0.0852499008178711, "learning_rate": 1.147548126973183e-06, "loss": 0.1453, "num_input_tokens_seen": 6326304, "step": 70255 }, { "epoch": 18.258835758835758, "grad_norm": 0.0016915634041652083, "learning_rate": 1.1458507276203373e-06, "loss": 0.0025, "num_input_tokens_seen": 6326784, "step": 70260 }, { "epoch": 18.260135135135137, "grad_norm": 0.005536708515137434, "learning_rate": 1.1441545551092448e-06, "loss": 0.0002, "num_input_tokens_seen": 6327232, "step": 70265 }, { "epoch": 18.261434511434512, "grad_norm": 0.488339900970459, "learning_rate": 1.142459609527144e-06, "loss": 0.0072, "num_input_tokens_seen": 6327680, "step": 70270 }, { "epoch": 18.262733887733887, "grad_norm": 0.01161368377506733, "learning_rate": 1.1407658909612012e-06, "loss": 0.1993, "num_input_tokens_seen": 6328128, "step": 70275 }, { "epoch": 18.264033264033262, "grad_norm": 0.0095570869743824, "learning_rate": 1.139073399498533e-06, "loss": 0.3501, "num_input_tokens_seen": 6328560, "step": 70280 }, { "epoch": 18.26533264033264, "grad_norm": 51.9534797668457, "learning_rate": 1.1373821352261781e-06, "loss": 0.1448, "num_input_tokens_seen": 6329024, "step": 70285 }, { "epoch": 18.266632016632016, "grad_norm": 0.0018778885714709759, "learning_rate": 1.135692098231128e-06, "loss": 0.0017, "num_input_tokens_seen": 6329488, "step": 70290 }, { "epoch": 18.26793139293139, "grad_norm": 35.64105987548828, "learning_rate": 1.1340032886002966e-06, "loss": 0.0704, "num_input_tokens_seen": 6329936, "step": 70295 }, { "epoch": 18.26923076923077, "grad_norm": 0.01622244156897068, "learning_rate": 1.132315706420542e-06, "loss": 0.0774, "num_input_tokens_seen": 6330352, "step": 70300 }, { "epoch": 18.270530145530145, "grad_norm": 12.521025657653809, "learning_rate": 1.1306293517786614e-06, "loss": 0.043, "num_input_tokens_seen": 6330784, "step": 70305 }, { "epoch": 18.27182952182952, "grad_norm": 0.13560429215431213, "learning_rate": 1.128944224761383e-06, "loss": 0.5393, "num_input_tokens_seen": 6331264, "step": 70310 }, { "epoch": 18.2731288981289, "grad_norm": 1.2790037393569946, "learning_rate": 1.1272603254553786e-06, "loss": 0.0672, "num_input_tokens_seen": 6331712, "step": 70315 }, { "epoch": 18.274428274428274, "grad_norm": 0.02593366429209709, "learning_rate": 1.1255776539472463e-06, "loss": 0.2975, "num_input_tokens_seen": 6332128, "step": 70320 }, { "epoch": 18.27572765072765, "grad_norm": 0.11929723620414734, "learning_rate": 1.1238962103235329e-06, "loss": 0.1249, "num_input_tokens_seen": 6332592, "step": 70325 }, { "epoch": 18.277027027027028, "grad_norm": 38.00364685058594, "learning_rate": 1.1222159946707112e-06, "loss": 0.0231, "num_input_tokens_seen": 6333056, "step": 70330 }, { "epoch": 18.278326403326403, "grad_norm": 0.06206923723220825, "learning_rate": 1.120537007075198e-06, "loss": 0.0008, "num_input_tokens_seen": 6333488, "step": 70335 }, { "epoch": 18.27962577962578, "grad_norm": 71.25473022460938, "learning_rate": 1.1188592476233494e-06, "loss": 0.1928, "num_input_tokens_seen": 6333936, "step": 70340 }, { "epoch": 18.280925155925157, "grad_norm": 0.0012899689609184861, "learning_rate": 1.117182716401452e-06, "loss": 0.4375, "num_input_tokens_seen": 6334384, "step": 70345 }, { "epoch": 18.282224532224532, "grad_norm": 47.49533462524414, "learning_rate": 1.1155074134957312e-06, "loss": 0.0411, "num_input_tokens_seen": 6334816, "step": 70350 }, { "epoch": 18.283523908523907, "grad_norm": 0.28043463826179504, "learning_rate": 1.113833338992351e-06, "loss": 0.0073, "num_input_tokens_seen": 6335232, "step": 70355 }, { "epoch": 18.284823284823286, "grad_norm": 27.717384338378906, "learning_rate": 1.1121604929774044e-06, "loss": 0.0574, "num_input_tokens_seen": 6335664, "step": 70360 }, { "epoch": 18.28612266112266, "grad_norm": 0.005248602479696274, "learning_rate": 1.1104888755369359e-06, "loss": 0.0005, "num_input_tokens_seen": 6336144, "step": 70365 }, { "epoch": 18.287422037422036, "grad_norm": 0.40867552161216736, "learning_rate": 1.1088184867569101e-06, "loss": 0.0077, "num_input_tokens_seen": 6336560, "step": 70370 }, { "epoch": 18.288721413721415, "grad_norm": 58.38132095336914, "learning_rate": 1.107149326723242e-06, "loss": 0.283, "num_input_tokens_seen": 6337056, "step": 70375 }, { "epoch": 18.29002079002079, "grad_norm": 0.06073083356022835, "learning_rate": 1.105481395521779e-06, "loss": 0.0626, "num_input_tokens_seen": 6337488, "step": 70380 }, { "epoch": 18.291320166320165, "grad_norm": 49.41952133178711, "learning_rate": 1.1038146932383004e-06, "loss": 0.0668, "num_input_tokens_seen": 6337920, "step": 70385 }, { "epoch": 18.292619542619544, "grad_norm": 6.736331939697266, "learning_rate": 1.1021492199585314e-06, "loss": 0.0042, "num_input_tokens_seen": 6338368, "step": 70390 }, { "epoch": 18.29391891891892, "grad_norm": 0.040627237409353256, "learning_rate": 1.1004849757681235e-06, "loss": 0.001, "num_input_tokens_seen": 6338816, "step": 70395 }, { "epoch": 18.295218295218294, "grad_norm": 58.73670959472656, "learning_rate": 1.0988219607526745e-06, "loss": 0.0366, "num_input_tokens_seen": 6339280, "step": 70400 }, { "epoch": 18.296517671517673, "grad_norm": 1.0604218244552612, "learning_rate": 1.0971601749977106e-06, "loss": 0.2224, "num_input_tokens_seen": 6339744, "step": 70405 }, { "epoch": 18.29781704781705, "grad_norm": 0.014527623541653156, "learning_rate": 1.0954996185887023e-06, "loss": 0.0507, "num_input_tokens_seen": 6340208, "step": 70410 }, { "epoch": 18.299116424116423, "grad_norm": 4.67802619934082, "learning_rate": 1.0938402916110508e-06, "loss": 0.0772, "num_input_tokens_seen": 6340688, "step": 70415 }, { "epoch": 18.3004158004158, "grad_norm": 0.06770116835832596, "learning_rate": 1.0921821941501043e-06, "loss": 0.0458, "num_input_tokens_seen": 6341152, "step": 70420 }, { "epoch": 18.301715176715177, "grad_norm": 1.9500653743743896, "learning_rate": 1.0905253262911309e-06, "loss": 0.0027, "num_input_tokens_seen": 6341584, "step": 70425 }, { "epoch": 18.303014553014552, "grad_norm": 0.002079951809719205, "learning_rate": 1.088869688119351e-06, "loss": 0.0332, "num_input_tokens_seen": 6342048, "step": 70430 }, { "epoch": 18.304313929313928, "grad_norm": 0.028795091435313225, "learning_rate": 1.0872152797199136e-06, "loss": 0.1127, "num_input_tokens_seen": 6342496, "step": 70435 }, { "epoch": 18.305613305613306, "grad_norm": 0.2556990683078766, "learning_rate": 1.085562101177906e-06, "loss": 0.0078, "num_input_tokens_seen": 6342912, "step": 70440 }, { "epoch": 18.30691268191268, "grad_norm": 0.13180230557918549, "learning_rate": 1.0839101525783491e-06, "loss": 0.0281, "num_input_tokens_seen": 6343360, "step": 70445 }, { "epoch": 18.308212058212057, "grad_norm": 0.07757651805877686, "learning_rate": 1.0822594340062109e-06, "loss": 0.0049, "num_input_tokens_seen": 6343856, "step": 70450 }, { "epoch": 18.309511434511435, "grad_norm": 0.016819484531879425, "learning_rate": 1.0806099455463903e-06, "loss": 0.0005, "num_input_tokens_seen": 6344336, "step": 70455 }, { "epoch": 18.31081081081081, "grad_norm": 1.830656886100769, "learning_rate": 1.0789616872837167e-06, "loss": 0.2529, "num_input_tokens_seen": 6344784, "step": 70460 }, { "epoch": 18.312110187110186, "grad_norm": 0.002187857637181878, "learning_rate": 1.0773146593029637e-06, "loss": 0.2958, "num_input_tokens_seen": 6345248, "step": 70465 }, { "epoch": 18.313409563409564, "grad_norm": 0.1386796087026596, "learning_rate": 1.0756688616888361e-06, "loss": 0.0002, "num_input_tokens_seen": 6345744, "step": 70470 }, { "epoch": 18.31470893970894, "grad_norm": 0.000798584776930511, "learning_rate": 1.0740242945259855e-06, "loss": 0.1951, "num_input_tokens_seen": 6346160, "step": 70475 }, { "epoch": 18.316008316008315, "grad_norm": 0.3458400070667267, "learning_rate": 1.0723809578989884e-06, "loss": 0.0367, "num_input_tokens_seen": 6346592, "step": 70480 }, { "epoch": 18.317307692307693, "grad_norm": 1.0767549276351929, "learning_rate": 1.0707388518923668e-06, "loss": 0.0108, "num_input_tokens_seen": 6346992, "step": 70485 }, { "epoch": 18.31860706860707, "grad_norm": 17.461942672729492, "learning_rate": 1.0690979765905718e-06, "loss": 0.0232, "num_input_tokens_seen": 6347440, "step": 70490 }, { "epoch": 18.319906444906444, "grad_norm": 22.317291259765625, "learning_rate": 1.067458332078e-06, "loss": 0.095, "num_input_tokens_seen": 6347888, "step": 70495 }, { "epoch": 18.321205821205822, "grad_norm": 6.909704685211182, "learning_rate": 1.0658199184389762e-06, "loss": 0.0068, "num_input_tokens_seen": 6348352, "step": 70500 }, { "epoch": 18.322505197505198, "grad_norm": 2.7637109756469727, "learning_rate": 1.0641827357577682e-06, "loss": 0.025, "num_input_tokens_seen": 6348800, "step": 70505 }, { "epoch": 18.323804573804573, "grad_norm": 6.551817417144775, "learning_rate": 1.0625467841185733e-06, "loss": 0.3117, "num_input_tokens_seen": 6349232, "step": 70510 }, { "epoch": 18.32510395010395, "grad_norm": 3.5654826164245605, "learning_rate": 1.060912063605532e-06, "loss": 0.0575, "num_input_tokens_seen": 6349680, "step": 70515 }, { "epoch": 18.326403326403327, "grad_norm": 121.63075256347656, "learning_rate": 1.0592785743027244e-06, "loss": 0.1474, "num_input_tokens_seen": 6350112, "step": 70520 }, { "epoch": 18.3277027027027, "grad_norm": 0.06326597183942795, "learning_rate": 1.0576463162941558e-06, "loss": 0.0252, "num_input_tokens_seen": 6350560, "step": 70525 }, { "epoch": 18.32900207900208, "grad_norm": 0.004476779606193304, "learning_rate": 1.056015289663781e-06, "loss": 0.0007, "num_input_tokens_seen": 6351024, "step": 70530 }, { "epoch": 18.330301455301456, "grad_norm": 0.4707145094871521, "learning_rate": 1.05438549449548e-06, "loss": 0.0234, "num_input_tokens_seen": 6351440, "step": 70535 }, { "epoch": 18.33160083160083, "grad_norm": 1.32179594039917, "learning_rate": 1.0527569308730779e-06, "loss": 0.0013, "num_input_tokens_seen": 6351856, "step": 70540 }, { "epoch": 18.33290020790021, "grad_norm": 0.37831631302833557, "learning_rate": 1.0511295988803294e-06, "loss": 0.0834, "num_input_tokens_seen": 6352304, "step": 70545 }, { "epoch": 18.334199584199585, "grad_norm": 0.14639607071876526, "learning_rate": 1.0495034986009316e-06, "loss": 0.0864, "num_input_tokens_seen": 6352816, "step": 70550 }, { "epoch": 18.33549896049896, "grad_norm": 0.6963945627212524, "learning_rate": 1.0478786301185178e-06, "loss": 0.0054, "num_input_tokens_seen": 6353248, "step": 70555 }, { "epoch": 18.33679833679834, "grad_norm": 0.0064707533456385136, "learning_rate": 1.0462549935166572e-06, "loss": 0.0022, "num_input_tokens_seen": 6353696, "step": 70560 }, { "epoch": 18.338097713097714, "grad_norm": 35.157100677490234, "learning_rate": 1.0446325888788521e-06, "loss": 0.2378, "num_input_tokens_seen": 6354128, "step": 70565 }, { "epoch": 18.33939708939709, "grad_norm": 2.0101852416992188, "learning_rate": 1.0430114162885502e-06, "loss": 0.12, "num_input_tokens_seen": 6354608, "step": 70570 }, { "epoch": 18.340696465696467, "grad_norm": 2.2046005725860596, "learning_rate": 1.0413914758291233e-06, "loss": 0.0073, "num_input_tokens_seen": 6355040, "step": 70575 }, { "epoch": 18.341995841995843, "grad_norm": 0.19856618344783783, "learning_rate": 1.0397727675838882e-06, "loss": 0.0096, "num_input_tokens_seen": 6355504, "step": 70580 }, { "epoch": 18.343295218295218, "grad_norm": 2.242126226425171, "learning_rate": 1.0381552916360948e-06, "loss": 0.3141, "num_input_tokens_seen": 6355952, "step": 70585 }, { "epoch": 18.344594594594593, "grad_norm": 0.003205699846148491, "learning_rate": 1.0365390480689353e-06, "loss": 0.0864, "num_input_tokens_seen": 6356432, "step": 70590 }, { "epoch": 18.34589397089397, "grad_norm": 0.0021393620409071445, "learning_rate": 1.0349240369655373e-06, "loss": 0.1055, "num_input_tokens_seen": 6356864, "step": 70595 }, { "epoch": 18.347193347193347, "grad_norm": 52.035125732421875, "learning_rate": 1.0333102584089537e-06, "loss": 0.2875, "num_input_tokens_seen": 6357296, "step": 70600 }, { "epoch": 18.348492723492722, "grad_norm": 0.02508104406297207, "learning_rate": 1.0316977124821908e-06, "loss": 0.1421, "num_input_tokens_seen": 6357760, "step": 70605 }, { "epoch": 18.3497920997921, "grad_norm": 57.87839889526367, "learning_rate": 1.0300863992681763e-06, "loss": 0.1661, "num_input_tokens_seen": 6358208, "step": 70610 }, { "epoch": 18.351091476091476, "grad_norm": 0.6788372993469238, "learning_rate": 1.0284763188497886e-06, "loss": 0.0009, "num_input_tokens_seen": 6358640, "step": 70615 }, { "epoch": 18.35239085239085, "grad_norm": 0.055337317287921906, "learning_rate": 1.026867471309828e-06, "loss": 0.011, "num_input_tokens_seen": 6359104, "step": 70620 }, { "epoch": 18.35369022869023, "grad_norm": 0.07282332330942154, "learning_rate": 1.0252598567310451e-06, "loss": 0.0003, "num_input_tokens_seen": 6359552, "step": 70625 }, { "epoch": 18.354989604989605, "grad_norm": 32.228092193603516, "learning_rate": 1.023653475196118e-06, "loss": 0.037, "num_input_tokens_seen": 6360032, "step": 70630 }, { "epoch": 18.35628898128898, "grad_norm": 0.42761287093162537, "learning_rate": 1.0220483267876696e-06, "loss": 0.1948, "num_input_tokens_seen": 6360528, "step": 70635 }, { "epoch": 18.35758835758836, "grad_norm": 10.688556671142578, "learning_rate": 1.0204444115882505e-06, "loss": 0.0218, "num_input_tokens_seen": 6360992, "step": 70640 }, { "epoch": 18.358887733887734, "grad_norm": 0.3149932026863098, "learning_rate": 1.0188417296803476e-06, "loss": 0.0009, "num_input_tokens_seen": 6361440, "step": 70645 }, { "epoch": 18.36018711018711, "grad_norm": 0.10788790136575699, "learning_rate": 1.017240281146395e-06, "loss": 0.0037, "num_input_tokens_seen": 6361888, "step": 70650 }, { "epoch": 18.361486486486488, "grad_norm": 0.02664564549922943, "learning_rate": 1.0156400660687515e-06, "loss": 0.0005, "num_input_tokens_seen": 6362320, "step": 70655 }, { "epoch": 18.362785862785863, "grad_norm": 0.6852413415908813, "learning_rate": 1.014041084529721e-06, "loss": 0.2261, "num_input_tokens_seen": 6362768, "step": 70660 }, { "epoch": 18.364085239085238, "grad_norm": 0.0017198032001033425, "learning_rate": 1.0124433366115376e-06, "loss": 0.0003, "num_input_tokens_seen": 6363216, "step": 70665 }, { "epoch": 18.365384615384617, "grad_norm": 0.008311675861477852, "learning_rate": 1.010846822396383e-06, "loss": 0.0001, "num_input_tokens_seen": 6363648, "step": 70670 }, { "epoch": 18.366683991683992, "grad_norm": 0.005873821675777435, "learning_rate": 1.0092515419663578e-06, "loss": 0.0036, "num_input_tokens_seen": 6364112, "step": 70675 }, { "epoch": 18.367983367983367, "grad_norm": 0.03655174374580383, "learning_rate": 1.0076574954035133e-06, "loss": 0.0002, "num_input_tokens_seen": 6364608, "step": 70680 }, { "epoch": 18.369282744282746, "grad_norm": 1.5923007726669312, "learning_rate": 1.0060646827898313e-06, "loss": 0.0665, "num_input_tokens_seen": 6365072, "step": 70685 }, { "epoch": 18.37058212058212, "grad_norm": 11.693238258361816, "learning_rate": 1.0044731042072348e-06, "loss": 0.0258, "num_input_tokens_seen": 6365552, "step": 70690 }, { "epoch": 18.371881496881496, "grad_norm": 0.001327589270658791, "learning_rate": 1.0028827597375751e-06, "loss": 0.0016, "num_input_tokens_seen": 6365984, "step": 70695 }, { "epoch": 18.373180873180875, "grad_norm": 0.006659309845417738, "learning_rate": 1.001293649462648e-06, "loss": 0.0005, "num_input_tokens_seen": 6366448, "step": 70700 }, { "epoch": 18.37448024948025, "grad_norm": 4.8014607429504395, "learning_rate": 9.99705773464185e-07, "loss": 0.0051, "num_input_tokens_seen": 6366912, "step": 70705 }, { "epoch": 18.375779625779625, "grad_norm": 2.8881618976593018, "learning_rate": 9.98119131823849e-07, "loss": 0.0067, "num_input_tokens_seen": 6367376, "step": 70710 }, { "epoch": 18.377079002079004, "grad_norm": 0.0045175873674452305, "learning_rate": 9.965337246232441e-07, "loss": 0.6427, "num_input_tokens_seen": 6367824, "step": 70715 }, { "epoch": 18.37837837837838, "grad_norm": 0.0011404972756281495, "learning_rate": 9.949495519439077e-07, "loss": 0.0025, "num_input_tokens_seen": 6368304, "step": 70720 }, { "epoch": 18.379677754677754, "grad_norm": 78.584228515625, "learning_rate": 9.933666138673164e-07, "loss": 0.2592, "num_input_tokens_seen": 6368736, "step": 70725 }, { "epoch": 18.38097713097713, "grad_norm": 69.2276611328125, "learning_rate": 9.917849104748827e-07, "loss": 0.2021, "num_input_tokens_seen": 6369184, "step": 70730 }, { "epoch": 18.382276507276508, "grad_norm": 16.157230377197266, "learning_rate": 9.902044418479556e-07, "loss": 0.0131, "num_input_tokens_seen": 6369680, "step": 70735 }, { "epoch": 18.383575883575883, "grad_norm": 0.006075077690184116, "learning_rate": 9.886252080678172e-07, "loss": 0.2835, "num_input_tokens_seen": 6370080, "step": 70740 }, { "epoch": 18.38487525987526, "grad_norm": 1.2650350332260132, "learning_rate": 9.87047209215694e-07, "loss": 0.2029, "num_input_tokens_seen": 6370528, "step": 70745 }, { "epoch": 18.386174636174637, "grad_norm": 46.574981689453125, "learning_rate": 9.85470445372738e-07, "loss": 0.2119, "num_input_tokens_seen": 6371008, "step": 70750 }, { "epoch": 18.387474012474012, "grad_norm": 2.4372265338897705, "learning_rate": 9.838949166200484e-07, "loss": 0.0023, "num_input_tokens_seen": 6371456, "step": 70755 }, { "epoch": 18.388773388773387, "grad_norm": 1.1762580871582031, "learning_rate": 9.823206230386517e-07, "loss": 0.0894, "num_input_tokens_seen": 6371904, "step": 70760 }, { "epoch": 18.390072765072766, "grad_norm": 0.07413649559020996, "learning_rate": 9.807475647095194e-07, "loss": 0.3245, "num_input_tokens_seen": 6372352, "step": 70765 }, { "epoch": 18.39137214137214, "grad_norm": 4.810540199279785, "learning_rate": 9.79175741713556e-07, "loss": 0.0116, "num_input_tokens_seen": 6372832, "step": 70770 }, { "epoch": 18.392671517671516, "grad_norm": 55.08564376831055, "learning_rate": 9.776051541315972e-07, "loss": 0.161, "num_input_tokens_seen": 6373280, "step": 70775 }, { "epoch": 18.393970893970895, "grad_norm": 65.84687805175781, "learning_rate": 9.760358020444255e-07, "loss": 0.2516, "num_input_tokens_seen": 6373696, "step": 70780 }, { "epoch": 18.39527027027027, "grad_norm": 0.028590761125087738, "learning_rate": 9.744676855327483e-07, "loss": 0.0093, "num_input_tokens_seen": 6374160, "step": 70785 }, { "epoch": 18.396569646569645, "grad_norm": 0.2888234257698059, "learning_rate": 9.729008046772208e-07, "loss": 0.0005, "num_input_tokens_seen": 6374592, "step": 70790 }, { "epoch": 18.397869022869024, "grad_norm": 0.17759273946285248, "learning_rate": 9.71335159558423e-07, "loss": 0.2119, "num_input_tokens_seen": 6375056, "step": 70795 }, { "epoch": 18.3991683991684, "grad_norm": 0.48071882128715515, "learning_rate": 9.69770750256882e-07, "loss": 0.0105, "num_input_tokens_seen": 6375488, "step": 70800 }, { "epoch": 18.400467775467774, "grad_norm": 71.13553619384766, "learning_rate": 9.682075768530558e-07, "loss": 0.3382, "num_input_tokens_seen": 6375952, "step": 70805 }, { "epoch": 18.401767151767153, "grad_norm": 0.007081173826009035, "learning_rate": 9.666456394273438e-07, "loss": 0.2138, "num_input_tokens_seen": 6376400, "step": 70810 }, { "epoch": 18.403066528066528, "grad_norm": 2.22737717628479, "learning_rate": 9.650849380600708e-07, "loss": 0.015, "num_input_tokens_seen": 6376848, "step": 70815 }, { "epoch": 18.404365904365903, "grad_norm": 0.0015942627796903253, "learning_rate": 9.635254728315113e-07, "loss": 0.0003, "num_input_tokens_seen": 6377296, "step": 70820 }, { "epoch": 18.405665280665282, "grad_norm": 0.004246928729116917, "learning_rate": 9.619672438218624e-07, "loss": 0.0561, "num_input_tokens_seen": 6377744, "step": 70825 }, { "epoch": 18.406964656964657, "grad_norm": 0.018730217590928078, "learning_rate": 9.604102511112766e-07, "loss": 0.0005, "num_input_tokens_seen": 6378208, "step": 70830 }, { "epoch": 18.408264033264032, "grad_norm": 53.13871765136719, "learning_rate": 9.588544947798206e-07, "loss": 0.0352, "num_input_tokens_seen": 6378656, "step": 70835 }, { "epoch": 18.40956340956341, "grad_norm": 1.2289378643035889, "learning_rate": 9.572999749075135e-07, "loss": 0.0272, "num_input_tokens_seen": 6379072, "step": 70840 }, { "epoch": 18.410862785862786, "grad_norm": 0.19222615659236908, "learning_rate": 9.55746691574308e-07, "loss": 0.0343, "num_input_tokens_seen": 6379520, "step": 70845 }, { "epoch": 18.41216216216216, "grad_norm": 1.1693811416625977, "learning_rate": 9.541946448600846e-07, "loss": 0.0029, "num_input_tokens_seen": 6379968, "step": 70850 }, { "epoch": 18.41346153846154, "grad_norm": 68.36740112304688, "learning_rate": 9.526438348446742e-07, "loss": 0.0447, "num_input_tokens_seen": 6380416, "step": 70855 }, { "epoch": 18.414760914760915, "grad_norm": 0.0752691924571991, "learning_rate": 9.510942616078294e-07, "loss": 0.0008, "num_input_tokens_seen": 6380864, "step": 70860 }, { "epoch": 18.41606029106029, "grad_norm": 2.329423427581787, "learning_rate": 9.495459252292504e-07, "loss": 0.2471, "num_input_tokens_seen": 6381312, "step": 70865 }, { "epoch": 18.41735966735967, "grad_norm": 0.6310350894927979, "learning_rate": 9.47998825788568e-07, "loss": 0.2158, "num_input_tokens_seen": 6381744, "step": 70870 }, { "epoch": 18.418659043659044, "grad_norm": 4.130654811859131, "learning_rate": 9.46452963365349e-07, "loss": 0.0048, "num_input_tokens_seen": 6382192, "step": 70875 }, { "epoch": 18.41995841995842, "grad_norm": 1.404046654701233, "learning_rate": 9.44908338039105e-07, "loss": 0.1715, "num_input_tokens_seen": 6382672, "step": 70880 }, { "epoch": 18.421257796257795, "grad_norm": 45.33441925048828, "learning_rate": 9.433649498892721e-07, "loss": 0.333, "num_input_tokens_seen": 6383152, "step": 70885 }, { "epoch": 18.422557172557173, "grad_norm": 4.256516933441162, "learning_rate": 9.418227989952288e-07, "loss": 0.41, "num_input_tokens_seen": 6383616, "step": 70890 }, { "epoch": 18.42385654885655, "grad_norm": 0.7326321601867676, "learning_rate": 9.402818854362949e-07, "loss": 0.0006, "num_input_tokens_seen": 6384080, "step": 70895 }, { "epoch": 18.425155925155924, "grad_norm": 0.011886716820299625, "learning_rate": 9.387422092917153e-07, "loss": 0.1908, "num_input_tokens_seen": 6384512, "step": 70900 }, { "epoch": 18.426455301455302, "grad_norm": 0.7415727376937866, "learning_rate": 9.372037706406739e-07, "loss": 0.005, "num_input_tokens_seen": 6385008, "step": 70905 }, { "epoch": 18.427754677754677, "grad_norm": 7.7499799728393555, "learning_rate": 9.356665695623018e-07, "loss": 0.0187, "num_input_tokens_seen": 6385440, "step": 70910 }, { "epoch": 18.429054054054053, "grad_norm": 0.25324076414108276, "learning_rate": 9.341306061356525e-07, "loss": 0.072, "num_input_tokens_seen": 6385856, "step": 70915 }, { "epoch": 18.43035343035343, "grad_norm": 62.66370391845703, "learning_rate": 9.325958804397295e-07, "loss": 0.1141, "num_input_tokens_seen": 6386304, "step": 70920 }, { "epoch": 18.431652806652806, "grad_norm": 0.005016066133975983, "learning_rate": 9.310623925534556e-07, "loss": 0.0005, "num_input_tokens_seen": 6386752, "step": 70925 }, { "epoch": 18.43295218295218, "grad_norm": 0.0068556759506464005, "learning_rate": 9.295301425557095e-07, "loss": 0.0059, "num_input_tokens_seen": 6387216, "step": 70930 }, { "epoch": 18.43425155925156, "grad_norm": 12.775571823120117, "learning_rate": 9.279991305252866e-07, "loss": 0.2171, "num_input_tokens_seen": 6387680, "step": 70935 }, { "epoch": 18.435550935550935, "grad_norm": 9.154572486877441, "learning_rate": 9.264693565409377e-07, "loss": 0.1918, "num_input_tokens_seen": 6388176, "step": 70940 }, { "epoch": 18.43685031185031, "grad_norm": 0.0022504497319459915, "learning_rate": 9.249408206813332e-07, "loss": 0.0642, "num_input_tokens_seen": 6388592, "step": 70945 }, { "epoch": 18.43814968814969, "grad_norm": 0.005419282708317041, "learning_rate": 9.234135230250879e-07, "loss": 0.0016, "num_input_tokens_seen": 6389040, "step": 70950 }, { "epoch": 18.439449064449065, "grad_norm": 0.05383562296628952, "learning_rate": 9.218874636507558e-07, "loss": 0.0038, "num_input_tokens_seen": 6389504, "step": 70955 }, { "epoch": 18.44074844074844, "grad_norm": 77.98206329345703, "learning_rate": 9.20362642636824e-07, "loss": 0.0786, "num_input_tokens_seen": 6389984, "step": 70960 }, { "epoch": 18.44204781704782, "grad_norm": 0.06914496421813965, "learning_rate": 9.188390600617158e-07, "loss": 0.0428, "num_input_tokens_seen": 6390416, "step": 70965 }, { "epoch": 18.443347193347194, "grad_norm": 0.048674486577510834, "learning_rate": 9.173167160037827e-07, "loss": 0.025, "num_input_tokens_seen": 6390848, "step": 70970 }, { "epoch": 18.44464656964657, "grad_norm": 3.122164011001587, "learning_rate": 9.157956105413257e-07, "loss": 0.0784, "num_input_tokens_seen": 6391280, "step": 70975 }, { "epoch": 18.445945945945947, "grad_norm": 5.961615562438965, "learning_rate": 9.142757437525795e-07, "loss": 0.0049, "num_input_tokens_seen": 6391728, "step": 70980 }, { "epoch": 18.447245322245323, "grad_norm": 0.0011907829903066158, "learning_rate": 9.127571157157094e-07, "loss": 0.1855, "num_input_tokens_seen": 6392192, "step": 70985 }, { "epoch": 18.448544698544698, "grad_norm": 0.06501064449548721, "learning_rate": 9.112397265088196e-07, "loss": 0.0026, "num_input_tokens_seen": 6392656, "step": 70990 }, { "epoch": 18.449844074844076, "grad_norm": 70.25933074951172, "learning_rate": 9.097235762099532e-07, "loss": 0.3468, "num_input_tokens_seen": 6393072, "step": 70995 }, { "epoch": 18.45114345114345, "grad_norm": 0.004258079454302788, "learning_rate": 9.082086648970811e-07, "loss": 0.0013, "num_input_tokens_seen": 6393520, "step": 71000 }, { "epoch": 18.452442827442827, "grad_norm": 0.14096814393997192, "learning_rate": 9.066949926481271e-07, "loss": 0.2184, "num_input_tokens_seen": 6393968, "step": 71005 }, { "epoch": 18.453742203742205, "grad_norm": 0.1436227709054947, "learning_rate": 9.051825595409291e-07, "loss": 0.0612, "num_input_tokens_seen": 6394416, "step": 71010 }, { "epoch": 18.45504158004158, "grad_norm": 0.8098892569541931, "learning_rate": 9.036713656532802e-07, "loss": 0.423, "num_input_tokens_seen": 6394880, "step": 71015 }, { "epoch": 18.456340956340956, "grad_norm": 0.0031807743944227695, "learning_rate": 9.02161411062899e-07, "loss": 0.0059, "num_input_tokens_seen": 6395328, "step": 71020 }, { "epoch": 18.45764033264033, "grad_norm": 0.38142552971839905, "learning_rate": 9.006526958474509e-07, "loss": 0.1615, "num_input_tokens_seen": 6395808, "step": 71025 }, { "epoch": 18.45893970893971, "grad_norm": 0.04048150032758713, "learning_rate": 8.991452200845268e-07, "loss": 0.2051, "num_input_tokens_seen": 6396256, "step": 71030 }, { "epoch": 18.460239085239085, "grad_norm": 0.05777010694146156, "learning_rate": 8.976389838516508e-07, "loss": 0.0924, "num_input_tokens_seen": 6396720, "step": 71035 }, { "epoch": 18.46153846153846, "grad_norm": 0.001600300776772201, "learning_rate": 8.961339872262997e-07, "loss": 0.0021, "num_input_tokens_seen": 6397152, "step": 71040 }, { "epoch": 18.46283783783784, "grad_norm": 0.1542394459247589, "learning_rate": 8.946302302858672e-07, "loss": 0.0031, "num_input_tokens_seen": 6397552, "step": 71045 }, { "epoch": 18.464137214137214, "grad_norm": 63.954837799072266, "learning_rate": 8.931277131077026e-07, "loss": 0.4659, "num_input_tokens_seen": 6398000, "step": 71050 }, { "epoch": 18.46543659043659, "grad_norm": 0.12542836368083954, "learning_rate": 8.916264357690746e-07, "loss": 0.0533, "num_input_tokens_seen": 6398432, "step": 71055 }, { "epoch": 18.466735966735968, "grad_norm": 0.010244663804769516, "learning_rate": 8.901263983472047e-07, "loss": 0.0027, "num_input_tokens_seen": 6398912, "step": 71060 }, { "epoch": 18.468035343035343, "grad_norm": 0.0020004939287900925, "learning_rate": 8.886276009192284e-07, "loss": 0.0798, "num_input_tokens_seen": 6399360, "step": 71065 }, { "epoch": 18.469334719334718, "grad_norm": 0.24813823401927948, "learning_rate": 8.871300435622427e-07, "loss": 0.0124, "num_input_tokens_seen": 6399840, "step": 71070 }, { "epoch": 18.470634095634097, "grad_norm": 0.01151161827147007, "learning_rate": 8.856337263532605e-07, "loss": 0.0101, "num_input_tokens_seen": 6400288, "step": 71075 }, { "epoch": 18.471933471933472, "grad_norm": 0.8641865849494934, "learning_rate": 8.841386493692428e-07, "loss": 0.3132, "num_input_tokens_seen": 6400720, "step": 71080 }, { "epoch": 18.473232848232847, "grad_norm": 7.039710998535156, "learning_rate": 8.826448126870779e-07, "loss": 0.0082, "num_input_tokens_seen": 6401184, "step": 71085 }, { "epoch": 18.474532224532226, "grad_norm": 46.48173141479492, "learning_rate": 8.811522163835961e-07, "loss": 0.1417, "num_input_tokens_seen": 6401632, "step": 71090 }, { "epoch": 18.4758316008316, "grad_norm": 0.5219446420669556, "learning_rate": 8.796608605355722e-07, "loss": 0.0012, "num_input_tokens_seen": 6402048, "step": 71095 }, { "epoch": 18.477130977130976, "grad_norm": 0.24783669412136078, "learning_rate": 8.781707452196947e-07, "loss": 0.0144, "num_input_tokens_seen": 6402496, "step": 71100 }, { "epoch": 18.478430353430355, "grad_norm": 1.2896476984024048, "learning_rate": 8.766818705126134e-07, "loss": 0.0611, "num_input_tokens_seen": 6402944, "step": 71105 }, { "epoch": 18.47972972972973, "grad_norm": 0.013843035325407982, "learning_rate": 8.751942364908949e-07, "loss": 0.0871, "num_input_tokens_seen": 6403376, "step": 71110 }, { "epoch": 18.481029106029105, "grad_norm": 0.05140169709920883, "learning_rate": 8.737078432310531e-07, "loss": 0.0185, "num_input_tokens_seen": 6403808, "step": 71115 }, { "epoch": 18.482328482328484, "grad_norm": 24.072185516357422, "learning_rate": 8.722226908095321e-07, "loss": 0.1198, "num_input_tokens_seen": 6404256, "step": 71120 }, { "epoch": 18.48362785862786, "grad_norm": 0.04162648692727089, "learning_rate": 8.707387793027155e-07, "loss": 0.0669, "num_input_tokens_seen": 6404688, "step": 71125 }, { "epoch": 18.484927234927234, "grad_norm": 0.008868691511452198, "learning_rate": 8.692561087869256e-07, "loss": 0.05, "num_input_tokens_seen": 6405168, "step": 71130 }, { "epoch": 18.486226611226613, "grad_norm": 0.004824243951588869, "learning_rate": 8.677746793384151e-07, "loss": 0.1826, "num_input_tokens_seen": 6405600, "step": 71135 }, { "epoch": 18.487525987525988, "grad_norm": 0.0019925215747207403, "learning_rate": 8.662944910333731e-07, "loss": 0.4008, "num_input_tokens_seen": 6406032, "step": 71140 }, { "epoch": 18.488825363825363, "grad_norm": 1.8962823152542114, "learning_rate": 8.648155439479305e-07, "loss": 0.0196, "num_input_tokens_seen": 6406496, "step": 71145 }, { "epoch": 18.49012474012474, "grad_norm": 28.451435089111328, "learning_rate": 8.633378381581487e-07, "loss": 0.022, "num_input_tokens_seen": 6406944, "step": 71150 }, { "epoch": 18.491424116424117, "grad_norm": 0.0914309099316597, "learning_rate": 8.618613737400305e-07, "loss": 0.003, "num_input_tokens_seen": 6407376, "step": 71155 }, { "epoch": 18.492723492723492, "grad_norm": 11.798108100891113, "learning_rate": 8.6038615076951e-07, "loss": 0.0112, "num_input_tokens_seen": 6407792, "step": 71160 }, { "epoch": 18.49402286902287, "grad_norm": 0.008522682823240757, "learning_rate": 8.589121693224567e-07, "loss": 0.0049, "num_input_tokens_seen": 6408224, "step": 71165 }, { "epoch": 18.495322245322246, "grad_norm": 0.008085537701845169, "learning_rate": 8.574394294746824e-07, "loss": 0.0571, "num_input_tokens_seen": 6408656, "step": 71170 }, { "epoch": 18.49662162162162, "grad_norm": 8.098100662231445, "learning_rate": 8.559679313019292e-07, "loss": 0.0045, "num_input_tokens_seen": 6409104, "step": 71175 }, { "epoch": 18.497920997920996, "grad_norm": 0.2519460916519165, "learning_rate": 8.544976748798838e-07, "loss": 0.5227, "num_input_tokens_seen": 6409552, "step": 71180 }, { "epoch": 18.499220374220375, "grad_norm": 0.5158796310424805, "learning_rate": 8.530286602841525e-07, "loss": 0.0143, "num_input_tokens_seen": 6410016, "step": 71185 }, { "epoch": 18.50051975051975, "grad_norm": 0.07119707763195038, "learning_rate": 8.515608875902997e-07, "loss": 0.0005, "num_input_tokens_seen": 6410448, "step": 71190 }, { "epoch": 18.501819126819125, "grad_norm": 0.011608180589973927, "learning_rate": 8.500943568737984e-07, "loss": 0.1381, "num_input_tokens_seen": 6410912, "step": 71195 }, { "epoch": 18.503118503118504, "grad_norm": 46.35358810424805, "learning_rate": 8.486290682100939e-07, "loss": 0.1805, "num_input_tokens_seen": 6411360, "step": 71200 }, { "epoch": 18.50441787941788, "grad_norm": 20.267549514770508, "learning_rate": 8.471650216745314e-07, "loss": 0.0109, "num_input_tokens_seen": 6411824, "step": 71205 }, { "epoch": 18.505717255717254, "grad_norm": 0.004730201326310635, "learning_rate": 8.457022173424173e-07, "loss": 0.0541, "num_input_tokens_seen": 6412256, "step": 71210 }, { "epoch": 18.507016632016633, "grad_norm": 0.663939356803894, "learning_rate": 8.442406552889776e-07, "loss": 0.0008, "num_input_tokens_seen": 6412704, "step": 71215 }, { "epoch": 18.508316008316008, "grad_norm": 0.05667337775230408, "learning_rate": 8.42780335589391e-07, "loss": 0.109, "num_input_tokens_seen": 6413136, "step": 71220 }, { "epoch": 18.509615384615383, "grad_norm": 3.656661033630371, "learning_rate": 8.413212583187558e-07, "loss": 0.0829, "num_input_tokens_seen": 6413568, "step": 71225 }, { "epoch": 18.510914760914762, "grad_norm": 7.161932945251465, "learning_rate": 8.398634235521147e-07, "loss": 0.0088, "num_input_tokens_seen": 6414000, "step": 71230 }, { "epoch": 18.512214137214137, "grad_norm": 3.410964012145996, "learning_rate": 8.384068313644494e-07, "loss": 0.0127, "num_input_tokens_seen": 6414432, "step": 71235 }, { "epoch": 18.513513513513512, "grad_norm": 0.011030678637325764, "learning_rate": 8.369514818306667e-07, "loss": 0.2007, "num_input_tokens_seen": 6414896, "step": 71240 }, { "epoch": 18.51481288981289, "grad_norm": 0.13629195094108582, "learning_rate": 8.354973750256262e-07, "loss": 0.0664, "num_input_tokens_seen": 6415376, "step": 71245 }, { "epoch": 18.516112266112266, "grad_norm": 0.010855629108846188, "learning_rate": 8.340445110241069e-07, "loss": 0.0003, "num_input_tokens_seen": 6415872, "step": 71250 }, { "epoch": 18.51741164241164, "grad_norm": 0.17045068740844727, "learning_rate": 8.32592889900835e-07, "loss": 0.1041, "num_input_tokens_seen": 6416352, "step": 71255 }, { "epoch": 18.51871101871102, "grad_norm": 79.86307525634766, "learning_rate": 8.311425117304649e-07, "loss": 0.1603, "num_input_tokens_seen": 6416816, "step": 71260 }, { "epoch": 18.520010395010395, "grad_norm": 64.08960723876953, "learning_rate": 8.296933765875897e-07, "loss": 0.2846, "num_input_tokens_seen": 6417248, "step": 71265 }, { "epoch": 18.52130977130977, "grad_norm": 55.41666793823242, "learning_rate": 8.282454845467468e-07, "loss": 0.2464, "num_input_tokens_seen": 6417696, "step": 71270 }, { "epoch": 18.52260914760915, "grad_norm": 0.024725841358304024, "learning_rate": 8.267988356823992e-07, "loss": 0.2526, "num_input_tokens_seen": 6418128, "step": 71275 }, { "epoch": 18.523908523908524, "grad_norm": 67.76478576660156, "learning_rate": 8.253534300689481e-07, "loss": 0.0497, "num_input_tokens_seen": 6418592, "step": 71280 }, { "epoch": 18.5252079002079, "grad_norm": 82.26382446289062, "learning_rate": 8.239092677807342e-07, "loss": 0.3619, "num_input_tokens_seen": 6419040, "step": 71285 }, { "epoch": 18.526507276507278, "grad_norm": 1.4062870740890503, "learning_rate": 8.224663488920342e-07, "loss": 0.01, "num_input_tokens_seen": 6419472, "step": 71290 }, { "epoch": 18.527806652806653, "grad_norm": 0.019792212173342705, "learning_rate": 8.210246734770499e-07, "loss": 0.0489, "num_input_tokens_seen": 6419936, "step": 71295 }, { "epoch": 18.52910602910603, "grad_norm": 9.68960189819336, "learning_rate": 8.195842416099359e-07, "loss": 0.0095, "num_input_tokens_seen": 6420416, "step": 71300 }, { "epoch": 18.530405405405407, "grad_norm": 0.002259644214063883, "learning_rate": 8.181450533647717e-07, "loss": 0.0005, "num_input_tokens_seen": 6420864, "step": 71305 }, { "epoch": 18.531704781704782, "grad_norm": 2.763698101043701, "learning_rate": 8.167071088155787e-07, "loss": 0.0853, "num_input_tokens_seen": 6421280, "step": 71310 }, { "epoch": 18.533004158004157, "grad_norm": 8.022553443908691, "learning_rate": 8.152704080363089e-07, "loss": 0.0246, "num_input_tokens_seen": 6421728, "step": 71315 }, { "epoch": 18.534303534303533, "grad_norm": 0.7358372807502747, "learning_rate": 8.138349511008586e-07, "loss": 0.0887, "num_input_tokens_seen": 6422144, "step": 71320 }, { "epoch": 18.53560291060291, "grad_norm": 14.139294624328613, "learning_rate": 8.124007380830467e-07, "loss": 0.0202, "num_input_tokens_seen": 6422576, "step": 71325 }, { "epoch": 18.536902286902286, "grad_norm": 0.3962317407131195, "learning_rate": 8.10967769056642e-07, "loss": 0.0161, "num_input_tokens_seen": 6422992, "step": 71330 }, { "epoch": 18.53820166320166, "grad_norm": 0.10737685859203339, "learning_rate": 8.095360440953409e-07, "loss": 0.0048, "num_input_tokens_seen": 6423440, "step": 71335 }, { "epoch": 18.53950103950104, "grad_norm": 0.03839339315891266, "learning_rate": 8.081055632727791e-07, "loss": 0.1916, "num_input_tokens_seen": 6423888, "step": 71340 }, { "epoch": 18.540800415800415, "grad_norm": 16.609113693237305, "learning_rate": 8.066763266625282e-07, "loss": 0.0864, "num_input_tokens_seen": 6424320, "step": 71345 }, { "epoch": 18.54209979209979, "grad_norm": 81.74089050292969, "learning_rate": 8.052483343380962e-07, "loss": 0.3039, "num_input_tokens_seen": 6424752, "step": 71350 }, { "epoch": 18.54339916839917, "grad_norm": 0.003469527233392, "learning_rate": 8.038215863729242e-07, "loss": 0.434, "num_input_tokens_seen": 6425184, "step": 71355 }, { "epoch": 18.544698544698544, "grad_norm": 0.03987244889140129, "learning_rate": 8.023960828403898e-07, "loss": 0.0008, "num_input_tokens_seen": 6425616, "step": 71360 }, { "epoch": 18.54599792099792, "grad_norm": 0.005172623321413994, "learning_rate": 8.009718238138148e-07, "loss": 0.1088, "num_input_tokens_seen": 6426112, "step": 71365 }, { "epoch": 18.5472972972973, "grad_norm": 0.01594318263232708, "learning_rate": 7.995488093664405e-07, "loss": 0.0006, "num_input_tokens_seen": 6426592, "step": 71370 }, { "epoch": 18.548596673596673, "grad_norm": 124.79493713378906, "learning_rate": 7.981270395714585e-07, "loss": 0.3844, "num_input_tokens_seen": 6427040, "step": 71375 }, { "epoch": 18.54989604989605, "grad_norm": 0.0012133781565353274, "learning_rate": 7.967065145019908e-07, "loss": 0.0282, "num_input_tokens_seen": 6427488, "step": 71380 }, { "epoch": 18.551195426195427, "grad_norm": 1.074040412902832, "learning_rate": 7.952872342311013e-07, "loss": 0.0023, "num_input_tokens_seen": 6427904, "step": 71385 }, { "epoch": 18.552494802494802, "grad_norm": 33.33407211303711, "learning_rate": 7.938691988317787e-07, "loss": 0.1887, "num_input_tokens_seen": 6428384, "step": 71390 }, { "epoch": 18.553794178794178, "grad_norm": 0.3716579079627991, "learning_rate": 7.924524083769591e-07, "loss": 0.0019, "num_input_tokens_seen": 6428832, "step": 71395 }, { "epoch": 18.555093555093556, "grad_norm": 0.4278239607810974, "learning_rate": 7.910368629395038e-07, "loss": 0.0039, "num_input_tokens_seen": 6429280, "step": 71400 }, { "epoch": 18.55639293139293, "grad_norm": 0.03162777051329613, "learning_rate": 7.896225625922238e-07, "loss": 0.0454, "num_input_tokens_seen": 6429744, "step": 71405 }, { "epoch": 18.557692307692307, "grad_norm": 58.18596267700195, "learning_rate": 7.882095074078472e-07, "loss": 0.1475, "num_input_tokens_seen": 6430192, "step": 71410 }, { "epoch": 18.558991683991685, "grad_norm": 0.14379392564296722, "learning_rate": 7.867976974590546e-07, "loss": 0.0049, "num_input_tokens_seen": 6430608, "step": 71415 }, { "epoch": 18.56029106029106, "grad_norm": 0.004318814259022474, "learning_rate": 7.853871328184576e-07, "loss": 0.0004, "num_input_tokens_seen": 6431056, "step": 71420 }, { "epoch": 18.561590436590436, "grad_norm": 8.039773941040039, "learning_rate": 7.839778135586007e-07, "loss": 0.0383, "num_input_tokens_seen": 6431504, "step": 71425 }, { "epoch": 18.562889812889814, "grad_norm": 0.006148593034595251, "learning_rate": 7.825697397519705e-07, "loss": 0.0443, "num_input_tokens_seen": 6431936, "step": 71430 }, { "epoch": 18.56418918918919, "grad_norm": 0.007054498884826899, "learning_rate": 7.811629114709812e-07, "loss": 0.0034, "num_input_tokens_seen": 6432400, "step": 71435 }, { "epoch": 18.565488565488565, "grad_norm": 1.1592786312103271, "learning_rate": 7.797573287879889e-07, "loss": 0.0005, "num_input_tokens_seen": 6432880, "step": 71440 }, { "epoch": 18.566787941787943, "grad_norm": 0.06983933597803116, "learning_rate": 7.783529917752802e-07, "loss": 0.1034, "num_input_tokens_seen": 6433280, "step": 71445 }, { "epoch": 18.56808731808732, "grad_norm": 0.11784971505403519, "learning_rate": 7.76949900505089e-07, "loss": 0.001, "num_input_tokens_seen": 6433744, "step": 71450 }, { "epoch": 18.569386694386694, "grad_norm": 0.012610243633389473, "learning_rate": 7.755480550495714e-07, "loss": 0.0135, "num_input_tokens_seen": 6434208, "step": 71455 }, { "epoch": 18.570686070686072, "grad_norm": 0.06662692129611969, "learning_rate": 7.741474554808309e-07, "loss": 0.1497, "num_input_tokens_seen": 6434656, "step": 71460 }, { "epoch": 18.571985446985448, "grad_norm": 18.102869033813477, "learning_rate": 7.72748101870896e-07, "loss": 0.0349, "num_input_tokens_seen": 6435104, "step": 71465 }, { "epoch": 18.573284823284823, "grad_norm": 40.49265670776367, "learning_rate": 7.713499942917452e-07, "loss": 0.0881, "num_input_tokens_seen": 6435520, "step": 71470 }, { "epoch": 18.5745841995842, "grad_norm": 0.010971608571708202, "learning_rate": 7.699531328152737e-07, "loss": 0.2022, "num_input_tokens_seen": 6435936, "step": 71475 }, { "epoch": 18.575883575883577, "grad_norm": 0.08143887668848038, "learning_rate": 7.685575175133269e-07, "loss": 0.0989, "num_input_tokens_seen": 6436400, "step": 71480 }, { "epoch": 18.57718295218295, "grad_norm": 26.103919982910156, "learning_rate": 7.671631484576892e-07, "loss": 0.0145, "num_input_tokens_seen": 6436848, "step": 71485 }, { "epoch": 18.578482328482327, "grad_norm": 0.02341819368302822, "learning_rate": 7.657700257200668e-07, "loss": 0.0169, "num_input_tokens_seen": 6437328, "step": 71490 }, { "epoch": 18.579781704781706, "grad_norm": 0.01915629953145981, "learning_rate": 7.643781493721164e-07, "loss": 0.0012, "num_input_tokens_seen": 6437760, "step": 71495 }, { "epoch": 18.58108108108108, "grad_norm": 0.4441162049770355, "learning_rate": 7.629875194854141e-07, "loss": 0.0098, "num_input_tokens_seen": 6438192, "step": 71500 }, { "epoch": 18.582380457380456, "grad_norm": 2.422687530517578, "learning_rate": 7.615981361314889e-07, "loss": 0.1017, "num_input_tokens_seen": 6438640, "step": 71505 }, { "epoch": 18.583679833679835, "grad_norm": 87.29119873046875, "learning_rate": 7.602099993817946e-07, "loss": 0.2906, "num_input_tokens_seen": 6439120, "step": 71510 }, { "epoch": 18.58497920997921, "grad_norm": 0.019694432616233826, "learning_rate": 7.588231093077241e-07, "loss": 0.0618, "num_input_tokens_seen": 6439552, "step": 71515 }, { "epoch": 18.586278586278585, "grad_norm": 1.4630109071731567, "learning_rate": 7.574374659806094e-07, "loss": 0.0736, "num_input_tokens_seen": 6440016, "step": 71520 }, { "epoch": 18.587577962577964, "grad_norm": 5.836017608642578, "learning_rate": 7.560530694717155e-07, "loss": 0.0086, "num_input_tokens_seen": 6440464, "step": 71525 }, { "epoch": 18.58887733887734, "grad_norm": 1.4779207706451416, "learning_rate": 7.546699198522411e-07, "loss": 0.0011, "num_input_tokens_seen": 6440944, "step": 71530 }, { "epoch": 18.590176715176714, "grad_norm": 0.24600911140441895, "learning_rate": 7.532880171933237e-07, "loss": 0.1794, "num_input_tokens_seen": 6441440, "step": 71535 }, { "epoch": 18.591476091476093, "grad_norm": 0.00623075757175684, "learning_rate": 7.519073615660344e-07, "loss": 0.1444, "num_input_tokens_seen": 6441904, "step": 71540 }, { "epoch": 18.592775467775468, "grad_norm": 0.3974779546260834, "learning_rate": 7.505279530413855e-07, "loss": 0.0961, "num_input_tokens_seen": 6442336, "step": 71545 }, { "epoch": 18.594074844074843, "grad_norm": 55.98453140258789, "learning_rate": 7.491497916903151e-07, "loss": 0.1264, "num_input_tokens_seen": 6442768, "step": 71550 }, { "epoch": 18.59537422037422, "grad_norm": 0.014328657649457455, "learning_rate": 7.477728775837078e-07, "loss": 0.0182, "num_input_tokens_seen": 6443184, "step": 71555 }, { "epoch": 18.596673596673597, "grad_norm": 0.0400124229490757, "learning_rate": 7.463972107923794e-07, "loss": 0.0468, "num_input_tokens_seen": 6443632, "step": 71560 }, { "epoch": 18.597972972972972, "grad_norm": 1.2056750059127808, "learning_rate": 7.450227913870816e-07, "loss": 0.235, "num_input_tokens_seen": 6444080, "step": 71565 }, { "epoch": 18.59927234927235, "grad_norm": 0.2264677733182907, "learning_rate": 7.436496194385024e-07, "loss": 0.0004, "num_input_tokens_seen": 6444560, "step": 71570 }, { "epoch": 18.600571725571726, "grad_norm": 0.002920709550380707, "learning_rate": 7.422776950172599e-07, "loss": 0.1605, "num_input_tokens_seen": 6445024, "step": 71575 }, { "epoch": 18.6018711018711, "grad_norm": 0.24497263133525848, "learning_rate": 7.409070181939231e-07, "loss": 0.1237, "num_input_tokens_seen": 6445504, "step": 71580 }, { "epoch": 18.60317047817048, "grad_norm": 1.0029594898223877, "learning_rate": 7.3953758903898e-07, "loss": 0.0049, "num_input_tokens_seen": 6445968, "step": 71585 }, { "epoch": 18.604469854469855, "grad_norm": 7.5375237464904785, "learning_rate": 7.381694076228656e-07, "loss": 0.2937, "num_input_tokens_seen": 6446384, "step": 71590 }, { "epoch": 18.60576923076923, "grad_norm": 0.02061493508517742, "learning_rate": 7.368024740159434e-07, "loss": 0.2946, "num_input_tokens_seen": 6446864, "step": 71595 }, { "epoch": 18.60706860706861, "grad_norm": 0.04960319772362709, "learning_rate": 7.354367882885183e-07, "loss": 0.084, "num_input_tokens_seen": 6447312, "step": 71600 }, { "epoch": 18.608367983367984, "grad_norm": 1.20013427734375, "learning_rate": 7.340723505108283e-07, "loss": 0.3863, "num_input_tokens_seen": 6447792, "step": 71605 }, { "epoch": 18.60966735966736, "grad_norm": 0.24928931891918182, "learning_rate": 7.327091607530506e-07, "loss": 0.0692, "num_input_tokens_seen": 6448256, "step": 71610 }, { "epoch": 18.610966735966738, "grad_norm": 0.1397000402212143, "learning_rate": 7.313472190852905e-07, "loss": 0.0438, "num_input_tokens_seen": 6448736, "step": 71615 }, { "epoch": 18.612266112266113, "grad_norm": 2.6315577030181885, "learning_rate": 7.299865255775946e-07, "loss": 0.0125, "num_input_tokens_seen": 6449232, "step": 71620 }, { "epoch": 18.613565488565488, "grad_norm": 77.41931915283203, "learning_rate": 7.286270802999457e-07, "loss": 0.2502, "num_input_tokens_seen": 6449680, "step": 71625 }, { "epoch": 18.614864864864863, "grad_norm": 0.0008122642175294459, "learning_rate": 7.27268883322263e-07, "loss": 0.0007, "num_input_tokens_seen": 6450112, "step": 71630 }, { "epoch": 18.616164241164242, "grad_norm": 2.6950817108154297, "learning_rate": 7.259119347144017e-07, "loss": 0.6772, "num_input_tokens_seen": 6450512, "step": 71635 }, { "epoch": 18.617463617463617, "grad_norm": 1.178641676902771, "learning_rate": 7.24556234546142e-07, "loss": 0.2042, "num_input_tokens_seen": 6450976, "step": 71640 }, { "epoch": 18.618762993762992, "grad_norm": 0.1388503462076187, "learning_rate": 7.2320178288722e-07, "loss": 0.0007, "num_input_tokens_seen": 6451408, "step": 71645 }, { "epoch": 18.62006237006237, "grad_norm": 0.2002592831850052, "learning_rate": 7.218485798072883e-07, "loss": 0.1408, "num_input_tokens_seen": 6451872, "step": 71650 }, { "epoch": 18.621361746361746, "grad_norm": 20.387792587280273, "learning_rate": 7.204966253759493e-07, "loss": 0.0191, "num_input_tokens_seen": 6452336, "step": 71655 }, { "epoch": 18.62266112266112, "grad_norm": 0.017556998878717422, "learning_rate": 7.191459196627282e-07, "loss": 0.0002, "num_input_tokens_seen": 6452768, "step": 71660 }, { "epoch": 18.6239604989605, "grad_norm": 0.17258523404598236, "learning_rate": 7.177964627370997e-07, "loss": 0.1264, "num_input_tokens_seen": 6453184, "step": 71665 }, { "epoch": 18.625259875259875, "grad_norm": 0.3614066541194916, "learning_rate": 7.164482546684642e-07, "loss": 0.3163, "num_input_tokens_seen": 6453632, "step": 71670 }, { "epoch": 18.62655925155925, "grad_norm": 0.013862636871635914, "learning_rate": 7.151012955261632e-07, "loss": 0.3197, "num_input_tokens_seen": 6454096, "step": 71675 }, { "epoch": 18.62785862785863, "grad_norm": 49.63149642944336, "learning_rate": 7.13755585379472e-07, "loss": 0.05, "num_input_tokens_seen": 6454560, "step": 71680 }, { "epoch": 18.629158004158004, "grad_norm": 2.000676393508911, "learning_rate": 7.12411124297599e-07, "loss": 0.21, "num_input_tokens_seen": 6455072, "step": 71685 }, { "epoch": 18.63045738045738, "grad_norm": 0.17939606308937073, "learning_rate": 7.110679123496972e-07, "loss": 0.0026, "num_input_tokens_seen": 6455520, "step": 71690 }, { "epoch": 18.631756756756758, "grad_norm": 0.4694914221763611, "learning_rate": 7.097259496048364e-07, "loss": 0.0287, "num_input_tokens_seen": 6455952, "step": 71695 }, { "epoch": 18.633056133056133, "grad_norm": 49.9847526550293, "learning_rate": 7.083852361320531e-07, "loss": 0.3698, "num_input_tokens_seen": 6456368, "step": 71700 }, { "epoch": 18.634355509355508, "grad_norm": 0.8196209669113159, "learning_rate": 7.070457720002865e-07, "loss": 0.0116, "num_input_tokens_seen": 6456832, "step": 71705 }, { "epoch": 18.635654885654887, "grad_norm": 34.66427230834961, "learning_rate": 7.057075572784372e-07, "loss": 0.0238, "num_input_tokens_seen": 6457264, "step": 71710 }, { "epoch": 18.636954261954262, "grad_norm": 0.003280687378719449, "learning_rate": 7.043705920353222e-07, "loss": 0.0815, "num_input_tokens_seen": 6457728, "step": 71715 }, { "epoch": 18.638253638253637, "grad_norm": 0.027544526383280754, "learning_rate": 7.030348763397088e-07, "loss": 0.0031, "num_input_tokens_seen": 6458208, "step": 71720 }, { "epoch": 18.639553014553016, "grad_norm": 5.830595970153809, "learning_rate": 7.017004102602892e-07, "loss": 0.0064, "num_input_tokens_seen": 6458656, "step": 71725 }, { "epoch": 18.64085239085239, "grad_norm": 0.015470573678612709, "learning_rate": 7.003671938656975e-07, "loss": 0.1273, "num_input_tokens_seen": 6459056, "step": 71730 }, { "epoch": 18.642151767151766, "grad_norm": 4.923624038696289, "learning_rate": 6.990352272245065e-07, "loss": 0.006, "num_input_tokens_seen": 6459536, "step": 71735 }, { "epoch": 18.643451143451145, "grad_norm": 1.8230611085891724, "learning_rate": 6.977045104052199e-07, "loss": 0.4183, "num_input_tokens_seen": 6460016, "step": 71740 }, { "epoch": 18.64475051975052, "grad_norm": 0.010846447199583054, "learning_rate": 6.963750434762745e-07, "loss": 0.0714, "num_input_tokens_seen": 6460528, "step": 71745 }, { "epoch": 18.646049896049895, "grad_norm": 0.08204314857721329, "learning_rate": 6.950468265060461e-07, "loss": 0.0003, "num_input_tokens_seen": 6460992, "step": 71750 }, { "epoch": 18.647349272349274, "grad_norm": 1.564361810684204, "learning_rate": 6.937198595628496e-07, "loss": 0.4827, "num_input_tokens_seen": 6461392, "step": 71755 }, { "epoch": 18.64864864864865, "grad_norm": 0.1841566115617752, "learning_rate": 6.923941427149277e-07, "loss": 0.0263, "num_input_tokens_seen": 6461840, "step": 71760 }, { "epoch": 18.649948024948024, "grad_norm": 0.21337772905826569, "learning_rate": 6.910696760304646e-07, "loss": 0.2628, "num_input_tokens_seen": 6462320, "step": 71765 }, { "epoch": 18.651247401247403, "grad_norm": 34.533935546875, "learning_rate": 6.897464595775782e-07, "loss": 0.039, "num_input_tokens_seen": 6462800, "step": 71770 }, { "epoch": 18.652546777546778, "grad_norm": 0.3017192780971527, "learning_rate": 6.884244934243305e-07, "loss": 0.0508, "num_input_tokens_seen": 6463264, "step": 71775 }, { "epoch": 18.653846153846153, "grad_norm": 78.53841400146484, "learning_rate": 6.871037776387007e-07, "loss": 0.1549, "num_input_tokens_seen": 6463696, "step": 71780 }, { "epoch": 18.65514553014553, "grad_norm": 0.6620572209358215, "learning_rate": 6.857843122886204e-07, "loss": 0.0265, "num_input_tokens_seen": 6464096, "step": 71785 }, { "epoch": 18.656444906444907, "grad_norm": 19.19585418701172, "learning_rate": 6.844660974419492e-07, "loss": 0.2373, "num_input_tokens_seen": 6464528, "step": 71790 }, { "epoch": 18.657744282744282, "grad_norm": 0.8824236392974854, "learning_rate": 6.831491331664858e-07, "loss": 0.0443, "num_input_tokens_seen": 6464944, "step": 71795 }, { "epoch": 18.659043659043657, "grad_norm": 0.03395431488752365, "learning_rate": 6.818334195299592e-07, "loss": 0.0903, "num_input_tokens_seen": 6465376, "step": 71800 }, { "epoch": 18.660343035343036, "grad_norm": 0.04103875160217285, "learning_rate": 6.80518956600043e-07, "loss": 0.0005, "num_input_tokens_seen": 6465792, "step": 71805 }, { "epoch": 18.66164241164241, "grad_norm": 0.3141402304172516, "learning_rate": 6.79205744444339e-07, "loss": 0.1268, "num_input_tokens_seen": 6466240, "step": 71810 }, { "epoch": 18.662941787941786, "grad_norm": 5.869576930999756, "learning_rate": 6.778937831303844e-07, "loss": 0.0085, "num_input_tokens_seen": 6466672, "step": 71815 }, { "epoch": 18.664241164241165, "grad_norm": 0.003691053716465831, "learning_rate": 6.765830727256589e-07, "loss": 0.0005, "num_input_tokens_seen": 6467136, "step": 71820 }, { "epoch": 18.66554054054054, "grad_norm": 0.02243930660188198, "learning_rate": 6.752736132975696e-07, "loss": 0.0029, "num_input_tokens_seen": 6467632, "step": 71825 }, { "epoch": 18.666839916839916, "grad_norm": 14.573981285095215, "learning_rate": 6.739654049134681e-07, "loss": 0.0122, "num_input_tokens_seen": 6468128, "step": 71830 }, { "epoch": 18.668139293139294, "grad_norm": 0.14690104126930237, "learning_rate": 6.726584476406311e-07, "loss": 0.2502, "num_input_tokens_seen": 6468560, "step": 71835 }, { "epoch": 18.66943866943867, "grad_norm": 10.933365821838379, "learning_rate": 6.713527415462772e-07, "loss": 0.098, "num_input_tokens_seen": 6469024, "step": 71840 }, { "epoch": 18.670738045738045, "grad_norm": 0.0010797708528116345, "learning_rate": 6.700482866975665e-07, "loss": 0.4641, "num_input_tokens_seen": 6469440, "step": 71845 }, { "epoch": 18.672037422037423, "grad_norm": 0.4628421366214752, "learning_rate": 6.687450831615844e-07, "loss": 0.4313, "num_input_tokens_seen": 6469936, "step": 71850 }, { "epoch": 18.6733367983368, "grad_norm": 15.011011123657227, "learning_rate": 6.674431310053519e-07, "loss": 0.2124, "num_input_tokens_seen": 6470384, "step": 71855 }, { "epoch": 18.674636174636174, "grad_norm": 100.17680358886719, "learning_rate": 6.66142430295838e-07, "loss": 0.2059, "num_input_tokens_seen": 6470848, "step": 71860 }, { "epoch": 18.675935550935552, "grad_norm": 3.5268454551696777, "learning_rate": 6.648429810999335e-07, "loss": 0.0055, "num_input_tokens_seen": 6471344, "step": 71865 }, { "epoch": 18.677234927234927, "grad_norm": 0.0014395442558452487, "learning_rate": 6.63544783484471e-07, "loss": 0.0101, "num_input_tokens_seen": 6471792, "step": 71870 }, { "epoch": 18.678534303534303, "grad_norm": 16.308128356933594, "learning_rate": 6.622478375162167e-07, "loss": 0.0222, "num_input_tokens_seen": 6472240, "step": 71875 }, { "epoch": 18.67983367983368, "grad_norm": 34.400333404541016, "learning_rate": 6.609521432618754e-07, "loss": 0.0282, "num_input_tokens_seen": 6472672, "step": 71880 }, { "epoch": 18.681133056133056, "grad_norm": 0.20648349821567535, "learning_rate": 6.596577007880883e-07, "loss": 0.0008, "num_input_tokens_seen": 6473104, "step": 71885 }, { "epoch": 18.68243243243243, "grad_norm": 1.3726420402526855, "learning_rate": 6.583645101614271e-07, "loss": 0.0241, "num_input_tokens_seen": 6473520, "step": 71890 }, { "epoch": 18.68373180873181, "grad_norm": 0.03156646713614464, "learning_rate": 6.570725714484028e-07, "loss": 0.0029, "num_input_tokens_seen": 6473984, "step": 71895 }, { "epoch": 18.685031185031185, "grad_norm": 21.245437622070312, "learning_rate": 6.557818847154562e-07, "loss": 0.2104, "num_input_tokens_seen": 6474448, "step": 71900 }, { "epoch": 18.68633056133056, "grad_norm": 0.017242759466171265, "learning_rate": 6.54492450028979e-07, "loss": 0.0016, "num_input_tokens_seen": 6474912, "step": 71905 }, { "epoch": 18.68762993762994, "grad_norm": 13.107316017150879, "learning_rate": 6.532042674552763e-07, "loss": 0.0118, "num_input_tokens_seen": 6475328, "step": 71910 }, { "epoch": 18.688929313929314, "grad_norm": 2.7280945777893066, "learning_rate": 6.519173370606063e-07, "loss": 0.0103, "num_input_tokens_seen": 6475744, "step": 71915 }, { "epoch": 18.69022869022869, "grad_norm": 0.005708414129912853, "learning_rate": 6.506316589111577e-07, "loss": 0.1785, "num_input_tokens_seen": 6476224, "step": 71920 }, { "epoch": 18.691528066528065, "grad_norm": 0.05812571570277214, "learning_rate": 6.493472330730554e-07, "loss": 0.0053, "num_input_tokens_seen": 6476688, "step": 71925 }, { "epoch": 18.692827442827443, "grad_norm": 0.42271825671195984, "learning_rate": 6.480640596123549e-07, "loss": 0.0004, "num_input_tokens_seen": 6477152, "step": 71930 }, { "epoch": 18.69412681912682, "grad_norm": 2.4074032306671143, "learning_rate": 6.467821385950562e-07, "loss": 0.5071, "num_input_tokens_seen": 6477584, "step": 71935 }, { "epoch": 18.695426195426194, "grad_norm": 0.10008023679256439, "learning_rate": 6.455014700870842e-07, "loss": 0.0004, "num_input_tokens_seen": 6478000, "step": 71940 }, { "epoch": 18.696725571725572, "grad_norm": 0.21403706073760986, "learning_rate": 6.442220541543031e-07, "loss": 0.0656, "num_input_tokens_seen": 6478432, "step": 71945 }, { "epoch": 18.698024948024948, "grad_norm": 61.60123825073242, "learning_rate": 6.429438908625213e-07, "loss": 0.3071, "num_input_tokens_seen": 6478896, "step": 71950 }, { "epoch": 18.699324324324323, "grad_norm": 0.49766942858695984, "learning_rate": 6.416669802774722e-07, "loss": 0.1132, "num_input_tokens_seen": 6479328, "step": 71955 }, { "epoch": 18.7006237006237, "grad_norm": 0.43495500087738037, "learning_rate": 6.403913224648312e-07, "loss": 0.0291, "num_input_tokens_seen": 6479776, "step": 71960 }, { "epoch": 18.701923076923077, "grad_norm": 0.0028546510729938745, "learning_rate": 6.39116917490204e-07, "loss": 0.2696, "num_input_tokens_seen": 6480192, "step": 71965 }, { "epoch": 18.703222453222452, "grad_norm": 63.575592041015625, "learning_rate": 6.378437654191355e-07, "loss": 0.2079, "num_input_tokens_seen": 6480640, "step": 71970 }, { "epoch": 18.70452182952183, "grad_norm": 3.2590317726135254, "learning_rate": 6.365718663171038e-07, "loss": 0.1977, "num_input_tokens_seen": 6481072, "step": 71975 }, { "epoch": 18.705821205821206, "grad_norm": 60.33661651611328, "learning_rate": 6.353012202495235e-07, "loss": 0.3157, "num_input_tokens_seen": 6481504, "step": 71980 }, { "epoch": 18.70712058212058, "grad_norm": 4.018299579620361, "learning_rate": 6.340318272817474e-07, "loss": 0.0025, "num_input_tokens_seen": 6481952, "step": 71985 }, { "epoch": 18.70841995841996, "grad_norm": 0.0014463858678936958, "learning_rate": 6.327636874790654e-07, "loss": 0.0007, "num_input_tokens_seen": 6482400, "step": 71990 }, { "epoch": 18.709719334719335, "grad_norm": 0.0473305881023407, "learning_rate": 6.314968009066891e-07, "loss": 0.0017, "num_input_tokens_seen": 6482832, "step": 71995 }, { "epoch": 18.71101871101871, "grad_norm": 1.7371069192886353, "learning_rate": 6.302311676297829e-07, "loss": 0.0868, "num_input_tokens_seen": 6483312, "step": 72000 }, { "epoch": 18.71231808731809, "grad_norm": 0.0015066413907334208, "learning_rate": 6.289667877134392e-07, "loss": 0.0017, "num_input_tokens_seen": 6483760, "step": 72005 }, { "epoch": 18.713617463617464, "grad_norm": 1.58206045627594, "learning_rate": 6.277036612226839e-07, "loss": 0.1852, "num_input_tokens_seen": 6484160, "step": 72010 }, { "epoch": 18.71491683991684, "grad_norm": 61.02308654785156, "learning_rate": 6.264417882224788e-07, "loss": 0.0633, "num_input_tokens_seen": 6484592, "step": 72015 }, { "epoch": 18.716216216216218, "grad_norm": 3.988600969314575, "learning_rate": 6.251811687777276e-07, "loss": 0.1232, "num_input_tokens_seen": 6484992, "step": 72020 }, { "epoch": 18.717515592515593, "grad_norm": 2.3617970943450928, "learning_rate": 6.239218029532673e-07, "loss": 0.0123, "num_input_tokens_seen": 6485440, "step": 72025 }, { "epoch": 18.718814968814968, "grad_norm": 0.006069459952414036, "learning_rate": 6.2266369081386e-07, "loss": 0.0004, "num_input_tokens_seen": 6485920, "step": 72030 }, { "epoch": 18.720114345114347, "grad_norm": 0.00428031524643302, "learning_rate": 6.214068324242206e-07, "loss": 0.0173, "num_input_tokens_seen": 6486416, "step": 72035 }, { "epoch": 18.72141372141372, "grad_norm": 0.18022911250591278, "learning_rate": 6.201512278489835e-07, "loss": 0.0076, "num_input_tokens_seen": 6486864, "step": 72040 }, { "epoch": 18.722713097713097, "grad_norm": 0.26282602548599243, "learning_rate": 6.188968771527303e-07, "loss": 0.0215, "num_input_tokens_seen": 6487344, "step": 72045 }, { "epoch": 18.724012474012476, "grad_norm": 1.600702166557312, "learning_rate": 6.176437803999679e-07, "loss": 0.0159, "num_input_tokens_seen": 6487760, "step": 72050 }, { "epoch": 18.72531185031185, "grad_norm": 0.46153122186660767, "learning_rate": 6.163919376551502e-07, "loss": 0.0016, "num_input_tokens_seen": 6488208, "step": 72055 }, { "epoch": 18.726611226611226, "grad_norm": 3.7877721786499023, "learning_rate": 6.151413489826563e-07, "loss": 0.003, "num_input_tokens_seen": 6488656, "step": 72060 }, { "epoch": 18.727910602910605, "grad_norm": 0.010892802849411964, "learning_rate": 6.138920144468124e-07, "loss": 0.0062, "num_input_tokens_seen": 6489152, "step": 72065 }, { "epoch": 18.72920997920998, "grad_norm": 0.006866474635899067, "learning_rate": 6.126439341118645e-07, "loss": 0.0003, "num_input_tokens_seen": 6489584, "step": 72070 }, { "epoch": 18.730509355509355, "grad_norm": 47.466331481933594, "learning_rate": 6.113971080420056e-07, "loss": 0.061, "num_input_tokens_seen": 6490016, "step": 72075 }, { "epoch": 18.731808731808734, "grad_norm": 0.05341726168990135, "learning_rate": 6.101515363013649e-07, "loss": 0.001, "num_input_tokens_seen": 6490448, "step": 72080 }, { "epoch": 18.73310810810811, "grad_norm": 2.5438437461853027, "learning_rate": 6.089072189539968e-07, "loss": 0.0029, "num_input_tokens_seen": 6490880, "step": 72085 }, { "epoch": 18.734407484407484, "grad_norm": 0.6735795140266418, "learning_rate": 6.076641560639002e-07, "loss": 0.0859, "num_input_tokens_seen": 6491344, "step": 72090 }, { "epoch": 18.73570686070686, "grad_norm": 0.8735893368721008, "learning_rate": 6.06422347695007e-07, "loss": 0.4382, "num_input_tokens_seen": 6491808, "step": 72095 }, { "epoch": 18.737006237006238, "grad_norm": 0.08356909453868866, "learning_rate": 6.051817939111887e-07, "loss": 0.0777, "num_input_tokens_seen": 6492240, "step": 72100 }, { "epoch": 18.738305613305613, "grad_norm": 0.011882497929036617, "learning_rate": 6.039424947762413e-07, "loss": 0.0087, "num_input_tokens_seen": 6492656, "step": 72105 }, { "epoch": 18.739604989604988, "grad_norm": 0.03160407766699791, "learning_rate": 6.027044503539081e-07, "loss": 0.0023, "num_input_tokens_seen": 6493088, "step": 72110 }, { "epoch": 18.740904365904367, "grad_norm": 5.104318141937256, "learning_rate": 6.014676607078607e-07, "loss": 0.049, "num_input_tokens_seen": 6493552, "step": 72115 }, { "epoch": 18.742203742203742, "grad_norm": 0.0268786009401083, "learning_rate": 6.002321259017118e-07, "loss": 0.0007, "num_input_tokens_seen": 6493968, "step": 72120 }, { "epoch": 18.743503118503117, "grad_norm": 0.0021676879841834307, "learning_rate": 5.989978459989998e-07, "loss": 0.1537, "num_input_tokens_seen": 6494416, "step": 72125 }, { "epoch": 18.744802494802496, "grad_norm": 0.005720884073525667, "learning_rate": 5.977648210632097e-07, "loss": 0.0003, "num_input_tokens_seen": 6494880, "step": 72130 }, { "epoch": 18.74610187110187, "grad_norm": 0.13344626128673553, "learning_rate": 5.96533051157755e-07, "loss": 0.0534, "num_input_tokens_seen": 6495360, "step": 72135 }, { "epoch": 18.747401247401246, "grad_norm": 0.06216158717870712, "learning_rate": 5.953025363459902e-07, "loss": 0.4511, "num_input_tokens_seen": 6495824, "step": 72140 }, { "epoch": 18.748700623700625, "grad_norm": 0.29680928587913513, "learning_rate": 5.94073276691201e-07, "loss": 0.0162, "num_input_tokens_seen": 6496272, "step": 72145 }, { "epoch": 18.75, "grad_norm": 41.73478317260742, "learning_rate": 5.928452722566036e-07, "loss": 0.0677, "num_input_tokens_seen": 6496688, "step": 72150 }, { "epoch": 18.751299376299375, "grad_norm": 12.773984909057617, "learning_rate": 5.916185231053611e-07, "loss": 0.01, "num_input_tokens_seen": 6497136, "step": 72155 }, { "epoch": 18.752598752598754, "grad_norm": 1.5739372968673706, "learning_rate": 5.90393029300565e-07, "loss": 0.1674, "num_input_tokens_seen": 6497552, "step": 72160 }, { "epoch": 18.75389812889813, "grad_norm": 0.002631730865687132, "learning_rate": 5.891687909052423e-07, "loss": 0.0131, "num_input_tokens_seen": 6498000, "step": 72165 }, { "epoch": 18.755197505197504, "grad_norm": 0.8555406928062439, "learning_rate": 5.879458079823596e-07, "loss": 0.0007, "num_input_tokens_seen": 6498464, "step": 72170 }, { "epoch": 18.756496881496883, "grad_norm": 0.09953302145004272, "learning_rate": 5.867240805948138e-07, "loss": 0.0991, "num_input_tokens_seen": 6498944, "step": 72175 }, { "epoch": 18.757796257796258, "grad_norm": 0.5059183239936829, "learning_rate": 5.855036088054406e-07, "loss": 0.0163, "num_input_tokens_seen": 6499376, "step": 72180 }, { "epoch": 18.759095634095633, "grad_norm": 0.038394372910261154, "learning_rate": 5.842843926770119e-07, "loss": 0.159, "num_input_tokens_seen": 6499808, "step": 72185 }, { "epoch": 18.760395010395012, "grad_norm": 0.009370489045977592, "learning_rate": 5.830664322722279e-07, "loss": 0.5801, "num_input_tokens_seen": 6500240, "step": 72190 }, { "epoch": 18.761694386694387, "grad_norm": 1.124611496925354, "learning_rate": 5.818497276537299e-07, "loss": 0.0012, "num_input_tokens_seen": 6500704, "step": 72195 }, { "epoch": 18.762993762993762, "grad_norm": 0.0027000661939382553, "learning_rate": 5.806342788841013e-07, "loss": 0.2013, "num_input_tokens_seen": 6501136, "step": 72200 }, { "epoch": 18.76429313929314, "grad_norm": 49.469913482666016, "learning_rate": 5.794200860258476e-07, "loss": 0.4763, "num_input_tokens_seen": 6501584, "step": 72205 }, { "epoch": 18.765592515592516, "grad_norm": 9.165032386779785, "learning_rate": 5.782071491414187e-07, "loss": 0.0424, "num_input_tokens_seen": 6502048, "step": 72210 }, { "epoch": 18.76689189189189, "grad_norm": 0.014150520786643028, "learning_rate": 5.769954682931955e-07, "loss": 0.0014, "num_input_tokens_seen": 6502464, "step": 72215 }, { "epoch": 18.768191268191266, "grad_norm": 0.01895751804113388, "learning_rate": 5.757850435434975e-07, "loss": 0.0006, "num_input_tokens_seen": 6502928, "step": 72220 }, { "epoch": 18.769490644490645, "grad_norm": 0.9193532466888428, "learning_rate": 5.745758749545749e-07, "loss": 0.0017, "num_input_tokens_seen": 6503440, "step": 72225 }, { "epoch": 18.77079002079002, "grad_norm": 0.059423837810754776, "learning_rate": 5.733679625886168e-07, "loss": 0.0124, "num_input_tokens_seen": 6503872, "step": 72230 }, { "epoch": 18.772089397089395, "grad_norm": 6.961244583129883, "learning_rate": 5.721613065077514e-07, "loss": 0.1943, "num_input_tokens_seen": 6504288, "step": 72235 }, { "epoch": 18.773388773388774, "grad_norm": 43.70463180541992, "learning_rate": 5.709559067740344e-07, "loss": 0.1499, "num_input_tokens_seen": 6504720, "step": 72240 }, { "epoch": 18.77468814968815, "grad_norm": 0.33568912744522095, "learning_rate": 5.697517634494637e-07, "loss": 0.0801, "num_input_tokens_seen": 6505168, "step": 72245 }, { "epoch": 18.775987525987524, "grad_norm": 1.6681898832321167, "learning_rate": 5.685488765959673e-07, "loss": 0.0366, "num_input_tokens_seen": 6505616, "step": 72250 }, { "epoch": 18.777286902286903, "grad_norm": 2.7067716121673584, "learning_rate": 5.673472462754098e-07, "loss": 0.0314, "num_input_tokens_seen": 6506064, "step": 72255 }, { "epoch": 18.77858627858628, "grad_norm": 0.08336639404296875, "learning_rate": 5.661468725495944e-07, "loss": 0.0465, "num_input_tokens_seen": 6506512, "step": 72260 }, { "epoch": 18.779885654885653, "grad_norm": 64.01968383789062, "learning_rate": 5.649477554802579e-07, "loss": 0.165, "num_input_tokens_seen": 6506944, "step": 72265 }, { "epoch": 18.781185031185032, "grad_norm": 0.020153149962425232, "learning_rate": 5.637498951290676e-07, "loss": 0.0905, "num_input_tokens_seen": 6507440, "step": 72270 }, { "epoch": 18.782484407484407, "grad_norm": 1.7200407981872559, "learning_rate": 5.625532915576381e-07, "loss": 0.0011, "num_input_tokens_seen": 6507872, "step": 72275 }, { "epoch": 18.783783783783782, "grad_norm": 0.012364453636109829, "learning_rate": 5.613579448275036e-07, "loss": 0.1754, "num_input_tokens_seen": 6508352, "step": 72280 }, { "epoch": 18.78508316008316, "grad_norm": 0.24704822897911072, "learning_rate": 5.601638550001509e-07, "loss": 0.236, "num_input_tokens_seen": 6508800, "step": 72285 }, { "epoch": 18.786382536382536, "grad_norm": 0.015713324770331383, "learning_rate": 5.589710221369837e-07, "loss": 0.1829, "num_input_tokens_seen": 6509280, "step": 72290 }, { "epoch": 18.78768191268191, "grad_norm": 0.10832569748163223, "learning_rate": 5.577794462993586e-07, "loss": 0.0494, "num_input_tokens_seen": 6509744, "step": 72295 }, { "epoch": 18.78898128898129, "grad_norm": 0.8815662860870361, "learning_rate": 5.56589127548554e-07, "loss": 0.1505, "num_input_tokens_seen": 6510176, "step": 72300 }, { "epoch": 18.790280665280665, "grad_norm": 0.004075570032000542, "learning_rate": 5.554000659457881e-07, "loss": 0.0011, "num_input_tokens_seen": 6510640, "step": 72305 }, { "epoch": 18.79158004158004, "grad_norm": 0.00699630007147789, "learning_rate": 5.542122615522227e-07, "loss": 0.1593, "num_input_tokens_seen": 6511120, "step": 72310 }, { "epoch": 18.79287941787942, "grad_norm": 0.0066415416076779366, "learning_rate": 5.530257144289425e-07, "loss": 0.0149, "num_input_tokens_seen": 6511568, "step": 72315 }, { "epoch": 18.794178794178794, "grad_norm": 0.042921282351017, "learning_rate": 5.518404246369735e-07, "loss": 0.0008, "num_input_tokens_seen": 6512000, "step": 72320 }, { "epoch": 18.79547817047817, "grad_norm": 0.5658978223800659, "learning_rate": 5.506563922372781e-07, "loss": 0.1892, "num_input_tokens_seen": 6512448, "step": 72325 }, { "epoch": 18.796777546777548, "grad_norm": 0.21620877087116241, "learning_rate": 5.494736172907495e-07, "loss": 0.0517, "num_input_tokens_seen": 6512928, "step": 72330 }, { "epoch": 18.798076923076923, "grad_norm": 0.03486869856715202, "learning_rate": 5.482920998582191e-07, "loss": 0.0016, "num_input_tokens_seen": 6513376, "step": 72335 }, { "epoch": 18.7993762993763, "grad_norm": 0.0024423974100500345, "learning_rate": 5.471118400004555e-07, "loss": 0.0003, "num_input_tokens_seen": 6513840, "step": 72340 }, { "epoch": 18.800675675675677, "grad_norm": 0.7692724466323853, "learning_rate": 5.459328377781598e-07, "loss": 0.0246, "num_input_tokens_seen": 6514288, "step": 72345 }, { "epoch": 18.801975051975052, "grad_norm": 0.004918810911476612, "learning_rate": 5.447550932519696e-07, "loss": 0.0313, "num_input_tokens_seen": 6514704, "step": 72350 }, { "epoch": 18.803274428274428, "grad_norm": 0.08684483170509338, "learning_rate": 5.435786064824533e-07, "loss": 0.0008, "num_input_tokens_seen": 6515184, "step": 72355 }, { "epoch": 18.804573804573806, "grad_norm": 0.12486062943935394, "learning_rate": 5.424033775301262e-07, "loss": 0.0005, "num_input_tokens_seen": 6515632, "step": 72360 }, { "epoch": 18.80587318087318, "grad_norm": 36.65745544433594, "learning_rate": 5.412294064554263e-07, "loss": 0.0373, "num_input_tokens_seen": 6516080, "step": 72365 }, { "epoch": 18.807172557172557, "grad_norm": 2.653505563735962, "learning_rate": 5.400566933187357e-07, "loss": 0.0019, "num_input_tokens_seen": 6516528, "step": 72370 }, { "epoch": 18.808471933471935, "grad_norm": 0.003141401568427682, "learning_rate": 5.388852381803617e-07, "loss": 0.0027, "num_input_tokens_seen": 6516960, "step": 72375 }, { "epoch": 18.80977130977131, "grad_norm": 0.010304421186447144, "learning_rate": 5.377150411005588e-07, "loss": 0.4942, "num_input_tokens_seen": 6517408, "step": 72380 }, { "epoch": 18.811070686070686, "grad_norm": 0.01025367807596922, "learning_rate": 5.365461021395096e-07, "loss": 0.0002, "num_input_tokens_seen": 6517872, "step": 72385 }, { "epoch": 18.81237006237006, "grad_norm": 59.39424514770508, "learning_rate": 5.353784213573354e-07, "loss": 0.2195, "num_input_tokens_seen": 6518320, "step": 72390 }, { "epoch": 18.81366943866944, "grad_norm": 61.72728729248047, "learning_rate": 5.342119988140881e-07, "loss": 0.2369, "num_input_tokens_seen": 6518736, "step": 72395 }, { "epoch": 18.814968814968815, "grad_norm": 18.541072845458984, "learning_rate": 5.330468345697615e-07, "loss": 0.0138, "num_input_tokens_seen": 6519216, "step": 72400 }, { "epoch": 18.81626819126819, "grad_norm": 0.9317647814750671, "learning_rate": 5.318829286842796e-07, "loss": 0.0016, "num_input_tokens_seen": 6519664, "step": 72405 }, { "epoch": 18.81756756756757, "grad_norm": 0.14679810404777527, "learning_rate": 5.307202812175005e-07, "loss": 0.0004, "num_input_tokens_seen": 6520112, "step": 72410 }, { "epoch": 18.818866943866944, "grad_norm": 0.0004043384687975049, "learning_rate": 5.295588922292233e-07, "loss": 0.5062, "num_input_tokens_seen": 6520528, "step": 72415 }, { "epoch": 18.82016632016632, "grad_norm": 0.2655552327632904, "learning_rate": 5.28398761779178e-07, "loss": 0.0188, "num_input_tokens_seen": 6520976, "step": 72420 }, { "epoch": 18.821465696465697, "grad_norm": 0.14874589443206787, "learning_rate": 5.272398899270364e-07, "loss": 0.0319, "num_input_tokens_seen": 6521424, "step": 72425 }, { "epoch": 18.822765072765073, "grad_norm": 0.08580213040113449, "learning_rate": 5.260822767323926e-07, "loss": 0.0767, "num_input_tokens_seen": 6521872, "step": 72430 }, { "epoch": 18.824064449064448, "grad_norm": 44.300262451171875, "learning_rate": 5.249259222547876e-07, "loss": 0.6267, "num_input_tokens_seen": 6522304, "step": 72435 }, { "epoch": 18.825363825363826, "grad_norm": 85.82279205322266, "learning_rate": 5.237708265536934e-07, "loss": 0.1858, "num_input_tokens_seen": 6522736, "step": 72440 }, { "epoch": 18.8266632016632, "grad_norm": 64.40573120117188, "learning_rate": 5.226169896885153e-07, "loss": 0.2544, "num_input_tokens_seen": 6523200, "step": 72445 }, { "epoch": 18.827962577962577, "grad_norm": 0.5142905712127686, "learning_rate": 5.214644117186002e-07, "loss": 0.0821, "num_input_tokens_seen": 6523648, "step": 72450 }, { "epoch": 18.829261954261955, "grad_norm": 11.634238243103027, "learning_rate": 5.203130927032257e-07, "loss": 0.0083, "num_input_tokens_seen": 6524160, "step": 72455 }, { "epoch": 18.83056133056133, "grad_norm": 0.005950676277279854, "learning_rate": 5.191630327016028e-07, "loss": 0.0145, "num_input_tokens_seen": 6524592, "step": 72460 }, { "epoch": 18.831860706860706, "grad_norm": 0.00490257004275918, "learning_rate": 5.180142317728815e-07, "loss": 0.0029, "num_input_tokens_seen": 6525024, "step": 72465 }, { "epoch": 18.833160083160084, "grad_norm": 15.415223121643066, "learning_rate": 5.168666899761476e-07, "loss": 0.0135, "num_input_tokens_seen": 6525504, "step": 72470 }, { "epoch": 18.83445945945946, "grad_norm": 8.935946464538574, "learning_rate": 5.15720407370418e-07, "loss": 0.1008, "num_input_tokens_seen": 6525936, "step": 72475 }, { "epoch": 18.835758835758835, "grad_norm": 0.002207621466368437, "learning_rate": 5.145753840146456e-07, "loss": 0.0021, "num_input_tokens_seen": 6526416, "step": 72480 }, { "epoch": 18.837058212058214, "grad_norm": 29.188806533813477, "learning_rate": 5.13431619967722e-07, "loss": 0.0336, "num_input_tokens_seen": 6526848, "step": 72485 }, { "epoch": 18.83835758835759, "grad_norm": 0.48622098565101624, "learning_rate": 5.122891152884751e-07, "loss": 0.0005, "num_input_tokens_seen": 6527312, "step": 72490 }, { "epoch": 18.839656964656964, "grad_norm": 0.28147396445274353, "learning_rate": 5.111478700356582e-07, "loss": 0.0058, "num_input_tokens_seen": 6527776, "step": 72495 }, { "epoch": 18.840956340956343, "grad_norm": 3.802921772003174, "learning_rate": 5.10007884267974e-07, "loss": 0.0028, "num_input_tokens_seen": 6528240, "step": 72500 }, { "epoch": 18.842255717255718, "grad_norm": 22.99901008605957, "learning_rate": 5.08869158044048e-07, "loss": 0.024, "num_input_tokens_seen": 6528688, "step": 72505 }, { "epoch": 18.843555093555093, "grad_norm": 0.038777612149715424, "learning_rate": 5.077316914224472e-07, "loss": 0.0002, "num_input_tokens_seen": 6529168, "step": 72510 }, { "epoch": 18.84485446985447, "grad_norm": 2.6722209453582764, "learning_rate": 5.065954844616721e-07, "loss": 0.3366, "num_input_tokens_seen": 6529616, "step": 72515 }, { "epoch": 18.846153846153847, "grad_norm": 0.027124205604195595, "learning_rate": 5.054605372201593e-07, "loss": 0.0024, "num_input_tokens_seen": 6530096, "step": 72520 }, { "epoch": 18.847453222453222, "grad_norm": 0.008814308792352676, "learning_rate": 5.043268497562814e-07, "loss": 0.0152, "num_input_tokens_seen": 6530544, "step": 72525 }, { "epoch": 18.848752598752597, "grad_norm": 0.010097295977175236, "learning_rate": 5.031944221283474e-07, "loss": 0.0011, "num_input_tokens_seen": 6530976, "step": 72530 }, { "epoch": 18.850051975051976, "grad_norm": 2.158535957336426, "learning_rate": 5.020632543945941e-07, "loss": 0.0019, "num_input_tokens_seen": 6531424, "step": 72535 }, { "epoch": 18.85135135135135, "grad_norm": 7.985045433044434, "learning_rate": 5.009333466131971e-07, "loss": 0.0059, "num_input_tokens_seen": 6531888, "step": 72540 }, { "epoch": 18.852650727650726, "grad_norm": 0.02879149466753006, "learning_rate": 4.998046988422766e-07, "loss": 0.0001, "num_input_tokens_seen": 6532336, "step": 72545 }, { "epoch": 18.853950103950105, "grad_norm": 134.49835205078125, "learning_rate": 4.986773111398724e-07, "loss": 0.3778, "num_input_tokens_seen": 6532800, "step": 72550 }, { "epoch": 18.85524948024948, "grad_norm": 0.05664451792836189, "learning_rate": 4.975511835639712e-07, "loss": 0.0002, "num_input_tokens_seen": 6533232, "step": 72555 }, { "epoch": 18.856548856548855, "grad_norm": 1.5891135931015015, "learning_rate": 4.964263161724881e-07, "loss": 0.0258, "num_input_tokens_seen": 6533712, "step": 72560 }, { "epoch": 18.857848232848234, "grad_norm": 0.05556368827819824, "learning_rate": 4.953027090232792e-07, "loss": 0.0005, "num_input_tokens_seen": 6534160, "step": 72565 }, { "epoch": 18.85914760914761, "grad_norm": 0.002186352154240012, "learning_rate": 4.94180362174132e-07, "loss": 0.1185, "num_input_tokens_seen": 6534608, "step": 72570 }, { "epoch": 18.860446985446984, "grad_norm": 0.004316024947911501, "learning_rate": 4.930592756827695e-07, "loss": 0.0151, "num_input_tokens_seen": 6535024, "step": 72575 }, { "epoch": 18.861746361746363, "grad_norm": 2.2273917198181152, "learning_rate": 4.919394496068486e-07, "loss": 0.0035, "num_input_tokens_seen": 6535472, "step": 72580 }, { "epoch": 18.863045738045738, "grad_norm": 0.15182100236415863, "learning_rate": 4.908208840039646e-07, "loss": 0.0004, "num_input_tokens_seen": 6535904, "step": 72585 }, { "epoch": 18.864345114345113, "grad_norm": 0.002713114023208618, "learning_rate": 4.89703578931644e-07, "loss": 0.3351, "num_input_tokens_seen": 6536368, "step": 72590 }, { "epoch": 18.865644490644492, "grad_norm": 75.01823425292969, "learning_rate": 4.885875344473545e-07, "loss": 0.603, "num_input_tokens_seen": 6536832, "step": 72595 }, { "epoch": 18.866943866943867, "grad_norm": 0.0005552282673306763, "learning_rate": 4.874727506084947e-07, "loss": 0.0229, "num_input_tokens_seen": 6537264, "step": 72600 }, { "epoch": 18.868243243243242, "grad_norm": 54.078487396240234, "learning_rate": 4.863592274723965e-07, "loss": 0.3622, "num_input_tokens_seen": 6537712, "step": 72605 }, { "epoch": 18.86954261954262, "grad_norm": 0.0008570586214773357, "learning_rate": 4.852469650963337e-07, "loss": 0.2552, "num_input_tokens_seen": 6538160, "step": 72610 }, { "epoch": 18.870841995841996, "grad_norm": 23.72119903564453, "learning_rate": 4.841359635375076e-07, "loss": 0.1321, "num_input_tokens_seen": 6538608, "step": 72615 }, { "epoch": 18.87214137214137, "grad_norm": 1.8789689540863037, "learning_rate": 4.830262228530585e-07, "loss": 0.0144, "num_input_tokens_seen": 6539040, "step": 72620 }, { "epoch": 18.87344074844075, "grad_norm": 0.018648425117135048, "learning_rate": 4.819177431000604e-07, "loss": 0.0005, "num_input_tokens_seen": 6539488, "step": 72625 }, { "epoch": 18.874740124740125, "grad_norm": 0.13025586307048798, "learning_rate": 4.80810524335526e-07, "loss": 0.0003, "num_input_tokens_seen": 6539952, "step": 72630 }, { "epoch": 18.8760395010395, "grad_norm": 0.17858155071735382, "learning_rate": 4.797045666163986e-07, "loss": 0.1396, "num_input_tokens_seen": 6540384, "step": 72635 }, { "epoch": 18.87733887733888, "grad_norm": 0.01859150640666485, "learning_rate": 4.785998699995603e-07, "loss": 0.1795, "num_input_tokens_seen": 6540880, "step": 72640 }, { "epoch": 18.878638253638254, "grad_norm": 0.012429427355527878, "learning_rate": 4.77496434541827e-07, "loss": 0.0042, "num_input_tokens_seen": 6541376, "step": 72645 }, { "epoch": 18.87993762993763, "grad_norm": 0.6800950169563293, "learning_rate": 4.763942602999477e-07, "loss": 0.1373, "num_input_tokens_seen": 6541824, "step": 72650 }, { "epoch": 18.881237006237008, "grad_norm": 0.009611649438738823, "learning_rate": 4.752933473306076e-07, "loss": 0.0015, "num_input_tokens_seen": 6542256, "step": 72655 }, { "epoch": 18.882536382536383, "grad_norm": 61.06852722167969, "learning_rate": 4.741936956904308e-07, "loss": 0.197, "num_input_tokens_seen": 6542736, "step": 72660 }, { "epoch": 18.883835758835758, "grad_norm": 0.027287056669592857, "learning_rate": 4.730953054359694e-07, "loss": 0.0364, "num_input_tokens_seen": 6543200, "step": 72665 }, { "epoch": 18.885135135135137, "grad_norm": 0.006379059050232172, "learning_rate": 4.719981766237197e-07, "loss": 0.0267, "num_input_tokens_seen": 6543632, "step": 72670 }, { "epoch": 18.886434511434512, "grad_norm": 13.818933486938477, "learning_rate": 4.7090230931010615e-07, "loss": 0.0263, "num_input_tokens_seen": 6544096, "step": 72675 }, { "epoch": 18.887733887733887, "grad_norm": 0.1001172810792923, "learning_rate": 4.698077035514864e-07, "loss": 0.0002, "num_input_tokens_seen": 6544512, "step": 72680 }, { "epoch": 18.889033264033262, "grad_norm": 0.01418248750269413, "learning_rate": 4.687143594041626e-07, "loss": 0.0305, "num_input_tokens_seen": 6544928, "step": 72685 }, { "epoch": 18.89033264033264, "grad_norm": 1.6347583532333374, "learning_rate": 4.6762227692436213e-07, "loss": 0.041, "num_input_tokens_seen": 6545360, "step": 72690 }, { "epoch": 18.891632016632016, "grad_norm": 0.004797479137778282, "learning_rate": 4.665314561682538e-07, "loss": 0.0276, "num_input_tokens_seen": 6545856, "step": 72695 }, { "epoch": 18.89293139293139, "grad_norm": 0.009828724898397923, "learning_rate": 4.654418971919372e-07, "loss": 0.0001, "num_input_tokens_seen": 6546336, "step": 72700 }, { "epoch": 18.89423076923077, "grad_norm": 0.012137328274548054, "learning_rate": 4.6435360005145644e-07, "loss": 0.3564, "num_input_tokens_seen": 6546816, "step": 72705 }, { "epoch": 18.895530145530145, "grad_norm": 0.1064252033829689, "learning_rate": 4.632665648027779e-07, "loss": 0.0005, "num_input_tokens_seen": 6547248, "step": 72710 }, { "epoch": 18.89682952182952, "grad_norm": 0.009100972674787045, "learning_rate": 4.6218079150180946e-07, "loss": 0.0172, "num_input_tokens_seen": 6547696, "step": 72715 }, { "epoch": 18.8981288981289, "grad_norm": 33.5389518737793, "learning_rate": 4.610962802043928e-07, "loss": 0.1406, "num_input_tokens_seen": 6548112, "step": 72720 }, { "epoch": 18.899428274428274, "grad_norm": 6.411097049713135, "learning_rate": 4.600130309663081e-07, "loss": 0.0038, "num_input_tokens_seen": 6548576, "step": 72725 }, { "epoch": 18.90072765072765, "grad_norm": 0.040500570088624954, "learning_rate": 4.5893104384326367e-07, "loss": 0.0016, "num_input_tokens_seen": 6548976, "step": 72730 }, { "epoch": 18.902027027027028, "grad_norm": 0.05867984890937805, "learning_rate": 4.5785031889091225e-07, "loss": 0.1599, "num_input_tokens_seen": 6549456, "step": 72735 }, { "epoch": 18.903326403326403, "grad_norm": 0.0023749915417283773, "learning_rate": 4.5677085616483427e-07, "loss": 0.1402, "num_input_tokens_seen": 6549904, "step": 72740 }, { "epoch": 18.90462577962578, "grad_norm": 0.22548061609268188, "learning_rate": 4.5569265572054655e-07, "loss": 0.001, "num_input_tokens_seen": 6550352, "step": 72745 }, { "epoch": 18.905925155925157, "grad_norm": 1.5765609741210938, "learning_rate": 4.5461571761350465e-07, "loss": 0.0031, "num_input_tokens_seen": 6550768, "step": 72750 }, { "epoch": 18.907224532224532, "grad_norm": 40.125030517578125, "learning_rate": 4.5354004189909203e-07, "loss": 0.1577, "num_input_tokens_seen": 6551200, "step": 72755 }, { "epoch": 18.908523908523907, "grad_norm": 0.24869148433208466, "learning_rate": 4.52465628632634e-07, "loss": 0.0004, "num_input_tokens_seen": 6551648, "step": 72760 }, { "epoch": 18.909823284823286, "grad_norm": 3.5936832427978516, "learning_rate": 4.51392477869389e-07, "loss": 0.0407, "num_input_tokens_seen": 6552080, "step": 72765 }, { "epoch": 18.91112266112266, "grad_norm": 0.16329240798950195, "learning_rate": 4.503205896645518e-07, "loss": 0.0007, "num_input_tokens_seen": 6552560, "step": 72770 }, { "epoch": 18.912422037422036, "grad_norm": 0.02205575257539749, "learning_rate": 4.492499640732478e-07, "loss": 0.0338, "num_input_tokens_seen": 6553024, "step": 72775 }, { "epoch": 18.913721413721415, "grad_norm": 0.0008385093533433974, "learning_rate": 4.4818060115054406e-07, "loss": 0.0003, "num_input_tokens_seen": 6553472, "step": 72780 }, { "epoch": 18.91502079002079, "grad_norm": 0.20455646514892578, "learning_rate": 4.4711250095143267e-07, "loss": 0.2525, "num_input_tokens_seen": 6553920, "step": 72785 }, { "epoch": 18.916320166320165, "grad_norm": 0.00342776020988822, "learning_rate": 4.4604566353085296e-07, "loss": 0.0004, "num_input_tokens_seen": 6554352, "step": 72790 }, { "epoch": 18.917619542619544, "grad_norm": 0.005047073122113943, "learning_rate": 4.4498008894367227e-07, "loss": 0.2452, "num_input_tokens_seen": 6554800, "step": 72795 }, { "epoch": 18.91891891891892, "grad_norm": 0.007085287943482399, "learning_rate": 4.4391577724469114e-07, "loss": 0.0012, "num_input_tokens_seen": 6555248, "step": 72800 }, { "epoch": 18.920218295218294, "grad_norm": 0.02136324718594551, "learning_rate": 4.428527284886519e-07, "loss": 0.0015, "num_input_tokens_seen": 6555680, "step": 72805 }, { "epoch": 18.921517671517673, "grad_norm": 0.01149732992053032, "learning_rate": 4.417909427302247e-07, "loss": 0.1076, "num_input_tokens_seen": 6556160, "step": 72810 }, { "epoch": 18.92281704781705, "grad_norm": 0.006090066395699978, "learning_rate": 4.407304200240214e-07, "loss": 0.0018, "num_input_tokens_seen": 6556592, "step": 72815 }, { "epoch": 18.924116424116423, "grad_norm": 0.009586267173290253, "learning_rate": 4.3967116042458177e-07, "loss": 0.0001, "num_input_tokens_seen": 6557056, "step": 72820 }, { "epoch": 18.9254158004158, "grad_norm": 65.06705474853516, "learning_rate": 4.3861316398638995e-07, "loss": 0.1748, "num_input_tokens_seen": 6557488, "step": 72825 }, { "epoch": 18.926715176715177, "grad_norm": 8.619494438171387, "learning_rate": 4.3755643076385243e-07, "loss": 0.1649, "num_input_tokens_seen": 6557952, "step": 72830 }, { "epoch": 18.928014553014552, "grad_norm": 0.02113008312880993, "learning_rate": 4.365009608113285e-07, "loss": 0.0003, "num_input_tokens_seen": 6558400, "step": 72835 }, { "epoch": 18.929313929313928, "grad_norm": 24.48946762084961, "learning_rate": 4.3544675418309144e-07, "loss": 0.1875, "num_input_tokens_seen": 6558832, "step": 72840 }, { "epoch": 18.930613305613306, "grad_norm": 0.3202703297138214, "learning_rate": 4.343938109333645e-07, "loss": 0.1568, "num_input_tokens_seen": 6559264, "step": 72845 }, { "epoch": 18.93191268191268, "grad_norm": 0.012687393464148045, "learning_rate": 4.3334213111629883e-07, "loss": 0.038, "num_input_tokens_seen": 6559744, "step": 72850 }, { "epoch": 18.933212058212057, "grad_norm": 0.0010809648083522916, "learning_rate": 4.3229171478599283e-07, "loss": 0.0541, "num_input_tokens_seen": 6560208, "step": 72855 }, { "epoch": 18.934511434511435, "grad_norm": 1.5782575607299805, "learning_rate": 4.3124256199645884e-07, "loss": 0.1658, "num_input_tokens_seen": 6560640, "step": 72860 }, { "epoch": 18.93581081081081, "grad_norm": 0.4993017017841339, "learning_rate": 4.30194672801662e-07, "loss": 0.0021, "num_input_tokens_seen": 6561088, "step": 72865 }, { "epoch": 18.937110187110186, "grad_norm": 0.0494360588490963, "learning_rate": 4.291480472554954e-07, "loss": 0.1235, "num_input_tokens_seen": 6561536, "step": 72870 }, { "epoch": 18.938409563409564, "grad_norm": 0.006027237046509981, "learning_rate": 4.281026854117853e-07, "loss": 0.111, "num_input_tokens_seen": 6561968, "step": 72875 }, { "epoch": 18.93970893970894, "grad_norm": 0.008895305916666985, "learning_rate": 4.2705858732429993e-07, "loss": 0.2155, "num_input_tokens_seen": 6562448, "step": 72880 }, { "epoch": 18.941008316008315, "grad_norm": 0.30703696608543396, "learning_rate": 4.2601575304673234e-07, "loss": 0.0088, "num_input_tokens_seen": 6562896, "step": 72885 }, { "epoch": 18.942307692307693, "grad_norm": 10.848581314086914, "learning_rate": 4.2497418263272583e-07, "loss": 0.0345, "num_input_tokens_seen": 6563344, "step": 72890 }, { "epoch": 18.94360706860707, "grad_norm": 0.20019838213920593, "learning_rate": 4.2393387613584025e-07, "loss": 0.0131, "num_input_tokens_seen": 6563840, "step": 72895 }, { "epoch": 18.944906444906444, "grad_norm": 0.026208333671092987, "learning_rate": 4.228948336095856e-07, "loss": 0.0002, "num_input_tokens_seen": 6564288, "step": 72900 }, { "epoch": 18.946205821205822, "grad_norm": 1.792523980140686, "learning_rate": 4.2185705510739415e-07, "loss": 0.0659, "num_input_tokens_seen": 6564736, "step": 72905 }, { "epoch": 18.947505197505198, "grad_norm": 11.802630424499512, "learning_rate": 4.208205406826482e-07, "loss": 0.0095, "num_input_tokens_seen": 6565200, "step": 72910 }, { "epoch": 18.948804573804573, "grad_norm": 2.285295009613037, "learning_rate": 4.1978529038864676e-07, "loss": 0.0799, "num_input_tokens_seen": 6565680, "step": 72915 }, { "epoch": 18.95010395010395, "grad_norm": 0.5001276135444641, "learning_rate": 4.187513042786445e-07, "loss": 0.0031, "num_input_tokens_seen": 6566112, "step": 72920 }, { "epoch": 18.951403326403327, "grad_norm": 0.002283309120684862, "learning_rate": 4.177185824058155e-07, "loss": 0.0006, "num_input_tokens_seen": 6566576, "step": 72925 }, { "epoch": 18.9527027027027, "grad_norm": 0.23218512535095215, "learning_rate": 4.166871248232729e-07, "loss": 0.0004, "num_input_tokens_seen": 6567024, "step": 72930 }, { "epoch": 18.95400207900208, "grad_norm": 0.0011159124551340938, "learning_rate": 4.1565693158406584e-07, "loss": 0.0002, "num_input_tokens_seen": 6567472, "step": 72935 }, { "epoch": 18.955301455301456, "grad_norm": 0.0010449910769239068, "learning_rate": 4.1462800274117697e-07, "loss": 0.081, "num_input_tokens_seen": 6567952, "step": 72940 }, { "epoch": 18.95660083160083, "grad_norm": 0.05740928649902344, "learning_rate": 4.136003383475251e-07, "loss": 0.0013, "num_input_tokens_seen": 6568352, "step": 72945 }, { "epoch": 18.95790020790021, "grad_norm": 0.01801370456814766, "learning_rate": 4.1257393845596793e-07, "loss": 0.0145, "num_input_tokens_seen": 6568800, "step": 72950 }, { "epoch": 18.959199584199585, "grad_norm": 0.043273452669382095, "learning_rate": 4.11548803119291e-07, "loss": 0.1673, "num_input_tokens_seen": 6569232, "step": 72955 }, { "epoch": 18.96049896049896, "grad_norm": 0.0074255988001823425, "learning_rate": 4.105249323902188e-07, "loss": 0.2948, "num_input_tokens_seen": 6569680, "step": 72960 }, { "epoch": 18.96179833679834, "grad_norm": 0.008489991538226604, "learning_rate": 4.095023263214121e-07, "loss": 0.0049, "num_input_tokens_seen": 6570144, "step": 72965 }, { "epoch": 18.963097713097714, "grad_norm": 0.00837243627756834, "learning_rate": 4.0848098496545915e-07, "loss": 0.0001, "num_input_tokens_seen": 6570592, "step": 72970 }, { "epoch": 18.96439708939709, "grad_norm": 0.02053418569266796, "learning_rate": 4.0746090837489316e-07, "loss": 0.0033, "num_input_tokens_seen": 6571024, "step": 72975 }, { "epoch": 18.965696465696467, "grad_norm": 0.02160356380045414, "learning_rate": 4.064420966021748e-07, "loss": 0.0023, "num_input_tokens_seen": 6571504, "step": 72980 }, { "epoch": 18.966995841995843, "grad_norm": 45.359291076660156, "learning_rate": 4.0542454969970387e-07, "loss": 0.2672, "num_input_tokens_seen": 6571936, "step": 72985 }, { "epoch": 18.968295218295218, "grad_norm": 0.08360521495342255, "learning_rate": 4.0440826771981354e-07, "loss": 0.0008, "num_input_tokens_seen": 6572400, "step": 72990 }, { "epoch": 18.969594594594593, "grad_norm": 0.000700654461979866, "learning_rate": 4.033932507147731e-07, "loss": 0.0289, "num_input_tokens_seen": 6572912, "step": 72995 }, { "epoch": 18.97089397089397, "grad_norm": 0.0015904353931546211, "learning_rate": 4.0237949873678516e-07, "loss": 0.0013, "num_input_tokens_seen": 6573360, "step": 73000 }, { "epoch": 18.972193347193347, "grad_norm": 0.15315338969230652, "learning_rate": 4.0136701183798866e-07, "loss": 0.1313, "num_input_tokens_seen": 6573824, "step": 73005 }, { "epoch": 18.973492723492722, "grad_norm": 0.004081788007169962, "learning_rate": 4.0035579007045577e-07, "loss": 0.0682, "num_input_tokens_seen": 6574304, "step": 73010 }, { "epoch": 18.9747920997921, "grad_norm": 87.35199737548828, "learning_rate": 3.993458334861949e-07, "loss": 0.3422, "num_input_tokens_seen": 6574752, "step": 73015 }, { "epoch": 18.976091476091476, "grad_norm": 0.05543221905827522, "learning_rate": 3.9833714213714513e-07, "loss": 0.0139, "num_input_tokens_seen": 6575200, "step": 73020 }, { "epoch": 18.97739085239085, "grad_norm": 0.08747672289609909, "learning_rate": 3.9732971607519265e-07, "loss": 0.0924, "num_input_tokens_seen": 6575648, "step": 73025 }, { "epoch": 18.97869022869023, "grad_norm": 0.23939085006713867, "learning_rate": 3.9632355535214603e-07, "loss": 0.001, "num_input_tokens_seen": 6576080, "step": 73030 }, { "epoch": 18.979989604989605, "grad_norm": 34.491329193115234, "learning_rate": 3.9531866001975003e-07, "loss": 0.0906, "num_input_tokens_seen": 6576512, "step": 73035 }, { "epoch": 18.98128898128898, "grad_norm": 0.010028728283941746, "learning_rate": 3.9431503012969384e-07, "loss": 0.0028, "num_input_tokens_seen": 6576944, "step": 73040 }, { "epoch": 18.98258835758836, "grad_norm": 2.282048463821411, "learning_rate": 3.933126657335889e-07, "loss": 0.299, "num_input_tokens_seen": 6577360, "step": 73045 }, { "epoch": 18.983887733887734, "grad_norm": 0.3307265043258667, "learning_rate": 3.9231156688299406e-07, "loss": 0.0198, "num_input_tokens_seen": 6577792, "step": 73050 }, { "epoch": 18.98518711018711, "grad_norm": 65.97535705566406, "learning_rate": 3.9131173362939033e-07, "loss": 0.4137, "num_input_tokens_seen": 6578192, "step": 73055 }, { "epoch": 18.986486486486488, "grad_norm": 0.002353448886424303, "learning_rate": 3.9031316602420323e-07, "loss": 0.3114, "num_input_tokens_seen": 6578640, "step": 73060 }, { "epoch": 18.987785862785863, "grad_norm": 0.007467319257557392, "learning_rate": 3.8931586411879163e-07, "loss": 0.0019, "num_input_tokens_seen": 6579104, "step": 73065 }, { "epoch": 18.989085239085238, "grad_norm": 0.0009443306480534375, "learning_rate": 3.8831982796444233e-07, "loss": 0.0033, "num_input_tokens_seen": 6579600, "step": 73070 }, { "epoch": 18.990384615384617, "grad_norm": 0.1341378092765808, "learning_rate": 3.8732505761239215e-07, "loss": 0.0577, "num_input_tokens_seen": 6580032, "step": 73075 }, { "epoch": 18.991683991683992, "grad_norm": 4.7723307609558105, "learning_rate": 3.8633155311379174e-07, "loss": 0.2154, "num_input_tokens_seen": 6580496, "step": 73080 }, { "epoch": 18.992983367983367, "grad_norm": 75.59659576416016, "learning_rate": 3.853393145197448e-07, "loss": 0.4811, "num_input_tokens_seen": 6580960, "step": 73085 }, { "epoch": 18.994282744282746, "grad_norm": 0.010178862139582634, "learning_rate": 3.8434834188128266e-07, "loss": 0.0326, "num_input_tokens_seen": 6581424, "step": 73090 }, { "epoch": 18.99558212058212, "grad_norm": 0.30453377962112427, "learning_rate": 3.8335863524936733e-07, "loss": 0.0011, "num_input_tokens_seen": 6581872, "step": 73095 }, { "epoch": 18.996881496881496, "grad_norm": 0.39087745547294617, "learning_rate": 3.8237019467490533e-07, "loss": 0.4712, "num_input_tokens_seen": 6582352, "step": 73100 }, { "epoch": 18.998180873180875, "grad_norm": 0.0256006121635437, "learning_rate": 3.8138302020873373e-07, "loss": 0.0948, "num_input_tokens_seen": 6582816, "step": 73105 }, { "epoch": 18.99948024948025, "grad_norm": 0.019375551491975784, "learning_rate": 3.803971119016203e-07, "loss": 0.3151, "num_input_tokens_seen": 6583264, "step": 73110 }, { "epoch": 19.0, "eval_loss": 0.9201962947845459, "eval_runtime": 13.1471, "eval_samples_per_second": 65.11, "eval_steps_per_second": 32.555, "num_input_tokens_seen": 6583408, "step": 73112 }, { "epoch": 19.000779625779625, "grad_norm": 0.5709923505783081, "learning_rate": 3.7941246980427445e-07, "loss": 0.181, "num_input_tokens_seen": 6583664, "step": 73115 }, { "epoch": 19.002079002079004, "grad_norm": 0.8985303640365601, "learning_rate": 3.7842909396733627e-07, "loss": 0.0023, "num_input_tokens_seen": 6584112, "step": 73120 }, { "epoch": 19.00337837837838, "grad_norm": 0.8823784589767456, "learning_rate": 3.774469844413792e-07, "loss": 0.0042, "num_input_tokens_seen": 6584560, "step": 73125 }, { "epoch": 19.004677754677754, "grad_norm": 0.01011077780276537, "learning_rate": 3.7646614127691546e-07, "loss": 0.0029, "num_input_tokens_seen": 6584960, "step": 73130 }, { "epoch": 19.00597713097713, "grad_norm": 0.5953271985054016, "learning_rate": 3.7548656452438826e-07, "loss": 0.0245, "num_input_tokens_seen": 6585392, "step": 73135 }, { "epoch": 19.007276507276508, "grad_norm": 71.99679565429688, "learning_rate": 3.7450825423418767e-07, "loss": 0.2734, "num_input_tokens_seen": 6585856, "step": 73140 }, { "epoch": 19.008575883575883, "grad_norm": 73.96968078613281, "learning_rate": 3.7353121045661797e-07, "loss": 0.144, "num_input_tokens_seen": 6586336, "step": 73145 }, { "epoch": 19.00987525987526, "grad_norm": 0.0007401621551252902, "learning_rate": 3.725554332419362e-07, "loss": 0.0002, "num_input_tokens_seen": 6586752, "step": 73150 }, { "epoch": 19.011174636174637, "grad_norm": 0.41696375608444214, "learning_rate": 3.715809226403244e-07, "loss": 0.0164, "num_input_tokens_seen": 6587216, "step": 73155 }, { "epoch": 19.012474012474012, "grad_norm": 2.280534267425537, "learning_rate": 3.706076787019036e-07, "loss": 0.0013, "num_input_tokens_seen": 6587680, "step": 73160 }, { "epoch": 19.013773388773387, "grad_norm": 0.01866532303392887, "learning_rate": 3.6963570147672824e-07, "loss": 0.0023, "num_input_tokens_seen": 6588176, "step": 73165 }, { "epoch": 19.015072765072766, "grad_norm": 12.286248207092285, "learning_rate": 3.686649910147888e-07, "loss": 0.0202, "num_input_tokens_seen": 6588624, "step": 73170 }, { "epoch": 19.01637214137214, "grad_norm": 45.24934005737305, "learning_rate": 3.676955473660093e-07, "loss": 0.0429, "num_input_tokens_seen": 6589040, "step": 73175 }, { "epoch": 19.017671517671516, "grad_norm": 48.44340896606445, "learning_rate": 3.6672737058025266e-07, "loss": 0.1748, "num_input_tokens_seen": 6589456, "step": 73180 }, { "epoch": 19.018970893970895, "grad_norm": 0.017064008861780167, "learning_rate": 3.6576046070730675e-07, "loss": 0.0015, "num_input_tokens_seen": 6589872, "step": 73185 }, { "epoch": 19.02027027027027, "grad_norm": 0.0012084590271115303, "learning_rate": 3.6479481779690403e-07, "loss": 0.002, "num_input_tokens_seen": 6590304, "step": 73190 }, { "epoch": 19.021569646569645, "grad_norm": 8.4082670211792, "learning_rate": 3.6383044189870763e-07, "loss": 0.0088, "num_input_tokens_seen": 6590720, "step": 73195 }, { "epoch": 19.022869022869024, "grad_norm": 0.0021843761205673218, "learning_rate": 3.628673330623139e-07, "loss": 0.0055, "num_input_tokens_seen": 6591200, "step": 73200 }, { "epoch": 19.0241683991684, "grad_norm": 22.259246826171875, "learning_rate": 3.619054913372638e-07, "loss": 0.0221, "num_input_tokens_seen": 6591632, "step": 73205 }, { "epoch": 19.025467775467774, "grad_norm": 0.36311423778533936, "learning_rate": 3.609449167730206e-07, "loss": 0.0323, "num_input_tokens_seen": 6592080, "step": 73210 }, { "epoch": 19.026767151767153, "grad_norm": 0.021427327767014503, "learning_rate": 3.5998560941898907e-07, "loss": 0.0015, "num_input_tokens_seen": 6592560, "step": 73215 }, { "epoch": 19.028066528066528, "grad_norm": 0.22756347060203552, "learning_rate": 3.5902756932450486e-07, "loss": 0.0006, "num_input_tokens_seen": 6593024, "step": 73220 }, { "epoch": 19.029365904365903, "grad_norm": 0.08147229254245758, "learning_rate": 3.5807079653884526e-07, "loss": 0.0094, "num_input_tokens_seen": 6593472, "step": 73225 }, { "epoch": 19.030665280665282, "grad_norm": 0.13544616103172302, "learning_rate": 3.5711529111121244e-07, "loss": 0.0335, "num_input_tokens_seen": 6593952, "step": 73230 }, { "epoch": 19.031964656964657, "grad_norm": 0.0020910368766635656, "learning_rate": 3.561610530907505e-07, "loss": 0.2378, "num_input_tokens_seen": 6594384, "step": 73235 }, { "epoch": 19.033264033264032, "grad_norm": 0.5513428449630737, "learning_rate": 3.552080825265397e-07, "loss": 0.1014, "num_input_tokens_seen": 6594800, "step": 73240 }, { "epoch": 19.03456340956341, "grad_norm": 42.00167465209961, "learning_rate": 3.5425637946759347e-07, "loss": 0.0577, "num_input_tokens_seen": 6595264, "step": 73245 }, { "epoch": 19.035862785862786, "grad_norm": 53.11301040649414, "learning_rate": 3.533059439628561e-07, "loss": 0.1905, "num_input_tokens_seen": 6595712, "step": 73250 }, { "epoch": 19.03716216216216, "grad_norm": 0.024298274889588356, "learning_rate": 3.523567760612051e-07, "loss": 0.0002, "num_input_tokens_seen": 6596176, "step": 73255 }, { "epoch": 19.03846153846154, "grad_norm": 86.43136596679688, "learning_rate": 3.5140887581146534e-07, "loss": 0.1467, "num_input_tokens_seen": 6596592, "step": 73260 }, { "epoch": 19.039760914760915, "grad_norm": 3.188858985900879, "learning_rate": 3.5046224326238107e-07, "loss": 0.1366, "num_input_tokens_seen": 6597040, "step": 73265 }, { "epoch": 19.04106029106029, "grad_norm": 0.20950748026371002, "learning_rate": 3.49516878462644e-07, "loss": 0.002, "num_input_tokens_seen": 6597520, "step": 73270 }, { "epoch": 19.04235966735967, "grad_norm": 96.45437622070312, "learning_rate": 3.485727814608708e-07, "loss": 0.1164, "num_input_tokens_seen": 6597968, "step": 73275 }, { "epoch": 19.043659043659044, "grad_norm": 0.0012066798517480493, "learning_rate": 3.476299523056198e-07, "loss": 0.0009, "num_input_tokens_seen": 6598400, "step": 73280 }, { "epoch": 19.04495841995842, "grad_norm": 0.007021081633865833, "learning_rate": 3.4668839104538273e-07, "loss": 0.0007, "num_input_tokens_seen": 6598848, "step": 73285 }, { "epoch": 19.046257796257795, "grad_norm": 0.16992245614528656, "learning_rate": 3.457480977285821e-07, "loss": 0.008, "num_input_tokens_seen": 6599280, "step": 73290 }, { "epoch": 19.047557172557173, "grad_norm": 25.25705337524414, "learning_rate": 3.4480907240357906e-07, "loss": 0.0138, "num_input_tokens_seen": 6599728, "step": 73295 }, { "epoch": 19.04885654885655, "grad_norm": 0.5209349989891052, "learning_rate": 3.438713151186712e-07, "loss": 0.4383, "num_input_tokens_seen": 6600176, "step": 73300 }, { "epoch": 19.050155925155924, "grad_norm": 0.27060818672180176, "learning_rate": 3.429348259220838e-07, "loss": 0.091, "num_input_tokens_seen": 6600608, "step": 73305 }, { "epoch": 19.051455301455302, "grad_norm": 0.0036456340458244085, "learning_rate": 3.419996048619839e-07, "loss": 0.0368, "num_input_tokens_seen": 6601040, "step": 73310 }, { "epoch": 19.052754677754677, "grad_norm": 0.8267706036567688, "learning_rate": 3.410656519864719e-07, "loss": 0.245, "num_input_tokens_seen": 6601472, "step": 73315 }, { "epoch": 19.054054054054053, "grad_norm": 20.45001983642578, "learning_rate": 3.401329673435788e-07, "loss": 0.3608, "num_input_tokens_seen": 6601952, "step": 73320 }, { "epoch": 19.05535343035343, "grad_norm": 1.7003616094589233, "learning_rate": 3.3920155098127457e-07, "loss": 0.017, "num_input_tokens_seen": 6602416, "step": 73325 }, { "epoch": 19.056652806652806, "grad_norm": 0.6582050919532776, "learning_rate": 3.382714029474654e-07, "loss": 0.0056, "num_input_tokens_seen": 6602864, "step": 73330 }, { "epoch": 19.05795218295218, "grad_norm": 0.008920452557504177, "learning_rate": 3.3734252328998795e-07, "loss": 0.186, "num_input_tokens_seen": 6603296, "step": 73335 }, { "epoch": 19.05925155925156, "grad_norm": 0.040415845811367035, "learning_rate": 3.3641491205661236e-07, "loss": 0.0641, "num_input_tokens_seen": 6603728, "step": 73340 }, { "epoch": 19.060550935550935, "grad_norm": 0.630355715751648, "learning_rate": 3.3548856929505047e-07, "loss": 0.0021, "num_input_tokens_seen": 6604176, "step": 73345 }, { "epoch": 19.06185031185031, "grad_norm": 0.2403770536184311, "learning_rate": 3.345634950529419e-07, "loss": 0.1468, "num_input_tokens_seen": 6604624, "step": 73350 }, { "epoch": 19.06314968814969, "grad_norm": 0.006677126511931419, "learning_rate": 3.336396893778709e-07, "loss": 0.0018, "num_input_tokens_seen": 6605120, "step": 73355 }, { "epoch": 19.064449064449065, "grad_norm": 0.00727069890126586, "learning_rate": 3.3271715231734113e-07, "loss": 0.0011, "num_input_tokens_seen": 6605536, "step": 73360 }, { "epoch": 19.06574844074844, "grad_norm": 0.5528377890586853, "learning_rate": 3.317958839188062e-07, "loss": 0.0086, "num_input_tokens_seen": 6606000, "step": 73365 }, { "epoch": 19.06704781704782, "grad_norm": 0.20296531915664673, "learning_rate": 3.3087588422964223e-07, "loss": 0.1659, "num_input_tokens_seen": 6606480, "step": 73370 }, { "epoch": 19.068347193347194, "grad_norm": 0.006535154301673174, "learning_rate": 3.2995715329716957e-07, "loss": 0.0435, "num_input_tokens_seen": 6606912, "step": 73375 }, { "epoch": 19.06964656964657, "grad_norm": 3.4588520526885986, "learning_rate": 3.2903969116863667e-07, "loss": 0.002, "num_input_tokens_seen": 6607328, "step": 73380 }, { "epoch": 19.070945945945947, "grad_norm": 82.18548583984375, "learning_rate": 3.2812349789123063e-07, "loss": 0.1614, "num_input_tokens_seen": 6607760, "step": 73385 }, { "epoch": 19.072245322245323, "grad_norm": 0.0005259964382275939, "learning_rate": 3.272085735120778e-07, "loss": 0.002, "num_input_tokens_seen": 6608208, "step": 73390 }, { "epoch": 19.073544698544698, "grad_norm": 0.0030528302304446697, "learning_rate": 3.2629491807822375e-07, "loss": 0.0986, "num_input_tokens_seen": 6608656, "step": 73395 }, { "epoch": 19.074844074844076, "grad_norm": 0.22811301052570343, "learning_rate": 3.253825316366643e-07, "loss": 0.0027, "num_input_tokens_seen": 6609088, "step": 73400 }, { "epoch": 19.07614345114345, "grad_norm": 0.04747813940048218, "learning_rate": 3.24471414234323e-07, "loss": 0.0001, "num_input_tokens_seen": 6609568, "step": 73405 }, { "epoch": 19.077442827442827, "grad_norm": 0.017287852242588997, "learning_rate": 3.2356156591805966e-07, "loss": 0.0001, "num_input_tokens_seen": 6610000, "step": 73410 }, { "epoch": 19.078742203742205, "grad_norm": 0.8949039578437805, "learning_rate": 3.226529867346673e-07, "loss": 0.0014, "num_input_tokens_seen": 6610448, "step": 73415 }, { "epoch": 19.08004158004158, "grad_norm": 0.13021604716777802, "learning_rate": 3.2174567673088077e-07, "loss": 0.0002, "num_input_tokens_seen": 6610928, "step": 73420 }, { "epoch": 19.081340956340956, "grad_norm": 0.0187725480645895, "learning_rate": 3.208396359533572e-07, "loss": 0.1347, "num_input_tokens_seen": 6611376, "step": 73425 }, { "epoch": 19.08264033264033, "grad_norm": 0.4544132947921753, "learning_rate": 3.1993486444869823e-07, "loss": 0.0173, "num_input_tokens_seen": 6611824, "step": 73430 }, { "epoch": 19.08393970893971, "grad_norm": 0.005700364243239164, "learning_rate": 3.190313622634333e-07, "loss": 0.0116, "num_input_tokens_seen": 6612256, "step": 73435 }, { "epoch": 19.085239085239085, "grad_norm": 0.07056230306625366, "learning_rate": 3.1812912944403915e-07, "loss": 0.1345, "num_input_tokens_seen": 6612704, "step": 73440 }, { "epoch": 19.08653846153846, "grad_norm": 0.6718626022338867, "learning_rate": 3.172281660369092e-07, "loss": 0.0006, "num_input_tokens_seen": 6613152, "step": 73445 }, { "epoch": 19.08783783783784, "grad_norm": 0.03370799496769905, "learning_rate": 3.163284720883841e-07, "loss": 0.3105, "num_input_tokens_seen": 6613616, "step": 73450 }, { "epoch": 19.089137214137214, "grad_norm": 0.12357795238494873, "learning_rate": 3.1543004764473805e-07, "loss": 0.0005, "num_input_tokens_seen": 6614080, "step": 73455 }, { "epoch": 19.09043659043659, "grad_norm": 0.007415059022605419, "learning_rate": 3.145328927521757e-07, "loss": 0.0759, "num_input_tokens_seen": 6614560, "step": 73460 }, { "epoch": 19.091735966735968, "grad_norm": 101.170654296875, "learning_rate": 3.1363700745684065e-07, "loss": 0.1129, "num_input_tokens_seen": 6615008, "step": 73465 }, { "epoch": 19.093035343035343, "grad_norm": 0.0677856057882309, "learning_rate": 3.1274239180480446e-07, "loss": 0.2572, "num_input_tokens_seen": 6615440, "step": 73470 }, { "epoch": 19.094334719334718, "grad_norm": 0.5314732193946838, "learning_rate": 3.1184904584208586e-07, "loss": 0.3657, "num_input_tokens_seen": 6615888, "step": 73475 }, { "epoch": 19.095634095634097, "grad_norm": 0.8929476737976074, "learning_rate": 3.109569696146231e-07, "loss": 0.2061, "num_input_tokens_seen": 6616352, "step": 73480 }, { "epoch": 19.096933471933472, "grad_norm": 0.03478556126356125, "learning_rate": 3.1006616316829886e-07, "loss": 0.0186, "num_input_tokens_seen": 6616784, "step": 73485 }, { "epoch": 19.098232848232847, "grad_norm": 0.005578994285315275, "learning_rate": 3.0917662654892654e-07, "loss": 0.1593, "num_input_tokens_seen": 6617248, "step": 73490 }, { "epoch": 19.099532224532226, "grad_norm": 1.5593671798706055, "learning_rate": 3.082883598022612e-07, "loss": 0.0017, "num_input_tokens_seen": 6617728, "step": 73495 }, { "epoch": 19.1008316008316, "grad_norm": 0.01038308721035719, "learning_rate": 3.0740136297398305e-07, "loss": 0.0014, "num_input_tokens_seen": 6618192, "step": 73500 }, { "epoch": 19.102130977130976, "grad_norm": 0.001483589643612504, "learning_rate": 3.065156361097138e-07, "loss": 0.0014, "num_input_tokens_seen": 6618624, "step": 73505 }, { "epoch": 19.103430353430355, "grad_norm": 14.467572212219238, "learning_rate": 3.0563117925500595e-07, "loss": 0.2685, "num_input_tokens_seen": 6619056, "step": 73510 }, { "epoch": 19.10472972972973, "grad_norm": 0.8016976118087769, "learning_rate": 3.0474799245534537e-07, "loss": 0.0198, "num_input_tokens_seen": 6619488, "step": 73515 }, { "epoch": 19.106029106029105, "grad_norm": 0.619965136051178, "learning_rate": 3.038660757561568e-07, "loss": 0.0005, "num_input_tokens_seen": 6619968, "step": 73520 }, { "epoch": 19.107328482328484, "grad_norm": 29.35837173461914, "learning_rate": 3.0298542920279835e-07, "loss": 0.0239, "num_input_tokens_seen": 6620400, "step": 73525 }, { "epoch": 19.10862785862786, "grad_norm": 1.3602505922317505, "learning_rate": 3.021060528405645e-07, "loss": 0.0039, "num_input_tokens_seen": 6620848, "step": 73530 }, { "epoch": 19.109927234927234, "grad_norm": 0.007757263258099556, "learning_rate": 3.0122794671468004e-07, "loss": 0.1192, "num_input_tokens_seen": 6621280, "step": 73535 }, { "epoch": 19.111226611226613, "grad_norm": 0.019413165748119354, "learning_rate": 3.0035111087030885e-07, "loss": 0.0001, "num_input_tokens_seen": 6621760, "step": 73540 }, { "epoch": 19.112525987525988, "grad_norm": 0.36361536383628845, "learning_rate": 2.9947554535254275e-07, "loss": 0.0063, "num_input_tokens_seen": 6622192, "step": 73545 }, { "epoch": 19.113825363825363, "grad_norm": 5.45585298538208, "learning_rate": 2.9860125020642063e-07, "loss": 0.0938, "num_input_tokens_seen": 6622624, "step": 73550 }, { "epoch": 19.11512474012474, "grad_norm": 7.745334148406982, "learning_rate": 2.977282254769009e-07, "loss": 0.0062, "num_input_tokens_seen": 6623072, "step": 73555 }, { "epoch": 19.116424116424117, "grad_norm": 0.010724090971052647, "learning_rate": 2.968564712088867e-07, "loss": 0.2992, "num_input_tokens_seen": 6623488, "step": 73560 }, { "epoch": 19.117723492723492, "grad_norm": 0.09899508953094482, "learning_rate": 2.959859874472143e-07, "loss": 0.3634, "num_input_tokens_seen": 6623952, "step": 73565 }, { "epoch": 19.11902286902287, "grad_norm": 0.007691269274801016, "learning_rate": 2.951167742366534e-07, "loss": 0.0005, "num_input_tokens_seen": 6624416, "step": 73570 }, { "epoch": 19.120322245322246, "grad_norm": 0.13288645446300507, "learning_rate": 2.9424883162191e-07, "loss": 0.0006, "num_input_tokens_seen": 6624848, "step": 73575 }, { "epoch": 19.12162162162162, "grad_norm": 0.2483239322900772, "learning_rate": 2.933821596476177e-07, "loss": 0.0194, "num_input_tokens_seen": 6625280, "step": 73580 }, { "epoch": 19.122920997920996, "grad_norm": 0.021754451096057892, "learning_rate": 2.925167583583577e-07, "loss": 0.0308, "num_input_tokens_seen": 6625696, "step": 73585 }, { "epoch": 19.124220374220375, "grad_norm": 0.7964653372764587, "learning_rate": 2.9165262779863036e-07, "loss": 0.1281, "num_input_tokens_seen": 6626112, "step": 73590 }, { "epoch": 19.12551975051975, "grad_norm": 0.2105976790189743, "learning_rate": 2.907897680128835e-07, "loss": 0.0815, "num_input_tokens_seen": 6626528, "step": 73595 }, { "epoch": 19.126819126819125, "grad_norm": 0.36789363622665405, "learning_rate": 2.899281790454927e-07, "loss": 0.1224, "num_input_tokens_seen": 6626976, "step": 73600 }, { "epoch": 19.128118503118504, "grad_norm": 0.00882481038570404, "learning_rate": 2.890678609407754e-07, "loss": 0.0003, "num_input_tokens_seen": 6627440, "step": 73605 }, { "epoch": 19.12941787941788, "grad_norm": 1.7322688102722168, "learning_rate": 2.8820881374297113e-07, "loss": 0.0971, "num_input_tokens_seen": 6627872, "step": 73610 }, { "epoch": 19.130717255717254, "grad_norm": 0.6166471242904663, "learning_rate": 2.8735103749626955e-07, "loss": 0.0007, "num_input_tokens_seen": 6628368, "step": 73615 }, { "epoch": 19.132016632016633, "grad_norm": 0.25291380286216736, "learning_rate": 2.8649453224477986e-07, "loss": 0.0004, "num_input_tokens_seen": 6628848, "step": 73620 }, { "epoch": 19.133316008316008, "grad_norm": 125.7652359008789, "learning_rate": 2.856392980325556e-07, "loss": 0.5107, "num_input_tokens_seen": 6629280, "step": 73625 }, { "epoch": 19.134615384615383, "grad_norm": 10.559093475341797, "learning_rate": 2.8478533490358395e-07, "loss": 0.0039, "num_input_tokens_seen": 6629792, "step": 73630 }, { "epoch": 19.135914760914762, "grad_norm": 1.6487071514129639, "learning_rate": 2.8393264290178243e-07, "loss": 0.0057, "num_input_tokens_seen": 6630240, "step": 73635 }, { "epoch": 19.137214137214137, "grad_norm": 8.543852806091309, "learning_rate": 2.8308122207100773e-07, "loss": 0.0065, "num_input_tokens_seen": 6630688, "step": 73640 }, { "epoch": 19.138513513513512, "grad_norm": 0.17596057057380676, "learning_rate": 2.82231072455047e-07, "loss": 0.1871, "num_input_tokens_seen": 6631136, "step": 73645 }, { "epoch": 19.13981288981289, "grad_norm": 0.022767161950469017, "learning_rate": 2.8138219409762633e-07, "loss": 0.1032, "num_input_tokens_seen": 6631552, "step": 73650 }, { "epoch": 19.141112266112266, "grad_norm": 0.0020520833786576986, "learning_rate": 2.805345870424025e-07, "loss": 0.0162, "num_input_tokens_seen": 6631984, "step": 73655 }, { "epoch": 19.14241164241164, "grad_norm": 44.21857833862305, "learning_rate": 2.796882513329713e-07, "loss": 0.0685, "num_input_tokens_seen": 6632432, "step": 73660 }, { "epoch": 19.14371101871102, "grad_norm": 0.05644074082374573, "learning_rate": 2.7884318701285885e-07, "loss": 0.0066, "num_input_tokens_seen": 6632864, "step": 73665 }, { "epoch": 19.145010395010395, "grad_norm": 0.006371928378939629, "learning_rate": 2.779993941255277e-07, "loss": 0.2459, "num_input_tokens_seen": 6633296, "step": 73670 }, { "epoch": 19.14630977130977, "grad_norm": 0.7430940270423889, "learning_rate": 2.771568727143736e-07, "loss": 0.0173, "num_input_tokens_seen": 6633760, "step": 73675 }, { "epoch": 19.14760914760915, "grad_norm": 0.0034034245181828737, "learning_rate": 2.7631562282273425e-07, "loss": 0.0003, "num_input_tokens_seen": 6634272, "step": 73680 }, { "epoch": 19.148908523908524, "grad_norm": 0.12124892324209213, "learning_rate": 2.7547564449386664e-07, "loss": 0.2333, "num_input_tokens_seen": 6634704, "step": 73685 }, { "epoch": 19.1502079002079, "grad_norm": 0.1193549856543541, "learning_rate": 2.7463693777098065e-07, "loss": 0.0026, "num_input_tokens_seen": 6635232, "step": 73690 }, { "epoch": 19.151507276507278, "grad_norm": 0.0547344945371151, "learning_rate": 2.7379950269720565e-07, "loss": 0.0221, "num_input_tokens_seen": 6635648, "step": 73695 }, { "epoch": 19.152806652806653, "grad_norm": 2.028846263885498, "learning_rate": 2.729633393156128e-07, "loss": 0.0022, "num_input_tokens_seen": 6636096, "step": 73700 }, { "epoch": 19.15410602910603, "grad_norm": 0.0041887485422194, "learning_rate": 2.721284476692093e-07, "loss": 0.0086, "num_input_tokens_seen": 6636576, "step": 73705 }, { "epoch": 19.155405405405407, "grad_norm": 0.007721026428043842, "learning_rate": 2.7129482780093305e-07, "loss": 0.0395, "num_input_tokens_seen": 6637024, "step": 73710 }, { "epoch": 19.156704781704782, "grad_norm": 0.04124201461672783, "learning_rate": 2.7046247975365815e-07, "loss": 0.022, "num_input_tokens_seen": 6637472, "step": 73715 }, { "epoch": 19.158004158004157, "grad_norm": 104.29889678955078, "learning_rate": 2.6963140357018914e-07, "loss": 0.1497, "num_input_tokens_seen": 6637904, "step": 73720 }, { "epoch": 19.159303534303536, "grad_norm": 0.1538306474685669, "learning_rate": 2.6880159929327796e-07, "loss": 0.2852, "num_input_tokens_seen": 6638384, "step": 73725 }, { "epoch": 19.16060291060291, "grad_norm": 0.07416056096553802, "learning_rate": 2.679730669655933e-07, "loss": 0.094, "num_input_tokens_seen": 6638848, "step": 73730 }, { "epoch": 19.161902286902286, "grad_norm": 46.1646842956543, "learning_rate": 2.671458066297511e-07, "loss": 0.0205, "num_input_tokens_seen": 6639296, "step": 73735 }, { "epoch": 19.16320166320166, "grad_norm": 0.15274465084075928, "learning_rate": 2.66319818328295e-07, "loss": 0.0113, "num_input_tokens_seen": 6639760, "step": 73740 }, { "epoch": 19.16450103950104, "grad_norm": 0.08540447801351547, "learning_rate": 2.6549510210371607e-07, "loss": 0.0069, "num_input_tokens_seen": 6640192, "step": 73745 }, { "epoch": 19.165800415800415, "grad_norm": 0.019464876502752304, "learning_rate": 2.646716579984193e-07, "loss": 0.0141, "num_input_tokens_seen": 6640656, "step": 73750 }, { "epoch": 19.16709979209979, "grad_norm": 0.031092682853341103, "learning_rate": 2.638494860547597e-07, "loss": 0.004, "num_input_tokens_seen": 6641104, "step": 73755 }, { "epoch": 19.16839916839917, "grad_norm": 0.47452032566070557, "learning_rate": 2.6302858631502283e-07, "loss": 0.0015, "num_input_tokens_seen": 6641536, "step": 73760 }, { "epoch": 19.169698544698544, "grad_norm": 4.210328578948975, "learning_rate": 2.622089588214277e-07, "loss": 0.0195, "num_input_tokens_seen": 6642000, "step": 73765 }, { "epoch": 19.17099792099792, "grad_norm": 0.1857072412967682, "learning_rate": 2.613906036161268e-07, "loss": 0.1285, "num_input_tokens_seen": 6642480, "step": 73770 }, { "epoch": 19.1722972972973, "grad_norm": 0.06491146981716156, "learning_rate": 2.6057352074121134e-07, "loss": 0.1825, "num_input_tokens_seen": 6642912, "step": 73775 }, { "epoch": 19.173596673596673, "grad_norm": 26.65721893310547, "learning_rate": 2.597577102387061e-07, "loss": 0.055, "num_input_tokens_seen": 6643344, "step": 73780 }, { "epoch": 19.17489604989605, "grad_norm": 0.04619790241122246, "learning_rate": 2.5894317215056363e-07, "loss": 0.0002, "num_input_tokens_seen": 6643776, "step": 73785 }, { "epoch": 19.176195426195427, "grad_norm": 0.07469922304153442, "learning_rate": 2.5812990651868097e-07, "loss": 0.3333, "num_input_tokens_seen": 6644208, "step": 73790 }, { "epoch": 19.177494802494802, "grad_norm": 0.01821148209273815, "learning_rate": 2.5731791338488296e-07, "loss": 0.2218, "num_input_tokens_seen": 6644640, "step": 73795 }, { "epoch": 19.178794178794178, "grad_norm": 0.01653696596622467, "learning_rate": 2.5650719279093347e-07, "loss": 0.0042, "num_input_tokens_seen": 6645056, "step": 73800 }, { "epoch": 19.180093555093556, "grad_norm": 12.065557479858398, "learning_rate": 2.5569774477852695e-07, "loss": 0.2077, "num_input_tokens_seen": 6645504, "step": 73805 }, { "epoch": 19.18139293139293, "grad_norm": 0.13608887791633606, "learning_rate": 2.548895693892911e-07, "loss": 0.1037, "num_input_tokens_seen": 6645952, "step": 73810 }, { "epoch": 19.182692307692307, "grad_norm": 2.5288991928100586, "learning_rate": 2.540826666647955e-07, "loss": 0.012, "num_input_tokens_seen": 6646400, "step": 73815 }, { "epoch": 19.183991683991685, "grad_norm": 1.6563665866851807, "learning_rate": 2.532770366465431e-07, "loss": 0.3664, "num_input_tokens_seen": 6646864, "step": 73820 }, { "epoch": 19.18529106029106, "grad_norm": 8.149123191833496, "learning_rate": 2.524726793759591e-07, "loss": 0.3938, "num_input_tokens_seen": 6647296, "step": 73825 }, { "epoch": 19.186590436590436, "grad_norm": 2.6262423992156982, "learning_rate": 2.5166959489441866e-07, "loss": 0.0023, "num_input_tokens_seen": 6647696, "step": 73830 }, { "epoch": 19.187889812889814, "grad_norm": 0.6429327130317688, "learning_rate": 2.508677832432249e-07, "loss": 0.0016, "num_input_tokens_seen": 6648144, "step": 73835 }, { "epoch": 19.18918918918919, "grad_norm": 0.07549212127923965, "learning_rate": 2.500672444636143e-07, "loss": 0.0002, "num_input_tokens_seen": 6648576, "step": 73840 }, { "epoch": 19.190488565488565, "grad_norm": 0.0009301406098529696, "learning_rate": 2.4926797859675666e-07, "loss": 0.2185, "num_input_tokens_seen": 6648992, "step": 73845 }, { "epoch": 19.191787941787943, "grad_norm": 4.095149517059326, "learning_rate": 2.484699856837636e-07, "loss": 0.025, "num_input_tokens_seen": 6649408, "step": 73850 }, { "epoch": 19.19308731808732, "grad_norm": 13.978388786315918, "learning_rate": 2.4767326576567716e-07, "loss": 0.0157, "num_input_tokens_seen": 6649856, "step": 73855 }, { "epoch": 19.194386694386694, "grad_norm": 0.005331441294401884, "learning_rate": 2.468778188834675e-07, "loss": 0.0006, "num_input_tokens_seen": 6650352, "step": 73860 }, { "epoch": 19.195686070686072, "grad_norm": 0.463096022605896, "learning_rate": 2.4608364507805184e-07, "loss": 0.0015, "num_input_tokens_seen": 6650800, "step": 73865 }, { "epoch": 19.196985446985448, "grad_norm": 0.0038148141466081142, "learning_rate": 2.4529074439027244e-07, "loss": 0.0181, "num_input_tokens_seen": 6651232, "step": 73870 }, { "epoch": 19.198284823284823, "grad_norm": 1.1001440286636353, "learning_rate": 2.444991168609079e-07, "loss": 0.0019, "num_input_tokens_seen": 6651728, "step": 73875 }, { "epoch": 19.1995841995842, "grad_norm": 52.03031539916992, "learning_rate": 2.4370876253067277e-07, "loss": 0.042, "num_input_tokens_seen": 6652176, "step": 73880 }, { "epoch": 19.200883575883577, "grad_norm": 0.2022201418876648, "learning_rate": 2.4291968144021516e-07, "loss": 0.0008, "num_input_tokens_seen": 6652704, "step": 73885 }, { "epoch": 19.20218295218295, "grad_norm": 0.4158134460449219, "learning_rate": 2.421318736301192e-07, "loss": 0.0331, "num_input_tokens_seen": 6653136, "step": 73890 }, { "epoch": 19.203482328482327, "grad_norm": 0.0002584323228802532, "learning_rate": 2.4134533914090817e-07, "loss": 0.0144, "num_input_tokens_seen": 6653552, "step": 73895 }, { "epoch": 19.204781704781706, "grad_norm": 0.0034098143223673105, "learning_rate": 2.405600780130246e-07, "loss": 0.0006, "num_input_tokens_seen": 6654000, "step": 73900 }, { "epoch": 19.20608108108108, "grad_norm": 0.013898365199565887, "learning_rate": 2.397760902868612e-07, "loss": 0.0837, "num_input_tokens_seen": 6654416, "step": 73905 }, { "epoch": 19.207380457380456, "grad_norm": 2.12943959236145, "learning_rate": 2.3899337600273577e-07, "loss": 0.0014, "num_input_tokens_seen": 6654864, "step": 73910 }, { "epoch": 19.208679833679835, "grad_norm": 0.20070600509643555, "learning_rate": 2.382119352009049e-07, "loss": 0.0012, "num_input_tokens_seen": 6655312, "step": 73915 }, { "epoch": 19.20997920997921, "grad_norm": 35.156593322753906, "learning_rate": 2.374317679215643e-07, "loss": 0.0303, "num_input_tokens_seen": 6655744, "step": 73920 }, { "epoch": 19.211278586278585, "grad_norm": 0.1730467826128006, "learning_rate": 2.3665287420482907e-07, "loss": 0.0006, "num_input_tokens_seen": 6656208, "step": 73925 }, { "epoch": 19.212577962577964, "grad_norm": 0.050884246826171875, "learning_rate": 2.3587525409076716e-07, "loss": 0.0093, "num_input_tokens_seen": 6656672, "step": 73930 }, { "epoch": 19.21387733887734, "grad_norm": 0.4095219671726227, "learning_rate": 2.3509890761936882e-07, "loss": 0.0003, "num_input_tokens_seen": 6657104, "step": 73935 }, { "epoch": 19.215176715176714, "grad_norm": 0.0010152801405638456, "learning_rate": 2.3432383483056041e-07, "loss": 0.0003, "num_input_tokens_seen": 6657552, "step": 73940 }, { "epoch": 19.216476091476093, "grad_norm": 0.23498521745204926, "learning_rate": 2.335500357642073e-07, "loss": 0.0941, "num_input_tokens_seen": 6658016, "step": 73945 }, { "epoch": 19.217775467775468, "grad_norm": 0.5093395113945007, "learning_rate": 2.3277751046010543e-07, "loss": 0.0949, "num_input_tokens_seen": 6658464, "step": 73950 }, { "epoch": 19.219074844074843, "grad_norm": 55.86861038208008, "learning_rate": 2.3200625895798688e-07, "loss": 0.3579, "num_input_tokens_seen": 6658912, "step": 73955 }, { "epoch": 19.22037422037422, "grad_norm": 0.028188593685626984, "learning_rate": 2.3123628129751996e-07, "loss": 0.0003, "num_input_tokens_seen": 6659392, "step": 73960 }, { "epoch": 19.221673596673597, "grad_norm": 0.18377622961997986, "learning_rate": 2.3046757751830073e-07, "loss": 0.3203, "num_input_tokens_seen": 6659840, "step": 73965 }, { "epoch": 19.222972972972972, "grad_norm": 0.00216222763992846, "learning_rate": 2.297001476598698e-07, "loss": 0.0054, "num_input_tokens_seen": 6660256, "step": 73970 }, { "epoch": 19.22427234927235, "grad_norm": 0.7152166962623596, "learning_rate": 2.2893399176169284e-07, "loss": 0.1865, "num_input_tokens_seen": 6660688, "step": 73975 }, { "epoch": 19.225571725571726, "grad_norm": 0.022155893966555595, "learning_rate": 2.281691098631744e-07, "loss": 0.0002, "num_input_tokens_seen": 6661136, "step": 73980 }, { "epoch": 19.2268711018711, "grad_norm": 0.14037248492240906, "learning_rate": 2.274055020036553e-07, "loss": 0.0458, "num_input_tokens_seen": 6661584, "step": 73985 }, { "epoch": 19.22817047817048, "grad_norm": 0.1457076072692871, "learning_rate": 2.2664316822240407e-07, "loss": 0.461, "num_input_tokens_seen": 6662032, "step": 73990 }, { "epoch": 19.229469854469855, "grad_norm": 85.77902221679688, "learning_rate": 2.258821085586338e-07, "loss": 0.4063, "num_input_tokens_seen": 6662528, "step": 73995 }, { "epoch": 19.23076923076923, "grad_norm": 0.49621063470840454, "learning_rate": 2.251223230514854e-07, "loss": 0.0015, "num_input_tokens_seen": 6663024, "step": 74000 }, { "epoch": 19.23206860706861, "grad_norm": 57.578861236572266, "learning_rate": 2.243638117400332e-07, "loss": 0.0553, "num_input_tokens_seen": 6663456, "step": 74005 }, { "epoch": 19.233367983367984, "grad_norm": 0.01769283041357994, "learning_rate": 2.2360657466328761e-07, "loss": 0.2757, "num_input_tokens_seen": 6663904, "step": 74010 }, { "epoch": 19.23466735966736, "grad_norm": 0.0014242042088881135, "learning_rate": 2.228506118601953e-07, "loss": 0.1017, "num_input_tokens_seen": 6664320, "step": 74015 }, { "epoch": 19.235966735966738, "grad_norm": 0.34026482701301575, "learning_rate": 2.2209592336963625e-07, "loss": 0.0003, "num_input_tokens_seen": 6664736, "step": 74020 }, { "epoch": 19.237266112266113, "grad_norm": 0.528529703617096, "learning_rate": 2.2134250923042665e-07, "loss": 0.0556, "num_input_tokens_seen": 6665152, "step": 74025 }, { "epoch": 19.238565488565488, "grad_norm": 0.022282399237155914, "learning_rate": 2.205903694813133e-07, "loss": 0.0003, "num_input_tokens_seen": 6665648, "step": 74030 }, { "epoch": 19.239864864864863, "grad_norm": 8.482831001281738, "learning_rate": 2.198395041609791e-07, "loss": 0.5437, "num_input_tokens_seen": 6666096, "step": 74035 }, { "epoch": 19.241164241164242, "grad_norm": 0.026434019207954407, "learning_rate": 2.1908991330804319e-07, "loss": 0.0809, "num_input_tokens_seen": 6666512, "step": 74040 }, { "epoch": 19.242463617463617, "grad_norm": 0.11819993704557419, "learning_rate": 2.1834159696105528e-07, "loss": 0.1067, "num_input_tokens_seen": 6666992, "step": 74045 }, { "epoch": 19.243762993762992, "grad_norm": 0.05765635520219803, "learning_rate": 2.175945551585068e-07, "loss": 0.0866, "num_input_tokens_seen": 6667424, "step": 74050 }, { "epoch": 19.24506237006237, "grad_norm": 108.20036315917969, "learning_rate": 2.1684878793881146e-07, "loss": 0.2138, "num_input_tokens_seen": 6667936, "step": 74055 }, { "epoch": 19.246361746361746, "grad_norm": 0.06585089862346649, "learning_rate": 2.1610429534033027e-07, "loss": 0.1322, "num_input_tokens_seen": 6668384, "step": 74060 }, { "epoch": 19.24766112266112, "grad_norm": 0.3365626633167267, "learning_rate": 2.1536107740135482e-07, "loss": 0.1325, "num_input_tokens_seen": 6668864, "step": 74065 }, { "epoch": 19.2489604989605, "grad_norm": 0.004208672326058149, "learning_rate": 2.1461913416010448e-07, "loss": 0.334, "num_input_tokens_seen": 6669328, "step": 74070 }, { "epoch": 19.250259875259875, "grad_norm": 93.64778137207031, "learning_rate": 2.1387846565474045e-07, "loss": 0.3307, "num_input_tokens_seen": 6669776, "step": 74075 }, { "epoch": 19.25155925155925, "grad_norm": 0.4596381187438965, "learning_rate": 2.1313907192335724e-07, "loss": 0.2115, "num_input_tokens_seen": 6670256, "step": 74080 }, { "epoch": 19.25285862785863, "grad_norm": 0.0070414491929113865, "learning_rate": 2.1240095300397723e-07, "loss": 0.0194, "num_input_tokens_seen": 6670704, "step": 74085 }, { "epoch": 19.254158004158004, "grad_norm": 0.17086997628211975, "learning_rate": 2.1166410893457e-07, "loss": 0.0012, "num_input_tokens_seen": 6671184, "step": 74090 }, { "epoch": 19.25545738045738, "grad_norm": 0.07796265929937363, "learning_rate": 2.109285397530275e-07, "loss": 0.0327, "num_input_tokens_seen": 6671584, "step": 74095 }, { "epoch": 19.256756756756758, "grad_norm": 77.0494155883789, "learning_rate": 2.1019424549718335e-07, "loss": 0.4546, "num_input_tokens_seen": 6672016, "step": 74100 }, { "epoch": 19.258056133056133, "grad_norm": 0.009965356439352036, "learning_rate": 2.09461226204799e-07, "loss": 0.0009, "num_input_tokens_seen": 6672448, "step": 74105 }, { "epoch": 19.259355509355508, "grad_norm": 0.016040625050663948, "learning_rate": 2.0872948191358045e-07, "loss": 0.0022, "num_input_tokens_seen": 6672848, "step": 74110 }, { "epoch": 19.260654885654887, "grad_norm": 0.050006065517663956, "learning_rate": 2.0799901266115585e-07, "loss": 0.0004, "num_input_tokens_seen": 6673296, "step": 74115 }, { "epoch": 19.261954261954262, "grad_norm": 0.5711989998817444, "learning_rate": 2.0726981848509797e-07, "loss": 0.0009, "num_input_tokens_seen": 6673712, "step": 74120 }, { "epoch": 19.263253638253637, "grad_norm": 0.3924022912979126, "learning_rate": 2.0654189942290735e-07, "loss": 0.0146, "num_input_tokens_seen": 6674160, "step": 74125 }, { "epoch": 19.264553014553016, "grad_norm": 1.3189828395843506, "learning_rate": 2.0581525551202352e-07, "loss": 0.0035, "num_input_tokens_seen": 6674592, "step": 74130 }, { "epoch": 19.26585239085239, "grad_norm": 0.4529391825199127, "learning_rate": 2.0508988678981656e-07, "loss": 0.001, "num_input_tokens_seen": 6675008, "step": 74135 }, { "epoch": 19.267151767151766, "grad_norm": 0.019707972183823586, "learning_rate": 2.043657932935955e-07, "loss": 0.0118, "num_input_tokens_seen": 6675440, "step": 74140 }, { "epoch": 19.268451143451145, "grad_norm": 1.415910005569458, "learning_rate": 2.0364297506060003e-07, "loss": 0.1041, "num_input_tokens_seen": 6675888, "step": 74145 }, { "epoch": 19.26975051975052, "grad_norm": 0.38097622990608215, "learning_rate": 2.029214321280032e-07, "loss": 0.1016, "num_input_tokens_seen": 6676320, "step": 74150 }, { "epoch": 19.271049896049895, "grad_norm": 0.27326953411102295, "learning_rate": 2.0220116453291693e-07, "loss": 0.0476, "num_input_tokens_seen": 6676752, "step": 74155 }, { "epoch": 19.272349272349274, "grad_norm": 0.28435224294662476, "learning_rate": 2.0148217231238664e-07, "loss": 0.005, "num_input_tokens_seen": 6677168, "step": 74160 }, { "epoch": 19.27364864864865, "grad_norm": 0.016787152737379074, "learning_rate": 2.007644555033855e-07, "loss": 0.002, "num_input_tokens_seen": 6677632, "step": 74165 }, { "epoch": 19.274948024948024, "grad_norm": 1.4609819650650024, "learning_rate": 2.0004801414283402e-07, "loss": 0.0021, "num_input_tokens_seen": 6678128, "step": 74170 }, { "epoch": 19.276247401247403, "grad_norm": 0.21759749948978424, "learning_rate": 1.9933284826757216e-07, "loss": 0.0198, "num_input_tokens_seen": 6678544, "step": 74175 }, { "epoch": 19.277546777546778, "grad_norm": 25.698434829711914, "learning_rate": 1.9861895791438712e-07, "loss": 0.1247, "num_input_tokens_seen": 6679024, "step": 74180 }, { "epoch": 19.278846153846153, "grad_norm": 0.07768237590789795, "learning_rate": 1.9790634311999124e-07, "loss": 0.1756, "num_input_tokens_seen": 6679504, "step": 74185 }, { "epoch": 19.28014553014553, "grad_norm": 0.8030422925949097, "learning_rate": 1.971950039210385e-07, "loss": 0.0008, "num_input_tokens_seen": 6679936, "step": 74190 }, { "epoch": 19.281444906444907, "grad_norm": 7.2498884201049805, "learning_rate": 1.964849403541108e-07, "loss": 0.0058, "num_input_tokens_seen": 6680368, "step": 74195 }, { "epoch": 19.282744282744282, "grad_norm": 0.46317198872566223, "learning_rate": 1.957761524557261e-07, "loss": 0.0004, "num_input_tokens_seen": 6680800, "step": 74200 }, { "epoch": 19.284043659043657, "grad_norm": 3.0920462608337402, "learning_rate": 1.950686402623414e-07, "loss": 0.0111, "num_input_tokens_seen": 6681264, "step": 74205 }, { "epoch": 19.285343035343036, "grad_norm": 0.00041348213562741876, "learning_rate": 1.943624038103442e-07, "loss": 0.0569, "num_input_tokens_seen": 6681696, "step": 74210 }, { "epoch": 19.28664241164241, "grad_norm": 0.016779879108071327, "learning_rate": 1.936574431360555e-07, "loss": 0.3067, "num_input_tokens_seen": 6682160, "step": 74215 }, { "epoch": 19.287941787941786, "grad_norm": 1.4258489608764648, "learning_rate": 1.9295375827573238e-07, "loss": 0.0071, "num_input_tokens_seen": 6682624, "step": 74220 }, { "epoch": 19.289241164241165, "grad_norm": 10.4505033493042, "learning_rate": 1.922513492655653e-07, "loss": 0.0448, "num_input_tokens_seen": 6683088, "step": 74225 }, { "epoch": 19.29054054054054, "grad_norm": 0.009194644168019295, "learning_rate": 1.915502161416838e-07, "loss": 0.1007, "num_input_tokens_seen": 6683536, "step": 74230 }, { "epoch": 19.291839916839916, "grad_norm": 0.04520152136683464, "learning_rate": 1.9085035894014224e-07, "loss": 0.0237, "num_input_tokens_seen": 6684000, "step": 74235 }, { "epoch": 19.293139293139294, "grad_norm": 36.63855743408203, "learning_rate": 1.901517776969397e-07, "loss": 0.0399, "num_input_tokens_seen": 6684432, "step": 74240 }, { "epoch": 19.29443866943867, "grad_norm": 0.0016310811042785645, "learning_rate": 1.8945447244800297e-07, "loss": 0.0003, "num_input_tokens_seen": 6684848, "step": 74245 }, { "epoch": 19.295738045738045, "grad_norm": 18.37295913696289, "learning_rate": 1.88758443229195e-07, "loss": 0.2293, "num_input_tokens_seen": 6685264, "step": 74250 }, { "epoch": 19.297037422037423, "grad_norm": 0.2812575697898865, "learning_rate": 1.8806369007631219e-07, "loss": 0.0084, "num_input_tokens_seen": 6685696, "step": 74255 }, { "epoch": 19.2983367983368, "grad_norm": 0.002190339844673872, "learning_rate": 1.8737021302508707e-07, "loss": 0.0008, "num_input_tokens_seen": 6686144, "step": 74260 }, { "epoch": 19.299636174636174, "grad_norm": 27.755474090576172, "learning_rate": 1.8667801211118828e-07, "loss": 0.1665, "num_input_tokens_seen": 6686592, "step": 74265 }, { "epoch": 19.300935550935552, "grad_norm": 0.8936366438865662, "learning_rate": 1.8598708737021243e-07, "loss": 0.0186, "num_input_tokens_seen": 6687040, "step": 74270 }, { "epoch": 19.302234927234927, "grad_norm": 36.195987701416016, "learning_rate": 1.852974388376977e-07, "loss": 0.0271, "num_input_tokens_seen": 6687472, "step": 74275 }, { "epoch": 19.303534303534303, "grad_norm": 0.0014871234307065606, "learning_rate": 1.8460906654911014e-07, "loss": 0.0012, "num_input_tokens_seen": 6687936, "step": 74280 }, { "epoch": 19.30483367983368, "grad_norm": 0.0017610376235097647, "learning_rate": 1.839219705398576e-07, "loss": 0.1129, "num_input_tokens_seen": 6688368, "step": 74285 }, { "epoch": 19.306133056133056, "grad_norm": 26.929231643676758, "learning_rate": 1.832361508452729e-07, "loss": 0.0132, "num_input_tokens_seen": 6688816, "step": 74290 }, { "epoch": 19.30743243243243, "grad_norm": 0.21528016030788422, "learning_rate": 1.8255160750063338e-07, "loss": 0.0019, "num_input_tokens_seen": 6689264, "step": 74295 }, { "epoch": 19.30873180873181, "grad_norm": 0.00549942534416914, "learning_rate": 1.8186834054114422e-07, "loss": 0.0002, "num_input_tokens_seen": 6689696, "step": 74300 }, { "epoch": 19.310031185031185, "grad_norm": 0.018813543021678925, "learning_rate": 1.8118635000194396e-07, "loss": 0.1327, "num_input_tokens_seen": 6690128, "step": 74305 }, { "epoch": 19.31133056133056, "grad_norm": 0.0017793604638427496, "learning_rate": 1.805056359181101e-07, "loss": 0.0009, "num_input_tokens_seen": 6690624, "step": 74310 }, { "epoch": 19.31262993762994, "grad_norm": 0.0065467157401144505, "learning_rate": 1.7982619832465353e-07, "loss": 0.0003, "num_input_tokens_seen": 6691072, "step": 74315 }, { "epoch": 19.313929313929314, "grad_norm": 0.6477242708206177, "learning_rate": 1.7914803725651573e-07, "loss": 0.022, "num_input_tokens_seen": 6691568, "step": 74320 }, { "epoch": 19.31522869022869, "grad_norm": 0.006648609880357981, "learning_rate": 1.7847115274857718e-07, "loss": 0.1351, "num_input_tokens_seen": 6692016, "step": 74325 }, { "epoch": 19.316528066528065, "grad_norm": 0.01147042028605938, "learning_rate": 1.7779554483565163e-07, "loss": 0.0868, "num_input_tokens_seen": 6692448, "step": 74330 }, { "epoch": 19.317827442827443, "grad_norm": 20.302961349487305, "learning_rate": 1.7712121355248356e-07, "loss": 0.1206, "num_input_tokens_seen": 6692896, "step": 74335 }, { "epoch": 19.31912681912682, "grad_norm": 65.04981994628906, "learning_rate": 1.7644815893375632e-07, "loss": 0.2277, "num_input_tokens_seen": 6693328, "step": 74340 }, { "epoch": 19.320426195426194, "grad_norm": 17.223236083984375, "learning_rate": 1.7577638101408389e-07, "loss": 0.2095, "num_input_tokens_seen": 6693792, "step": 74345 }, { "epoch": 19.321725571725572, "grad_norm": 0.05057328939437866, "learning_rate": 1.7510587982801641e-07, "loss": 0.23, "num_input_tokens_seen": 6694304, "step": 74350 }, { "epoch": 19.323024948024948, "grad_norm": 0.0009835761738941073, "learning_rate": 1.7443665541004016e-07, "loss": 0.0002, "num_input_tokens_seen": 6694752, "step": 74355 }, { "epoch": 19.324324324324323, "grad_norm": 0.4003724455833435, "learning_rate": 1.7376870779457487e-07, "loss": 0.0021, "num_input_tokens_seen": 6695216, "step": 74360 }, { "epoch": 19.3256237006237, "grad_norm": 0.2124975025653839, "learning_rate": 1.731020370159736e-07, "loss": 0.0093, "num_input_tokens_seen": 6695664, "step": 74365 }, { "epoch": 19.326923076923077, "grad_norm": 0.850025475025177, "learning_rate": 1.7243664310852003e-07, "loss": 0.0036, "num_input_tokens_seen": 6696112, "step": 74370 }, { "epoch": 19.328222453222452, "grad_norm": 0.02795243449509144, "learning_rate": 1.7177252610643958e-07, "loss": 0.0003, "num_input_tokens_seen": 6696576, "step": 74375 }, { "epoch": 19.32952182952183, "grad_norm": 0.008145633153617382, "learning_rate": 1.7110968604388544e-07, "loss": 0.0328, "num_input_tokens_seen": 6697024, "step": 74380 }, { "epoch": 19.330821205821206, "grad_norm": 4.297458171844482, "learning_rate": 1.704481229549526e-07, "loss": 0.1391, "num_input_tokens_seen": 6697504, "step": 74385 }, { "epoch": 19.33212058212058, "grad_norm": 0.001672919257543981, "learning_rate": 1.6978783687366107e-07, "loss": 0.1668, "num_input_tokens_seen": 6698000, "step": 74390 }, { "epoch": 19.33341995841996, "grad_norm": 0.7247357964515686, "learning_rate": 1.691288278339753e-07, "loss": 0.0037, "num_input_tokens_seen": 6698432, "step": 74395 }, { "epoch": 19.334719334719335, "grad_norm": 2.6367013454437256, "learning_rate": 1.6847109586978216e-07, "loss": 0.0018, "num_input_tokens_seen": 6698896, "step": 74400 }, { "epoch": 19.33601871101871, "grad_norm": 0.013401097618043423, "learning_rate": 1.678146410149156e-07, "loss": 0.0211, "num_input_tokens_seen": 6699360, "step": 74405 }, { "epoch": 19.33731808731809, "grad_norm": 0.002632672432810068, "learning_rate": 1.671594633031348e-07, "loss": 0.0525, "num_input_tokens_seen": 6699808, "step": 74410 }, { "epoch": 19.338617463617464, "grad_norm": 11.968585014343262, "learning_rate": 1.66505562768135e-07, "loss": 0.0083, "num_input_tokens_seen": 6700240, "step": 74415 }, { "epoch": 19.33991683991684, "grad_norm": 0.03235983848571777, "learning_rate": 1.6585293944354762e-07, "loss": 0.0005, "num_input_tokens_seen": 6700720, "step": 74420 }, { "epoch": 19.341216216216218, "grad_norm": 0.007627379614859819, "learning_rate": 1.6520159336294306e-07, "loss": 0.0007, "num_input_tokens_seen": 6701120, "step": 74425 }, { "epoch": 19.342515592515593, "grad_norm": 0.051981933414936066, "learning_rate": 1.6455152455981392e-07, "loss": 0.0014, "num_input_tokens_seen": 6701584, "step": 74430 }, { "epoch": 19.343814968814968, "grad_norm": 0.127094104886055, "learning_rate": 1.6390273306759463e-07, "loss": 0.0038, "num_input_tokens_seen": 6702032, "step": 74435 }, { "epoch": 19.345114345114347, "grad_norm": 0.0013990600127726793, "learning_rate": 1.6325521891965568e-07, "loss": 0.0569, "num_input_tokens_seen": 6702480, "step": 74440 }, { "epoch": 19.34641372141372, "grad_norm": 0.0010159717639908195, "learning_rate": 1.6260898214929542e-07, "loss": 0.0012, "num_input_tokens_seen": 6702944, "step": 74445 }, { "epoch": 19.347713097713097, "grad_norm": 0.26607608795166016, "learning_rate": 1.6196402278975675e-07, "loss": 0.0059, "num_input_tokens_seen": 6703376, "step": 74450 }, { "epoch": 19.349012474012476, "grad_norm": 0.03780199587345123, "learning_rate": 1.6132034087420477e-07, "loss": 0.1383, "num_input_tokens_seen": 6703792, "step": 74455 }, { "epoch": 19.35031185031185, "grad_norm": 0.20111091434955597, "learning_rate": 1.6067793643574912e-07, "loss": 0.0002, "num_input_tokens_seen": 6704256, "step": 74460 }, { "epoch": 19.351611226611226, "grad_norm": 0.009781427681446075, "learning_rate": 1.6003680950742728e-07, "loss": 0.0004, "num_input_tokens_seen": 6704720, "step": 74465 }, { "epoch": 19.352910602910605, "grad_norm": 21.708234786987305, "learning_rate": 1.5939696012221007e-07, "loss": 0.011, "num_input_tokens_seen": 6705136, "step": 74470 }, { "epoch": 19.35420997920998, "grad_norm": 0.0032444987446069717, "learning_rate": 1.5875838831301004e-07, "loss": 0.0025, "num_input_tokens_seen": 6705600, "step": 74475 }, { "epoch": 19.355509355509355, "grad_norm": 0.19618597626686096, "learning_rate": 1.5812109411266762e-07, "loss": 0.0011, "num_input_tokens_seen": 6706048, "step": 74480 }, { "epoch": 19.35680873180873, "grad_norm": 1.8654835224151611, "learning_rate": 1.5748507755395936e-07, "loss": 0.0671, "num_input_tokens_seen": 6706480, "step": 74485 }, { "epoch": 19.35810810810811, "grad_norm": 0.45789238810539246, "learning_rate": 1.5685033866959798e-07, "loss": 0.0207, "num_input_tokens_seen": 6706928, "step": 74490 }, { "epoch": 19.359407484407484, "grad_norm": 1.4880056381225586, "learning_rate": 1.5621687749222679e-07, "loss": 0.0015, "num_input_tokens_seen": 6707392, "step": 74495 }, { "epoch": 19.36070686070686, "grad_norm": 0.1803419142961502, "learning_rate": 1.5558469405442534e-07, "loss": 0.0182, "num_input_tokens_seen": 6707872, "step": 74500 }, { "epoch": 19.362006237006238, "grad_norm": 0.06418188661336899, "learning_rate": 1.5495378838870643e-07, "loss": 0.0328, "num_input_tokens_seen": 6708288, "step": 74505 }, { "epoch": 19.363305613305613, "grad_norm": 0.12958721816539764, "learning_rate": 1.5432416052752198e-07, "loss": 0.0005, "num_input_tokens_seen": 6708752, "step": 74510 }, { "epoch": 19.364604989604988, "grad_norm": 0.004831216763705015, "learning_rate": 1.5369581050325155e-07, "loss": 0.1366, "num_input_tokens_seen": 6709216, "step": 74515 }, { "epoch": 19.365904365904367, "grad_norm": 0.04034580662846565, "learning_rate": 1.5306873834821102e-07, "loss": 0.0001, "num_input_tokens_seen": 6709664, "step": 74520 }, { "epoch": 19.367203742203742, "grad_norm": 0.008815889246761799, "learning_rate": 1.5244294409465232e-07, "loss": 0.1629, "num_input_tokens_seen": 6710112, "step": 74525 }, { "epoch": 19.368503118503117, "grad_norm": 0.30968520045280457, "learning_rate": 1.5181842777476084e-07, "loss": 0.0286, "num_input_tokens_seen": 6710544, "step": 74530 }, { "epoch": 19.369802494802496, "grad_norm": 0.0012558283051475883, "learning_rate": 1.5119518942065535e-07, "loss": 0.0149, "num_input_tokens_seen": 6710992, "step": 74535 }, { "epoch": 19.37110187110187, "grad_norm": 0.14391306042671204, "learning_rate": 1.5057322906439075e-07, "loss": 0.0871, "num_input_tokens_seen": 6711440, "step": 74540 }, { "epoch": 19.372401247401246, "grad_norm": 0.006852084305137396, "learning_rate": 1.4995254673795812e-07, "loss": 0.0001, "num_input_tokens_seen": 6711904, "step": 74545 }, { "epoch": 19.373700623700625, "grad_norm": 0.17058202624320984, "learning_rate": 1.4933314247327078e-07, "loss": 0.0022, "num_input_tokens_seen": 6712384, "step": 74550 }, { "epoch": 19.375, "grad_norm": 0.1740819215774536, "learning_rate": 1.48715016302195e-07, "loss": 0.0576, "num_input_tokens_seen": 6712832, "step": 74555 }, { "epoch": 19.376299376299375, "grad_norm": 0.600490391254425, "learning_rate": 1.4809816825651356e-07, "loss": 0.0022, "num_input_tokens_seen": 6713264, "step": 74560 }, { "epoch": 19.377598752598754, "grad_norm": 0.22896595299243927, "learning_rate": 1.4748259836795675e-07, "loss": 0.0003, "num_input_tokens_seen": 6713760, "step": 74565 }, { "epoch": 19.37889812889813, "grad_norm": 19.305301666259766, "learning_rate": 1.468683066681853e-07, "loss": 0.4416, "num_input_tokens_seen": 6714224, "step": 74570 }, { "epoch": 19.380197505197504, "grad_norm": 0.009623446501791477, "learning_rate": 1.4625529318878505e-07, "loss": 0.0156, "num_input_tokens_seen": 6714656, "step": 74575 }, { "epoch": 19.381496881496883, "grad_norm": 47.11042022705078, "learning_rate": 1.4564355796129193e-07, "loss": 0.2713, "num_input_tokens_seen": 6715136, "step": 74580 }, { "epoch": 19.382796257796258, "grad_norm": 14.237558364868164, "learning_rate": 1.4503310101716406e-07, "loss": 0.0099, "num_input_tokens_seen": 6715584, "step": 74585 }, { "epoch": 19.384095634095633, "grad_norm": 0.7509160041809082, "learning_rate": 1.4442392238779856e-07, "loss": 0.0011, "num_input_tokens_seen": 6716032, "step": 74590 }, { "epoch": 19.385395010395012, "grad_norm": 1.4014369249343872, "learning_rate": 1.4381602210452593e-07, "loss": 0.0026, "num_input_tokens_seen": 6716464, "step": 74595 }, { "epoch": 19.386694386694387, "grad_norm": 0.0029830564744770527, "learning_rate": 1.4320940019861283e-07, "loss": 0.0476, "num_input_tokens_seen": 6716896, "step": 74600 }, { "epoch": 19.387993762993762, "grad_norm": 0.006508766673505306, "learning_rate": 1.4260405670125378e-07, "loss": 0.0507, "num_input_tokens_seen": 6717360, "step": 74605 }, { "epoch": 19.38929313929314, "grad_norm": 0.15565717220306396, "learning_rate": 1.4199999164359045e-07, "loss": 0.0004, "num_input_tokens_seen": 6717792, "step": 74610 }, { "epoch": 19.390592515592516, "grad_norm": 0.5692588090896606, "learning_rate": 1.4139720505668141e-07, "loss": 0.0027, "num_input_tokens_seen": 6718256, "step": 74615 }, { "epoch": 19.39189189189189, "grad_norm": 1.4628746509552002, "learning_rate": 1.4079569697153239e-07, "loss": 0.0186, "num_input_tokens_seen": 6718736, "step": 74620 }, { "epoch": 19.39319126819127, "grad_norm": 0.1297379583120346, "learning_rate": 1.4019546741908251e-07, "loss": 0.053, "num_input_tokens_seen": 6719200, "step": 74625 }, { "epoch": 19.394490644490645, "grad_norm": 25.249195098876953, "learning_rate": 1.39596516430196e-07, "loss": 0.0234, "num_input_tokens_seen": 6719632, "step": 74630 }, { "epoch": 19.39579002079002, "grad_norm": 0.24538740515708923, "learning_rate": 1.3899884403568153e-07, "loss": 0.1017, "num_input_tokens_seen": 6720112, "step": 74635 }, { "epoch": 19.397089397089395, "grad_norm": 0.7358759641647339, "learning_rate": 1.384024502662784e-07, "loss": 0.1185, "num_input_tokens_seen": 6720592, "step": 74640 }, { "epoch": 19.398388773388774, "grad_norm": 0.3478287160396576, "learning_rate": 1.378073351526593e-07, "loss": 0.2012, "num_input_tokens_seen": 6721040, "step": 74645 }, { "epoch": 19.39968814968815, "grad_norm": 6.441871643066406, "learning_rate": 1.3721349872542756e-07, "loss": 0.0714, "num_input_tokens_seen": 6721504, "step": 74650 }, { "epoch": 19.400987525987524, "grad_norm": 48.39885711669922, "learning_rate": 1.366209410151309e-07, "loss": 0.0764, "num_input_tokens_seen": 6721968, "step": 74655 }, { "epoch": 19.402286902286903, "grad_norm": 3.617666482925415, "learning_rate": 1.3602966205223943e-07, "loss": 0.0098, "num_input_tokens_seen": 6722432, "step": 74660 }, { "epoch": 19.40358627858628, "grad_norm": 6.324382305145264, "learning_rate": 1.3543966186716773e-07, "loss": 0.0032, "num_input_tokens_seen": 6722944, "step": 74665 }, { "epoch": 19.404885654885653, "grad_norm": 0.0004469669656828046, "learning_rate": 1.3485094049025816e-07, "loss": 0.0018, "num_input_tokens_seen": 6723376, "step": 74670 }, { "epoch": 19.406185031185032, "grad_norm": 2.275635004043579, "learning_rate": 1.3426349795178926e-07, "loss": 0.0067, "num_input_tokens_seen": 6723808, "step": 74675 }, { "epoch": 19.407484407484407, "grad_norm": 7.380412578582764, "learning_rate": 1.3367733428197304e-07, "loss": 0.0042, "num_input_tokens_seen": 6724256, "step": 74680 }, { "epoch": 19.408783783783782, "grad_norm": 0.02057887613773346, "learning_rate": 1.3309244951095756e-07, "loss": 0.0074, "num_input_tokens_seen": 6724720, "step": 74685 }, { "epoch": 19.41008316008316, "grad_norm": 0.3610995411872864, "learning_rate": 1.325088436688271e-07, "loss": 0.0716, "num_input_tokens_seen": 6725216, "step": 74690 }, { "epoch": 19.411382536382536, "grad_norm": 0.07679665088653564, "learning_rate": 1.319265167855882e-07, "loss": 0.0001, "num_input_tokens_seen": 6725664, "step": 74695 }, { "epoch": 19.41268191268191, "grad_norm": 1.8222885131835938, "learning_rate": 1.3134546889120026e-07, "loss": 0.0028, "num_input_tokens_seen": 6726160, "step": 74700 }, { "epoch": 19.41398128898129, "grad_norm": 78.46247100830078, "learning_rate": 1.3076570001553934e-07, "loss": 0.0982, "num_input_tokens_seen": 6726608, "step": 74705 }, { "epoch": 19.415280665280665, "grad_norm": 0.9610490202903748, "learning_rate": 1.3018721018842883e-07, "loss": 0.0072, "num_input_tokens_seen": 6727072, "step": 74710 }, { "epoch": 19.41658004158004, "grad_norm": 12.996941566467285, "learning_rate": 1.2960999943961992e-07, "loss": 0.0094, "num_input_tokens_seen": 6727536, "step": 74715 }, { "epoch": 19.41787941787942, "grad_norm": 1.0746718645095825, "learning_rate": 1.2903406779879722e-07, "loss": 0.0281, "num_input_tokens_seen": 6728016, "step": 74720 }, { "epoch": 19.419178794178794, "grad_norm": 53.04378128051758, "learning_rate": 1.2845941529558424e-07, "loss": 0.4705, "num_input_tokens_seen": 6728432, "step": 74725 }, { "epoch": 19.42047817047817, "grad_norm": 0.07297656685113907, "learning_rate": 1.2788604195953234e-07, "loss": 0.0003, "num_input_tokens_seen": 6728880, "step": 74730 }, { "epoch": 19.421777546777548, "grad_norm": 0.0038290072698146105, "learning_rate": 1.273139478201346e-07, "loss": 0.2007, "num_input_tokens_seen": 6729296, "step": 74735 }, { "epoch": 19.423076923076923, "grad_norm": 0.1505683958530426, "learning_rate": 1.2674313290680916e-07, "loss": 0.0246, "num_input_tokens_seen": 6729728, "step": 74740 }, { "epoch": 19.4243762993763, "grad_norm": 0.08159065246582031, "learning_rate": 1.2617359724891863e-07, "loss": 0.0007, "num_input_tokens_seen": 6730176, "step": 74745 }, { "epoch": 19.425675675675677, "grad_norm": 0.008044200018048286, "learning_rate": 1.2560534087575349e-07, "loss": 0.1564, "num_input_tokens_seen": 6730624, "step": 74750 }, { "epoch": 19.426975051975052, "grad_norm": 0.3493184745311737, "learning_rate": 1.2503836381654032e-07, "loss": 0.1978, "num_input_tokens_seen": 6731088, "step": 74755 }, { "epoch": 19.428274428274428, "grad_norm": 8.549199104309082, "learning_rate": 1.244726661004364e-07, "loss": 0.0062, "num_input_tokens_seen": 6731552, "step": 74760 }, { "epoch": 19.429573804573806, "grad_norm": 0.015139175578951836, "learning_rate": 1.2390824775653788e-07, "loss": 0.0072, "num_input_tokens_seen": 6731968, "step": 74765 }, { "epoch": 19.43087318087318, "grad_norm": 85.85411834716797, "learning_rate": 1.233451088138743e-07, "loss": 0.1766, "num_input_tokens_seen": 6732400, "step": 74770 }, { "epoch": 19.432172557172557, "grad_norm": 0.024893363937735558, "learning_rate": 1.2278324930140585e-07, "loss": 0.071, "num_input_tokens_seen": 6732816, "step": 74775 }, { "epoch": 19.433471933471935, "grad_norm": 0.1698496788740158, "learning_rate": 1.2222266924803161e-07, "loss": 0.1949, "num_input_tokens_seen": 6733264, "step": 74780 }, { "epoch": 19.43477130977131, "grad_norm": 72.35746002197266, "learning_rate": 1.216633686825841e-07, "loss": 0.0954, "num_input_tokens_seen": 6733696, "step": 74785 }, { "epoch": 19.436070686070686, "grad_norm": 8.762391090393066, "learning_rate": 1.2110534763382365e-07, "loss": 0.0085, "num_input_tokens_seen": 6734144, "step": 74790 }, { "epoch": 19.43737006237006, "grad_norm": 0.026255715638399124, "learning_rate": 1.2054860613045504e-07, "loss": 0.5306, "num_input_tokens_seen": 6734624, "step": 74795 }, { "epoch": 19.43866943866944, "grad_norm": 0.06398356705904007, "learning_rate": 1.1999314420111095e-07, "loss": 0.0254, "num_input_tokens_seen": 6735088, "step": 74800 }, { "epoch": 19.439968814968815, "grad_norm": 4.319192409515381, "learning_rate": 1.1943896187435744e-07, "loss": 0.3679, "num_input_tokens_seen": 6735552, "step": 74805 }, { "epoch": 19.44126819126819, "grad_norm": 0.4396870732307434, "learning_rate": 1.1888605917869666e-07, "loss": 0.0334, "num_input_tokens_seen": 6735984, "step": 74810 }, { "epoch": 19.44256756756757, "grad_norm": 0.015011822804808617, "learning_rate": 1.1833443614256423e-07, "loss": 0.0527, "num_input_tokens_seen": 6736496, "step": 74815 }, { "epoch": 19.443866943866944, "grad_norm": 0.11703866720199585, "learning_rate": 1.1778409279433467e-07, "loss": 0.0004, "num_input_tokens_seen": 6736976, "step": 74820 }, { "epoch": 19.44516632016632, "grad_norm": 0.2131747305393219, "learning_rate": 1.1723502916231032e-07, "loss": 0.001, "num_input_tokens_seen": 6737424, "step": 74825 }, { "epoch": 19.446465696465697, "grad_norm": 0.039957523345947266, "learning_rate": 1.1668724527472974e-07, "loss": 0.3934, "num_input_tokens_seen": 6737872, "step": 74830 }, { "epoch": 19.447765072765073, "grad_norm": 0.008946524001657963, "learning_rate": 1.1614074115976481e-07, "loss": 0.0004, "num_input_tokens_seen": 6738352, "step": 74835 }, { "epoch": 19.449064449064448, "grad_norm": 0.021355608478188515, "learning_rate": 1.1559551684552638e-07, "loss": 0.2381, "num_input_tokens_seen": 6738800, "step": 74840 }, { "epoch": 19.450363825363826, "grad_norm": 61.06625747680664, "learning_rate": 1.1505157236005037e-07, "loss": 0.4484, "num_input_tokens_seen": 6739248, "step": 74845 }, { "epoch": 19.4516632016632, "grad_norm": 0.0006085702916607261, "learning_rate": 1.1450890773131717e-07, "loss": 0.2536, "num_input_tokens_seen": 6739712, "step": 74850 }, { "epoch": 19.452962577962577, "grad_norm": 0.13811145722866058, "learning_rate": 1.1396752298723501e-07, "loss": 0.0864, "num_input_tokens_seen": 6740128, "step": 74855 }, { "epoch": 19.454261954261955, "grad_norm": 0.16704101860523224, "learning_rate": 1.1342741815564828e-07, "loss": 0.0566, "num_input_tokens_seen": 6740560, "step": 74860 }, { "epoch": 19.45556133056133, "grad_norm": 0.4727663993835449, "learning_rate": 1.1288859326433477e-07, "loss": 0.0494, "num_input_tokens_seen": 6740992, "step": 74865 }, { "epoch": 19.456860706860706, "grad_norm": 0.008157125674188137, "learning_rate": 1.1235104834100563e-07, "loss": 0.0105, "num_input_tokens_seen": 6741440, "step": 74870 }, { "epoch": 19.458160083160084, "grad_norm": 0.18633583188056946, "learning_rate": 1.118147834133082e-07, "loss": 0.01, "num_input_tokens_seen": 6741856, "step": 74875 }, { "epoch": 19.45945945945946, "grad_norm": 0.056941915303468704, "learning_rate": 1.1127979850882598e-07, "loss": 0.0005, "num_input_tokens_seen": 6742288, "step": 74880 }, { "epoch": 19.460758835758835, "grad_norm": 0.14130641520023346, "learning_rate": 1.107460936550675e-07, "loss": 0.171, "num_input_tokens_seen": 6742768, "step": 74885 }, { "epoch": 19.462058212058214, "grad_norm": 0.15208058059215546, "learning_rate": 1.1021366887948581e-07, "loss": 0.3031, "num_input_tokens_seen": 6743232, "step": 74890 }, { "epoch": 19.46335758835759, "grad_norm": 69.19793701171875, "learning_rate": 1.0968252420946456e-07, "loss": 0.121, "num_input_tokens_seen": 6743664, "step": 74895 }, { "epoch": 19.464656964656964, "grad_norm": 0.45718780159950256, "learning_rate": 1.0915265967231802e-07, "loss": 0.2179, "num_input_tokens_seen": 6744080, "step": 74900 }, { "epoch": 19.465956340956343, "grad_norm": 0.3582073748111725, "learning_rate": 1.0862407529530217e-07, "loss": 0.0397, "num_input_tokens_seen": 6744528, "step": 74905 }, { "epoch": 19.467255717255718, "grad_norm": 0.002428349805995822, "learning_rate": 1.0809677110559802e-07, "loss": 0.1178, "num_input_tokens_seen": 6744976, "step": 74910 }, { "epoch": 19.468555093555093, "grad_norm": 0.0791257843375206, "learning_rate": 1.0757074713032556e-07, "loss": 0.0093, "num_input_tokens_seen": 6745424, "step": 74915 }, { "epoch": 19.46985446985447, "grad_norm": 0.0036981389857828617, "learning_rate": 1.0704600339654369e-07, "loss": 0.0009, "num_input_tokens_seen": 6745904, "step": 74920 }, { "epoch": 19.471153846153847, "grad_norm": 0.25408804416656494, "learning_rate": 1.065225399312364e-07, "loss": 0.0004, "num_input_tokens_seen": 6746336, "step": 74925 }, { "epoch": 19.472453222453222, "grad_norm": 0.010081392712891102, "learning_rate": 1.060003567613238e-07, "loss": 0.285, "num_input_tokens_seen": 6746768, "step": 74930 }, { "epoch": 19.473752598752597, "grad_norm": 0.0022748890332877636, "learning_rate": 1.0547945391366776e-07, "loss": 0.0289, "num_input_tokens_seen": 6747216, "step": 74935 }, { "epoch": 19.475051975051976, "grad_norm": 0.012070423923432827, "learning_rate": 1.0495983141505794e-07, "loss": 0.0001, "num_input_tokens_seen": 6747648, "step": 74940 }, { "epoch": 19.47635135135135, "grad_norm": 0.059788621962070465, "learning_rate": 1.0444148929221464e-07, "loss": 0.1152, "num_input_tokens_seen": 6748096, "step": 74945 }, { "epoch": 19.477650727650726, "grad_norm": 8.132732391357422, "learning_rate": 1.0392442757179987e-07, "loss": 0.0081, "num_input_tokens_seen": 6748496, "step": 74950 }, { "epoch": 19.478950103950105, "grad_norm": 0.41617435216903687, "learning_rate": 1.0340864628040626e-07, "loss": 0.4189, "num_input_tokens_seen": 6748928, "step": 74955 }, { "epoch": 19.48024948024948, "grad_norm": 58.20866012573242, "learning_rate": 1.0289414544455978e-07, "loss": 0.4371, "num_input_tokens_seen": 6749344, "step": 74960 }, { "epoch": 19.481548856548855, "grad_norm": 0.04773309454321861, "learning_rate": 1.0238092509072262e-07, "loss": 0.001, "num_input_tokens_seen": 6749792, "step": 74965 }, { "epoch": 19.482848232848234, "grad_norm": 0.008975240401923656, "learning_rate": 1.0186898524529309e-07, "loss": 0.0074, "num_input_tokens_seen": 6750272, "step": 74970 }, { "epoch": 19.48414760914761, "grad_norm": 49.156822204589844, "learning_rate": 1.013583259345946e-07, "loss": 0.1604, "num_input_tokens_seen": 6750736, "step": 74975 }, { "epoch": 19.485446985446984, "grad_norm": 0.10438744723796844, "learning_rate": 1.0084894718489501e-07, "loss": 0.191, "num_input_tokens_seen": 6751168, "step": 74980 }, { "epoch": 19.486746361746363, "grad_norm": 0.01170639880001545, "learning_rate": 1.0034084902239282e-07, "loss": 0.0003, "num_input_tokens_seen": 6751616, "step": 74985 }, { "epoch": 19.488045738045738, "grad_norm": 5.926011562347412, "learning_rate": 9.983403147321712e-08, "loss": 0.0038, "num_input_tokens_seen": 6752032, "step": 74990 }, { "epoch": 19.489345114345113, "grad_norm": 0.004053717013448477, "learning_rate": 9.932849456343318e-08, "loss": 0.0001, "num_input_tokens_seen": 6752480, "step": 74995 }, { "epoch": 19.490644490644492, "grad_norm": 8.966233253479004, "learning_rate": 9.882423831904797e-08, "loss": 0.5291, "num_input_tokens_seen": 6752960, "step": 75000 }, { "epoch": 19.491943866943867, "grad_norm": 4.982540130615234, "learning_rate": 9.832126276598797e-08, "loss": 0.2203, "num_input_tokens_seen": 6753392, "step": 75005 }, { "epoch": 19.493243243243242, "grad_norm": 0.005149777978658676, "learning_rate": 9.781956793012692e-08, "loss": 0.0528, "num_input_tokens_seen": 6753824, "step": 75010 }, { "epoch": 19.49454261954262, "grad_norm": 0.13772255182266235, "learning_rate": 9.731915383726364e-08, "loss": 0.0226, "num_input_tokens_seen": 6754304, "step": 75015 }, { "epoch": 19.495841995841996, "grad_norm": 1.5880907773971558, "learning_rate": 9.682002051313866e-08, "loss": 0.102, "num_input_tokens_seen": 6754800, "step": 75020 }, { "epoch": 19.49714137214137, "grad_norm": 0.49993783235549927, "learning_rate": 9.63221679834203e-08, "loss": 0.1213, "num_input_tokens_seen": 6755280, "step": 75025 }, { "epoch": 19.49844074844075, "grad_norm": 45.82029724121094, "learning_rate": 9.582559627371313e-08, "loss": 0.2315, "num_input_tokens_seen": 6755744, "step": 75030 }, { "epoch": 19.499740124740125, "grad_norm": 0.04229605942964554, "learning_rate": 9.533030540956056e-08, "loss": 0.078, "num_input_tokens_seen": 6756192, "step": 75035 }, { "epoch": 19.5010395010395, "grad_norm": 0.00909197237342596, "learning_rate": 9.483629541643114e-08, "loss": 0.0009, "num_input_tokens_seen": 6756656, "step": 75040 }, { "epoch": 19.50233887733888, "grad_norm": 0.685577929019928, "learning_rate": 9.434356631973506e-08, "loss": 0.009, "num_input_tokens_seen": 6757136, "step": 75045 }, { "epoch": 19.503638253638254, "grad_norm": 0.25531572103500366, "learning_rate": 9.38521181448132e-08, "loss": 0.0002, "num_input_tokens_seen": 6757584, "step": 75050 }, { "epoch": 19.50493762993763, "grad_norm": 0.0008801218937151134, "learning_rate": 9.336195091693978e-08, "loss": 0.0017, "num_input_tokens_seen": 6758016, "step": 75055 }, { "epoch": 19.506237006237008, "grad_norm": 0.007298257667571306, "learning_rate": 9.287306466132518e-08, "loss": 0.0959, "num_input_tokens_seen": 6758480, "step": 75060 }, { "epoch": 19.507536382536383, "grad_norm": 0.013039336539804935, "learning_rate": 9.238545940311316e-08, "loss": 0.0563, "num_input_tokens_seen": 6758944, "step": 75065 }, { "epoch": 19.508835758835758, "grad_norm": 0.014554459601640701, "learning_rate": 9.189913516738368e-08, "loss": 0.0161, "num_input_tokens_seen": 6759376, "step": 75070 }, { "epoch": 19.510135135135137, "grad_norm": 0.21246673166751862, "learning_rate": 9.14140919791473e-08, "loss": 0.0002, "num_input_tokens_seen": 6759856, "step": 75075 }, { "epoch": 19.511434511434512, "grad_norm": 0.00320444256067276, "learning_rate": 9.09303298633507e-08, "loss": 0.0003, "num_input_tokens_seen": 6760336, "step": 75080 }, { "epoch": 19.512733887733887, "grad_norm": 0.1853584498167038, "learning_rate": 9.044784884487123e-08, "loss": 0.0003, "num_input_tokens_seen": 6760784, "step": 75085 }, { "epoch": 19.514033264033262, "grad_norm": 0.32997414469718933, "learning_rate": 8.996664894853069e-08, "loss": 0.2153, "num_input_tokens_seen": 6761232, "step": 75090 }, { "epoch": 19.51533264033264, "grad_norm": 2.389699935913086, "learning_rate": 8.948673019906762e-08, "loss": 0.0022, "num_input_tokens_seen": 6761680, "step": 75095 }, { "epoch": 19.516632016632016, "grad_norm": 17.423358917236328, "learning_rate": 8.900809262117337e-08, "loss": 0.1041, "num_input_tokens_seen": 6762128, "step": 75100 }, { "epoch": 19.51793139293139, "grad_norm": 0.03373518958687782, "learning_rate": 8.853073623946162e-08, "loss": 0.0015, "num_input_tokens_seen": 6762592, "step": 75105 }, { "epoch": 19.51923076923077, "grad_norm": 0.010046404786407948, "learning_rate": 8.805466107848215e-08, "loss": 0.002, "num_input_tokens_seen": 6763072, "step": 75110 }, { "epoch": 19.520530145530145, "grad_norm": 0.04126368835568428, "learning_rate": 8.757986716272093e-08, "loss": 0.0412, "num_input_tokens_seen": 6763584, "step": 75115 }, { "epoch": 19.52182952182952, "grad_norm": 0.001220186473801732, "learning_rate": 8.71063545166001e-08, "loss": 0.0001, "num_input_tokens_seen": 6764032, "step": 75120 }, { "epoch": 19.5231288981289, "grad_norm": 5.781580924987793, "learning_rate": 8.663412316446684e-08, "loss": 0.0283, "num_input_tokens_seen": 6764464, "step": 75125 }, { "epoch": 19.524428274428274, "grad_norm": 9.330334663391113, "learning_rate": 8.616317313061285e-08, "loss": 0.007, "num_input_tokens_seen": 6764896, "step": 75130 }, { "epoch": 19.52572765072765, "grad_norm": 10.627628326416016, "learning_rate": 8.569350443925484e-08, "loss": 0.0102, "num_input_tokens_seen": 6765360, "step": 75135 }, { "epoch": 19.527027027027028, "grad_norm": 0.038515396416187286, "learning_rate": 8.522511711455406e-08, "loss": 0.0681, "num_input_tokens_seen": 6765776, "step": 75140 }, { "epoch": 19.528326403326403, "grad_norm": 0.022281648591160774, "learning_rate": 8.475801118059956e-08, "loss": 0.0005, "num_input_tokens_seen": 6766240, "step": 75145 }, { "epoch": 19.52962577962578, "grad_norm": 1.7691857814788818, "learning_rate": 8.429218666141103e-08, "loss": 0.0013, "num_input_tokens_seen": 6766688, "step": 75150 }, { "epoch": 19.530925155925157, "grad_norm": 34.14216613769531, "learning_rate": 8.382764358094708e-08, "loss": 0.0373, "num_input_tokens_seen": 6767136, "step": 75155 }, { "epoch": 19.532224532224532, "grad_norm": 0.06623101979494095, "learning_rate": 8.336438196310248e-08, "loss": 0.067, "num_input_tokens_seen": 6767584, "step": 75160 }, { "epoch": 19.533523908523907, "grad_norm": 0.008243482559919357, "learning_rate": 8.290240183170261e-08, "loss": 0.0823, "num_input_tokens_seen": 6768016, "step": 75165 }, { "epoch": 19.534823284823286, "grad_norm": 0.004699781537055969, "learning_rate": 8.244170321050626e-08, "loss": 0.0758, "num_input_tokens_seen": 6768448, "step": 75170 }, { "epoch": 19.53612266112266, "grad_norm": 0.12343724071979523, "learning_rate": 8.198228612320558e-08, "loss": 0.0603, "num_input_tokens_seen": 6768912, "step": 75175 }, { "epoch": 19.537422037422036, "grad_norm": 0.13611862063407898, "learning_rate": 8.152415059343443e-08, "loss": 0.0131, "num_input_tokens_seen": 6769344, "step": 75180 }, { "epoch": 19.538721413721415, "grad_norm": 0.004730669315904379, "learning_rate": 8.106729664475176e-08, "loss": 0.0003, "num_input_tokens_seen": 6769760, "step": 75185 }, { "epoch": 19.54002079002079, "grad_norm": 0.08794621378183365, "learning_rate": 8.061172430065266e-08, "loss": 0.0006, "num_input_tokens_seen": 6770224, "step": 75190 }, { "epoch": 19.541320166320165, "grad_norm": 71.48152160644531, "learning_rate": 8.01574335845684e-08, "loss": 0.0735, "num_input_tokens_seen": 6770656, "step": 75195 }, { "epoch": 19.542619542619544, "grad_norm": 22.95802879333496, "learning_rate": 7.970442451986638e-08, "loss": 0.0867, "num_input_tokens_seen": 6771104, "step": 75200 }, { "epoch": 19.54391891891892, "grad_norm": 0.11880506575107574, "learning_rate": 7.925269712984184e-08, "loss": 0.0004, "num_input_tokens_seen": 6771552, "step": 75205 }, { "epoch": 19.545218295218294, "grad_norm": 0.0016660330584272742, "learning_rate": 7.880225143772902e-08, "loss": 0.1321, "num_input_tokens_seen": 6772000, "step": 75210 }, { "epoch": 19.546517671517673, "grad_norm": 2.759598970413208, "learning_rate": 7.835308746669545e-08, "loss": 0.0032, "num_input_tokens_seen": 6772432, "step": 75215 }, { "epoch": 19.54781704781705, "grad_norm": 0.8532277345657349, "learning_rate": 7.790520523984213e-08, "loss": 0.6433, "num_input_tokens_seen": 6772848, "step": 75220 }, { "epoch": 19.549116424116423, "grad_norm": 7.858173370361328, "learning_rate": 7.74586047802034e-08, "loss": 0.0678, "num_input_tokens_seen": 6773296, "step": 75225 }, { "epoch": 19.5504158004158, "grad_norm": 95.51441955566406, "learning_rate": 7.701328611074698e-08, "loss": 0.09, "num_input_tokens_seen": 6773712, "step": 75230 }, { "epoch": 19.551715176715177, "grad_norm": 0.9249685406684875, "learning_rate": 7.656924925437681e-08, "loss": 0.0925, "num_input_tokens_seen": 6774176, "step": 75235 }, { "epoch": 19.553014553014552, "grad_norm": 2.871117115020752, "learning_rate": 7.612649423393014e-08, "loss": 0.3901, "num_input_tokens_seen": 6774592, "step": 75240 }, { "epoch": 19.554313929313928, "grad_norm": 5.7815327644348145, "learning_rate": 7.568502107218044e-08, "loss": 0.2096, "num_input_tokens_seen": 6775072, "step": 75245 }, { "epoch": 19.555613305613306, "grad_norm": 117.11721801757812, "learning_rate": 7.524482979183178e-08, "loss": 0.1357, "num_input_tokens_seen": 6775536, "step": 75250 }, { "epoch": 19.55691268191268, "grad_norm": 0.02633899636566639, "learning_rate": 7.48059204155216e-08, "loss": 0.0035, "num_input_tokens_seen": 6775984, "step": 75255 }, { "epoch": 19.558212058212057, "grad_norm": 0.1373712718486786, "learning_rate": 7.436829296582626e-08, "loss": 0.0013, "num_input_tokens_seen": 6776432, "step": 75260 }, { "epoch": 19.559511434511435, "grad_norm": 0.23970799148082733, "learning_rate": 7.393194746525279e-08, "loss": 0.3756, "num_input_tokens_seen": 6776912, "step": 75265 }, { "epoch": 19.56081081081081, "grad_norm": 0.006574805360287428, "learning_rate": 7.349688393624154e-08, "loss": 0.0116, "num_input_tokens_seen": 6777360, "step": 75270 }, { "epoch": 19.562110187110186, "grad_norm": 0.08784011006355286, "learning_rate": 7.30631024011691e-08, "loss": 0.016, "num_input_tokens_seen": 6777808, "step": 75275 }, { "epoch": 19.563409563409564, "grad_norm": 0.023763593286275864, "learning_rate": 7.263060288234535e-08, "loss": 0.0627, "num_input_tokens_seen": 6778224, "step": 75280 }, { "epoch": 19.56470893970894, "grad_norm": 5.265789985656738, "learning_rate": 7.219938540201366e-08, "loss": 0.0051, "num_input_tokens_seen": 6778672, "step": 75285 }, { "epoch": 19.566008316008315, "grad_norm": 80.03418731689453, "learning_rate": 7.17694499823507e-08, "loss": 0.1542, "num_input_tokens_seen": 6779136, "step": 75290 }, { "epoch": 19.567307692307693, "grad_norm": 0.010233482345938683, "learning_rate": 7.13407966454721e-08, "loss": 0.0002, "num_input_tokens_seen": 6779568, "step": 75295 }, { "epoch": 19.56860706860707, "grad_norm": 0.35540786385536194, "learning_rate": 7.091342541342139e-08, "loss": 0.0031, "num_input_tokens_seen": 6780000, "step": 75300 }, { "epoch": 19.569906444906444, "grad_norm": 0.4364847242832184, "learning_rate": 7.048733630817817e-08, "loss": 0.0118, "num_input_tokens_seen": 6780432, "step": 75305 }, { "epoch": 19.571205821205822, "grad_norm": 1.8331466913223267, "learning_rate": 7.006252935165824e-08, "loss": 0.2386, "num_input_tokens_seen": 6780880, "step": 75310 }, { "epoch": 19.572505197505198, "grad_norm": 0.05563792958855629, "learning_rate": 6.963900456571081e-08, "loss": 0.0008, "num_input_tokens_seen": 6781344, "step": 75315 }, { "epoch": 19.573804573804573, "grad_norm": 0.1123267412185669, "learning_rate": 6.921676197211291e-08, "loss": 0.1828, "num_input_tokens_seen": 6781776, "step": 75320 }, { "epoch": 19.57510395010395, "grad_norm": 0.0067846765741705894, "learning_rate": 6.87958015925888e-08, "loss": 0.0005, "num_input_tokens_seen": 6782240, "step": 75325 }, { "epoch": 19.576403326403327, "grad_norm": 100.97069549560547, "learning_rate": 6.837612344877952e-08, "loss": 0.1866, "num_input_tokens_seen": 6782688, "step": 75330 }, { "epoch": 19.5777027027027, "grad_norm": 0.05695485323667526, "learning_rate": 6.795772756227891e-08, "loss": 0.0041, "num_input_tokens_seen": 6783136, "step": 75335 }, { "epoch": 19.57900207900208, "grad_norm": 0.3902432322502136, "learning_rate": 6.754061395460032e-08, "loss": 0.0881, "num_input_tokens_seen": 6783568, "step": 75340 }, { "epoch": 19.580301455301456, "grad_norm": 0.008318813517689705, "learning_rate": 6.712478264719601e-08, "loss": 0.0133, "num_input_tokens_seen": 6784000, "step": 75345 }, { "epoch": 19.58160083160083, "grad_norm": 0.5686217546463013, "learning_rate": 6.671023366145169e-08, "loss": 0.0004, "num_input_tokens_seen": 6784448, "step": 75350 }, { "epoch": 19.58290020790021, "grad_norm": 83.69066619873047, "learning_rate": 6.629696701869193e-08, "loss": 0.1137, "num_input_tokens_seen": 6784896, "step": 75355 }, { "epoch": 19.584199584199585, "grad_norm": 0.7309305667877197, "learning_rate": 6.588498274017196e-08, "loss": 0.0079, "num_input_tokens_seen": 6785376, "step": 75360 }, { "epoch": 19.58549896049896, "grad_norm": 0.1594959944486618, "learning_rate": 6.547428084707485e-08, "loss": 0.0008, "num_input_tokens_seen": 6785808, "step": 75365 }, { "epoch": 19.58679833679834, "grad_norm": 1.8121371269226074, "learning_rate": 6.506486136052814e-08, "loss": 0.0495, "num_input_tokens_seen": 6786304, "step": 75370 }, { "epoch": 19.588097713097714, "grad_norm": 5.826635837554932, "learning_rate": 6.465672430158443e-08, "loss": 0.0165, "num_input_tokens_seen": 6786736, "step": 75375 }, { "epoch": 19.58939708939709, "grad_norm": 1.520161747932434, "learning_rate": 6.424986969124081e-08, "loss": 0.0218, "num_input_tokens_seen": 6787200, "step": 75380 }, { "epoch": 19.590696465696467, "grad_norm": 0.3225625157356262, "learning_rate": 6.384429755041665e-08, "loss": 0.0071, "num_input_tokens_seen": 6787632, "step": 75385 }, { "epoch": 19.591995841995843, "grad_norm": 0.08379609137773514, "learning_rate": 6.344000789997307e-08, "loss": 0.0014, "num_input_tokens_seen": 6788096, "step": 75390 }, { "epoch": 19.593295218295218, "grad_norm": 0.11181659996509552, "learning_rate": 6.303700076070173e-08, "loss": 0.0002, "num_input_tokens_seen": 6788544, "step": 75395 }, { "epoch": 19.594594594594593, "grad_norm": 0.09534186124801636, "learning_rate": 6.26352761533333e-08, "loss": 0.086, "num_input_tokens_seen": 6789024, "step": 75400 }, { "epoch": 19.59589397089397, "grad_norm": 0.007793752010911703, "learning_rate": 6.223483409852626e-08, "loss": 0.1483, "num_input_tokens_seen": 6789440, "step": 75405 }, { "epoch": 19.597193347193347, "grad_norm": 71.25291442871094, "learning_rate": 6.183567461687523e-08, "loss": 0.2518, "num_input_tokens_seen": 6789952, "step": 75410 }, { "epoch": 19.598492723492722, "grad_norm": 0.061081662774086, "learning_rate": 6.1437797728911e-08, "loss": 0.0756, "num_input_tokens_seen": 6790416, "step": 75415 }, { "epoch": 19.5997920997921, "grad_norm": 0.02215503342449665, "learning_rate": 6.1041203455095e-08, "loss": 0.1465, "num_input_tokens_seen": 6790912, "step": 75420 }, { "epoch": 19.601091476091476, "grad_norm": 36.35179901123047, "learning_rate": 6.064589181582481e-08, "loss": 0.0605, "num_input_tokens_seen": 6791360, "step": 75425 }, { "epoch": 19.60239085239085, "grad_norm": 82.17854309082031, "learning_rate": 6.025186283143136e-08, "loss": 0.2378, "num_input_tokens_seen": 6791792, "step": 75430 }, { "epoch": 19.60369022869023, "grad_norm": 0.0022998552303761244, "learning_rate": 5.985911652218179e-08, "loss": 0.182, "num_input_tokens_seen": 6792256, "step": 75435 }, { "epoch": 19.604989604989605, "grad_norm": 0.005404547788202763, "learning_rate": 5.946765290827383e-08, "loss": 0.0131, "num_input_tokens_seen": 6792704, "step": 75440 }, { "epoch": 19.60628898128898, "grad_norm": 0.13727104663848877, "learning_rate": 5.907747200984415e-08, "loss": 0.001, "num_input_tokens_seen": 6793136, "step": 75445 }, { "epoch": 19.60758835758836, "grad_norm": 0.6555811166763306, "learning_rate": 5.8688573846954474e-08, "loss": 0.001, "num_input_tokens_seen": 6793552, "step": 75450 }, { "epoch": 19.608887733887734, "grad_norm": 0.0017310174880549312, "learning_rate": 5.8300958439608254e-08, "loss": 0.0011, "num_input_tokens_seen": 6794016, "step": 75455 }, { "epoch": 19.61018711018711, "grad_norm": 0.25063496828079224, "learning_rate": 5.791462580774232e-08, "loss": 0.1093, "num_input_tokens_seen": 6794464, "step": 75460 }, { "epoch": 19.611486486486488, "grad_norm": 0.02047322504222393, "learning_rate": 5.7529575971226877e-08, "loss": 0.0077, "num_input_tokens_seen": 6794944, "step": 75465 }, { "epoch": 19.612785862785863, "grad_norm": 0.019945865496993065, "learning_rate": 5.7145808949865546e-08, "loss": 0.1612, "num_input_tokens_seen": 6795392, "step": 75470 }, { "epoch": 19.614085239085238, "grad_norm": 134.63380432128906, "learning_rate": 5.676332476339252e-08, "loss": 0.5882, "num_input_tokens_seen": 6795824, "step": 75475 }, { "epoch": 19.615384615384617, "grad_norm": 0.007088142447173595, "learning_rate": 5.638212343148097e-08, "loss": 0.0006, "num_input_tokens_seen": 6796288, "step": 75480 }, { "epoch": 19.616683991683992, "grad_norm": 0.36141952872276306, "learning_rate": 5.600220497373465e-08, "loss": 0.1714, "num_input_tokens_seen": 6796720, "step": 75485 }, { "epoch": 19.617983367983367, "grad_norm": 0.0479997843503952, "learning_rate": 5.562356940969904e-08, "loss": 0.0352, "num_input_tokens_seen": 6797216, "step": 75490 }, { "epoch": 19.619282744282746, "grad_norm": 109.01690673828125, "learning_rate": 5.5246216758841895e-08, "loss": 0.1328, "num_input_tokens_seen": 6797664, "step": 75495 }, { "epoch": 19.62058212058212, "grad_norm": 71.075927734375, "learning_rate": 5.48701470405727e-08, "loss": 0.2572, "num_input_tokens_seen": 6798112, "step": 75500 }, { "epoch": 19.621881496881496, "grad_norm": 0.20063376426696777, "learning_rate": 5.4495360274231524e-08, "loss": 0.0026, "num_input_tokens_seen": 6798544, "step": 75505 }, { "epoch": 19.623180873180875, "grad_norm": 0.0636635571718216, "learning_rate": 5.41218564790974e-08, "loss": 0.035, "num_input_tokens_seen": 6799008, "step": 75510 }, { "epoch": 19.62448024948025, "grad_norm": 0.02698618918657303, "learning_rate": 5.374963567437719e-08, "loss": 0.0008, "num_input_tokens_seen": 6799456, "step": 75515 }, { "epoch": 19.625779625779625, "grad_norm": 0.02131001278758049, "learning_rate": 5.3378697879216676e-08, "loss": 0.0046, "num_input_tokens_seen": 6799904, "step": 75520 }, { "epoch": 19.627079002079, "grad_norm": 0.042007412761449814, "learning_rate": 5.300904311269228e-08, "loss": 0.0103, "num_input_tokens_seen": 6800336, "step": 75525 }, { "epoch": 19.62837837837838, "grad_norm": 48.96070861816406, "learning_rate": 5.2640671393816566e-08, "loss": 0.1673, "num_input_tokens_seen": 6800816, "step": 75530 }, { "epoch": 19.629677754677754, "grad_norm": 0.022254912182688713, "learning_rate": 5.227358274153271e-08, "loss": 0.2626, "num_input_tokens_seen": 6801248, "step": 75535 }, { "epoch": 19.63097713097713, "grad_norm": 0.0008607202325947583, "learning_rate": 5.1907777174722835e-08, "loss": 0.0003, "num_input_tokens_seen": 6801728, "step": 75540 }, { "epoch": 19.632276507276508, "grad_norm": 0.003573288442566991, "learning_rate": 5.154325471220245e-08, "loss": 0.0016, "num_input_tokens_seen": 6802176, "step": 75545 }, { "epoch": 19.633575883575883, "grad_norm": 48.75678634643555, "learning_rate": 5.1180015372714886e-08, "loss": 0.283, "num_input_tokens_seen": 6802608, "step": 75550 }, { "epoch": 19.63487525987526, "grad_norm": 0.14296528697013855, "learning_rate": 5.081805917494242e-08, "loss": 0.0003, "num_input_tokens_seen": 6803072, "step": 75555 }, { "epoch": 19.636174636174637, "grad_norm": 0.36393269896507263, "learning_rate": 5.045738613750628e-08, "loss": 0.2331, "num_input_tokens_seen": 6803504, "step": 75560 }, { "epoch": 19.637474012474012, "grad_norm": 0.11486843973398209, "learning_rate": 5.009799627894718e-08, "loss": 0.1971, "num_input_tokens_seen": 6803936, "step": 75565 }, { "epoch": 19.638773388773387, "grad_norm": 42.711456298828125, "learning_rate": 4.973988961775866e-08, "loss": 0.0465, "num_input_tokens_seen": 6804368, "step": 75570 }, { "epoch": 19.640072765072766, "grad_norm": 0.05185222625732422, "learning_rate": 4.9383066172351e-08, "loss": 0.0292, "num_input_tokens_seen": 6804832, "step": 75575 }, { "epoch": 19.64137214137214, "grad_norm": 1.198215126991272, "learning_rate": 4.902752596107896e-08, "loss": 0.0761, "num_input_tokens_seen": 6805280, "step": 75580 }, { "epoch": 19.642671517671516, "grad_norm": 0.2144167572259903, "learning_rate": 4.867326900223068e-08, "loss": 0.0009, "num_input_tokens_seen": 6805776, "step": 75585 }, { "epoch": 19.643970893970895, "grad_norm": 32.17304992675781, "learning_rate": 4.832029531401938e-08, "loss": 0.2865, "num_input_tokens_seen": 6806224, "step": 75590 }, { "epoch": 19.64527027027027, "grad_norm": 15.77306842803955, "learning_rate": 4.7968604914605534e-08, "loss": 0.2045, "num_input_tokens_seen": 6806688, "step": 75595 }, { "epoch": 19.646569646569645, "grad_norm": 0.08795826137065887, "learning_rate": 4.761819782207466e-08, "loss": 0.4816, "num_input_tokens_seen": 6807136, "step": 75600 }, { "epoch": 19.647869022869024, "grad_norm": 4.14409065246582, "learning_rate": 4.726907405444569e-08, "loss": 0.0026, "num_input_tokens_seen": 6807584, "step": 75605 }, { "epoch": 19.6491683991684, "grad_norm": 0.04408526048064232, "learning_rate": 4.692123362967926e-08, "loss": 0.3166, "num_input_tokens_seen": 6808048, "step": 75610 }, { "epoch": 19.650467775467774, "grad_norm": 0.3515833914279938, "learning_rate": 4.657467656566106e-08, "loss": 0.1089, "num_input_tokens_seen": 6808480, "step": 75615 }, { "epoch": 19.651767151767153, "grad_norm": 0.04991752654314041, "learning_rate": 4.6229402880215713e-08, "loss": 0.119, "num_input_tokens_seen": 6808896, "step": 75620 }, { "epoch": 19.653066528066528, "grad_norm": 0.33611223101615906, "learning_rate": 4.5885412591104016e-08, "loss": 0.0025, "num_input_tokens_seen": 6809344, "step": 75625 }, { "epoch": 19.654365904365903, "grad_norm": 75.97476959228516, "learning_rate": 4.55427057160146e-08, "loss": 0.0943, "num_input_tokens_seen": 6809808, "step": 75630 }, { "epoch": 19.655665280665282, "grad_norm": 66.24229431152344, "learning_rate": 4.520128227257226e-08, "loss": 0.1022, "num_input_tokens_seen": 6810256, "step": 75635 }, { "epoch": 19.656964656964657, "grad_norm": 0.05969037488102913, "learning_rate": 4.4861142278340704e-08, "loss": 0.0413, "num_input_tokens_seen": 6810688, "step": 75640 }, { "epoch": 19.658264033264032, "grad_norm": 2.4284019470214844, "learning_rate": 4.452228575081152e-08, "loss": 0.0015, "num_input_tokens_seen": 6811152, "step": 75645 }, { "epoch": 19.65956340956341, "grad_norm": 0.08730456978082657, "learning_rate": 4.4184712707412426e-08, "loss": 0.0008, "num_input_tokens_seen": 6811600, "step": 75650 }, { "epoch": 19.660862785862786, "grad_norm": 0.24481067061424255, "learning_rate": 4.384842316550453e-08, "loss": 0.001, "num_input_tokens_seen": 6812016, "step": 75655 }, { "epoch": 19.66216216216216, "grad_norm": 0.06587683409452438, "learning_rate": 4.351341714238233e-08, "loss": 0.0009, "num_input_tokens_seen": 6812464, "step": 75660 }, { "epoch": 19.66346153846154, "grad_norm": 34.7009391784668, "learning_rate": 4.317969465527927e-08, "loss": 0.3542, "num_input_tokens_seen": 6812928, "step": 75665 }, { "epoch": 19.664760914760915, "grad_norm": 0.008628391660749912, "learning_rate": 4.2847255721356616e-08, "loss": 0.308, "num_input_tokens_seen": 6813392, "step": 75670 }, { "epoch": 19.66606029106029, "grad_norm": 0.0005988162010908127, "learning_rate": 4.251610035771181e-08, "loss": 0.0014, "num_input_tokens_seen": 6813840, "step": 75675 }, { "epoch": 19.66735966735967, "grad_norm": 6.3634114265441895, "learning_rate": 4.218622858137844e-08, "loss": 0.136, "num_input_tokens_seen": 6814320, "step": 75680 }, { "epoch": 19.668659043659044, "grad_norm": 0.2832840383052826, "learning_rate": 4.1857640409317946e-08, "loss": 0.1466, "num_input_tokens_seen": 6814816, "step": 75685 }, { "epoch": 19.66995841995842, "grad_norm": 0.03171202540397644, "learning_rate": 4.153033585843624e-08, "loss": 0.2458, "num_input_tokens_seen": 6815264, "step": 75690 }, { "epoch": 19.671257796257795, "grad_norm": 0.2419014722108841, "learning_rate": 4.120431494556154e-08, "loss": 0.0044, "num_input_tokens_seen": 6815728, "step": 75695 }, { "epoch": 19.672557172557173, "grad_norm": 0.014721276238560677, "learning_rate": 4.087957768746375e-08, "loss": 0.0549, "num_input_tokens_seen": 6816176, "step": 75700 }, { "epoch": 19.67385654885655, "grad_norm": 0.00223342957906425, "learning_rate": 4.055612410084342e-08, "loss": 0.0057, "num_input_tokens_seen": 6816624, "step": 75705 }, { "epoch": 19.675155925155924, "grad_norm": 0.8377135396003723, "learning_rate": 4.023395420233722e-08, "loss": 0.0007, "num_input_tokens_seen": 6817056, "step": 75710 }, { "epoch": 19.676455301455302, "grad_norm": 0.01528382208198309, "learning_rate": 3.9913068008512466e-08, "loss": 0.0073, "num_input_tokens_seen": 6817488, "step": 75715 }, { "epoch": 19.677754677754677, "grad_norm": 0.0017662591999396682, "learning_rate": 3.9593465535875396e-08, "loss": 0.0003, "num_input_tokens_seen": 6817968, "step": 75720 }, { "epoch": 19.679054054054053, "grad_norm": 0.010928395204246044, "learning_rate": 3.927514680086286e-08, "loss": 0.0019, "num_input_tokens_seen": 6818416, "step": 75725 }, { "epoch": 19.68035343035343, "grad_norm": 0.029271524399518967, "learning_rate": 3.89581118198451e-08, "loss": 0.0029, "num_input_tokens_seen": 6818864, "step": 75730 }, { "epoch": 19.681652806652806, "grad_norm": 0.3127504885196686, "learning_rate": 3.8642360609128516e-08, "loss": 0.0011, "num_input_tokens_seen": 6819328, "step": 75735 }, { "epoch": 19.68295218295218, "grad_norm": 0.07111038267612457, "learning_rate": 3.832789318495289e-08, "loss": 0.0006, "num_input_tokens_seen": 6819792, "step": 75740 }, { "epoch": 19.68425155925156, "grad_norm": 1.5383111238479614, "learning_rate": 3.8014709563488625e-08, "loss": 0.4826, "num_input_tokens_seen": 6820256, "step": 75745 }, { "epoch": 19.685550935550935, "grad_norm": 0.0851874053478241, "learning_rate": 3.7702809760847833e-08, "loss": 0.2011, "num_input_tokens_seen": 6820688, "step": 75750 }, { "epoch": 19.68685031185031, "grad_norm": 9.041690826416016, "learning_rate": 3.7392193793067684e-08, "loss": 0.0188, "num_input_tokens_seen": 6821088, "step": 75755 }, { "epoch": 19.68814968814969, "grad_norm": 57.52450180053711, "learning_rate": 3.708286167612707e-08, "loss": 0.039, "num_input_tokens_seen": 6821552, "step": 75760 }, { "epoch": 19.689449064449065, "grad_norm": 15.268380165100098, "learning_rate": 3.677481342592992e-08, "loss": 0.0093, "num_input_tokens_seen": 6822016, "step": 75765 }, { "epoch": 19.69074844074844, "grad_norm": 0.09382805228233337, "learning_rate": 3.646804905832468e-08, "loss": 0.195, "num_input_tokens_seen": 6822464, "step": 75770 }, { "epoch": 19.69204781704782, "grad_norm": 3.140293598175049, "learning_rate": 3.6162568589084845e-08, "loss": 0.5548, "num_input_tokens_seen": 6822912, "step": 75775 }, { "epoch": 19.693347193347194, "grad_norm": 0.010019992478191853, "learning_rate": 3.585837203392561e-08, "loss": 0.2642, "num_input_tokens_seen": 6823344, "step": 75780 }, { "epoch": 19.69464656964657, "grad_norm": 0.0005839387304149568, "learning_rate": 3.555545940848726e-08, "loss": 0.0004, "num_input_tokens_seen": 6823792, "step": 75785 }, { "epoch": 19.695945945945947, "grad_norm": 0.033343806862831116, "learning_rate": 3.525383072835453e-08, "loss": 0.0105, "num_input_tokens_seen": 6824224, "step": 75790 }, { "epoch": 19.697245322245323, "grad_norm": 0.6827938556671143, "learning_rate": 3.495348600903448e-08, "loss": 0.0004, "num_input_tokens_seen": 6824656, "step": 75795 }, { "epoch": 19.698544698544698, "grad_norm": 0.04949639365077019, "learning_rate": 3.4654425265978616e-08, "loss": 0.0059, "num_input_tokens_seen": 6825152, "step": 75800 }, { "epoch": 19.699844074844076, "grad_norm": 0.002148739527910948, "learning_rate": 3.435664851456632e-08, "loss": 0.1261, "num_input_tokens_seen": 6825600, "step": 75805 }, { "epoch": 19.70114345114345, "grad_norm": 9.060246467590332, "learning_rate": 3.406015577011312e-08, "loss": 0.2064, "num_input_tokens_seen": 6826048, "step": 75810 }, { "epoch": 19.702442827442827, "grad_norm": 0.5741959810256958, "learning_rate": 3.376494704786515e-08, "loss": 0.0035, "num_input_tokens_seen": 6826464, "step": 75815 }, { "epoch": 19.703742203742205, "grad_norm": 0.40757840871810913, "learning_rate": 3.347102236301025e-08, "loss": 0.0056, "num_input_tokens_seen": 6826880, "step": 75820 }, { "epoch": 19.70504158004158, "grad_norm": 0.13933241367340088, "learning_rate": 3.317838173066135e-08, "loss": 0.2354, "num_input_tokens_seen": 6827344, "step": 75825 }, { "epoch": 19.706340956340956, "grad_norm": 81.38285827636719, "learning_rate": 3.28870251658675e-08, "loss": 0.1935, "num_input_tokens_seen": 6827776, "step": 75830 }, { "epoch": 19.70764033264033, "grad_norm": 0.0031016722787171602, "learning_rate": 3.259695268361951e-08, "loss": 0.0107, "num_input_tokens_seen": 6828224, "step": 75835 }, { "epoch": 19.70893970893971, "grad_norm": 0.33564257621765137, "learning_rate": 3.230816429883321e-08, "loss": 0.0024, "num_input_tokens_seen": 6828656, "step": 75840 }, { "epoch": 19.710239085239085, "grad_norm": 17.639421463012695, "learning_rate": 3.2020660026360615e-08, "loss": 0.0162, "num_input_tokens_seen": 6829104, "step": 75845 }, { "epoch": 19.71153846153846, "grad_norm": 9.363804817199707, "learning_rate": 3.173443988098712e-08, "loss": 0.3247, "num_input_tokens_seen": 6829552, "step": 75850 }, { "epoch": 19.71283783783784, "grad_norm": 3.3624637126922607, "learning_rate": 3.144950387743428e-08, "loss": 0.0046, "num_input_tokens_seen": 6830016, "step": 75855 }, { "epoch": 19.714137214137214, "grad_norm": 0.040389880537986755, "learning_rate": 3.116585203035705e-08, "loss": 0.0574, "num_input_tokens_seen": 6830448, "step": 75860 }, { "epoch": 19.71543659043659, "grad_norm": 0.8176955580711365, "learning_rate": 3.0883484354346514e-08, "loss": 0.1859, "num_input_tokens_seen": 6830912, "step": 75865 }, { "epoch": 19.716735966735968, "grad_norm": 0.03956996649503708, "learning_rate": 3.060240086392163e-08, "loss": 0.0023, "num_input_tokens_seen": 6831440, "step": 75870 }, { "epoch": 19.718035343035343, "grad_norm": 0.1047387346625328, "learning_rate": 3.032260157354028e-08, "loss": 0.0068, "num_input_tokens_seen": 6831920, "step": 75875 }, { "epoch": 19.719334719334718, "grad_norm": 0.0993034690618515, "learning_rate": 3.004408649759094e-08, "loss": 0.0112, "num_input_tokens_seen": 6832320, "step": 75880 }, { "epoch": 19.720634095634097, "grad_norm": 0.003171202028170228, "learning_rate": 2.9766855650398273e-08, "loss": 0.0656, "num_input_tokens_seen": 6832768, "step": 75885 }, { "epoch": 19.721933471933472, "grad_norm": 0.0030553608667105436, "learning_rate": 2.9490909046225867e-08, "loss": 0.0072, "num_input_tokens_seen": 6833168, "step": 75890 }, { "epoch": 19.723232848232847, "grad_norm": 0.0053432476706802845, "learning_rate": 2.921624669925682e-08, "loss": 0.0004, "num_input_tokens_seen": 6833600, "step": 75895 }, { "epoch": 19.724532224532226, "grad_norm": 0.07762841880321503, "learning_rate": 2.8942868623624276e-08, "loss": 0.0002, "num_input_tokens_seen": 6834048, "step": 75900 }, { "epoch": 19.7258316008316, "grad_norm": 0.0073032071813941, "learning_rate": 2.8670774833386426e-08, "loss": 0.053, "num_input_tokens_seen": 6834496, "step": 75905 }, { "epoch": 19.727130977130976, "grad_norm": 1.6007219552993774, "learning_rate": 2.8399965342537637e-08, "loss": 0.0009, "num_input_tokens_seen": 6834944, "step": 75910 }, { "epoch": 19.728430353430355, "grad_norm": 3.086681842803955, "learning_rate": 2.813044016500288e-08, "loss": 0.0032, "num_input_tokens_seen": 6835360, "step": 75915 }, { "epoch": 19.72972972972973, "grad_norm": 0.0012634472222998738, "learning_rate": 2.786219931464884e-08, "loss": 0.0808, "num_input_tokens_seen": 6835824, "step": 75920 }, { "epoch": 19.731029106029105, "grad_norm": 4.4301838874816895, "learning_rate": 2.7595242805267262e-08, "loss": 0.0466, "num_input_tokens_seen": 6836288, "step": 75925 }, { "epoch": 19.732328482328484, "grad_norm": 36.67881774902344, "learning_rate": 2.7329570650591606e-08, "loss": 0.1236, "num_input_tokens_seen": 6836768, "step": 75930 }, { "epoch": 19.73362785862786, "grad_norm": 0.3964948356151581, "learning_rate": 2.7065182864283167e-08, "loss": 0.0459, "num_input_tokens_seen": 6837216, "step": 75935 }, { "epoch": 19.734927234927234, "grad_norm": 0.18780681490898132, "learning_rate": 2.68020794599394e-08, "loss": 0.0004, "num_input_tokens_seen": 6837696, "step": 75940 }, { "epoch": 19.736226611226613, "grad_norm": 0.010438330471515656, "learning_rate": 2.6540260451093922e-08, "loss": 0.085, "num_input_tokens_seen": 6838176, "step": 75945 }, { "epoch": 19.737525987525988, "grad_norm": 0.04173412173986435, "learning_rate": 2.6279725851208194e-08, "loss": 0.004, "num_input_tokens_seen": 6838592, "step": 75950 }, { "epoch": 19.738825363825363, "grad_norm": 0.008956675417721272, "learning_rate": 2.602047567368815e-08, "loss": 0.0005, "num_input_tokens_seen": 6839072, "step": 75955 }, { "epoch": 19.74012474012474, "grad_norm": 2.1792218685150146, "learning_rate": 2.5762509931862023e-08, "loss": 0.0017, "num_input_tokens_seen": 6839536, "step": 75960 }, { "epoch": 19.741424116424117, "grad_norm": 0.5391731858253479, "learning_rate": 2.5505828639002527e-08, "loss": 0.1345, "num_input_tokens_seen": 6839968, "step": 75965 }, { "epoch": 19.742723492723492, "grad_norm": 0.8170632123947144, "learning_rate": 2.5250431808304665e-08, "loss": 0.0043, "num_input_tokens_seen": 6840384, "step": 75970 }, { "epoch": 19.74402286902287, "grad_norm": 0.02564174495637417, "learning_rate": 2.4996319452907925e-08, "loss": 0.0031, "num_input_tokens_seen": 6840832, "step": 75975 }, { "epoch": 19.745322245322246, "grad_norm": 0.5864288806915283, "learning_rate": 2.474349158587963e-08, "loss": 0.0558, "num_input_tokens_seen": 6841296, "step": 75980 }, { "epoch": 19.74662162162162, "grad_norm": 59.910274505615234, "learning_rate": 2.449194822022327e-08, "loss": 0.1946, "num_input_tokens_seen": 6841760, "step": 75985 }, { "epoch": 19.747920997921, "grad_norm": 8.957484245300293, "learning_rate": 2.4241689368878494e-08, "loss": 0.0609, "num_input_tokens_seen": 6842240, "step": 75990 }, { "epoch": 19.749220374220375, "grad_norm": 0.040742430835962296, "learning_rate": 2.3992715044710012e-08, "loss": 0.1413, "num_input_tokens_seen": 6842704, "step": 75995 }, { "epoch": 19.75051975051975, "grad_norm": 0.19772343337535858, "learning_rate": 2.374502526053257e-08, "loss": 0.0078, "num_input_tokens_seen": 6843120, "step": 76000 }, { "epoch": 19.751819126819125, "grad_norm": 0.024444641545414925, "learning_rate": 2.349862002907488e-08, "loss": 0.171, "num_input_tokens_seen": 6843584, "step": 76005 }, { "epoch": 19.753118503118504, "grad_norm": 0.019068017601966858, "learning_rate": 2.325349936301846e-08, "loss": 0.0009, "num_input_tokens_seen": 6844032, "step": 76010 }, { "epoch": 19.75441787941788, "grad_norm": 0.003451975993812084, "learning_rate": 2.300966327496157e-08, "loss": 0.4221, "num_input_tokens_seen": 6844512, "step": 76015 }, { "epoch": 19.755717255717254, "grad_norm": 0.37686023116111755, "learning_rate": 2.27671117774525e-08, "loss": 0.0592, "num_input_tokens_seen": 6844928, "step": 76020 }, { "epoch": 19.757016632016633, "grad_norm": 0.024353958666324615, "learning_rate": 2.2525844882964607e-08, "loss": 0.0016, "num_input_tokens_seen": 6845376, "step": 76025 }, { "epoch": 19.758316008316008, "grad_norm": 1.2518504858016968, "learning_rate": 2.2285862603901865e-08, "loss": 0.0011, "num_input_tokens_seen": 6845856, "step": 76030 }, { "epoch": 19.759615384615383, "grad_norm": 0.0025642341934144497, "learning_rate": 2.2047164952609944e-08, "loss": 0.0067, "num_input_tokens_seen": 6846320, "step": 76035 }, { "epoch": 19.760914760914762, "grad_norm": 1.515264868736267, "learning_rate": 2.1809751941365142e-08, "loss": 0.0047, "num_input_tokens_seen": 6846752, "step": 76040 }, { "epoch": 19.762214137214137, "grad_norm": 0.018656276166439056, "learning_rate": 2.1573623582377133e-08, "loss": 0.0018, "num_input_tokens_seen": 6847232, "step": 76045 }, { "epoch": 19.763513513513512, "grad_norm": 0.028752904385328293, "learning_rate": 2.1338779887794534e-08, "loss": 0.0017, "num_input_tokens_seen": 6847664, "step": 76050 }, { "epoch": 19.76481288981289, "grad_norm": 0.001549256849102676, "learning_rate": 2.1105220869688246e-08, "loss": 0.0627, "num_input_tokens_seen": 6848080, "step": 76055 }, { "epoch": 19.766112266112266, "grad_norm": 0.06300808489322662, "learning_rate": 2.0872946540076433e-08, "loss": 0.0167, "num_input_tokens_seen": 6848576, "step": 76060 }, { "epoch": 19.76741164241164, "grad_norm": 0.0022962531074881554, "learning_rate": 2.064195691089954e-08, "loss": 0.1462, "num_input_tokens_seen": 6849008, "step": 76065 }, { "epoch": 19.76871101871102, "grad_norm": 0.0018657840555533767, "learning_rate": 2.0412251994042508e-08, "loss": 0.0903, "num_input_tokens_seen": 6849488, "step": 76070 }, { "epoch": 19.770010395010395, "grad_norm": 0.002074823947623372, "learning_rate": 2.018383180131811e-08, "loss": 0.001, "num_input_tokens_seen": 6849920, "step": 76075 }, { "epoch": 19.77130977130977, "grad_norm": 47.669044494628906, "learning_rate": 1.995669634447528e-08, "loss": 0.044, "num_input_tokens_seen": 6850368, "step": 76080 }, { "epoch": 19.77260914760915, "grad_norm": 0.005666770972311497, "learning_rate": 1.9730845635190788e-08, "loss": 0.0139, "num_input_tokens_seen": 6850800, "step": 76085 }, { "epoch": 19.773908523908524, "grad_norm": 0.19182319939136505, "learning_rate": 1.950627968508589e-08, "loss": 0.0009, "num_input_tokens_seen": 6851280, "step": 76090 }, { "epoch": 19.7752079002079, "grad_norm": 1.026369571685791, "learning_rate": 1.9282998505709693e-08, "loss": 0.1665, "num_input_tokens_seen": 6851696, "step": 76095 }, { "epoch": 19.776507276507278, "grad_norm": 0.005626131314784288, "learning_rate": 1.906100210854189e-08, "loss": 0.0001, "num_input_tokens_seen": 6852128, "step": 76100 }, { "epoch": 19.777806652806653, "grad_norm": 0.04511838033795357, "learning_rate": 1.8840290505001134e-08, "loss": 0.117, "num_input_tokens_seen": 6852608, "step": 76105 }, { "epoch": 19.77910602910603, "grad_norm": 0.11559414118528366, "learning_rate": 1.8620863706442228e-08, "loss": 0.0005, "num_input_tokens_seen": 6853040, "step": 76110 }, { "epoch": 19.780405405405407, "grad_norm": 0.007827413268387318, "learning_rate": 1.840272172414781e-08, "loss": 0.0001, "num_input_tokens_seen": 6853472, "step": 76115 }, { "epoch": 19.781704781704782, "grad_norm": 0.017876869067549706, "learning_rate": 1.8185864569336687e-08, "loss": 0.2944, "num_input_tokens_seen": 6853904, "step": 76120 }, { "epoch": 19.783004158004157, "grad_norm": 1.0468409061431885, "learning_rate": 1.79702922531666e-08, "loss": 0.1953, "num_input_tokens_seen": 6854384, "step": 76125 }, { "epoch": 19.784303534303533, "grad_norm": 0.005556178744882345, "learning_rate": 1.7756004786717572e-08, "loss": 0.1222, "num_input_tokens_seen": 6854816, "step": 76130 }, { "epoch": 19.78560291060291, "grad_norm": 38.64916229248047, "learning_rate": 1.7543002181014125e-08, "loss": 0.039, "num_input_tokens_seen": 6855264, "step": 76135 }, { "epoch": 19.786902286902286, "grad_norm": 1.0946499109268188, "learning_rate": 1.7331284447011377e-08, "loss": 0.0031, "num_input_tokens_seen": 6855696, "step": 76140 }, { "epoch": 19.78820166320166, "grad_norm": 0.937355637550354, "learning_rate": 1.712085159559784e-08, "loss": 0.128, "num_input_tokens_seen": 6856144, "step": 76145 }, { "epoch": 19.78950103950104, "grad_norm": 0.053353920578956604, "learning_rate": 1.69117036375982e-08, "loss": 0.5711, "num_input_tokens_seen": 6856592, "step": 76150 }, { "epoch": 19.790800415800415, "grad_norm": 3.909071445465088, "learning_rate": 1.670384058376495e-08, "loss": 0.0028, "num_input_tokens_seen": 6857056, "step": 76155 }, { "epoch": 19.79209979209979, "grad_norm": 0.27138590812683105, "learning_rate": 1.6497262444792326e-08, "loss": 0.0715, "num_input_tokens_seen": 6857488, "step": 76160 }, { "epoch": 19.79339916839917, "grad_norm": 0.08089489489793777, "learning_rate": 1.629196923130516e-08, "loss": 0.1582, "num_input_tokens_seen": 6857936, "step": 76165 }, { "epoch": 19.794698544698544, "grad_norm": 0.013633144088089466, "learning_rate": 1.608796095385612e-08, "loss": 0.0332, "num_input_tokens_seen": 6858384, "step": 76170 }, { "epoch": 19.79599792099792, "grad_norm": 1.7863194942474365, "learning_rate": 1.5885237622945136e-08, "loss": 0.0243, "num_input_tokens_seen": 6858816, "step": 76175 }, { "epoch": 19.7972972972973, "grad_norm": 3.098716974258423, "learning_rate": 1.5683799248994436e-08, "loss": 0.0034, "num_input_tokens_seen": 6859248, "step": 76180 }, { "epoch": 19.798596673596673, "grad_norm": 0.06169237941503525, "learning_rate": 1.5483645842362392e-08, "loss": 0.0144, "num_input_tokens_seen": 6859664, "step": 76185 }, { "epoch": 19.79989604989605, "grad_norm": 0.2861577570438385, "learning_rate": 1.5284777413349106e-08, "loss": 0.0049, "num_input_tokens_seen": 6860128, "step": 76190 }, { "epoch": 19.801195426195427, "grad_norm": 64.45181274414062, "learning_rate": 1.508719397217695e-08, "loss": 0.1375, "num_input_tokens_seen": 6860624, "step": 76195 }, { "epoch": 19.802494802494802, "grad_norm": 0.07043952494859695, "learning_rate": 1.4890895529010019e-08, "loss": 0.0974, "num_input_tokens_seen": 6861088, "step": 76200 }, { "epoch": 19.803794178794178, "grad_norm": 0.0034007439389824867, "learning_rate": 1.4695882093943015e-08, "loss": 0.0001, "num_input_tokens_seen": 6861536, "step": 76205 }, { "epoch": 19.805093555093556, "grad_norm": 0.008534288965165615, "learning_rate": 1.4502153677006802e-08, "loss": 0.1276, "num_input_tokens_seen": 6861968, "step": 76210 }, { "epoch": 19.80639293139293, "grad_norm": 6.2757344245910645, "learning_rate": 1.4309710288165634e-08, "loss": 0.0076, "num_input_tokens_seen": 6862400, "step": 76215 }, { "epoch": 19.807692307692307, "grad_norm": 2.0583269596099854, "learning_rate": 1.4118551937314372e-08, "loss": 0.0028, "num_input_tokens_seen": 6862848, "step": 76220 }, { "epoch": 19.808991683991685, "grad_norm": 0.007184094283729792, "learning_rate": 1.3928678634289593e-08, "loss": 0.0002, "num_input_tokens_seen": 6863296, "step": 76225 }, { "epoch": 19.81029106029106, "grad_norm": 1.3504440784454346, "learning_rate": 1.3740090388850158e-08, "loss": 0.0014, "num_input_tokens_seen": 6863712, "step": 76230 }, { "epoch": 19.811590436590436, "grad_norm": 0.7735159397125244, "learning_rate": 1.3552787210699413e-08, "loss": 0.0011, "num_input_tokens_seen": 6864144, "step": 76235 }, { "epoch": 19.812889812889814, "grad_norm": 0.01667802222073078, "learning_rate": 1.3366769109471321e-08, "loss": 0.0002, "num_input_tokens_seen": 6864576, "step": 76240 }, { "epoch": 19.81418918918919, "grad_norm": 0.046621739864349365, "learning_rate": 1.3182036094730454e-08, "loss": 0.3562, "num_input_tokens_seen": 6865024, "step": 76245 }, { "epoch": 19.815488565488565, "grad_norm": 0.10767251253128052, "learning_rate": 1.2998588175977544e-08, "loss": 0.0005, "num_input_tokens_seen": 6865504, "step": 76250 }, { "epoch": 19.816787941787943, "grad_norm": 0.020195914432406425, "learning_rate": 1.2816425362649487e-08, "loss": 0.0002, "num_input_tokens_seen": 6865968, "step": 76255 }, { "epoch": 19.81808731808732, "grad_norm": 0.0551580935716629, "learning_rate": 1.263554766411379e-08, "loss": 0.0618, "num_input_tokens_seen": 6866432, "step": 76260 }, { "epoch": 19.819386694386694, "grad_norm": 0.024159666150808334, "learning_rate": 1.245595508967412e-08, "loss": 0.0001, "num_input_tokens_seen": 6866896, "step": 76265 }, { "epoch": 19.820686070686072, "grad_norm": 0.008944054134190083, "learning_rate": 1.2277647648567537e-08, "loss": 0.0001, "num_input_tokens_seen": 6867376, "step": 76270 }, { "epoch": 19.821985446985448, "grad_norm": 0.130115807056427, "learning_rate": 1.2100625349961702e-08, "loss": 0.2063, "num_input_tokens_seen": 6867808, "step": 76275 }, { "epoch": 19.823284823284823, "grad_norm": 0.08914625644683838, "learning_rate": 1.1924888202963224e-08, "loss": 0.0066, "num_input_tokens_seen": 6868272, "step": 76280 }, { "epoch": 19.8245841995842, "grad_norm": 0.18802811205387115, "learning_rate": 1.1750436216612092e-08, "loss": 0.0853, "num_input_tokens_seen": 6868752, "step": 76285 }, { "epoch": 19.825883575883577, "grad_norm": 21.862146377563477, "learning_rate": 1.1577269399876135e-08, "loss": 0.0533, "num_input_tokens_seen": 6869184, "step": 76290 }, { "epoch": 19.82718295218295, "grad_norm": 0.0015638775657862425, "learning_rate": 1.1405387761664887e-08, "loss": 0.0023, "num_input_tokens_seen": 6869648, "step": 76295 }, { "epoch": 19.828482328482327, "grad_norm": 0.01472054049372673, "learning_rate": 1.1234791310818504e-08, "loss": 0.0003, "num_input_tokens_seen": 6870080, "step": 76300 }, { "epoch": 19.829781704781706, "grad_norm": 0.1377822309732437, "learning_rate": 1.1065480056110522e-08, "loss": 0.0026, "num_input_tokens_seen": 6870528, "step": 76305 }, { "epoch": 19.83108108108108, "grad_norm": 0.04533276706933975, "learning_rate": 1.0897454006245089e-08, "loss": 0.0223, "num_input_tokens_seen": 6870944, "step": 76310 }, { "epoch": 19.832380457380456, "grad_norm": 0.5655477643013, "learning_rate": 1.0730713169868067e-08, "loss": 0.0105, "num_input_tokens_seen": 6871392, "step": 76315 }, { "epoch": 19.833679833679835, "grad_norm": 0.005642004776746035, "learning_rate": 1.056525755555593e-08, "loss": 0.0081, "num_input_tokens_seen": 6871840, "step": 76320 }, { "epoch": 19.83497920997921, "grad_norm": 0.04861006513237953, "learning_rate": 1.040108717181576e-08, "loss": 0.0007, "num_input_tokens_seen": 6872304, "step": 76325 }, { "epoch": 19.836278586278585, "grad_norm": 0.16984646022319794, "learning_rate": 1.0238202027090804e-08, "loss": 0.0081, "num_input_tokens_seen": 6872736, "step": 76330 }, { "epoch": 19.837577962577964, "grad_norm": 0.0027567429933696985, "learning_rate": 1.0076602129757696e-08, "loss": 0.0765, "num_input_tokens_seen": 6873184, "step": 76335 }, { "epoch": 19.83887733887734, "grad_norm": 0.007887603715062141, "learning_rate": 9.916287488132003e-09, "loss": 0.0005, "num_input_tokens_seen": 6873616, "step": 76340 }, { "epoch": 19.840176715176714, "grad_norm": 0.021708304062485695, "learning_rate": 9.757258110454359e-09, "loss": 0.0001, "num_input_tokens_seen": 6874080, "step": 76345 }, { "epoch": 19.841476091476093, "grad_norm": 33.52342224121094, "learning_rate": 9.599514004904331e-09, "loss": 0.0367, "num_input_tokens_seen": 6874528, "step": 76350 }, { "epoch": 19.842775467775468, "grad_norm": 6.67249870300293, "learning_rate": 9.443055179597648e-09, "loss": 0.0984, "num_input_tokens_seen": 6874944, "step": 76355 }, { "epoch": 19.844074844074843, "grad_norm": 0.0899181142449379, "learning_rate": 9.287881642577878e-09, "loss": 0.0713, "num_input_tokens_seen": 6875424, "step": 76360 }, { "epoch": 19.84537422037422, "grad_norm": 1.4736601114273071, "learning_rate": 9.133993401830298e-09, "loss": 0.0129, "num_input_tokens_seen": 6875856, "step": 76365 }, { "epoch": 19.846673596673597, "grad_norm": 0.013236316852271557, "learning_rate": 8.981390465262474e-09, "loss": 0.0001, "num_input_tokens_seen": 6876304, "step": 76370 }, { "epoch": 19.847972972972972, "grad_norm": 0.0159183070063591, "learning_rate": 8.83007284072923e-09, "loss": 0.3114, "num_input_tokens_seen": 6876736, "step": 76375 }, { "epoch": 19.84927234927235, "grad_norm": 0.002536606742069125, "learning_rate": 8.680040536010458e-09, "loss": 0.0009, "num_input_tokens_seen": 6877168, "step": 76380 }, { "epoch": 19.850571725571726, "grad_norm": 0.08129555732011795, "learning_rate": 8.531293558824982e-09, "loss": 0.0008, "num_input_tokens_seen": 6877616, "step": 76385 }, { "epoch": 19.8518711018711, "grad_norm": 34.34809494018555, "learning_rate": 8.383831916816686e-09, "loss": 0.0193, "num_input_tokens_seen": 6878080, "step": 76390 }, { "epoch": 19.85317047817048, "grad_norm": 0.368803471326828, "learning_rate": 8.237655617576723e-09, "loss": 0.0249, "num_input_tokens_seen": 6878544, "step": 76395 }, { "epoch": 19.854469854469855, "grad_norm": 0.07588983327150345, "learning_rate": 8.092764668618524e-09, "loss": 0.0025, "num_input_tokens_seen": 6878976, "step": 76400 }, { "epoch": 19.85576923076923, "grad_norm": 0.04747232049703598, "learning_rate": 7.949159077397238e-09, "loss": 0.0219, "num_input_tokens_seen": 6879456, "step": 76405 }, { "epoch": 19.85706860706861, "grad_norm": 63.34657669067383, "learning_rate": 7.80683885129585e-09, "loss": 0.0785, "num_input_tokens_seen": 6879888, "step": 76410 }, { "epoch": 19.858367983367984, "grad_norm": 0.2450868785381317, "learning_rate": 7.665803997633503e-09, "loss": 0.0053, "num_input_tokens_seen": 6880352, "step": 76415 }, { "epoch": 19.85966735966736, "grad_norm": 0.2626173496246338, "learning_rate": 7.52605452366828e-09, "loss": 0.0046, "num_input_tokens_seen": 6880768, "step": 76420 }, { "epoch": 19.860966735966738, "grad_norm": 0.014178267680108547, "learning_rate": 7.387590436583325e-09, "loss": 0.0644, "num_input_tokens_seen": 6881168, "step": 76425 }, { "epoch": 19.862266112266113, "grad_norm": 0.07839421182870865, "learning_rate": 7.250411743500718e-09, "loss": 0.0005, "num_input_tokens_seen": 6881632, "step": 76430 }, { "epoch": 19.863565488565488, "grad_norm": 0.011426420882344246, "learning_rate": 7.114518451478702e-09, "loss": 0.0013, "num_input_tokens_seen": 6882096, "step": 76435 }, { "epoch": 19.864864864864863, "grad_norm": 0.5796525478363037, "learning_rate": 6.979910567500581e-09, "loss": 0.0841, "num_input_tokens_seen": 6882544, "step": 76440 }, { "epoch": 19.866164241164242, "grad_norm": 0.00087803287897259, "learning_rate": 6.8465880984941444e-09, "loss": 0.1963, "num_input_tokens_seen": 6883008, "step": 76445 }, { "epoch": 19.867463617463617, "grad_norm": 30.729921340942383, "learning_rate": 6.714551051317796e-09, "loss": 0.0197, "num_input_tokens_seen": 6883456, "step": 76450 }, { "epoch": 19.868762993762992, "grad_norm": 0.011487183161079884, "learning_rate": 6.583799432755e-09, "loss": 0.0011, "num_input_tokens_seen": 6883936, "step": 76455 }, { "epoch": 19.87006237006237, "grad_norm": 0.008133826777338982, "learning_rate": 6.454333249536482e-09, "loss": 0.0995, "num_input_tokens_seen": 6884448, "step": 76460 }, { "epoch": 19.871361746361746, "grad_norm": 0.001176060875877738, "learning_rate": 6.326152508320804e-09, "loss": 0.0009, "num_input_tokens_seen": 6884960, "step": 76465 }, { "epoch": 19.87266112266112, "grad_norm": 0.371488481760025, "learning_rate": 6.199257215697141e-09, "loss": 0.0172, "num_input_tokens_seen": 6885472, "step": 76470 }, { "epoch": 19.8739604989605, "grad_norm": 0.03551585599780083, "learning_rate": 6.073647378196379e-09, "loss": 0.0822, "num_input_tokens_seen": 6885888, "step": 76475 }, { "epoch": 19.875259875259875, "grad_norm": 0.0013912991853430867, "learning_rate": 5.94932300227169e-09, "loss": 0.2098, "num_input_tokens_seen": 6886368, "step": 76480 }, { "epoch": 19.87655925155925, "grad_norm": 0.011977690272033215, "learning_rate": 5.8262840943235085e-09, "loss": 0.0026, "num_input_tokens_seen": 6886816, "step": 76485 }, { "epoch": 19.87785862785863, "grad_norm": 0.004894776735454798, "learning_rate": 5.7045306606801075e-09, "loss": 0.3259, "num_input_tokens_seen": 6887248, "step": 76490 }, { "epoch": 19.879158004158004, "grad_norm": 0.14505018293857574, "learning_rate": 5.584062707597593e-09, "loss": 0.0005, "num_input_tokens_seen": 6887728, "step": 76495 }, { "epoch": 19.88045738045738, "grad_norm": 0.020680008456110954, "learning_rate": 5.46488024127656e-09, "loss": 0.1159, "num_input_tokens_seen": 6888160, "step": 76500 }, { "epoch": 19.881756756756758, "grad_norm": 0.3769668936729431, "learning_rate": 5.34698326784544e-09, "loss": 0.2307, "num_input_tokens_seen": 6888640, "step": 76505 }, { "epoch": 19.883056133056133, "grad_norm": 0.01838027313351631, "learning_rate": 5.230371793368827e-09, "loss": 0.0098, "num_input_tokens_seen": 6889072, "step": 76510 }, { "epoch": 19.884355509355508, "grad_norm": 0.020145704969763756, "learning_rate": 5.115045823841924e-09, "loss": 0.002, "num_input_tokens_seen": 6889504, "step": 76515 }, { "epoch": 19.885654885654887, "grad_norm": 0.015768464654684067, "learning_rate": 5.001005365196098e-09, "loss": 0.0234, "num_input_tokens_seen": 6889936, "step": 76520 }, { "epoch": 19.886954261954262, "grad_norm": 0.0024539967998862267, "learning_rate": 4.888250423298879e-09, "loss": 0.0001, "num_input_tokens_seen": 6890384, "step": 76525 }, { "epoch": 19.888253638253637, "grad_norm": 0.0855599120259285, "learning_rate": 4.776781003948405e-09, "loss": 0.0055, "num_input_tokens_seen": 6890800, "step": 76530 }, { "epoch": 19.889553014553016, "grad_norm": 0.0006013945094309747, "learning_rate": 4.666597112876203e-09, "loss": 0.0007, "num_input_tokens_seen": 6891264, "step": 76535 }, { "epoch": 19.89085239085239, "grad_norm": 90.3938217163086, "learning_rate": 4.557698755749962e-09, "loss": 0.1765, "num_input_tokens_seen": 6891712, "step": 76540 }, { "epoch": 19.892151767151766, "grad_norm": 0.024628493934869766, "learning_rate": 4.450085938170756e-09, "loss": 0.028, "num_input_tokens_seen": 6892160, "step": 76545 }, { "epoch": 19.893451143451145, "grad_norm": 0.004909084644168615, "learning_rate": 4.3437586656758236e-09, "loss": 0.003, "num_input_tokens_seen": 6892640, "step": 76550 }, { "epoch": 19.89475051975052, "grad_norm": 0.00357419578358531, "learning_rate": 4.238716943727461e-09, "loss": 0.0046, "num_input_tokens_seen": 6893136, "step": 76555 }, { "epoch": 19.896049896049895, "grad_norm": 0.5501202940940857, "learning_rate": 4.1349607777352305e-09, "loss": 0.0007, "num_input_tokens_seen": 6893584, "step": 76560 }, { "epoch": 19.897349272349274, "grad_norm": 0.5884853601455688, "learning_rate": 4.032490173030978e-09, "loss": 0.0018, "num_input_tokens_seen": 6894048, "step": 76565 }, { "epoch": 19.89864864864865, "grad_norm": 66.85023498535156, "learning_rate": 3.931305134882712e-09, "loss": 0.274, "num_input_tokens_seen": 6894496, "step": 76570 }, { "epoch": 19.899948024948024, "grad_norm": 0.0022508702240884304, "learning_rate": 3.831405668500154e-09, "loss": 0.0008, "num_input_tokens_seen": 6894944, "step": 76575 }, { "epoch": 19.901247401247403, "grad_norm": 59.804901123046875, "learning_rate": 3.732791779018086e-09, "loss": 0.4099, "num_input_tokens_seen": 6895376, "step": 76580 }, { "epoch": 19.902546777546778, "grad_norm": 0.06754239648580551, "learning_rate": 3.6354634715102255e-09, "loss": 0.0253, "num_input_tokens_seen": 6895808, "step": 76585 }, { "epoch": 19.903846153846153, "grad_norm": 98.21891784667969, "learning_rate": 3.5394207509781287e-09, "loss": 0.3439, "num_input_tokens_seen": 6896256, "step": 76590 }, { "epoch": 19.90514553014553, "grad_norm": 0.2664313018321991, "learning_rate": 3.444663622365063e-09, "loss": 0.0131, "num_input_tokens_seen": 6896736, "step": 76595 }, { "epoch": 19.906444906444907, "grad_norm": 64.65585327148438, "learning_rate": 3.351192090544908e-09, "loss": 0.2191, "num_input_tokens_seen": 6897216, "step": 76600 }, { "epoch": 19.907744282744282, "grad_norm": 0.021010195836424828, "learning_rate": 3.2590061603221535e-09, "loss": 0.2251, "num_input_tokens_seen": 6897648, "step": 76605 }, { "epoch": 19.909043659043657, "grad_norm": 0.380667120218277, "learning_rate": 3.1681058364402272e-09, "loss": 0.1483, "num_input_tokens_seen": 6898096, "step": 76610 }, { "epoch": 19.910343035343036, "grad_norm": 0.009355886839330196, "learning_rate": 3.078491123573168e-09, "loss": 0.0005, "num_input_tokens_seen": 6898560, "step": 76615 }, { "epoch": 19.91164241164241, "grad_norm": 0.027063211426138878, "learning_rate": 2.9901620263284026e-09, "loss": 0.0055, "num_input_tokens_seen": 6898992, "step": 76620 }, { "epoch": 19.912941787941786, "grad_norm": 0.072351835668087, "learning_rate": 2.9031185492522926e-09, "loss": 0.1088, "num_input_tokens_seen": 6899424, "step": 76625 }, { "epoch": 19.914241164241165, "grad_norm": 0.013066272251307964, "learning_rate": 2.817360696819038e-09, "loss": 0.0001, "num_input_tokens_seen": 6899872, "step": 76630 }, { "epoch": 19.91554054054054, "grad_norm": 0.0035372821148484945, "learning_rate": 2.732888473441775e-09, "loss": 0.0016, "num_input_tokens_seen": 6900288, "step": 76635 }, { "epoch": 19.916839916839916, "grad_norm": 0.32420000433921814, "learning_rate": 2.649701883461475e-09, "loss": 0.3209, "num_input_tokens_seen": 6900784, "step": 76640 }, { "epoch": 19.918139293139294, "grad_norm": 0.003321946132928133, "learning_rate": 2.5678009311608243e-09, "loss": 0.0545, "num_input_tokens_seen": 6901232, "step": 76645 }, { "epoch": 19.91943866943867, "grad_norm": 0.4787004888057709, "learning_rate": 2.4871856207475673e-09, "loss": 0.0007, "num_input_tokens_seen": 6901728, "step": 76650 }, { "epoch": 19.920738045738045, "grad_norm": 56.96125793457031, "learning_rate": 2.407855956368388e-09, "loss": 0.2168, "num_input_tokens_seen": 6902176, "step": 76655 }, { "epoch": 19.922037422037423, "grad_norm": 0.016375860199332237, "learning_rate": 2.329811942108906e-09, "loss": 0.0013, "num_input_tokens_seen": 6902624, "step": 76660 }, { "epoch": 19.9233367983368, "grad_norm": 63.907833099365234, "learning_rate": 2.2530535819742514e-09, "loss": 0.2307, "num_input_tokens_seen": 6903072, "step": 76665 }, { "epoch": 19.924636174636174, "grad_norm": 102.11640167236328, "learning_rate": 2.177580879919594e-09, "loss": 0.357, "num_input_tokens_seen": 6903504, "step": 76670 }, { "epoch": 19.925935550935552, "grad_norm": 1.4456326961517334, "learning_rate": 2.1033938398223872e-09, "loss": 0.0026, "num_input_tokens_seen": 6904000, "step": 76675 }, { "epoch": 19.927234927234927, "grad_norm": 0.33902794122695923, "learning_rate": 2.0304924655017986e-09, "loss": 0.0004, "num_input_tokens_seen": 6904432, "step": 76680 }, { "epoch": 19.928534303534303, "grad_norm": 37.276885986328125, "learning_rate": 1.9588767607020553e-09, "loss": 0.0289, "num_input_tokens_seen": 6904960, "step": 76685 }, { "epoch": 19.92983367983368, "grad_norm": 4.889704704284668, "learning_rate": 1.8885467291090973e-09, "loss": 0.0048, "num_input_tokens_seen": 6905440, "step": 76690 }, { "epoch": 19.931133056133056, "grad_norm": 0.08753594011068344, "learning_rate": 1.8195023743422523e-09, "loss": 0.0693, "num_input_tokens_seen": 6905888, "step": 76695 }, { "epoch": 19.93243243243243, "grad_norm": 0.004811066202819347, "learning_rate": 1.7517436999486825e-09, "loss": 0.0006, "num_input_tokens_seen": 6906352, "step": 76700 }, { "epoch": 19.93373180873181, "grad_norm": 1.8247960805892944, "learning_rate": 1.6852707094172636e-09, "loss": 0.0036, "num_input_tokens_seen": 6906816, "step": 76705 }, { "epoch": 19.935031185031185, "grad_norm": 25.158626556396484, "learning_rate": 1.620083406161932e-09, "loss": 0.0137, "num_input_tokens_seen": 6907248, "step": 76710 }, { "epoch": 19.93633056133056, "grad_norm": 0.635384738445282, "learning_rate": 1.5561817935411116e-09, "loss": 0.0051, "num_input_tokens_seen": 6907712, "step": 76715 }, { "epoch": 19.93762993762994, "grad_norm": 3.0074410438537598, "learning_rate": 1.493565874835512e-09, "loss": 0.4065, "num_input_tokens_seen": 6908192, "step": 76720 }, { "epoch": 19.938929313929314, "grad_norm": 0.20129163563251495, "learning_rate": 1.4322356532703308e-09, "loss": 0.0016, "num_input_tokens_seen": 6908640, "step": 76725 }, { "epoch": 19.94022869022869, "grad_norm": 49.558902740478516, "learning_rate": 1.3721911319958258e-09, "loss": 0.0769, "num_input_tokens_seen": 6909088, "step": 76730 }, { "epoch": 19.941528066528065, "grad_norm": 0.18203219771385193, "learning_rate": 1.3134323141039683e-09, "loss": 0.1907, "num_input_tokens_seen": 6909536, "step": 76735 }, { "epoch": 19.942827442827443, "grad_norm": 52.608455657958984, "learning_rate": 1.255959202614565e-09, "loss": 0.1245, "num_input_tokens_seen": 6910000, "step": 76740 }, { "epoch": 19.94412681912682, "grad_norm": 0.19773463904857635, "learning_rate": 1.199771800480809e-09, "loss": 0.0234, "num_input_tokens_seen": 6910432, "step": 76745 }, { "epoch": 19.945426195426194, "grad_norm": 78.40899658203125, "learning_rate": 1.1448701105976068e-09, "loss": 0.1204, "num_input_tokens_seen": 6910880, "step": 76750 }, { "epoch": 19.946725571725572, "grad_norm": 0.019400369375944138, "learning_rate": 1.0912541357877004e-09, "loss": 0.2203, "num_input_tokens_seen": 6911312, "step": 76755 }, { "epoch": 19.948024948024948, "grad_norm": 0.14607369899749756, "learning_rate": 1.0389238788072187e-09, "loss": 0.0422, "num_input_tokens_seen": 6911792, "step": 76760 }, { "epoch": 19.949324324324323, "grad_norm": 0.30759668350219727, "learning_rate": 9.878793423456767e-10, "loss": 0.0007, "num_input_tokens_seen": 6912240, "step": 76765 }, { "epoch": 19.9506237006237, "grad_norm": 73.05976867675781, "learning_rate": 9.381205290315276e-10, "loss": 0.1742, "num_input_tokens_seen": 6912672, "step": 76770 }, { "epoch": 19.951923076923077, "grad_norm": 2.1716806888580322, "learning_rate": 8.896474414238354e-10, "loss": 0.0017, "num_input_tokens_seen": 6913152, "step": 76775 }, { "epoch": 19.953222453222452, "grad_norm": 0.008465724065899849, "learning_rate": 8.424600820122752e-10, "loss": 0.008, "num_input_tokens_seen": 6913616, "step": 76780 }, { "epoch": 19.95452182952183, "grad_norm": 0.11138513684272766, "learning_rate": 7.965584532282355e-10, "loss": 0.0421, "num_input_tokens_seen": 6914016, "step": 76785 }, { "epoch": 19.955821205821206, "grad_norm": 0.8770356178283691, "learning_rate": 7.519425574281647e-10, "loss": 0.0009, "num_input_tokens_seen": 6914464, "step": 76790 }, { "epoch": 19.95712058212058, "grad_norm": 0.0353841669857502, "learning_rate": 7.086123969102243e-10, "loss": 0.1222, "num_input_tokens_seen": 6914928, "step": 76795 }, { "epoch": 19.95841995841996, "grad_norm": 0.05082513391971588, "learning_rate": 6.665679739031871e-10, "loss": 0.0004, "num_input_tokens_seen": 6915376, "step": 76800 }, { "epoch": 19.959719334719335, "grad_norm": 0.11607377231121063, "learning_rate": 6.258092905636614e-10, "loss": 0.0587, "num_input_tokens_seen": 6915824, "step": 76805 }, { "epoch": 19.96101871101871, "grad_norm": 0.0014680157182738185, "learning_rate": 5.863363489955198e-10, "loss": 0.0001, "num_input_tokens_seen": 6916288, "step": 76810 }, { "epoch": 19.96231808731809, "grad_norm": 19.9289608001709, "learning_rate": 5.481491512249193e-10, "loss": 0.0918, "num_input_tokens_seen": 6916752, "step": 76815 }, { "epoch": 19.963617463617464, "grad_norm": 0.01056475006043911, "learning_rate": 5.11247699214179e-10, "loss": 0.001, "num_input_tokens_seen": 6917168, "step": 76820 }, { "epoch": 19.96491683991684, "grad_norm": 19.805221557617188, "learning_rate": 4.75631994864556e-10, "loss": 0.0145, "num_input_tokens_seen": 6917584, "step": 76825 }, { "epoch": 19.966216216216218, "grad_norm": 0.0006605301168747246, "learning_rate": 4.413020400079182e-10, "loss": 0.0519, "num_input_tokens_seen": 6918032, "step": 76830 }, { "epoch": 19.967515592515593, "grad_norm": 0.013692614622414112, "learning_rate": 4.082578364067446e-10, "loss": 0.0263, "num_input_tokens_seen": 6918496, "step": 76835 }, { "epoch": 19.968814968814968, "grad_norm": 0.0014871995663270354, "learning_rate": 3.764993857624521e-10, "loss": 0.0608, "num_input_tokens_seen": 6918896, "step": 76840 }, { "epoch": 19.970114345114347, "grad_norm": 0.04185549542307854, "learning_rate": 3.460266897098441e-10, "loss": 0.0002, "num_input_tokens_seen": 6919360, "step": 76845 }, { "epoch": 19.97141372141372, "grad_norm": 0.9430644512176514, "learning_rate": 3.168397498115594e-10, "loss": 0.0186, "num_input_tokens_seen": 6919824, "step": 76850 }, { "epoch": 19.972713097713097, "grad_norm": 0.01917734183371067, "learning_rate": 2.889385675747258e-10, "loss": 0.2904, "num_input_tokens_seen": 6920240, "step": 76855 }, { "epoch": 19.974012474012476, "grad_norm": 0.26803019642829895, "learning_rate": 2.6232314443153106e-10, "loss": 0.0087, "num_input_tokens_seen": 6920656, "step": 76860 }, { "epoch": 19.97531185031185, "grad_norm": 4.038375377655029, "learning_rate": 2.3699348174754945e-10, "loss": 0.3398, "num_input_tokens_seen": 6921104, "step": 76865 }, { "epoch": 19.976611226611226, "grad_norm": 0.011131150647997856, "learning_rate": 2.1294958083006855e-10, "loss": 0.0002, "num_input_tokens_seen": 6921568, "step": 76870 }, { "epoch": 19.977910602910605, "grad_norm": 0.06660331040620804, "learning_rate": 1.9019144291421153e-10, "loss": 0.0897, "num_input_tokens_seen": 6922048, "step": 76875 }, { "epoch": 19.97920997920998, "grad_norm": 0.41420772671699524, "learning_rate": 1.6871906917126367e-10, "loss": 0.0067, "num_input_tokens_seen": 6922528, "step": 76880 }, { "epoch": 19.980509355509355, "grad_norm": 0.30093663930892944, "learning_rate": 1.4853246070589689e-10, "loss": 0.0982, "num_input_tokens_seen": 6922976, "step": 76885 }, { "epoch": 19.981808731808734, "grad_norm": 1.1095471382141113, "learning_rate": 1.2963161855339413e-10, "loss": 0.0093, "num_input_tokens_seen": 6923424, "step": 76890 }, { "epoch": 19.98310810810811, "grad_norm": 7.170722484588623, "learning_rate": 1.120165436879761e-10, "loss": 0.0031, "num_input_tokens_seen": 6923888, "step": 76895 }, { "epoch": 19.984407484407484, "grad_norm": 59.213966369628906, "learning_rate": 9.568723701447457e-11, "loss": 0.0694, "num_input_tokens_seen": 6924336, "step": 76900 }, { "epoch": 19.98570686070686, "grad_norm": 0.04296775162220001, "learning_rate": 8.064369937388349e-11, "loss": 0.1367, "num_input_tokens_seen": 6924768, "step": 76905 }, { "epoch": 19.987006237006238, "grad_norm": 0.6967834234237671, "learning_rate": 6.688593154058343e-11, "loss": 0.0127, "num_input_tokens_seen": 6925232, "step": 76910 }, { "epoch": 19.988305613305613, "grad_norm": 0.0156168881803751, "learning_rate": 5.4413934219565974e-11, "loss": 0.0096, "num_input_tokens_seen": 6925664, "step": 76915 }, { "epoch": 19.989604989604988, "grad_norm": 0.0009853022638708353, "learning_rate": 4.3227708054760504e-11, "loss": 0.4628, "num_input_tokens_seen": 6926128, "step": 76920 }, { "epoch": 19.990904365904367, "grad_norm": 0.012771929614245892, "learning_rate": 3.3327253620707433e-11, "loss": 0.0004, "num_input_tokens_seen": 6926576, "step": 76925 }, { "epoch": 19.992203742203742, "grad_norm": 0.05981791391968727, "learning_rate": 2.4712571428109344e-11, "loss": 0.0002, "num_input_tokens_seen": 6927008, "step": 76930 }, { "epoch": 19.993503118503117, "grad_norm": 0.025937000289559364, "learning_rate": 1.7383661915504334e-11, "loss": 0.0001, "num_input_tokens_seen": 6927488, "step": 76935 }, { "epoch": 19.994802494802496, "grad_norm": 1.6587162017822266, "learning_rate": 1.1340525463143791e-11, "loss": 0.0201, "num_input_tokens_seen": 6927968, "step": 76940 }, { "epoch": 19.99610187110187, "grad_norm": 6.192531585693359, "learning_rate": 6.583162381890162e-12, "loss": 0.1956, "num_input_tokens_seen": 6928400, "step": 76945 }, { "epoch": 19.997401247401246, "grad_norm": 0.006729859858751297, "learning_rate": 3.111572915992511e-12, "loss": 0.0006, "num_input_tokens_seen": 6928848, "step": 76950 }, { "epoch": 19.998700623700625, "grad_norm": 0.11080331355333328, "learning_rate": 9.2575724586208e-13, "loss": 0.4653, "num_input_tokens_seen": 6929280, "step": 76955 }, { "epoch": 20.0, "grad_norm": 0.3616178333759308, "learning_rate": 2.5715479745613837e-14, "loss": 0.0006, "num_input_tokens_seen": 6929680, "step": 76960 }, { "epoch": 20.0, "eval_loss": 0.9521387219429016, "eval_runtime": 13.1511, "eval_samples_per_second": 65.09, "eval_steps_per_second": 32.545, "num_input_tokens_seen": 6929680, "step": 76960 }, { "epoch": 20.0, "num_input_tokens_seen": 6929680, "step": 76960, "total_flos": 3.1204035840638976e+17, "train_loss": 0.23893776387709736, "train_runtime": 6717.4331, "train_samples_per_second": 22.911, "train_steps_per_second": 11.457 } ], "logging_steps": 5, "max_steps": 76960, "num_input_tokens_seen": 6929680, "num_train_epochs": 20, "save_steps": 3848, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1204035840638976e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }