diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4767 @@ +{ + "best_metric": 1.2019070386886597, + "best_model_checkpoint": "/Users/bbunzeck/Documents/german-llamas/cxn-llamas/cds-llama/checkpoint-6003", + "epoch": 0.9999250093738282, + "eval_steps": 667, + "global_step": 6667, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014998125234345708, + "grad_norm": 4.572068214416504, + "learning_rate": 1.4999999999999999e-05, + "loss": 4.4943, + "step": 10 + }, + { + "epoch": 0.0029996250468691415, + "grad_norm": 3.0534508228302, + "learning_rate": 2.9999999999999997e-05, + "loss": 4.1499, + "step": 20 + }, + { + "epoch": 0.0044994375703037125, + "grad_norm": 2.2651097774505615, + "learning_rate": 4.4999999999999996e-05, + "loss": 3.7895, + "step": 30 + }, + { + "epoch": 0.005999250093738283, + "grad_norm": 1.8512789011001587, + "learning_rate": 5.9999999999999995e-05, + "loss": 3.5688, + "step": 40 + }, + { + "epoch": 0.0074990626171728535, + "grad_norm": 1.5266691446304321, + "learning_rate": 7.5e-05, + "loss": 3.4336, + "step": 50 + }, + { + "epoch": 0.008998875140607425, + "grad_norm": 1.1185054779052734, + "learning_rate": 8.999999999999999e-05, + "loss": 3.2977, + "step": 60 + }, + { + "epoch": 0.010498687664041995, + "grad_norm": 0.9552314877510071, + "learning_rate": 0.00010499999999999999, + "loss": 3.1571, + "step": 70 + }, + { + "epoch": 0.011998500187476566, + "grad_norm": 1.1306709051132202, + "learning_rate": 0.00011999999999999999, + "loss": 2.9127, + "step": 80 + }, + { + "epoch": 0.013498312710911136, + "grad_norm": 1.14328932762146, + "learning_rate": 0.000135, + "loss": 2.7226, + "step": 90 + }, + { + "epoch": 0.014998125234345707, + "grad_norm": 2.9210283756256104, + "learning_rate": 0.00015, + "loss": 2.6076, + "step": 100 + }, + { + "epoch": 0.016497937757780277, + "grad_norm": 1.3305509090423584, + "learning_rate": 0.000165, + "loss": 2.5095, + "step": 110 + }, + { + "epoch": 0.01799775028121485, + "grad_norm": 1.7380200624465942, + "learning_rate": 0.00017999999999999998, + "loss": 2.434, + "step": 120 + }, + { + "epoch": 0.01949756280464942, + "grad_norm": 2.1826679706573486, + "learning_rate": 0.000195, + "loss": 2.3722, + "step": 130 + }, + { + "epoch": 0.02099737532808399, + "grad_norm": 1.3203043937683105, + "learning_rate": 0.00020999999999999998, + "loss": 2.3267, + "step": 140 + }, + { + "epoch": 0.02249718785151856, + "grad_norm": 1.439988136291504, + "learning_rate": 0.000225, + "loss": 2.2467, + "step": 150 + }, + { + "epoch": 0.023997000374953132, + "grad_norm": 1.5344315767288208, + "learning_rate": 0.00023999999999999998, + "loss": 2.1827, + "step": 160 + }, + { + "epoch": 0.0254968128983877, + "grad_norm": 1.6336548328399658, + "learning_rate": 0.00025499999999999996, + "loss": 2.1359, + "step": 170 + }, + { + "epoch": 0.02699662542182227, + "grad_norm": 1.454647421836853, + "learning_rate": 0.00027, + "loss": 2.054, + "step": 180 + }, + { + "epoch": 0.028496437945256844, + "grad_norm": 1.4481509923934937, + "learning_rate": 0.000285, + "loss": 2.0315, + "step": 190 + }, + { + "epoch": 0.029996250468691414, + "grad_norm": 1.7465718984603882, + "learning_rate": 0.0003, + "loss": 1.9847, + "step": 200 + }, + { + "epoch": 0.031496062992125984, + "grad_norm": 1.4593420028686523, + "learning_rate": 0.0002999982300767559, + "loss": 1.9516, + "step": 210 + }, + { + "epoch": 0.032995875515560553, + "grad_norm": 1.270998477935791, + "learning_rate": 0.000299992920348792, + "loss": 1.9128, + "step": 220 + }, + { + "epoch": 0.03449568803899512, + "grad_norm": 1.7116246223449707, + "learning_rate": 0.0002999840709414124, + "loss": 1.9023, + "step": 230 + }, + { + "epoch": 0.0359955005624297, + "grad_norm": 1.2121365070343018, + "learning_rate": 0.0002999716820634541, + "loss": 1.8635, + "step": 240 + }, + { + "epoch": 0.03749531308586427, + "grad_norm": 1.5796058177947998, + "learning_rate": 0.000299955754007282, + "loss": 1.8292, + "step": 250 + }, + { + "epoch": 0.03899512560929884, + "grad_norm": 1.2615342140197754, + "learning_rate": 0.00029993628714878185, + "loss": 1.7873, + "step": 260 + }, + { + "epoch": 0.04049493813273341, + "grad_norm": 1.3430827856063843, + "learning_rate": 0.00029991328194735155, + "loss": 1.798, + "step": 270 + }, + { + "epoch": 0.04199475065616798, + "grad_norm": 1.2447484731674194, + "learning_rate": 0.0002998867389458904, + "loss": 1.7618, + "step": 280 + }, + { + "epoch": 0.04349456317960255, + "grad_norm": 1.4957722425460815, + "learning_rate": 0.00029985665877078595, + "loss": 1.7638, + "step": 290 + }, + { + "epoch": 0.04499437570303712, + "grad_norm": 1.152177333831787, + "learning_rate": 0.0002998230421318997, + "loss": 1.7557, + "step": 300 + }, + { + "epoch": 0.046494188226471694, + "grad_norm": 1.0447930097579956, + "learning_rate": 0.0002997858898225498, + "loss": 1.7204, + "step": 310 + }, + { + "epoch": 0.047994000749906264, + "grad_norm": 1.4656999111175537, + "learning_rate": 0.0002997452027194928, + "loss": 1.7295, + "step": 320 + }, + { + "epoch": 0.049493813273340834, + "grad_norm": 1.344132661819458, + "learning_rate": 0.0002997009817829027, + "loss": 1.7353, + "step": 330 + }, + { + "epoch": 0.0509936257967754, + "grad_norm": 1.3033422231674194, + "learning_rate": 0.0002996532280563483, + "loss": 1.7043, + "step": 340 + }, + { + "epoch": 0.05249343832020997, + "grad_norm": 1.070493221282959, + "learning_rate": 0.0002996019426667687, + "loss": 1.6626, + "step": 350 + }, + { + "epoch": 0.05399325084364454, + "grad_norm": 1.1506638526916504, + "learning_rate": 0.00029954712682444656, + "loss": 1.652, + "step": 360 + }, + { + "epoch": 0.05549306336707911, + "grad_norm": 1.296350121498108, + "learning_rate": 0.0002994887818229797, + "loss": 1.6325, + "step": 370 + }, + { + "epoch": 0.05699287589051369, + "grad_norm": 1.0449875593185425, + "learning_rate": 0.0002994269090392505, + "loss": 1.6708, + "step": 380 + }, + { + "epoch": 0.05849268841394826, + "grad_norm": 1.1577085256576538, + "learning_rate": 0.00029936150993339325, + "loss": 1.6341, + "step": 390 + }, + { + "epoch": 0.05999250093738283, + "grad_norm": 1.0227168798446655, + "learning_rate": 0.0002992925860487599, + "loss": 1.6497, + "step": 400 + }, + { + "epoch": 0.0614923134608174, + "grad_norm": 1.073912262916565, + "learning_rate": 0.0002992201390118837, + "loss": 1.6289, + "step": 410 + }, + { + "epoch": 0.06299212598425197, + "grad_norm": 1.1426217555999756, + "learning_rate": 0.00029914417053244054, + "loss": 1.6277, + "step": 420 + }, + { + "epoch": 0.06449193850768654, + "grad_norm": 1.0212661027908325, + "learning_rate": 0.00029906468240320874, + "loss": 1.6146, + "step": 430 + }, + { + "epoch": 0.06599175103112111, + "grad_norm": 1.0049968957901, + "learning_rate": 0.00029898167650002676, + "loss": 1.6091, + "step": 440 + }, + { + "epoch": 0.06749156355455568, + "grad_norm": 0.9726770520210266, + "learning_rate": 0.0002988951547817491, + "loss": 1.5967, + "step": 450 + }, + { + "epoch": 0.06899137607799025, + "grad_norm": 1.0028915405273438, + "learning_rate": 0.00029880511929019965, + "loss": 1.6033, + "step": 460 + }, + { + "epoch": 0.07049118860142482, + "grad_norm": 1.164440631866455, + "learning_rate": 0.0002987115721501239, + "loss": 1.5866, + "step": 470 + }, + { + "epoch": 0.0719910011248594, + "grad_norm": 1.1381382942199707, + "learning_rate": 0.00029861451556913865, + "loss": 1.5928, + "step": 480 + }, + { + "epoch": 0.07349081364829396, + "grad_norm": 1.057630181312561, + "learning_rate": 0.00029851395183767983, + "loss": 1.579, + "step": 490 + }, + { + "epoch": 0.07499062617172854, + "grad_norm": 1.0061906576156616, + "learning_rate": 0.00029840988332894864, + "loss": 1.5746, + "step": 500 + }, + { + "epoch": 0.0764904386951631, + "grad_norm": 1.0443668365478516, + "learning_rate": 0.00029830231249885537, + "loss": 1.5546, + "step": 510 + }, + { + "epoch": 0.07799025121859768, + "grad_norm": 1.1543552875518799, + "learning_rate": 0.00029819124188596146, + "loss": 1.553, + "step": 520 + }, + { + "epoch": 0.07949006374203224, + "grad_norm": 0.997122049331665, + "learning_rate": 0.00029807667411141977, + "loss": 1.5639, + "step": 530 + }, + { + "epoch": 0.08098987626546682, + "grad_norm": 1.0399993658065796, + "learning_rate": 0.0002979586118789125, + "loss": 1.5401, + "step": 540 + }, + { + "epoch": 0.0824896887889014, + "grad_norm": 1.0152385234832764, + "learning_rate": 0.0002978370579745876, + "loss": 1.5372, + "step": 550 + }, + { + "epoch": 0.08398950131233596, + "grad_norm": 0.9546723961830139, + "learning_rate": 0.00029771201526699264, + "loss": 1.528, + "step": 560 + }, + { + "epoch": 0.08548931383577053, + "grad_norm": 1.0119845867156982, + "learning_rate": 0.0002975834867070077, + "loss": 1.518, + "step": 570 + }, + { + "epoch": 0.0869891263592051, + "grad_norm": 1.063570261001587, + "learning_rate": 0.00029745147532777514, + "loss": 1.5108, + "step": 580 + }, + { + "epoch": 0.08848893888263967, + "grad_norm": 1.0260778665542603, + "learning_rate": 0.0002973159842446285, + "loss": 1.5021, + "step": 590 + }, + { + "epoch": 0.08998875140607424, + "grad_norm": 0.9610818028450012, + "learning_rate": 0.00029717701665501865, + "loss": 1.516, + "step": 600 + }, + { + "epoch": 0.09148856392950881, + "grad_norm": 1.0446592569351196, + "learning_rate": 0.00029703457583843846, + "loss": 1.5103, + "step": 610 + }, + { + "epoch": 0.09298837645294339, + "grad_norm": 0.9651235342025757, + "learning_rate": 0.00029688866515634546, + "loss": 1.5173, + "step": 620 + }, + { + "epoch": 0.09448818897637795, + "grad_norm": 0.9919349551200867, + "learning_rate": 0.00029673928805208237, + "loss": 1.5078, + "step": 630 + }, + { + "epoch": 0.09598800149981253, + "grad_norm": 0.967276394367218, + "learning_rate": 0.00029658644805079606, + "loss": 1.5167, + "step": 640 + }, + { + "epoch": 0.09748781402324709, + "grad_norm": 1.0425859689712524, + "learning_rate": 0.00029643014875935404, + "loss": 1.5134, + "step": 650 + }, + { + "epoch": 0.09898762654668167, + "grad_norm": 0.9938341975212097, + "learning_rate": 0.00029627039386625976, + "loss": 1.4941, + "step": 660 + }, + { + "epoch": 0.10003749531308587, + "eval_loss": 1.5367733240127563, + "eval_runtime": 35.5683, + "eval_samples_per_second": 702.874, + "eval_steps_per_second": 87.859, + "step": 667 + }, + { + "epoch": 0.10048743907011623, + "grad_norm": 1.0440524816513062, + "learning_rate": 0.0002961071871415651, + "loss": 1.474, + "step": 670 + }, + { + "epoch": 0.1019872515935508, + "grad_norm": 0.9019431471824646, + "learning_rate": 0.00029594053243678175, + "loss": 1.5061, + "step": 680 + }, + { + "epoch": 0.10348706411698538, + "grad_norm": 1.0530225038528442, + "learning_rate": 0.00029577043368479017, + "loss": 1.4618, + "step": 690 + }, + { + "epoch": 0.10498687664041995, + "grad_norm": 0.9635890126228333, + "learning_rate": 0.0002955968948997469, + "loss": 1.4822, + "step": 700 + }, + { + "epoch": 0.10648668916385452, + "grad_norm": 0.97013920545578, + "learning_rate": 0.00029541992017698956, + "loss": 1.4458, + "step": 710 + }, + { + "epoch": 0.10798650168728909, + "grad_norm": 1.0321599245071411, + "learning_rate": 0.0002952395136929406, + "loss": 1.4708, + "step": 720 + }, + { + "epoch": 0.10948631421072366, + "grad_norm": 0.8976233601570129, + "learning_rate": 0.00029505567970500833, + "loss": 1.4572, + "step": 730 + }, + { + "epoch": 0.11098612673415822, + "grad_norm": 0.9593002796173096, + "learning_rate": 0.0002948684225514868, + "loss": 1.4506, + "step": 740 + }, + { + "epoch": 0.1124859392575928, + "grad_norm": 0.941360354423523, + "learning_rate": 0.0002946777466514531, + "loss": 1.4683, + "step": 750 + }, + { + "epoch": 0.11398575178102738, + "grad_norm": 1.0240117311477661, + "learning_rate": 0.00029448365650466336, + "loss": 1.4679, + "step": 760 + }, + { + "epoch": 0.11548556430446194, + "grad_norm": 1.041799545288086, + "learning_rate": 0.0002942861566914465, + "loss": 1.4544, + "step": 770 + }, + { + "epoch": 0.11698537682789652, + "grad_norm": 0.9296654462814331, + "learning_rate": 0.0002940852518725959, + "loss": 1.4473, + "step": 780 + }, + { + "epoch": 0.11848518935133108, + "grad_norm": 0.9188225269317627, + "learning_rate": 0.0002938809467892596, + "loss": 1.4461, + "step": 790 + }, + { + "epoch": 0.11998500187476566, + "grad_norm": 0.9431837201118469, + "learning_rate": 0.0002936732462628287, + "loss": 1.4413, + "step": 800 + }, + { + "epoch": 0.12148481439820022, + "grad_norm": 0.9874680042266846, + "learning_rate": 0.0002934621551948229, + "loss": 1.4435, + "step": 810 + }, + { + "epoch": 0.1229846269216348, + "grad_norm": 0.9407353401184082, + "learning_rate": 0.0002932476785667754, + "loss": 1.4256, + "step": 820 + }, + { + "epoch": 0.12448443944506937, + "grad_norm": 0.9208086729049683, + "learning_rate": 0.00029302982144011514, + "loss": 1.4556, + "step": 830 + }, + { + "epoch": 0.12598425196850394, + "grad_norm": 0.8780565857887268, + "learning_rate": 0.00029280858895604727, + "loss": 1.4389, + "step": 840 + }, + { + "epoch": 0.1274840644919385, + "grad_norm": 0.9499910473823547, + "learning_rate": 0.0002925839863354322, + "loss": 1.4312, + "step": 850 + }, + { + "epoch": 0.1289838770153731, + "grad_norm": 0.9616991281509399, + "learning_rate": 0.00029235601887866167, + "loss": 1.4203, + "step": 860 + }, + { + "epoch": 0.13048368953880765, + "grad_norm": 0.8856968283653259, + "learning_rate": 0.00029212469196553456, + "loss": 1.4192, + "step": 870 + }, + { + "epoch": 0.13198350206224221, + "grad_norm": 0.9547106027603149, + "learning_rate": 0.00029189001105512914, + "loss": 1.4346, + "step": 880 + }, + { + "epoch": 0.13348331458567678, + "grad_norm": 0.9112891554832458, + "learning_rate": 0.0002916519816856748, + "loss": 1.4377, + "step": 890 + }, + { + "epoch": 0.13498312710911137, + "grad_norm": 0.9346863031387329, + "learning_rate": 0.000291410609474421, + "loss": 1.4468, + "step": 900 + }, + { + "epoch": 0.13648293963254593, + "grad_norm": 0.9371203184127808, + "learning_rate": 0.0002911659001175049, + "loss": 1.4067, + "step": 910 + }, + { + "epoch": 0.1379827521559805, + "grad_norm": 0.8880829811096191, + "learning_rate": 0.000290917859389817, + "loss": 1.4319, + "step": 920 + }, + { + "epoch": 0.13948256467941508, + "grad_norm": 1.0249537229537964, + "learning_rate": 0.0002906664931448645, + "loss": 1.4336, + "step": 930 + }, + { + "epoch": 0.14098237720284965, + "grad_norm": 0.8886300325393677, + "learning_rate": 0.00029041180731463357, + "loss": 1.4253, + "step": 940 + }, + { + "epoch": 0.1424821897262842, + "grad_norm": 0.9199953079223633, + "learning_rate": 0.00029015380790944916, + "loss": 1.4305, + "step": 950 + }, + { + "epoch": 0.1439820022497188, + "grad_norm": 0.9134594202041626, + "learning_rate": 0.0002898925010178332, + "loss": 1.4137, + "step": 960 + }, + { + "epoch": 0.14548181477315336, + "grad_norm": 0.9190260767936707, + "learning_rate": 0.00028962789280636083, + "loss": 1.4079, + "step": 970 + }, + { + "epoch": 0.14698162729658792, + "grad_norm": 0.9165851473808289, + "learning_rate": 0.00028935998951951515, + "loss": 1.4166, + "step": 980 + }, + { + "epoch": 0.1484814398200225, + "grad_norm": 0.9126707315444946, + "learning_rate": 0.00028908879747953955, + "loss": 1.4025, + "step": 990 + }, + { + "epoch": 0.14998125234345708, + "grad_norm": 0.9610698819160461, + "learning_rate": 0.00028881432308628855, + "loss": 1.3994, + "step": 1000 + }, + { + "epoch": 0.15148106486689164, + "grad_norm": 0.9160841703414917, + "learning_rate": 0.00028853657281707696, + "loss": 1.4105, + "step": 1010 + }, + { + "epoch": 0.1529808773903262, + "grad_norm": 0.9419758319854736, + "learning_rate": 0.0002882555532265269, + "loss": 1.4107, + "step": 1020 + }, + { + "epoch": 0.1544806899137608, + "grad_norm": 0.8920834064483643, + "learning_rate": 0.0002879712709464131, + "loss": 1.4076, + "step": 1030 + }, + { + "epoch": 0.15598050243719536, + "grad_norm": 0.88045734167099, + "learning_rate": 0.0002876837326855064, + "loss": 1.3878, + "step": 1040 + }, + { + "epoch": 0.15748031496062992, + "grad_norm": 0.8166473507881165, + "learning_rate": 0.00028739294522941555, + "loss": 1.3883, + "step": 1050 + }, + { + "epoch": 0.15898012748406448, + "grad_norm": 0.9151524901390076, + "learning_rate": 0.00028709891544042687, + "loss": 1.3817, + "step": 1060 + }, + { + "epoch": 0.16047994000749907, + "grad_norm": 0.9363940954208374, + "learning_rate": 0.0002868016502573425, + "loss": 1.3969, + "step": 1070 + }, + { + "epoch": 0.16197975253093364, + "grad_norm": 0.8952043056488037, + "learning_rate": 0.00028650115669531654, + "loss": 1.3784, + "step": 1080 + }, + { + "epoch": 0.1634795650543682, + "grad_norm": 0.89581698179245, + "learning_rate": 0.00028619744184568946, + "loss": 1.3764, + "step": 1090 + }, + { + "epoch": 0.1649793775778028, + "grad_norm": 0.856798529624939, + "learning_rate": 0.00028589051287582093, + "loss": 1.3826, + "step": 1100 + }, + { + "epoch": 0.16647919010123735, + "grad_norm": 0.9560316801071167, + "learning_rate": 0.0002855803770289206, + "loss": 1.3924, + "step": 1110 + }, + { + "epoch": 0.1679790026246719, + "grad_norm": 0.9139536619186401, + "learning_rate": 0.0002852670416238769, + "loss": 1.3655, + "step": 1120 + }, + { + "epoch": 0.16947881514810648, + "grad_norm": 0.8980649709701538, + "learning_rate": 0.0002849505140550848, + "loss": 1.3826, + "step": 1130 + }, + { + "epoch": 0.17097862767154107, + "grad_norm": 0.8634670376777649, + "learning_rate": 0.00028463080179227105, + "loss": 1.3837, + "step": 1140 + }, + { + "epoch": 0.17247844019497563, + "grad_norm": 0.9725201725959778, + "learning_rate": 0.00028430791238031775, + "loss": 1.4023, + "step": 1150 + }, + { + "epoch": 0.1739782527184102, + "grad_norm": 0.7960573434829712, + "learning_rate": 0.00028398185343908464, + "loss": 1.3784, + "step": 1160 + }, + { + "epoch": 0.17547806524184478, + "grad_norm": 0.866399884223938, + "learning_rate": 0.000283652632663229, + "loss": 1.3919, + "step": 1170 + }, + { + "epoch": 0.17697787776527935, + "grad_norm": 0.8062925934791565, + "learning_rate": 0.0002833202578220242, + "loss": 1.3724, + "step": 1180 + }, + { + "epoch": 0.1784776902887139, + "grad_norm": 0.8660501837730408, + "learning_rate": 0.0002829847367591764, + "loss": 1.3948, + "step": 1190 + }, + { + "epoch": 0.17997750281214847, + "grad_norm": 0.8984708189964294, + "learning_rate": 0.0002826460773926393, + "loss": 1.3664, + "step": 1200 + }, + { + "epoch": 0.18147731533558306, + "grad_norm": 0.8522405624389648, + "learning_rate": 0.00028230428771442725, + "loss": 1.3623, + "step": 1210 + }, + { + "epoch": 0.18297712785901762, + "grad_norm": 0.8871493935585022, + "learning_rate": 0.000281959375790427, + "loss": 1.3609, + "step": 1220 + }, + { + "epoch": 0.1844769403824522, + "grad_norm": 0.9038013815879822, + "learning_rate": 0.0002816113497602069, + "loss": 1.3708, + "step": 1230 + }, + { + "epoch": 0.18597675290588678, + "grad_norm": 0.8472639918327332, + "learning_rate": 0.0002812602178368251, + "loss": 1.3683, + "step": 1240 + }, + { + "epoch": 0.18747656542932134, + "grad_norm": 0.8597409725189209, + "learning_rate": 0.00028090598830663566, + "loss": 1.3672, + "step": 1250 + }, + { + "epoch": 0.1889763779527559, + "grad_norm": 0.8871280550956726, + "learning_rate": 0.00028054866952909296, + "loss": 1.3935, + "step": 1260 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.8564477562904358, + "learning_rate": 0.00028018826993655445, + "loss": 1.3611, + "step": 1270 + }, + { + "epoch": 0.19197600299962506, + "grad_norm": 0.8285048007965088, + "learning_rate": 0.00027982479803408166, + "loss": 1.3513, + "step": 1280 + }, + { + "epoch": 0.19347581552305962, + "grad_norm": 0.8391554355621338, + "learning_rate": 0.00027945826239923955, + "loss": 1.3692, + "step": 1290 + }, + { + "epoch": 0.19497562804649418, + "grad_norm": 0.9388437271118164, + "learning_rate": 0.000279088671681894, + "loss": 1.3611, + "step": 1300 + }, + { + "epoch": 0.19647544056992877, + "grad_norm": 0.9204142093658447, + "learning_rate": 0.0002787160346040076, + "loss": 1.3392, + "step": 1310 + }, + { + "epoch": 0.19797525309336333, + "grad_norm": 0.9090197086334229, + "learning_rate": 0.00027834035995943413, + "loss": 1.3555, + "step": 1320 + }, + { + "epoch": 0.1994750656167979, + "grad_norm": 0.8415820002555847, + "learning_rate": 0.00027796165661371074, + "loss": 1.3371, + "step": 1330 + }, + { + "epoch": 0.20007499062617173, + "eval_loss": 1.3978526592254639, + "eval_runtime": 35.0057, + "eval_samples_per_second": 714.169, + "eval_steps_per_second": 89.271, + "step": 1334 + }, + { + "epoch": 0.20097487814023246, + "grad_norm": 0.8407223224639893, + "learning_rate": 0.00027757993350384873, + "loss": 1.3453, + "step": 1340 + }, + { + "epoch": 0.20247469066366705, + "grad_norm": 0.8398892879486084, + "learning_rate": 0.00027719519963812286, + "loss": 1.3543, + "step": 1350 + }, + { + "epoch": 0.2039745031871016, + "grad_norm": 0.8169878721237183, + "learning_rate": 0.00027680746409585865, + "loss": 1.3542, + "step": 1360 + }, + { + "epoch": 0.20547431571053618, + "grad_norm": 0.8487715721130371, + "learning_rate": 0.00027641673602721805, + "loss": 1.3308, + "step": 1370 + }, + { + "epoch": 0.20697412823397077, + "grad_norm": 0.9306272268295288, + "learning_rate": 0.00027602302465298367, + "loss": 1.3381, + "step": 1380 + }, + { + "epoch": 0.20847394075740533, + "grad_norm": 0.8499324321746826, + "learning_rate": 0.0002756263392643409, + "loss": 1.3371, + "step": 1390 + }, + { + "epoch": 0.2099737532808399, + "grad_norm": 0.7504796385765076, + "learning_rate": 0.0002752266892226591, + "loss": 1.3359, + "step": 1400 + }, + { + "epoch": 0.21147356580427445, + "grad_norm": 0.8636272549629211, + "learning_rate": 0.0002748240839592701, + "loss": 1.3378, + "step": 1410 + }, + { + "epoch": 0.21297337832770905, + "grad_norm": 0.9339527487754822, + "learning_rate": 0.00027441853297524615, + "loss": 1.3743, + "step": 1420 + }, + { + "epoch": 0.2144731908511436, + "grad_norm": 0.9132825136184692, + "learning_rate": 0.00027401004584117535, + "loss": 1.3427, + "step": 1430 + }, + { + "epoch": 0.21597300337457817, + "grad_norm": 0.8461547493934631, + "learning_rate": 0.00027359863219693614, + "loss": 1.3338, + "step": 1440 + }, + { + "epoch": 0.21747281589801276, + "grad_norm": 0.8285521268844604, + "learning_rate": 0.00027318430175146934, + "loss": 1.3419, + "step": 1450 + }, + { + "epoch": 0.21897262842144732, + "grad_norm": 0.9066304564476013, + "learning_rate": 0.00027276706428254965, + "loss": 1.344, + "step": 1460 + }, + { + "epoch": 0.2204724409448819, + "grad_norm": 0.9125919342041016, + "learning_rate": 0.00027234692963655407, + "loss": 1.3395, + "step": 1470 + }, + { + "epoch": 0.22197225346831645, + "grad_norm": 0.8508041501045227, + "learning_rate": 0.00027192390772823045, + "loss": 1.3419, + "step": 1480 + }, + { + "epoch": 0.22347206599175104, + "grad_norm": 0.9331530928611755, + "learning_rate": 0.00027149800854046283, + "loss": 1.335, + "step": 1490 + }, + { + "epoch": 0.2249718785151856, + "grad_norm": 0.9175031781196594, + "learning_rate": 0.0002710692421240362, + "loss": 1.3341, + "step": 1500 + }, + { + "epoch": 0.22647169103862017, + "grad_norm": 0.8572413921356201, + "learning_rate": 0.0002706376185973991, + "loss": 1.3411, + "step": 1510 + }, + { + "epoch": 0.22797150356205476, + "grad_norm": 0.8543765544891357, + "learning_rate": 0.0002702031481464252, + "loss": 1.3164, + "step": 1520 + }, + { + "epoch": 0.22947131608548932, + "grad_norm": 0.8498407006263733, + "learning_rate": 0.00026976584102417233, + "loss": 1.3411, + "step": 1530 + }, + { + "epoch": 0.23097112860892388, + "grad_norm": 0.8382455110549927, + "learning_rate": 0.0002693257075506411, + "loss": 1.3418, + "step": 1540 + }, + { + "epoch": 0.23247094113235844, + "grad_norm": 0.8555087447166443, + "learning_rate": 0.00026888275811253105, + "loss": 1.3438, + "step": 1550 + }, + { + "epoch": 0.23397075365579303, + "grad_norm": 0.8968890309333801, + "learning_rate": 0.00026843700316299564, + "loss": 1.3292, + "step": 1560 + }, + { + "epoch": 0.2354705661792276, + "grad_norm": 0.8054748773574829, + "learning_rate": 0.0002679884532213954, + "loss": 1.3019, + "step": 1570 + }, + { + "epoch": 0.23697037870266216, + "grad_norm": 0.8258795738220215, + "learning_rate": 0.00026753711887304995, + "loss": 1.3379, + "step": 1580 + }, + { + "epoch": 0.23847019122609675, + "grad_norm": 0.9339794516563416, + "learning_rate": 0.000267083010768988, + "loss": 1.3397, + "step": 1590 + }, + { + "epoch": 0.2399700037495313, + "grad_norm": 0.8125893473625183, + "learning_rate": 0.0002666261396256961, + "loss": 1.319, + "step": 1600 + }, + { + "epoch": 0.24146981627296588, + "grad_norm": 0.8743818402290344, + "learning_rate": 0.0002661665162248656, + "loss": 1.3271, + "step": 1610 + }, + { + "epoch": 0.24296962879640044, + "grad_norm": 0.8262471556663513, + "learning_rate": 0.0002657041514131385, + "loss": 1.3342, + "step": 1620 + }, + { + "epoch": 0.24446944131983503, + "grad_norm": 0.8480871319770813, + "learning_rate": 0.000265239056101851, + "loss": 1.3228, + "step": 1630 + }, + { + "epoch": 0.2459692538432696, + "grad_norm": 0.8479325771331787, + "learning_rate": 0.0002647712412667765, + "loss": 1.3152, + "step": 1640 + }, + { + "epoch": 0.24746906636670415, + "grad_norm": 0.8265785574913025, + "learning_rate": 0.00026430071794786644, + "loss": 1.3234, + "step": 1650 + }, + { + "epoch": 0.24896887889013875, + "grad_norm": 0.8676069974899292, + "learning_rate": 0.00026382749724898955, + "loss": 1.2985, + "step": 1660 + }, + { + "epoch": 0.2504686914135733, + "grad_norm": 0.8200732469558716, + "learning_rate": 0.00026335159033766996, + "loss": 1.3306, + "step": 1670 + }, + { + "epoch": 0.25196850393700787, + "grad_norm": 0.8174329996109009, + "learning_rate": 0.0002628730084448239, + "loss": 1.3251, + "step": 1680 + }, + { + "epoch": 0.25346831646044243, + "grad_norm": 0.8221441507339478, + "learning_rate": 0.000262391762864494, + "loss": 1.3298, + "step": 1690 + }, + { + "epoch": 0.254968128983877, + "grad_norm": 0.8342576026916504, + "learning_rate": 0.00026190786495358366, + "loss": 1.3139, + "step": 1700 + }, + { + "epoch": 0.25646794150731156, + "grad_norm": 0.8533841967582703, + "learning_rate": 0.0002614213261315883, + "loss": 1.3109, + "step": 1710 + }, + { + "epoch": 0.2579677540307462, + "grad_norm": 0.8298860788345337, + "learning_rate": 0.0002609321578803261, + "loss": 1.3212, + "step": 1720 + }, + { + "epoch": 0.25946756655418074, + "grad_norm": 0.8570966720581055, + "learning_rate": 0.00026044037174366734, + "loss": 1.306, + "step": 1730 + }, + { + "epoch": 0.2609673790776153, + "grad_norm": 0.8347874879837036, + "learning_rate": 0.00025994597932726135, + "loss": 1.3215, + "step": 1740 + }, + { + "epoch": 0.26246719160104987, + "grad_norm": 0.8482568860054016, + "learning_rate": 0.0002594489922982633, + "loss": 1.3244, + "step": 1750 + }, + { + "epoch": 0.26396700412448443, + "grad_norm": 0.7775010466575623, + "learning_rate": 0.0002589494223850584, + "loss": 1.2984, + "step": 1760 + }, + { + "epoch": 0.265466816647919, + "grad_norm": 0.8049771189689636, + "learning_rate": 0.00025844728137698543, + "loss": 1.33, + "step": 1770 + }, + { + "epoch": 0.26696662917135355, + "grad_norm": 0.8238893747329712, + "learning_rate": 0.0002579425811240582, + "loss": 1.3175, + "step": 1780 + }, + { + "epoch": 0.26846644169478817, + "grad_norm": 0.8227720260620117, + "learning_rate": 0.00025743533353668626, + "loss": 1.3069, + "step": 1790 + }, + { + "epoch": 0.26996625421822273, + "grad_norm": 0.8744603991508484, + "learning_rate": 0.0002569255505853934, + "loss": 1.3132, + "step": 1800 + }, + { + "epoch": 0.2714660667416573, + "grad_norm": 0.8706551790237427, + "learning_rate": 0.0002564132443005356, + "loss": 1.3075, + "step": 1810 + }, + { + "epoch": 0.27296587926509186, + "grad_norm": 0.8768461346626282, + "learning_rate": 0.00025589842677201693, + "loss": 1.3012, + "step": 1820 + }, + { + "epoch": 0.2744656917885264, + "grad_norm": 0.8715064525604248, + "learning_rate": 0.0002553811101490042, + "loss": 1.303, + "step": 1830 + }, + { + "epoch": 0.275965504311961, + "grad_norm": 0.8031432628631592, + "learning_rate": 0.00025486130663964016, + "loss": 1.3038, + "step": 1840 + }, + { + "epoch": 0.27746531683539555, + "grad_norm": 0.78215491771698, + "learning_rate": 0.00025433902851075584, + "loss": 1.31, + "step": 1850 + }, + { + "epoch": 0.27896512935883017, + "grad_norm": 0.8224983215332031, + "learning_rate": 0.0002538142880875805, + "loss": 1.2931, + "step": 1860 + }, + { + "epoch": 0.28046494188226473, + "grad_norm": 0.9267176389694214, + "learning_rate": 0.00025328709775345105, + "loss": 1.3136, + "step": 1870 + }, + { + "epoch": 0.2819647544056993, + "grad_norm": 0.7887846827507019, + "learning_rate": 0.0002527574699495199, + "loss": 1.3086, + "step": 1880 + }, + { + "epoch": 0.28346456692913385, + "grad_norm": 0.867173969745636, + "learning_rate": 0.00025222541717446117, + "loss": 1.3029, + "step": 1890 + }, + { + "epoch": 0.2849643794525684, + "grad_norm": 0.8227114677429199, + "learning_rate": 0.00025169095198417584, + "loss": 1.2938, + "step": 1900 + }, + { + "epoch": 0.286464191976003, + "grad_norm": 0.816207766532898, + "learning_rate": 0.00025115408699149546, + "loss": 1.3115, + "step": 1910 + }, + { + "epoch": 0.2879640044994376, + "grad_norm": 0.91056227684021, + "learning_rate": 0.00025061483486588435, + "loss": 1.3171, + "step": 1920 + }, + { + "epoch": 0.28946381702287216, + "grad_norm": 0.7912834882736206, + "learning_rate": 0.00025007320833314085, + "loss": 1.2853, + "step": 1930 + }, + { + "epoch": 0.2909636295463067, + "grad_norm": 0.7663973569869995, + "learning_rate": 0.00024952922017509687, + "loss": 1.3014, + "step": 1940 + }, + { + "epoch": 0.2924634420697413, + "grad_norm": 0.7914089560508728, + "learning_rate": 0.00024898288322931615, + "loss": 1.2922, + "step": 1950 + }, + { + "epoch": 0.29396325459317585, + "grad_norm": 0.8117234706878662, + "learning_rate": 0.00024843421038879147, + "loss": 1.2953, + "step": 1960 + }, + { + "epoch": 0.2954630671166104, + "grad_norm": 0.8552126288414001, + "learning_rate": 0.0002478832146016404, + "loss": 1.3, + "step": 1970 + }, + { + "epoch": 0.296962879640045, + "grad_norm": 0.8411305546760559, + "learning_rate": 0.0002473299088707996, + "loss": 1.2945, + "step": 1980 + }, + { + "epoch": 0.2984626921634796, + "grad_norm": 0.8959758281707764, + "learning_rate": 0.00024677430625371803, + "loss": 1.2945, + "step": 1990 + }, + { + "epoch": 0.29996250468691416, + "grad_norm": 0.8792350888252258, + "learning_rate": 0.0002462164198620489, + "loss": 1.3009, + "step": 2000 + }, + { + "epoch": 0.3001124859392576, + "eval_loss": 1.3354628086090088, + "eval_runtime": 34.4241, + "eval_samples_per_second": 726.236, + "eval_steps_per_second": 90.78, + "step": 2001 + }, + { + "epoch": 0.3014623172103487, + "grad_norm": 0.8166361451148987, + "learning_rate": 0.00024565626286134003, + "loss": 1.2829, + "step": 2010 + }, + { + "epoch": 0.3029621297337833, + "grad_norm": 0.8526578545570374, + "learning_rate": 0.0002450938484707234, + "loss": 1.2799, + "step": 2020 + }, + { + "epoch": 0.30446194225721784, + "grad_norm": 0.8354383707046509, + "learning_rate": 0.0002445291899626031, + "loss": 1.2828, + "step": 2030 + }, + { + "epoch": 0.3059617547806524, + "grad_norm": 0.8629779815673828, + "learning_rate": 0.000243962300662342, + "loss": 1.3066, + "step": 2040 + }, + { + "epoch": 0.30746156730408697, + "grad_norm": 0.8469873666763306, + "learning_rate": 0.00024339319394794742, + "loss": 1.2998, + "step": 2050 + }, + { + "epoch": 0.3089613798275216, + "grad_norm": 0.8346788883209229, + "learning_rate": 0.00024282188324975534, + "loss": 1.2917, + "step": 2060 + }, + { + "epoch": 0.31046119235095615, + "grad_norm": 0.8433026671409607, + "learning_rate": 0.0002422483820501136, + "loss": 1.2878, + "step": 2070 + }, + { + "epoch": 0.3119610048743907, + "grad_norm": 0.8289304375648499, + "learning_rate": 0.00024167270388306366, + "loss": 1.2865, + "step": 2080 + }, + { + "epoch": 0.3134608173978253, + "grad_norm": 0.8034661412239075, + "learning_rate": 0.00024109486233402102, + "loss": 1.2858, + "step": 2090 + }, + { + "epoch": 0.31496062992125984, + "grad_norm": 0.8111044764518738, + "learning_rate": 0.00024051487103945486, + "loss": 1.2693, + "step": 2100 + }, + { + "epoch": 0.3164604424446944, + "grad_norm": 0.823731541633606, + "learning_rate": 0.00023993274368656618, + "loss": 1.2802, + "step": 2110 + }, + { + "epoch": 0.31796025496812896, + "grad_norm": 0.7805794477462769, + "learning_rate": 0.00023934849401296472, + "loss": 1.2961, + "step": 2120 + }, + { + "epoch": 0.3194600674915636, + "grad_norm": 0.8786959648132324, + "learning_rate": 0.0002387621358063449, + "loss": 1.2817, + "step": 2130 + }, + { + "epoch": 0.32095988001499814, + "grad_norm": 0.8563496470451355, + "learning_rate": 0.00023817368290416036, + "loss": 1.2837, + "step": 2140 + }, + { + "epoch": 0.3224596925384327, + "grad_norm": 0.7847155928611755, + "learning_rate": 0.00023758314919329726, + "loss": 1.304, + "step": 2150 + }, + { + "epoch": 0.32395950506186727, + "grad_norm": 0.8151364326477051, + "learning_rate": 0.00023699054860974682, + "loss": 1.2699, + "step": 2160 + }, + { + "epoch": 0.32545931758530183, + "grad_norm": 0.8278871774673462, + "learning_rate": 0.00023639589513827636, + "loss": 1.2706, + "step": 2170 + }, + { + "epoch": 0.3269591301087364, + "grad_norm": 0.8579797148704529, + "learning_rate": 0.0002357992028120993, + "loss": 1.286, + "step": 2180 + }, + { + "epoch": 0.32845894263217096, + "grad_norm": 0.8568677306175232, + "learning_rate": 0.00023520048571254378, + "loss": 1.2753, + "step": 2190 + }, + { + "epoch": 0.3299587551556056, + "grad_norm": 0.8455263376235962, + "learning_rate": 0.00023459975796872063, + "loss": 1.2721, + "step": 2200 + }, + { + "epoch": 0.33145856767904014, + "grad_norm": 0.8218129277229309, + "learning_rate": 0.0002339970337571899, + "loss": 1.2663, + "step": 2210 + }, + { + "epoch": 0.3329583802024747, + "grad_norm": 0.8314395546913147, + "learning_rate": 0.000233392327301626, + "loss": 1.2991, + "step": 2220 + }, + { + "epoch": 0.33445819272590926, + "grad_norm": 0.7503239512443542, + "learning_rate": 0.0002327856528724825, + "loss": 1.2568, + "step": 2230 + }, + { + "epoch": 0.3359580052493438, + "grad_norm": 0.8333805799484253, + "learning_rate": 0.0002321770247866551, + "loss": 1.2844, + "step": 2240 + }, + { + "epoch": 0.3374578177727784, + "grad_norm": 0.7753694653511047, + "learning_rate": 0.00023156645740714368, + "loss": 1.3001, + "step": 2250 + }, + { + "epoch": 0.33895763029621295, + "grad_norm": 0.7813096642494202, + "learning_rate": 0.00023095396514271355, + "loss": 1.2735, + "step": 2260 + }, + { + "epoch": 0.34045744281964757, + "grad_norm": 0.8016805648803711, + "learning_rate": 0.0002303395624475553, + "loss": 1.2938, + "step": 2270 + }, + { + "epoch": 0.34195725534308213, + "grad_norm": 0.7955138087272644, + "learning_rate": 0.00022972326382094378, + "loss": 1.2712, + "step": 2280 + }, + { + "epoch": 0.3434570678665167, + "grad_norm": 0.8395068645477295, + "learning_rate": 0.00022910508380689584, + "loss": 1.2711, + "step": 2290 + }, + { + "epoch": 0.34495688038995126, + "grad_norm": 0.8002228140830994, + "learning_rate": 0.00022848503699382717, + "loss": 1.2985, + "step": 2300 + }, + { + "epoch": 0.3464566929133858, + "grad_norm": 0.8872259259223938, + "learning_rate": 0.00022786313801420794, + "loss": 1.2639, + "step": 2310 + }, + { + "epoch": 0.3479565054368204, + "grad_norm": 0.8235191702842712, + "learning_rate": 0.0002272394015442177, + "loss": 1.2903, + "step": 2320 + }, + { + "epoch": 0.34945631796025495, + "grad_norm": 0.8455343246459961, + "learning_rate": 0.0002266138423033987, + "loss": 1.2871, + "step": 2330 + }, + { + "epoch": 0.35095613048368957, + "grad_norm": 0.8051894307136536, + "learning_rate": 0.00022598647505430895, + "loss": 1.2577, + "step": 2340 + }, + { + "epoch": 0.35245594300712413, + "grad_norm": 0.7616756558418274, + "learning_rate": 0.0002253573146021733, + "loss": 1.261, + "step": 2350 + }, + { + "epoch": 0.3539557555305587, + "grad_norm": 0.804066002368927, + "learning_rate": 0.0002247263757945347, + "loss": 1.2954, + "step": 2360 + }, + { + "epoch": 0.35545556805399325, + "grad_norm": 0.810148298740387, + "learning_rate": 0.00022409367352090322, + "loss": 1.2603, + "step": 2370 + }, + { + "epoch": 0.3569553805774278, + "grad_norm": 0.790664553642273, + "learning_rate": 0.00022345922271240496, + "loss": 1.2606, + "step": 2380 + }, + { + "epoch": 0.3584551931008624, + "grad_norm": 0.7945841550827026, + "learning_rate": 0.00022282303834142978, + "loss": 1.2605, + "step": 2390 + }, + { + "epoch": 0.35995500562429694, + "grad_norm": 0.8161887526512146, + "learning_rate": 0.0002221851354212777, + "loss": 1.2536, + "step": 2400 + }, + { + "epoch": 0.36145481814773156, + "grad_norm": 0.7882905006408691, + "learning_rate": 0.0002215455290058048, + "loss": 1.284, + "step": 2410 + }, + { + "epoch": 0.3629546306711661, + "grad_norm": 0.8553168773651123, + "learning_rate": 0.000220904234189068, + "loss": 1.2549, + "step": 2420 + }, + { + "epoch": 0.3644544431946007, + "grad_norm": 0.8243351578712463, + "learning_rate": 0.00022026126610496852, + "loss": 1.2526, + "step": 2430 + }, + { + "epoch": 0.36595425571803525, + "grad_norm": 0.8284412622451782, + "learning_rate": 0.0002196166399268952, + "loss": 1.267, + "step": 2440 + }, + { + "epoch": 0.3674540682414698, + "grad_norm": 0.7850016951560974, + "learning_rate": 0.00021897037086736614, + "loss": 1.2648, + "step": 2450 + }, + { + "epoch": 0.3689538807649044, + "grad_norm": 0.8068733811378479, + "learning_rate": 0.0002183224741776697, + "loss": 1.2653, + "step": 2460 + }, + { + "epoch": 0.37045369328833894, + "grad_norm": 0.7865297794342041, + "learning_rate": 0.00021767296514750472, + "loss": 1.2629, + "step": 2470 + }, + { + "epoch": 0.37195350581177355, + "grad_norm": 0.7744786739349365, + "learning_rate": 0.00021702185910461958, + "loss": 1.2597, + "step": 2480 + }, + { + "epoch": 0.3734533183352081, + "grad_norm": 0.8059437870979309, + "learning_rate": 0.00021636917141445056, + "loss": 1.2405, + "step": 2490 + }, + { + "epoch": 0.3749531308586427, + "grad_norm": 0.7744713425636292, + "learning_rate": 0.00021571491747975917, + "loss": 1.2559, + "step": 2500 + }, + { + "epoch": 0.37645294338207724, + "grad_norm": 0.817789614200592, + "learning_rate": 0.0002150591127402687, + "loss": 1.2444, + "step": 2510 + }, + { + "epoch": 0.3779527559055118, + "grad_norm": 0.8189598321914673, + "learning_rate": 0.00021440177267229984, + "loss": 1.2518, + "step": 2520 + }, + { + "epoch": 0.37945256842894637, + "grad_norm": 0.8474273681640625, + "learning_rate": 0.00021374291278840546, + "loss": 1.2634, + "step": 2530 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.8714548349380493, + "learning_rate": 0.00021308254863700452, + "loss": 1.2498, + "step": 2540 + }, + { + "epoch": 0.38245219347581555, + "grad_norm": 0.7918749451637268, + "learning_rate": 0.00021242069580201524, + "loss": 1.2659, + "step": 2550 + }, + { + "epoch": 0.3839520059992501, + "grad_norm": 0.7831052541732788, + "learning_rate": 0.00021175736990248714, + "loss": 1.2748, + "step": 2560 + }, + { + "epoch": 0.3854518185226847, + "grad_norm": 0.770596981048584, + "learning_rate": 0.00021109258659223254, + "loss": 1.2581, + "step": 2570 + }, + { + "epoch": 0.38695163104611924, + "grad_norm": 0.8102443218231201, + "learning_rate": 0.00021042636155945723, + "loss": 1.2365, + "step": 2580 + }, + { + "epoch": 0.3884514435695538, + "grad_norm": 0.7992528080940247, + "learning_rate": 0.00020975871052639024, + "loss": 1.2617, + "step": 2590 + }, + { + "epoch": 0.38995125609298836, + "grad_norm": 0.8378466367721558, + "learning_rate": 0.00020908964924891256, + "loss": 1.2352, + "step": 2600 + }, + { + "epoch": 0.3914510686164229, + "grad_norm": 0.7832515835762024, + "learning_rate": 0.0002084191935161857, + "loss": 1.2733, + "step": 2610 + }, + { + "epoch": 0.39295088113985754, + "grad_norm": 0.7198286056518555, + "learning_rate": 0.0002077473591502788, + "loss": 1.2655, + "step": 2620 + }, + { + "epoch": 0.3944506936632921, + "grad_norm": 0.7759490609169006, + "learning_rate": 0.00020707416200579524, + "loss": 1.2592, + "step": 2630 + }, + { + "epoch": 0.39595050618672667, + "grad_norm": 0.7895490527153015, + "learning_rate": 0.00020639961796949877, + "loss": 1.2495, + "step": 2640 + }, + { + "epoch": 0.39745031871016123, + "grad_norm": 0.8566303849220276, + "learning_rate": 0.00020572374295993822, + "loss": 1.2643, + "step": 2650 + }, + { + "epoch": 0.3989501312335958, + "grad_norm": 0.8570982813835144, + "learning_rate": 0.00020504655292707223, + "loss": 1.2499, + "step": 2660 + }, + { + "epoch": 0.40014998125234347, + "eval_loss": 1.2980915307998657, + "eval_runtime": 34.6825, + "eval_samples_per_second": 720.825, + "eval_steps_per_second": 90.103, + "step": 2668 + }, + { + "epoch": 0.40044994375703036, + "grad_norm": 0.8225275278091431, + "learning_rate": 0.00020436806385189246, + "loss": 1.2649, + "step": 2670 + }, + { + "epoch": 0.4019497562804649, + "grad_norm": 0.8848603367805481, + "learning_rate": 0.00020368829174604667, + "loss": 1.2686, + "step": 2680 + }, + { + "epoch": 0.40344956880389954, + "grad_norm": 0.7693737149238586, + "learning_rate": 0.00020300725265146093, + "loss": 1.2617, + "step": 2690 + }, + { + "epoch": 0.4049493813273341, + "grad_norm": 0.815846860408783, + "learning_rate": 0.00020232496263996092, + "loss": 1.2474, + "step": 2700 + }, + { + "epoch": 0.40644919385076866, + "grad_norm": 0.7660710215568542, + "learning_rate": 0.00020164143781289256, + "loss": 1.246, + "step": 2710 + }, + { + "epoch": 0.4079490063742032, + "grad_norm": 0.8107971549034119, + "learning_rate": 0.00020095669430074235, + "loss": 1.267, + "step": 2720 + }, + { + "epoch": 0.4094488188976378, + "grad_norm": 0.7721084356307983, + "learning_rate": 0.00020027074826275629, + "loss": 1.2613, + "step": 2730 + }, + { + "epoch": 0.41094863142107235, + "grad_norm": 0.7531006336212158, + "learning_rate": 0.00019958361588655888, + "loss": 1.2506, + "step": 2740 + }, + { + "epoch": 0.4124484439445069, + "grad_norm": 0.8231312036514282, + "learning_rate": 0.00019889531338777112, + "loss": 1.26, + "step": 2750 + }, + { + "epoch": 0.41394825646794153, + "grad_norm": 0.8262982368469238, + "learning_rate": 0.0001982058570096274, + "loss": 1.2426, + "step": 2760 + }, + { + "epoch": 0.4154480689913761, + "grad_norm": 0.7806345820426941, + "learning_rate": 0.00019751526302259271, + "loss": 1.2318, + "step": 2770 + }, + { + "epoch": 0.41694788151481066, + "grad_norm": 0.794988751411438, + "learning_rate": 0.00019682354772397842, + "loss": 1.2336, + "step": 2780 + }, + { + "epoch": 0.4184476940382452, + "grad_norm": 0.7735899686813354, + "learning_rate": 0.00019613072743755755, + "loss": 1.2483, + "step": 2790 + }, + { + "epoch": 0.4199475065616798, + "grad_norm": 0.7777860760688782, + "learning_rate": 0.00019543681851317998, + "loss": 1.2483, + "step": 2800 + }, + { + "epoch": 0.42144731908511435, + "grad_norm": 0.8276849985122681, + "learning_rate": 0.00019474183732638608, + "loss": 1.2464, + "step": 2810 + }, + { + "epoch": 0.4229471316085489, + "grad_norm": 0.7912575602531433, + "learning_rate": 0.0001940458002780206, + "loss": 1.2317, + "step": 2820 + }, + { + "epoch": 0.42444694413198353, + "grad_norm": 0.8454885482788086, + "learning_rate": 0.00019334872379384556, + "loss": 1.2458, + "step": 2830 + }, + { + "epoch": 0.4259467566554181, + "grad_norm": 0.8399609923362732, + "learning_rate": 0.0001926506243241526, + "loss": 1.2405, + "step": 2840 + }, + { + "epoch": 0.42744656917885265, + "grad_norm": 0.7756800651550293, + "learning_rate": 0.00019195151834337473, + "loss": 1.2409, + "step": 2850 + }, + { + "epoch": 0.4289463817022872, + "grad_norm": 0.8564468622207642, + "learning_rate": 0.00019125142234969762, + "loss": 1.2458, + "step": 2860 + }, + { + "epoch": 0.4304461942257218, + "grad_norm": 0.8514673709869385, + "learning_rate": 0.00019055035286467034, + "loss": 1.2414, + "step": 2870 + }, + { + "epoch": 0.43194600674915634, + "grad_norm": 0.8593403100967407, + "learning_rate": 0.00018984832643281513, + "loss": 1.2473, + "step": 2880 + }, + { + "epoch": 0.4334458192725909, + "grad_norm": 0.8827778100967407, + "learning_rate": 0.00018914535962123735, + "loss": 1.2532, + "step": 2890 + }, + { + "epoch": 0.4349456317960255, + "grad_norm": 0.7746068239212036, + "learning_rate": 0.00018844146901923436, + "loss": 1.2368, + "step": 2900 + }, + { + "epoch": 0.4364454443194601, + "grad_norm": 0.823742687702179, + "learning_rate": 0.000187736671237904, + "loss": 1.2326, + "step": 2910 + }, + { + "epoch": 0.43794525684289465, + "grad_norm": 0.7899442911148071, + "learning_rate": 0.0001870309829097526, + "loss": 1.2342, + "step": 2920 + }, + { + "epoch": 0.4394450693663292, + "grad_norm": 0.8022111058235168, + "learning_rate": 0.00018632442068830244, + "loss": 1.2224, + "step": 2930 + }, + { + "epoch": 0.4409448818897638, + "grad_norm": 0.8160791993141174, + "learning_rate": 0.00018561700124769892, + "loss": 1.2268, + "step": 2940 + }, + { + "epoch": 0.44244469441319834, + "grad_norm": 0.8522243499755859, + "learning_rate": 0.0001849087412823168, + "loss": 1.236, + "step": 2950 + }, + { + "epoch": 0.4439445069366329, + "grad_norm": 0.7768913507461548, + "learning_rate": 0.00018419965750636645, + "loss": 1.2368, + "step": 2960 + }, + { + "epoch": 0.4454443194600675, + "grad_norm": 0.907557487487793, + "learning_rate": 0.00018348976665349932, + "loss": 1.2275, + "step": 2970 + }, + { + "epoch": 0.4469441319835021, + "grad_norm": 0.8612179160118103, + "learning_rate": 0.00018277908547641294, + "loss": 1.2352, + "step": 2980 + }, + { + "epoch": 0.44844394450693664, + "grad_norm": 0.832486093044281, + "learning_rate": 0.00018206763074645588, + "loss": 1.2262, + "step": 2990 + }, + { + "epoch": 0.4499437570303712, + "grad_norm": 0.7332595586776733, + "learning_rate": 0.0001813554192532316, + "loss": 1.2445, + "step": 3000 + }, + { + "epoch": 0.45144356955380577, + "grad_norm": 0.7843475937843323, + "learning_rate": 0.00018064246780420245, + "loss": 1.2453, + "step": 3010 + }, + { + "epoch": 0.45294338207724033, + "grad_norm": 0.8000037670135498, + "learning_rate": 0.000179928793224293, + "loss": 1.2148, + "step": 3020 + }, + { + "epoch": 0.4544431946006749, + "grad_norm": 0.8519952893257141, + "learning_rate": 0.00017921441235549295, + "loss": 1.2369, + "step": 3030 + }, + { + "epoch": 0.4559430071241095, + "grad_norm": 0.8536350131034851, + "learning_rate": 0.00017849934205645967, + "loss": 1.2442, + "step": 3040 + }, + { + "epoch": 0.4574428196475441, + "grad_norm": 0.8602985739707947, + "learning_rate": 0.00017778359920212047, + "loss": 1.2475, + "step": 3050 + }, + { + "epoch": 0.45894263217097864, + "grad_norm": 0.7802624106407166, + "learning_rate": 0.0001770672006832741, + "loss": 1.2341, + "step": 3060 + }, + { + "epoch": 0.4604424446944132, + "grad_norm": 0.791219174861908, + "learning_rate": 0.00017635016340619255, + "loss": 1.2267, + "step": 3070 + }, + { + "epoch": 0.46194225721784776, + "grad_norm": 0.8056305050849915, + "learning_rate": 0.00017563250429222173, + "loss": 1.248, + "step": 3080 + }, + { + "epoch": 0.4634420697412823, + "grad_norm": 0.859767496585846, + "learning_rate": 0.00017491424027738216, + "loss": 1.2495, + "step": 3090 + }, + { + "epoch": 0.4649418822647169, + "grad_norm": 0.8691778182983398, + "learning_rate": 0.0001741953883119696, + "loss": 1.215, + "step": 3100 + }, + { + "epoch": 0.4664416947881515, + "grad_norm": 0.8486020565032959, + "learning_rate": 0.00017347596536015472, + "loss": 1.2339, + "step": 3110 + }, + { + "epoch": 0.46794150731158607, + "grad_norm": 0.798159122467041, + "learning_rate": 0.00017275598839958296, + "loss": 1.2461, + "step": 3120 + }, + { + "epoch": 0.46944131983502063, + "grad_norm": 0.8142710328102112, + "learning_rate": 0.00017203547442097369, + "loss": 1.231, + "step": 3130 + }, + { + "epoch": 0.4709411323584552, + "grad_norm": 0.8455555438995361, + "learning_rate": 0.0001713144404277195, + "loss": 1.2376, + "step": 3140 + }, + { + "epoch": 0.47244094488188976, + "grad_norm": 0.8146346807479858, + "learning_rate": 0.0001705929034354846, + "loss": 1.2204, + "step": 3150 + }, + { + "epoch": 0.4739407574053243, + "grad_norm": 0.8013060688972473, + "learning_rate": 0.0001698708804718037, + "loss": 1.2199, + "step": 3160 + }, + { + "epoch": 0.4754405699287589, + "grad_norm": 0.7504465579986572, + "learning_rate": 0.00016914838857567979, + "loss": 1.2314, + "step": 3170 + }, + { + "epoch": 0.4769403824521935, + "grad_norm": 0.813957691192627, + "learning_rate": 0.00016842544479718215, + "loss": 1.2298, + "step": 3180 + }, + { + "epoch": 0.47844019497562806, + "grad_norm": 0.8849406838417053, + "learning_rate": 0.00016770206619704412, + "loss": 1.2398, + "step": 3190 + }, + { + "epoch": 0.4799400074990626, + "grad_norm": 0.7421510815620422, + "learning_rate": 0.0001669782698462603, + "loss": 1.2274, + "step": 3200 + }, + { + "epoch": 0.4814398200224972, + "grad_norm": 0.8098101019859314, + "learning_rate": 0.00016625407282568394, + "loss": 1.238, + "step": 3210 + }, + { + "epoch": 0.48293963254593175, + "grad_norm": 0.7728010416030884, + "learning_rate": 0.00016552949222562352, + "loss": 1.2467, + "step": 3220 + }, + { + "epoch": 0.4844394450693663, + "grad_norm": 0.8032839298248291, + "learning_rate": 0.00016480454514543962, + "loss": 1.2288, + "step": 3230 + }, + { + "epoch": 0.4859392575928009, + "grad_norm": 0.7134261727333069, + "learning_rate": 0.00016407924869314144, + "loss": 1.225, + "step": 3240 + }, + { + "epoch": 0.4874390701162355, + "grad_norm": 0.8371984362602234, + "learning_rate": 0.00016335361998498296, + "loss": 1.2015, + "step": 3250 + }, + { + "epoch": 0.48893888263967006, + "grad_norm": 0.8015307188034058, + "learning_rate": 0.00016262767614505912, + "loss": 1.2082, + "step": 3260 + }, + { + "epoch": 0.4904386951631046, + "grad_norm": 0.8056173324584961, + "learning_rate": 0.00016190143430490152, + "loss": 1.2153, + "step": 3270 + }, + { + "epoch": 0.4919385076865392, + "grad_norm": 0.7906458377838135, + "learning_rate": 0.00016117491160307445, + "loss": 1.2337, + "step": 3280 + }, + { + "epoch": 0.49343832020997375, + "grad_norm": 0.8407464027404785, + "learning_rate": 0.00016044812518477007, + "loss": 1.2333, + "step": 3290 + }, + { + "epoch": 0.4949381327334083, + "grad_norm": 0.8177177906036377, + "learning_rate": 0.00015972109220140402, + "loss": 1.216, + "step": 3300 + }, + { + "epoch": 0.49643794525684287, + "grad_norm": 0.77277672290802, + "learning_rate": 0.0001589938298102108, + "loss": 1.2279, + "step": 3310 + }, + { + "epoch": 0.4979377577802775, + "grad_norm": 0.8213476538658142, + "learning_rate": 0.0001582663551738384, + "loss": 1.2272, + "step": 3320 + }, + { + "epoch": 0.49943757030371205, + "grad_norm": 0.8175489902496338, + "learning_rate": 0.00015753868545994378, + "loss": 1.2285, + "step": 3330 + }, + { + "epoch": 0.5001874765654293, + "eval_loss": 1.2690181732177734, + "eval_runtime": 35.2482, + "eval_samples_per_second": 709.256, + "eval_steps_per_second": 88.657, + "step": 3335 + }, + { + "epoch": 0.5009373828271466, + "grad_norm": 0.834984540939331, + "learning_rate": 0.00015681083784078748, + "loss": 1.221, + "step": 3340 + }, + { + "epoch": 0.5024371953505812, + "grad_norm": 0.8637697100639343, + "learning_rate": 0.00015608282949282844, + "loss": 1.2343, + "step": 3350 + }, + { + "epoch": 0.5039370078740157, + "grad_norm": 0.8110714554786682, + "learning_rate": 0.00015535467759631862, + "loss": 1.2341, + "step": 3360 + }, + { + "epoch": 0.5054368203974503, + "grad_norm": 0.8313619494438171, + "learning_rate": 0.00015462639933489753, + "loss": 1.2255, + "step": 3370 + }, + { + "epoch": 0.5069366329208849, + "grad_norm": 0.8548438549041748, + "learning_rate": 0.00015389801189518693, + "loss": 1.2204, + "step": 3380 + }, + { + "epoch": 0.5084364454443194, + "grad_norm": 0.8614264130592346, + "learning_rate": 0.00015316953246638482, + "loss": 1.2182, + "step": 3390 + }, + { + "epoch": 0.509936257967754, + "grad_norm": 0.8315454125404358, + "learning_rate": 0.00015244097823986023, + "loss": 1.2168, + "step": 3400 + }, + { + "epoch": 0.5114360704911886, + "grad_norm": 0.8473772406578064, + "learning_rate": 0.0001517123664087473, + "loss": 1.2329, + "step": 3410 + }, + { + "epoch": 0.5129358830146231, + "grad_norm": 0.8713449835777283, + "learning_rate": 0.00015098371416753963, + "loss": 1.2233, + "step": 3420 + }, + { + "epoch": 0.5144356955380578, + "grad_norm": 0.8293794989585876, + "learning_rate": 0.00015025503871168432, + "loss": 1.2085, + "step": 3430 + }, + { + "epoch": 0.5159355080614924, + "grad_norm": 0.8054031729698181, + "learning_rate": 0.00014952635723717642, + "loss": 1.2443, + "step": 3440 + }, + { + "epoch": 0.5174353205849269, + "grad_norm": 0.8494643568992615, + "learning_rate": 0.000148797686940153, + "loss": 1.2231, + "step": 3450 + }, + { + "epoch": 0.5189351331083615, + "grad_norm": 0.8356276154518127, + "learning_rate": 0.0001480690450164873, + "loss": 1.2275, + "step": 3460 + }, + { + "epoch": 0.520434945631796, + "grad_norm": 0.8196589350700378, + "learning_rate": 0.00014734044866138312, + "loss": 1.197, + "step": 3470 + }, + { + "epoch": 0.5219347581552306, + "grad_norm": 0.8253743648529053, + "learning_rate": 0.00014661191506896867, + "loss": 1.2068, + "step": 3480 + }, + { + "epoch": 0.5234345706786652, + "grad_norm": 0.7777687311172485, + "learning_rate": 0.0001458834614318912, + "loss": 1.2072, + "step": 3490 + }, + { + "epoch": 0.5249343832020997, + "grad_norm": 0.771138608455658, + "learning_rate": 0.00014515510494091102, + "loss": 1.2066, + "step": 3500 + }, + { + "epoch": 0.5264341957255343, + "grad_norm": 0.7920674085617065, + "learning_rate": 0.00014442686278449588, + "loss": 1.2078, + "step": 3510 + }, + { + "epoch": 0.5279340082489689, + "grad_norm": 0.8012422919273376, + "learning_rate": 0.00014369875214841548, + "loss": 1.2218, + "step": 3520 + }, + { + "epoch": 0.5294338207724034, + "grad_norm": 0.8012481331825256, + "learning_rate": 0.0001429707902153355, + "loss": 1.229, + "step": 3530 + }, + { + "epoch": 0.530933633295838, + "grad_norm": 0.8194921612739563, + "learning_rate": 0.0001422429941644127, + "loss": 1.2141, + "step": 3540 + }, + { + "epoch": 0.5324334458192725, + "grad_norm": 0.7735179662704468, + "learning_rate": 0.000141515381170889, + "loss": 1.2271, + "step": 3550 + }, + { + "epoch": 0.5339332583427071, + "grad_norm": 0.7934532761573792, + "learning_rate": 0.00014078796840568647, + "loss": 1.2161, + "step": 3560 + }, + { + "epoch": 0.5354330708661418, + "grad_norm": 0.844835102558136, + "learning_rate": 0.0001400607730350018, + "loss": 1.215, + "step": 3570 + }, + { + "epoch": 0.5369328833895763, + "grad_norm": 0.7871401309967041, + "learning_rate": 0.0001393338122199016, + "loss": 1.2248, + "step": 3580 + }, + { + "epoch": 0.5384326959130109, + "grad_norm": 0.7825441956520081, + "learning_rate": 0.00013860710311591713, + "loss": 1.2182, + "step": 3590 + }, + { + "epoch": 0.5399325084364455, + "grad_norm": 0.9266743659973145, + "learning_rate": 0.00013788066287263946, + "loss": 1.2195, + "step": 3600 + }, + { + "epoch": 0.54143232095988, + "grad_norm": 0.8771612048149109, + "learning_rate": 0.00013715450863331495, + "loss": 1.2077, + "step": 3610 + }, + { + "epoch": 0.5429321334833146, + "grad_norm": 0.8083598017692566, + "learning_rate": 0.00013642865753444043, + "loss": 1.2095, + "step": 3620 + }, + { + "epoch": 0.5444319460067492, + "grad_norm": 0.8372387886047363, + "learning_rate": 0.000135703126705359, + "loss": 1.1981, + "step": 3630 + }, + { + "epoch": 0.5459317585301837, + "grad_norm": 0.791763424873352, + "learning_rate": 0.00013497793326785573, + "loss": 1.2181, + "step": 3640 + }, + { + "epoch": 0.5474315710536183, + "grad_norm": 0.784136950969696, + "learning_rate": 0.00013425309433575365, + "loss": 1.2156, + "step": 3650 + }, + { + "epoch": 0.5489313835770528, + "grad_norm": 0.7631322741508484, + "learning_rate": 0.0001335286270145096, + "loss": 1.1944, + "step": 3660 + }, + { + "epoch": 0.5504311961004874, + "grad_norm": 0.8252114057540894, + "learning_rate": 0.00013280454840081105, + "loss": 1.2048, + "step": 3670 + }, + { + "epoch": 0.551931008623922, + "grad_norm": 0.7659439444541931, + "learning_rate": 0.0001320808755821722, + "loss": 1.2113, + "step": 3680 + }, + { + "epoch": 0.5534308211473565, + "grad_norm": 0.8548429608345032, + "learning_rate": 0.00013135762563653097, + "loss": 1.2021, + "step": 3690 + }, + { + "epoch": 0.5549306336707911, + "grad_norm": 0.8134576678276062, + "learning_rate": 0.00013063481563184589, + "loss": 1.1896, + "step": 3700 + }, + { + "epoch": 0.5564304461942258, + "grad_norm": 0.7915739417076111, + "learning_rate": 0.00012991246262569327, + "loss": 1.2162, + "step": 3710 + }, + { + "epoch": 0.5579302587176603, + "grad_norm": 0.8490334749221802, + "learning_rate": 0.00012919058366486492, + "loss": 1.2148, + "step": 3720 + }, + { + "epoch": 0.5594300712410949, + "grad_norm": 0.8796569108963013, + "learning_rate": 0.00012846919578496545, + "loss": 1.19, + "step": 3730 + }, + { + "epoch": 0.5609298837645295, + "grad_norm": 0.8151915669441223, + "learning_rate": 0.00012774831601001054, + "loss": 1.2171, + "step": 3740 + }, + { + "epoch": 0.562429696287964, + "grad_norm": 0.8759236335754395, + "learning_rate": 0.00012702796135202518, + "loss": 1.2296, + "step": 3750 + }, + { + "epoch": 0.5639295088113986, + "grad_norm": 0.8563137054443359, + "learning_rate": 0.00012630814881064206, + "loss": 1.2178, + "step": 3760 + }, + { + "epoch": 0.5654293213348331, + "grad_norm": 0.8290548920631409, + "learning_rate": 0.00012558889537270048, + "loss": 1.1993, + "step": 3770 + }, + { + "epoch": 0.5669291338582677, + "grad_norm": 0.8869160413742065, + "learning_rate": 0.0001248702180118455, + "loss": 1.2207, + "step": 3780 + }, + { + "epoch": 0.5684289463817023, + "grad_norm": 0.7675673961639404, + "learning_rate": 0.00012415213368812731, + "loss": 1.2006, + "step": 3790 + }, + { + "epoch": 0.5699287589051368, + "grad_norm": 0.7600024342536926, + "learning_rate": 0.00012343465934760102, + "loss": 1.205, + "step": 3800 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.8360748291015625, + "learning_rate": 0.00012271781192192688, + "loss": 1.2177, + "step": 3810 + }, + { + "epoch": 0.572928383952006, + "grad_norm": 0.7921136021614075, + "learning_rate": 0.00012200160832797046, + "loss": 1.1977, + "step": 3820 + }, + { + "epoch": 0.5744281964754405, + "grad_norm": 0.7750628590583801, + "learning_rate": 0.0001212860654674036, + "loss": 1.223, + "step": 3830 + }, + { + "epoch": 0.5759280089988752, + "grad_norm": 0.8707244992256165, + "learning_rate": 0.00012057120022630546, + "loss": 1.2088, + "step": 3840 + }, + { + "epoch": 0.5774278215223098, + "grad_norm": 0.8273059725761414, + "learning_rate": 0.00011985702947476424, + "loss": 1.201, + "step": 3850 + }, + { + "epoch": 0.5789276340457443, + "grad_norm": 0.9016513824462891, + "learning_rate": 0.00011914357006647877, + "loss": 1.1946, + "step": 3860 + }, + { + "epoch": 0.5804274465691789, + "grad_norm": 0.9097079038619995, + "learning_rate": 0.00011843083883836084, + "loss": 1.2074, + "step": 3870 + }, + { + "epoch": 0.5819272590926134, + "grad_norm": 0.845512866973877, + "learning_rate": 0.0001177188526101381, + "loss": 1.2024, + "step": 3880 + }, + { + "epoch": 0.583427071616048, + "grad_norm": 0.8833523392677307, + "learning_rate": 0.00011700762818395682, + "loss": 1.2116, + "step": 3890 + }, + { + "epoch": 0.5849268841394826, + "grad_norm": 0.7923941016197205, + "learning_rate": 0.0001162971823439856, + "loss": 1.2023, + "step": 3900 + }, + { + "epoch": 0.5864266966629171, + "grad_norm": 0.7897645235061646, + "learning_rate": 0.00011558753185601922, + "loss": 1.1994, + "step": 3910 + }, + { + "epoch": 0.5879265091863517, + "grad_norm": 0.7955695390701294, + "learning_rate": 0.00011487869346708289, + "loss": 1.1875, + "step": 3920 + }, + { + "epoch": 0.5894263217097863, + "grad_norm": 0.7841424942016602, + "learning_rate": 0.00011417068390503716, + "loss": 1.2065, + "step": 3930 + }, + { + "epoch": 0.5909261342332208, + "grad_norm": 0.8241638541221619, + "learning_rate": 0.00011346351987818307, + "loss": 1.1919, + "step": 3940 + }, + { + "epoch": 0.5924259467566554, + "grad_norm": 0.7868320941925049, + "learning_rate": 0.00011275721807486805, + "loss": 1.2089, + "step": 3950 + }, + { + "epoch": 0.59392575928009, + "grad_norm": 0.8634785413742065, + "learning_rate": 0.00011205179516309172, + "loss": 1.1979, + "step": 3960 + }, + { + "epoch": 0.5954255718035245, + "grad_norm": 0.9329147338867188, + "learning_rate": 0.00011134726779011288, + "loss": 1.2006, + "step": 3970 + }, + { + "epoch": 0.5969253843269592, + "grad_norm": 0.8250072598457336, + "learning_rate": 0.00011064365258205658, + "loss": 1.1937, + "step": 3980 + }, + { + "epoch": 0.5984251968503937, + "grad_norm": 0.7763371467590332, + "learning_rate": 0.00010994096614352153, + "loss": 1.205, + "step": 3990 + }, + { + "epoch": 0.5999250093738283, + "grad_norm": 0.8118007183074951, + "learning_rate": 0.00010923922505718863, + "loss": 1.2063, + "step": 4000 + }, + { + "epoch": 0.6002249718785152, + "eval_loss": 1.246632695198059, + "eval_runtime": 34.4866, + "eval_samples_per_second": 724.918, + "eval_steps_per_second": 90.615, + "step": 4002 + }, + { + "epoch": 0.6014248218972629, + "grad_norm": 0.8617530465126038, + "learning_rate": 0.00010853844588342926, + "loss": 1.1877, + "step": 4010 + }, + { + "epoch": 0.6029246344206974, + "grad_norm": 0.8039788007736206, + "learning_rate": 0.00010783864515991481, + "loss": 1.2154, + "step": 4020 + }, + { + "epoch": 0.604424446944132, + "grad_norm": 0.8393165469169617, + "learning_rate": 0.00010713983940122617, + "loss": 1.2147, + "step": 4030 + }, + { + "epoch": 0.6059242594675666, + "grad_norm": 0.8276166915893555, + "learning_rate": 0.00010644204509846398, + "loss": 1.185, + "step": 4040 + }, + { + "epoch": 0.6074240719910011, + "grad_norm": 0.8333732485771179, + "learning_rate": 0.00010574527871885977, + "loss": 1.2304, + "step": 4050 + }, + { + "epoch": 0.6089238845144357, + "grad_norm": 0.8123278617858887, + "learning_rate": 0.00010504955670538699, + "loss": 1.2036, + "step": 4060 + }, + { + "epoch": 0.6104236970378702, + "grad_norm": 0.8344663977622986, + "learning_rate": 0.00010435489547637316, + "loss": 1.2073, + "step": 4070 + }, + { + "epoch": 0.6119235095613048, + "grad_norm": 0.7638000845909119, + "learning_rate": 0.00010366131142511228, + "loss": 1.2091, + "step": 4080 + }, + { + "epoch": 0.6134233220847394, + "grad_norm": 0.8148689866065979, + "learning_rate": 0.00010296882091947826, + "loss": 1.1973, + "step": 4090 + }, + { + "epoch": 0.6149231346081739, + "grad_norm": 0.8646231293678284, + "learning_rate": 0.00010227744030153821, + "loss": 1.1868, + "step": 4100 + }, + { + "epoch": 0.6164229471316085, + "grad_norm": 0.8547177910804749, + "learning_rate": 0.0001015871858871672, + "loss": 1.2246, + "step": 4110 + }, + { + "epoch": 0.6179227596550432, + "grad_norm": 0.8218623399734497, + "learning_rate": 0.00010089807396566306, + "loss": 1.2139, + "step": 4120 + }, + { + "epoch": 0.6194225721784777, + "grad_norm": 0.8276129961013794, + "learning_rate": 0.00010021012079936174, + "loss": 1.1913, + "step": 4130 + }, + { + "epoch": 0.6209223847019123, + "grad_norm": 0.8115292191505432, + "learning_rate": 9.952334262325399e-05, + "loss": 1.172, + "step": 4140 + }, + { + "epoch": 0.6224221972253469, + "grad_norm": 0.8721445798873901, + "learning_rate": 9.883775564460193e-05, + "loss": 1.1901, + "step": 4150 + }, + { + "epoch": 0.6239220097487814, + "grad_norm": 0.8343495726585388, + "learning_rate": 9.815337604255665e-05, + "loss": 1.1891, + "step": 4160 + }, + { + "epoch": 0.625421822272216, + "grad_norm": 0.8031916618347168, + "learning_rate": 9.747021996777624e-05, + "loss": 1.1967, + "step": 4170 + }, + { + "epoch": 0.6269216347956506, + "grad_norm": 0.7772048711776733, + "learning_rate": 9.678830354204504e-05, + "loss": 1.2089, + "step": 4180 + }, + { + "epoch": 0.6284214473190851, + "grad_norm": 0.8471980690956116, + "learning_rate": 9.610764285789271e-05, + "loss": 1.1967, + "step": 4190 + }, + { + "epoch": 0.6299212598425197, + "grad_norm": 0.8212400078773499, + "learning_rate": 9.542825397821485e-05, + "loss": 1.1861, + "step": 4200 + }, + { + "epoch": 0.6314210723659542, + "grad_norm": 0.8733875155448914, + "learning_rate": 9.475015293589373e-05, + "loss": 1.1977, + "step": 4210 + }, + { + "epoch": 0.6329208848893888, + "grad_norm": 0.8037174940109253, + "learning_rate": 9.407335573341997e-05, + "loss": 1.1842, + "step": 4220 + }, + { + "epoch": 0.6344206974128234, + "grad_norm": 0.8639466762542725, + "learning_rate": 9.339787834251489e-05, + "loss": 1.195, + "step": 4230 + }, + { + "epoch": 0.6359205099362579, + "grad_norm": 0.8288107514381409, + "learning_rate": 9.272373670375362e-05, + "loss": 1.1919, + "step": 4240 + }, + { + "epoch": 0.6374203224596925, + "grad_norm": 0.7566668391227722, + "learning_rate": 9.205094672618889e-05, + "loss": 1.2009, + "step": 4250 + }, + { + "epoch": 0.6389201349831272, + "grad_norm": 0.8419100046157837, + "learning_rate": 9.137952428697568e-05, + "loss": 1.1658, + "step": 4260 + }, + { + "epoch": 0.6404199475065617, + "grad_norm": 0.7842187881469727, + "learning_rate": 9.070948523099643e-05, + "loss": 1.1876, + "step": 4270 + }, + { + "epoch": 0.6419197600299963, + "grad_norm": 0.7916682958602905, + "learning_rate": 9.004084537048708e-05, + "loss": 1.1952, + "step": 4280 + }, + { + "epoch": 0.6434195725534309, + "grad_norm": 0.7986481189727783, + "learning_rate": 8.937362048466404e-05, + "loss": 1.1925, + "step": 4290 + }, + { + "epoch": 0.6449193850768654, + "grad_norm": 0.7649396061897278, + "learning_rate": 8.870782631935184e-05, + "loss": 1.2007, + "step": 4300 + }, + { + "epoch": 0.6464191976003, + "grad_norm": 0.8236790895462036, + "learning_rate": 8.804347858661131e-05, + "loss": 1.1939, + "step": 4310 + }, + { + "epoch": 0.6479190101237345, + "grad_norm": 0.8257557153701782, + "learning_rate": 8.73805929643691e-05, + "loss": 1.1907, + "step": 4320 + }, + { + "epoch": 0.6494188226471691, + "grad_norm": 0.8270912170410156, + "learning_rate": 8.67191850960475e-05, + "loss": 1.1864, + "step": 4330 + }, + { + "epoch": 0.6509186351706037, + "grad_norm": 0.8583309650421143, + "learning_rate": 8.605927059019528e-05, + "loss": 1.1888, + "step": 4340 + }, + { + "epoch": 0.6524184476940382, + "grad_norm": 0.8187999725341797, + "learning_rate": 8.540086502011935e-05, + "loss": 1.2046, + "step": 4350 + }, + { + "epoch": 0.6539182602174728, + "grad_norm": 0.769585907459259, + "learning_rate": 8.47439839235174e-05, + "loss": 1.2002, + "step": 4360 + }, + { + "epoch": 0.6554180727409074, + "grad_norm": 0.8344823718070984, + "learning_rate": 8.408864280211115e-05, + "loss": 1.1753, + "step": 4370 + }, + { + "epoch": 0.6569178852643419, + "grad_norm": 0.8236811757087708, + "learning_rate": 8.343485712128026e-05, + "loss": 1.1818, + "step": 4380 + }, + { + "epoch": 0.6584176977877765, + "grad_norm": 0.807981014251709, + "learning_rate": 8.278264230969769e-05, + "loss": 1.1932, + "step": 4390 + }, + { + "epoch": 0.6599175103112112, + "grad_norm": 0.7782221436500549, + "learning_rate": 8.213201375896563e-05, + "loss": 1.1802, + "step": 4400 + }, + { + "epoch": 0.6614173228346457, + "grad_norm": 0.7768334746360779, + "learning_rate": 8.14829868232519e-05, + "loss": 1.188, + "step": 4410 + }, + { + "epoch": 0.6629171353580803, + "grad_norm": 0.8299062252044678, + "learning_rate": 8.083557681892797e-05, + "loss": 1.1834, + "step": 4420 + }, + { + "epoch": 0.6644169478815148, + "grad_norm": 0.8360872864723206, + "learning_rate": 8.018979902420746e-05, + "loss": 1.1928, + "step": 4430 + }, + { + "epoch": 0.6659167604049494, + "grad_norm": 0.8572105765342712, + "learning_rate": 7.954566867878538e-05, + "loss": 1.1892, + "step": 4440 + }, + { + "epoch": 0.667416572928384, + "grad_norm": 0.7985962629318237, + "learning_rate": 7.890320098347861e-05, + "loss": 1.1725, + "step": 4450 + }, + { + "epoch": 0.6689163854518185, + "grad_norm": 0.7798737287521362, + "learning_rate": 7.82624110998673e-05, + "loss": 1.1814, + "step": 4460 + }, + { + "epoch": 0.6704161979752531, + "grad_norm": 0.9000463485717773, + "learning_rate": 7.762331414993697e-05, + "loss": 1.1793, + "step": 4470 + }, + { + "epoch": 0.6719160104986877, + "grad_norm": 0.8930211067199707, + "learning_rate": 7.698592521572155e-05, + "loss": 1.1812, + "step": 4480 + }, + { + "epoch": 0.6734158230221222, + "grad_norm": 0.8517357707023621, + "learning_rate": 7.635025933894747e-05, + "loss": 1.1984, + "step": 4490 + }, + { + "epoch": 0.6749156355455568, + "grad_norm": 0.8382774591445923, + "learning_rate": 7.571633152067901e-05, + "loss": 1.1956, + "step": 4500 + }, + { + "epoch": 0.6764154480689913, + "grad_norm": 0.8367804884910583, + "learning_rate": 7.508415672096389e-05, + "loss": 1.1892, + "step": 4510 + }, + { + "epoch": 0.6779152605924259, + "grad_norm": 0.8804981708526611, + "learning_rate": 7.445374985848035e-05, + "loss": 1.1712, + "step": 4520 + }, + { + "epoch": 0.6794150731158605, + "grad_norm": 0.8023963570594788, + "learning_rate": 7.382512581018514e-05, + "loss": 1.2116, + "step": 4530 + }, + { + "epoch": 0.6809148856392951, + "grad_norm": 0.7947113513946533, + "learning_rate": 7.31982994109626e-05, + "loss": 1.1829, + "step": 4540 + }, + { + "epoch": 0.6824146981627297, + "grad_norm": 0.7888148427009583, + "learning_rate": 7.25732854532741e-05, + "loss": 1.1979, + "step": 4550 + }, + { + "epoch": 0.6839145106861643, + "grad_norm": 0.8301454782485962, + "learning_rate": 7.195009868680954e-05, + "loss": 1.1884, + "step": 4560 + }, + { + "epoch": 0.6854143232095988, + "grad_norm": 0.8213895559310913, + "learning_rate": 7.13287538181387e-05, + "loss": 1.1782, + "step": 4570 + }, + { + "epoch": 0.6869141357330334, + "grad_norm": 0.8629058599472046, + "learning_rate": 7.070926551036469e-05, + "loss": 1.1728, + "step": 4580 + }, + { + "epoch": 0.688413948256468, + "grad_norm": 0.8480379581451416, + "learning_rate": 7.009164838277754e-05, + "loss": 1.185, + "step": 4590 + }, + { + "epoch": 0.6899137607799025, + "grad_norm": 0.8442770838737488, + "learning_rate": 6.947591701050932e-05, + "loss": 1.2177, + "step": 4600 + }, + { + "epoch": 0.6914135733033371, + "grad_norm": 0.8650897741317749, + "learning_rate": 6.886208592419043e-05, + "loss": 1.1931, + "step": 4610 + }, + { + "epoch": 0.6929133858267716, + "grad_norm": 0.8003455400466919, + "learning_rate": 6.825016960960616e-05, + "loss": 1.199, + "step": 4620 + }, + { + "epoch": 0.6944131983502062, + "grad_norm": 0.8398526906967163, + "learning_rate": 6.764018250735532e-05, + "loss": 1.1725, + "step": 4630 + }, + { + "epoch": 0.6959130108736408, + "grad_norm": 0.8158472180366516, + "learning_rate": 6.703213901250931e-05, + "loss": 1.1756, + "step": 4640 + }, + { + "epoch": 0.6974128233970753, + "grad_norm": 0.8212939500808716, + "learning_rate": 6.64260534742723e-05, + "loss": 1.1783, + "step": 4650 + }, + { + "epoch": 0.6989126359205099, + "grad_norm": 0.8833694458007812, + "learning_rate": 6.582194019564266e-05, + "loss": 1.1649, + "step": 4660 + }, + { + "epoch": 0.700262467191601, + "eval_loss": 1.224123239517212, + "eval_runtime": 35.4085, + "eval_samples_per_second": 706.045, + "eval_steps_per_second": 88.256, + "step": 4669 + }, + { + "epoch": 0.7004124484439445, + "grad_norm": 0.8789852857589722, + "learning_rate": 6.521981343307554e-05, + "loss": 1.1856, + "step": 4670 + }, + { + "epoch": 0.7019122609673791, + "grad_norm": 0.7838938236236572, + "learning_rate": 6.461968739614639e-05, + "loss": 1.171, + "step": 4680 + }, + { + "epoch": 0.7034120734908137, + "grad_norm": 0.8347029685974121, + "learning_rate": 6.402157624721546e-05, + "loss": 1.186, + "step": 4690 + }, + { + "epoch": 0.7049118860142483, + "grad_norm": 0.8648792505264282, + "learning_rate": 6.342549410109372e-05, + "loss": 1.1603, + "step": 4700 + }, + { + "epoch": 0.7064116985376828, + "grad_norm": 0.8432384729385376, + "learning_rate": 6.283145502470976e-05, + "loss": 1.1896, + "step": 4710 + }, + { + "epoch": 0.7079115110611174, + "grad_norm": 0.8470872044563293, + "learning_rate": 6.223947303677793e-05, + "loss": 1.1933, + "step": 4720 + }, + { + "epoch": 0.709411323584552, + "grad_norm": 0.7818253636360168, + "learning_rate": 6.164956210746723e-05, + "loss": 1.1783, + "step": 4730 + }, + { + "epoch": 0.7109111361079865, + "grad_norm": 0.8789279460906982, + "learning_rate": 6.106173615807186e-05, + "loss": 1.18, + "step": 4740 + }, + { + "epoch": 0.7124109486314211, + "grad_norm": 0.7911210060119629, + "learning_rate": 6.047600906068269e-05, + "loss": 1.1675, + "step": 4750 + }, + { + "epoch": 0.7139107611548556, + "grad_norm": 0.827257513999939, + "learning_rate": 5.989239463785971e-05, + "loss": 1.1939, + "step": 4760 + }, + { + "epoch": 0.7154105736782902, + "grad_norm": 0.8434903025627136, + "learning_rate": 5.9310906662306125e-05, + "loss": 1.1885, + "step": 4770 + }, + { + "epoch": 0.7169103862017248, + "grad_norm": 0.8206247091293335, + "learning_rate": 5.8731558856542935e-05, + "loss": 1.1795, + "step": 4780 + }, + { + "epoch": 0.7184101987251593, + "grad_norm": 0.8511059880256653, + "learning_rate": 5.8154364892585574e-05, + "loss": 1.1673, + "step": 4790 + }, + { + "epoch": 0.7199100112485939, + "grad_norm": 0.8107950687408447, + "learning_rate": 5.75793383916208e-05, + "loss": 1.1627, + "step": 4800 + }, + { + "epoch": 0.7214098237720284, + "grad_norm": 0.8588976860046387, + "learning_rate": 5.70064929236855e-05, + "loss": 1.1817, + "step": 4810 + }, + { + "epoch": 0.7229096362954631, + "grad_norm": 0.8428772687911987, + "learning_rate": 5.643584200734659e-05, + "loss": 1.188, + "step": 4820 + }, + { + "epoch": 0.7244094488188977, + "grad_norm": 0.9080433249473572, + "learning_rate": 5.586739910938161e-05, + "loss": 1.1858, + "step": 4830 + }, + { + "epoch": 0.7259092613423322, + "grad_norm": 0.8083125352859497, + "learning_rate": 5.5301177644461164e-05, + "loss": 1.1629, + "step": 4840 + }, + { + "epoch": 0.7274090738657668, + "grad_norm": 0.8565297722816467, + "learning_rate": 5.4737190974832426e-05, + "loss": 1.1819, + "step": 4850 + }, + { + "epoch": 0.7289088863892014, + "grad_norm": 0.892975926399231, + "learning_rate": 5.417545241000353e-05, + "loss": 1.1745, + "step": 4860 + }, + { + "epoch": 0.7304086989126359, + "grad_norm": 0.9558145403862, + "learning_rate": 5.361597520642981e-05, + "loss": 1.1624, + "step": 4870 + }, + { + "epoch": 0.7319085114360705, + "grad_norm": 0.893839418888092, + "learning_rate": 5.3058772567200595e-05, + "loss": 1.1784, + "step": 4880 + }, + { + "epoch": 0.7334083239595051, + "grad_norm": 0.879852831363678, + "learning_rate": 5.250385764172802e-05, + "loss": 1.1754, + "step": 4890 + }, + { + "epoch": 0.7349081364829396, + "grad_norm": 0.9134666323661804, + "learning_rate": 5.195124352543636e-05, + "loss": 1.1919, + "step": 4900 + }, + { + "epoch": 0.7364079490063742, + "grad_norm": 0.8220536708831787, + "learning_rate": 5.140094325945323e-05, + "loss": 1.1639, + "step": 4910 + }, + { + "epoch": 0.7379077615298087, + "grad_norm": 0.9278730154037476, + "learning_rate": 5.085296983030164e-05, + "loss": 1.1914, + "step": 4920 + }, + { + "epoch": 0.7394075740532433, + "grad_norm": 0.874758780002594, + "learning_rate": 5.030733616959384e-05, + "loss": 1.179, + "step": 4930 + }, + { + "epoch": 0.7409073865766779, + "grad_norm": 0.8796992897987366, + "learning_rate": 4.976405515372577e-05, + "loss": 1.1838, + "step": 4940 + }, + { + "epoch": 0.7424071991001124, + "grad_norm": 0.8270970582962036, + "learning_rate": 4.922313960357336e-05, + "loss": 1.1744, + "step": 4950 + }, + { + "epoch": 0.7439070116235471, + "grad_norm": 0.8671916723251343, + "learning_rate": 4.868460228419003e-05, + "loss": 1.1837, + "step": 4960 + }, + { + "epoch": 0.7454068241469817, + "grad_norm": 0.8556026220321655, + "learning_rate": 4.814845590450544e-05, + "loss": 1.1724, + "step": 4970 + }, + { + "epoch": 0.7469066366704162, + "grad_norm": 0.8527052998542786, + "learning_rate": 4.761471311702541e-05, + "loss": 1.1604, + "step": 4980 + }, + { + "epoch": 0.7484064491938508, + "grad_norm": 0.9000732898712158, + "learning_rate": 4.70833865175334e-05, + "loss": 1.1787, + "step": 4990 + }, + { + "epoch": 0.7499062617172854, + "grad_norm": 0.8143606185913086, + "learning_rate": 4.6554488644793555e-05, + "loss": 1.1808, + "step": 5000 + }, + { + "epoch": 0.7514060742407199, + "grad_norm": 0.8999016880989075, + "learning_rate": 4.602803198025429e-05, + "loss": 1.1774, + "step": 5010 + }, + { + "epoch": 0.7529058867641545, + "grad_norm": 0.8793285489082336, + "learning_rate": 4.550402894775408e-05, + "loss": 1.1567, + "step": 5020 + }, + { + "epoch": 0.754405699287589, + "grad_norm": 0.8987736701965332, + "learning_rate": 4.49824919132283e-05, + "loss": 1.1531, + "step": 5030 + }, + { + "epoch": 0.7559055118110236, + "grad_norm": 0.9041977524757385, + "learning_rate": 4.446343318441719e-05, + "loss": 1.1695, + "step": 5040 + }, + { + "epoch": 0.7574053243344582, + "grad_norm": 0.8655080795288086, + "learning_rate": 4.394686501057553e-05, + "loss": 1.1734, + "step": 5050 + }, + { + "epoch": 0.7589051368578927, + "grad_norm": 0.7983774542808533, + "learning_rate": 4.343279958218352e-05, + "loss": 1.1742, + "step": 5060 + }, + { + "epoch": 0.7604049493813273, + "grad_norm": 0.8692320585250854, + "learning_rate": 4.29212490306592e-05, + "loss": 1.1727, + "step": 5070 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.8991125226020813, + "learning_rate": 4.241222542807211e-05, + "loss": 1.1796, + "step": 5080 + }, + { + "epoch": 0.7634045744281964, + "grad_norm": 0.8381094336509705, + "learning_rate": 4.19057407868583e-05, + "loss": 1.1658, + "step": 5090 + }, + { + "epoch": 0.7649043869516311, + "grad_norm": 0.8688058257102966, + "learning_rate": 4.140180705953689e-05, + "loss": 1.1808, + "step": 5100 + }, + { + "epoch": 0.7664041994750657, + "grad_norm": 0.910092830657959, + "learning_rate": 4.090043613842823e-05, + "loss": 1.1837, + "step": 5110 + }, + { + "epoch": 0.7679040119985002, + "grad_norm": 0.8620381355285645, + "learning_rate": 4.0401639855372884e-05, + "loss": 1.1887, + "step": 5120 + }, + { + "epoch": 0.7694038245219348, + "grad_norm": 0.8097319006919861, + "learning_rate": 3.990542998145262e-05, + "loss": 1.1579, + "step": 5130 + }, + { + "epoch": 0.7709036370453693, + "grad_norm": 0.9091822504997253, + "learning_rate": 3.941181822671273e-05, + "loss": 1.1801, + "step": 5140 + }, + { + "epoch": 0.7724034495688039, + "grad_norm": 0.8581311106681824, + "learning_rate": 3.892081623988541e-05, + "loss": 1.1892, + "step": 5150 + }, + { + "epoch": 0.7739032620922385, + "grad_norm": 0.7965316772460938, + "learning_rate": 3.8432435608115e-05, + "loss": 1.1629, + "step": 5160 + }, + { + "epoch": 0.775403074615673, + "grad_norm": 0.844725489616394, + "learning_rate": 3.794668785668465e-05, + "loss": 1.173, + "step": 5170 + }, + { + "epoch": 0.7769028871391076, + "grad_norm": 0.8694456219673157, + "learning_rate": 3.7463584448744186e-05, + "loss": 1.167, + "step": 5180 + }, + { + "epoch": 0.7784026996625422, + "grad_norm": 0.8310941457748413, + "learning_rate": 3.6983136785039636e-05, + "loss": 1.1647, + "step": 5190 + }, + { + "epoch": 0.7799025121859767, + "grad_norm": 0.7977578043937683, + "learning_rate": 3.650535620364407e-05, + "loss": 1.1704, + "step": 5200 + }, + { + "epoch": 0.7814023247094113, + "grad_norm": 0.8039920330047607, + "learning_rate": 3.603025397969037e-05, + "loss": 1.1585, + "step": 5210 + }, + { + "epoch": 0.7829021372328459, + "grad_norm": 0.974540114402771, + "learning_rate": 3.555784132510472e-05, + "loss": 1.1672, + "step": 5220 + }, + { + "epoch": 0.7844019497562804, + "grad_norm": 0.9114983081817627, + "learning_rate": 3.508812938834227e-05, + "loss": 1.1707, + "step": 5230 + }, + { + "epoch": 0.7859017622797151, + "grad_norm": 0.8260616064071655, + "learning_rate": 3.4621129254124106e-05, + "loss": 1.1548, + "step": 5240 + }, + { + "epoch": 0.7874015748031497, + "grad_norm": 0.8436164855957031, + "learning_rate": 3.415685194317539e-05, + "loss": 1.1552, + "step": 5250 + }, + { + "epoch": 0.7889013873265842, + "grad_norm": 0.8650628924369812, + "learning_rate": 3.3695308411965564e-05, + "loss": 1.1679, + "step": 5260 + }, + { + "epoch": 0.7904011998500188, + "grad_norm": 0.8216997385025024, + "learning_rate": 3.323650955244951e-05, + "loss": 1.1761, + "step": 5270 + }, + { + "epoch": 0.7919010123734533, + "grad_norm": 0.9183224439620972, + "learning_rate": 3.2780466191810905e-05, + "loss": 1.1657, + "step": 5280 + }, + { + "epoch": 0.7934008248968879, + "grad_norm": 0.8745443820953369, + "learning_rate": 3.232718909220631e-05, + "loss": 1.1748, + "step": 5290 + }, + { + "epoch": 0.7949006374203225, + "grad_norm": 0.8624297976493835, + "learning_rate": 3.187668895051135e-05, + "loss": 1.1656, + "step": 5300 + }, + { + "epoch": 0.796400449943757, + "grad_norm": 0.8851115107536316, + "learning_rate": 3.14289763980683e-05, + "loss": 1.1893, + "step": 5310 + }, + { + "epoch": 0.7979002624671916, + "grad_norm": 0.902093768119812, + "learning_rate": 3.0984062000435276e-05, + "loss": 1.1729, + "step": 5320 + }, + { + "epoch": 0.7994000749906262, + "grad_norm": 0.9625739455223083, + "learning_rate": 3.054195625713668e-05, + "loss": 1.1674, + "step": 5330 + }, + { + "epoch": 0.8002999625046869, + "eval_loss": 1.2090579271316528, + "eval_runtime": 34.3372, + "eval_samples_per_second": 728.073, + "eval_steps_per_second": 91.009, + "step": 5336 + }, + { + "epoch": 0.8008998875140607, + "grad_norm": 0.8621588349342346, + "learning_rate": 3.0102669601415575e-05, + "loss": 1.1566, + "step": 5340 + }, + { + "epoch": 0.8023997000374953, + "grad_norm": 0.7970715165138245, + "learning_rate": 2.966621239998755e-05, + "loss": 1.1581, + "step": 5350 + }, + { + "epoch": 0.8038995125609298, + "grad_norm": 0.78794264793396, + "learning_rate": 2.9232594952795818e-05, + "loss": 1.1581, + "step": 5360 + }, + { + "epoch": 0.8053993250843644, + "grad_norm": 0.8636032342910767, + "learning_rate": 2.8801827492768352e-05, + "loss": 1.1783, + "step": 5370 + }, + { + "epoch": 0.8068991376077991, + "grad_norm": 0.8886037468910217, + "learning_rate": 2.8373920185576375e-05, + "loss": 1.1646, + "step": 5380 + }, + { + "epoch": 0.8083989501312336, + "grad_norm": 0.8844544887542725, + "learning_rate": 2.7948883129394467e-05, + "loss": 1.1627, + "step": 5390 + }, + { + "epoch": 0.8098987626546682, + "grad_norm": 0.8555653691291809, + "learning_rate": 2.7526726354662104e-05, + "loss": 1.1557, + "step": 5400 + }, + { + "epoch": 0.8113985751781028, + "grad_norm": 0.8595440983772278, + "learning_rate": 2.7107459823847106e-05, + "loss": 1.1606, + "step": 5410 + }, + { + "epoch": 0.8128983877015373, + "grad_norm": 0.9353649020195007, + "learning_rate": 2.6691093431210596e-05, + "loss": 1.1755, + "step": 5420 + }, + { + "epoch": 0.8143982002249719, + "grad_norm": 0.8532871603965759, + "learning_rate": 2.6277637002573288e-05, + "loss": 1.1738, + "step": 5430 + }, + { + "epoch": 0.8158980127484065, + "grad_norm": 0.8638527393341064, + "learning_rate": 2.586710029508375e-05, + "loss": 1.1643, + "step": 5440 + }, + { + "epoch": 0.817397825271841, + "grad_norm": 0.9085490107536316, + "learning_rate": 2.54594929969881e-05, + "loss": 1.1604, + "step": 5450 + }, + { + "epoch": 0.8188976377952756, + "grad_norm": 0.8463364839553833, + "learning_rate": 2.5054824727401502e-05, + "loss": 1.1651, + "step": 5460 + }, + { + "epoch": 0.8203974503187101, + "grad_norm": 0.8295713663101196, + "learning_rate": 2.46531050360809e-05, + "loss": 1.1645, + "step": 5470 + }, + { + "epoch": 0.8218972628421447, + "grad_norm": 0.8853150010108948, + "learning_rate": 2.4254343403199945e-05, + "loss": 1.1768, + "step": 5480 + }, + { + "epoch": 0.8233970753655793, + "grad_norm": 0.9157831072807312, + "learning_rate": 2.3858549239125034e-05, + "loss": 1.1601, + "step": 5490 + }, + { + "epoch": 0.8248968878890138, + "grad_norm": 0.8486490845680237, + "learning_rate": 2.346573188419341e-05, + "loss": 1.1647, + "step": 5500 + }, + { + "epoch": 0.8263967004124484, + "grad_norm": 0.7939295768737793, + "learning_rate": 2.3075900608492637e-05, + "loss": 1.1692, + "step": 5510 + }, + { + "epoch": 0.8278965129358831, + "grad_norm": 1.1155019998550415, + "learning_rate": 2.2689064611641794e-05, + "loss": 1.1907, + "step": 5520 + }, + { + "epoch": 0.8293963254593176, + "grad_norm": 0.8578311800956726, + "learning_rate": 2.230523302257461e-05, + "loss": 1.15, + "step": 5530 + }, + { + "epoch": 0.8308961379827522, + "grad_norm": 0.86622154712677, + "learning_rate": 2.192441489932372e-05, + "loss": 1.1708, + "step": 5540 + }, + { + "epoch": 0.8323959505061868, + "grad_norm": 0.8797856569290161, + "learning_rate": 2.154661922880708e-05, + "loss": 1.155, + "step": 5550 + }, + { + "epoch": 0.8338957630296213, + "grad_norm": 0.9027743935585022, + "learning_rate": 2.117185492661592e-05, + "loss": 1.1502, + "step": 5560 + }, + { + "epoch": 0.8353955755530559, + "grad_norm": 0.8181419968605042, + "learning_rate": 2.0800130836804214e-05, + "loss": 1.1618, + "step": 5570 + }, + { + "epoch": 0.8368953880764904, + "grad_norm": 0.846794843673706, + "learning_rate": 2.043145573168003e-05, + "loss": 1.1588, + "step": 5580 + }, + { + "epoch": 0.838395200599925, + "grad_norm": 0.9015936255455017, + "learning_rate": 2.0065838311598543e-05, + "loss": 1.1775, + "step": 5590 + }, + { + "epoch": 0.8398950131233596, + "grad_norm": 0.8660979866981506, + "learning_rate": 1.9703287204756757e-05, + "loss": 1.1582, + "step": 5600 + }, + { + "epoch": 0.8413948256467941, + "grad_norm": 0.8045121431350708, + "learning_rate": 1.9343810966989716e-05, + "loss": 1.1778, + "step": 5610 + }, + { + "epoch": 0.8428946381702287, + "grad_norm": 0.8060126304626465, + "learning_rate": 1.8987418081568683e-05, + "loss": 1.1445, + "step": 5620 + }, + { + "epoch": 0.8443944506936633, + "grad_norm": 0.8652163147926331, + "learning_rate": 1.8634116959001106e-05, + "loss": 1.1634, + "step": 5630 + }, + { + "epoch": 0.8458942632170978, + "grad_norm": 0.9784821271896362, + "learning_rate": 1.828391593683185e-05, + "loss": 1.1474, + "step": 5640 + }, + { + "epoch": 0.8473940757405324, + "grad_norm": 0.9049434065818787, + "learning_rate": 1.7936823279446676e-05, + "loss": 1.1548, + "step": 5650 + }, + { + "epoch": 0.8488938882639671, + "grad_norm": 0.8466004133224487, + "learning_rate": 1.7592847177877008e-05, + "loss": 1.1611, + "step": 5660 + }, + { + "epoch": 0.8503937007874016, + "grad_norm": 0.8664677739143372, + "learning_rate": 1.725199574960689e-05, + "loss": 1.1472, + "step": 5670 + }, + { + "epoch": 0.8518935133108362, + "grad_norm": 0.8182629346847534, + "learning_rate": 1.6914277038381145e-05, + "loss": 1.1646, + "step": 5680 + }, + { + "epoch": 0.8533933258342707, + "grad_norm": 0.8338120579719543, + "learning_rate": 1.6579699014015783e-05, + "loss": 1.159, + "step": 5690 + }, + { + "epoch": 0.8548931383577053, + "grad_norm": 0.910591185092926, + "learning_rate": 1.6248269572209716e-05, + "loss": 1.1562, + "step": 5700 + }, + { + "epoch": 0.8563929508811399, + "grad_norm": 0.9760018587112427, + "learning_rate": 1.5919996534358635e-05, + "loss": 1.1413, + "step": 5710 + }, + { + "epoch": 0.8578927634045744, + "grad_norm": 0.9556133151054382, + "learning_rate": 1.5594887647370263e-05, + "loss": 1.1537, + "step": 5720 + }, + { + "epoch": 0.859392575928009, + "grad_norm": 0.8409389853477478, + "learning_rate": 1.527295058348154e-05, + "loss": 1.1351, + "step": 5730 + }, + { + "epoch": 0.8608923884514436, + "grad_norm": 0.8387997150421143, + "learning_rate": 1.4954192940077809e-05, + "loss": 1.1509, + "step": 5740 + }, + { + "epoch": 0.8623922009748781, + "grad_norm": 0.9269035458564758, + "learning_rate": 1.463862223951317e-05, + "loss": 1.1634, + "step": 5750 + }, + { + "epoch": 0.8638920134983127, + "grad_norm": 0.8396034836769104, + "learning_rate": 1.4326245928933178e-05, + "loss": 1.1663, + "step": 5760 + }, + { + "epoch": 0.8653918260217472, + "grad_norm": 0.8768131732940674, + "learning_rate": 1.4017071380099132e-05, + "loss": 1.1597, + "step": 5770 + }, + { + "epoch": 0.8668916385451818, + "grad_norm": 0.8506944179534912, + "learning_rate": 1.3711105889213908e-05, + "loss": 1.1605, + "step": 5780 + }, + { + "epoch": 0.8683914510686164, + "grad_norm": 0.8659221529960632, + "learning_rate": 1.3408356676750043e-05, + "loss": 1.1792, + "step": 5790 + }, + { + "epoch": 0.869891263592051, + "grad_norm": 0.869342029094696, + "learning_rate": 1.310883088727902e-05, + "loss": 1.1579, + "step": 5800 + }, + { + "epoch": 0.8713910761154856, + "grad_norm": 0.8290470242500305, + "learning_rate": 1.2812535589303024e-05, + "loss": 1.143, + "step": 5810 + }, + { + "epoch": 0.8728908886389202, + "grad_norm": 0.8000004887580872, + "learning_rate": 1.2519477775087805e-05, + "loss": 1.1688, + "step": 5820 + }, + { + "epoch": 0.8743907011623547, + "grad_norm": 0.907409131526947, + "learning_rate": 1.222966436049786e-05, + "loss": 1.1787, + "step": 5830 + }, + { + "epoch": 0.8758905136857893, + "grad_norm": 0.8386558294296265, + "learning_rate": 1.1943102184833165e-05, + "loss": 1.1736, + "step": 5840 + }, + { + "epoch": 0.8773903262092239, + "grad_norm": 0.8760896325111389, + "learning_rate": 1.165979801066782e-05, + "loss": 1.1539, + "step": 5850 + }, + { + "epoch": 0.8788901387326584, + "grad_norm": 0.959135890007019, + "learning_rate": 1.1379758523690413e-05, + "loss": 1.1756, + "step": 5860 + }, + { + "epoch": 0.880389951256093, + "grad_norm": 0.8947364091873169, + "learning_rate": 1.1102990332546175e-05, + "loss": 1.1567, + "step": 5870 + }, + { + "epoch": 0.8818897637795275, + "grad_norm": 0.8616886138916016, + "learning_rate": 1.0829499968681204e-05, + "loss": 1.1636, + "step": 5880 + }, + { + "epoch": 0.8833895763029621, + "grad_norm": 0.899998664855957, + "learning_rate": 1.0559293886188246e-05, + "loss": 1.1737, + "step": 5890 + }, + { + "epoch": 0.8848893888263967, + "grad_norm": 0.8765754699707031, + "learning_rate": 1.029237846165426e-05, + "loss": 1.165, + "step": 5900 + }, + { + "epoch": 0.8863892013498312, + "grad_norm": 0.8620162010192871, + "learning_rate": 1.0028759994010071e-05, + "loss": 1.1688, + "step": 5910 + }, + { + "epoch": 0.8878890138732658, + "grad_norm": 0.8123705387115479, + "learning_rate": 9.768444704381811e-06, + "loss": 1.157, + "step": 5920 + }, + { + "epoch": 0.8893888263967004, + "grad_norm": 0.9160408973693848, + "learning_rate": 9.511438735943849e-06, + "loss": 1.1718, + "step": 5930 + }, + { + "epoch": 0.890888638920135, + "grad_norm": 0.9307278990745544, + "learning_rate": 9.257748153773992e-06, + "loss": 1.155, + "step": 5940 + }, + { + "epoch": 0.8923884514435696, + "grad_norm": 0.8938122391700745, + "learning_rate": 9.007378944710431e-06, + "loss": 1.1486, + "step": 5950 + }, + { + "epoch": 0.8938882639670042, + "grad_norm": 0.8921361565589905, + "learning_rate": 8.760337017210206e-06, + "loss": 1.1456, + "step": 5960 + }, + { + "epoch": 0.8953880764904387, + "grad_norm": 0.9233677983283997, + "learning_rate": 8.516628201209985e-06, + "loss": 1.1566, + "step": 5970 + }, + { + "epoch": 0.8968878890138733, + "grad_norm": 0.8670746088027954, + "learning_rate": 8.276258247988437e-06, + "loss": 1.1533, + "step": 5980 + }, + { + "epoch": 0.8983877015373078, + "grad_norm": 0.8692810535430908, + "learning_rate": 8.039232830030413e-06, + "loss": 1.1672, + "step": 5990 + }, + { + "epoch": 0.8998875140607424, + "grad_norm": 0.8850069046020508, + "learning_rate": 7.805557540893276e-06, + "loss": 1.1712, + "step": 6000 + }, + { + "epoch": 0.9003374578177727, + "eval_loss": 1.2019070386886597, + "eval_runtime": 34.3384, + "eval_samples_per_second": 728.048, + "eval_steps_per_second": 91.006, + "step": 6003 + }, + { + "epoch": 0.901387326584177, + "grad_norm": 0.8891724944114685, + "learning_rate": 7.575237895074637e-06, + "loss": 1.167, + "step": 6010 + }, + { + "epoch": 0.9028871391076115, + "grad_norm": 0.8959289789199829, + "learning_rate": 7.348279327882467e-06, + "loss": 1.1651, + "step": 6020 + }, + { + "epoch": 0.9043869516310461, + "grad_norm": 0.8475953340530396, + "learning_rate": 7.1246871953066666e-06, + "loss": 1.1508, + "step": 6030 + }, + { + "epoch": 0.9058867641544807, + "grad_norm": 0.8530213832855225, + "learning_rate": 6.9044667738927365e-06, + "loss": 1.1631, + "step": 6040 + }, + { + "epoch": 0.9073865766779152, + "grad_norm": 0.8265974521636963, + "learning_rate": 6.6876232606172255e-06, + "loss": 1.1553, + "step": 6050 + }, + { + "epoch": 0.9088863892013498, + "grad_norm": 0.8897525668144226, + "learning_rate": 6.4741617727651626e-06, + "loss": 1.1457, + "step": 6060 + }, + { + "epoch": 0.9103862017247843, + "grad_norm": 0.926990270614624, + "learning_rate": 6.264087347809188e-06, + "loss": 1.171, + "step": 6070 + }, + { + "epoch": 0.911886014248219, + "grad_norm": 0.9227252006530762, + "learning_rate": 6.0574049432907115e-06, + "loss": 1.1704, + "step": 6080 + }, + { + "epoch": 0.9133858267716536, + "grad_norm": 0.8871036171913147, + "learning_rate": 5.854119436702976e-06, + "loss": 1.1648, + "step": 6090 + }, + { + "epoch": 0.9148856392950881, + "grad_norm": 0.9545475244522095, + "learning_rate": 5.65423562537593e-06, + "loss": 1.1612, + "step": 6100 + }, + { + "epoch": 0.9163854518185227, + "grad_norm": 0.8905931711196899, + "learning_rate": 5.4577582263629235e-06, + "loss": 1.1673, + "step": 6110 + }, + { + "epoch": 0.9178852643419573, + "grad_norm": 0.857297420501709, + "learning_rate": 5.264691876329474e-06, + "loss": 1.1436, + "step": 6120 + }, + { + "epoch": 0.9193850768653918, + "grad_norm": 0.8451759219169617, + "learning_rate": 5.075041131443891e-06, + "loss": 1.1594, + "step": 6130 + }, + { + "epoch": 0.9208848893888264, + "grad_norm": 0.8909957408905029, + "learning_rate": 4.88881046726966e-06, + "loss": 1.1514, + "step": 6140 + }, + { + "epoch": 0.922384701912261, + "grad_norm": 0.9147069454193115, + "learning_rate": 4.706004278659831e-06, + "loss": 1.153, + "step": 6150 + }, + { + "epoch": 0.9238845144356955, + "grad_norm": 0.8463402390480042, + "learning_rate": 4.526626879653428e-06, + "loss": 1.168, + "step": 6160 + }, + { + "epoch": 0.9253843269591301, + "grad_norm": 0.8234553933143616, + "learning_rate": 4.350682503373437e-06, + "loss": 1.1484, + "step": 6170 + }, + { + "epoch": 0.9268841394825647, + "grad_norm": 0.9229360222816467, + "learning_rate": 4.178175301927101e-06, + "loss": 1.1751, + "step": 6180 + }, + { + "epoch": 0.9283839520059992, + "grad_norm": 0.884353518486023, + "learning_rate": 4.009109346307792e-06, + "loss": 1.1613, + "step": 6190 + }, + { + "epoch": 0.9298837645294338, + "grad_norm": 0.8972669839859009, + "learning_rate": 3.8434886262991015e-06, + "loss": 1.1664, + "step": 6200 + }, + { + "epoch": 0.9313835770528683, + "grad_norm": 0.9175205230712891, + "learning_rate": 3.6813170503804834e-06, + "loss": 1.1777, + "step": 6210 + }, + { + "epoch": 0.932883389576303, + "grad_norm": 0.8350104093551636, + "learning_rate": 3.522598445635172e-06, + "loss": 1.1365, + "step": 6220 + }, + { + "epoch": 0.9343832020997376, + "grad_norm": 0.851258397102356, + "learning_rate": 3.3673365576598e-06, + "loss": 1.1595, + "step": 6230 + }, + { + "epoch": 0.9358830146231721, + "grad_norm": 0.9081841111183167, + "learning_rate": 3.21553505047602e-06, + "loss": 1.1698, + "step": 6240 + }, + { + "epoch": 0.9373828271466067, + "grad_norm": 0.9121673107147217, + "learning_rate": 3.067197506444058e-06, + "loss": 1.1611, + "step": 6250 + }, + { + "epoch": 0.9388826396700413, + "grad_norm": 0.9540317058563232, + "learning_rate": 2.922327426178128e-06, + "loss": 1.1404, + "step": 6260 + }, + { + "epoch": 0.9403824521934758, + "grad_norm": 0.8566182255744934, + "learning_rate": 2.7809282284638855e-06, + "loss": 1.1821, + "step": 6270 + }, + { + "epoch": 0.9418822647169104, + "grad_norm": 0.9044936895370483, + "learning_rate": 2.643003250177672e-06, + "loss": 1.1455, + "step": 6280 + }, + { + "epoch": 0.943382077240345, + "grad_norm": 0.8130437135696411, + "learning_rate": 2.5085557462078134e-06, + "loss": 1.1486, + "step": 6290 + }, + { + "epoch": 0.9448818897637795, + "grad_norm": 0.9217805862426758, + "learning_rate": 2.377588889377813e-06, + "loss": 1.1758, + "step": 6300 + }, + { + "epoch": 0.9463817022872141, + "grad_norm": 0.9020255208015442, + "learning_rate": 2.2501057703714797e-06, + "loss": 1.1762, + "step": 6310 + }, + { + "epoch": 0.9478815148106486, + "grad_norm": 0.8997666239738464, + "learning_rate": 2.1261093976599365e-06, + "loss": 1.1698, + "step": 6320 + }, + { + "epoch": 0.9493813273340832, + "grad_norm": 0.8945279121398926, + "learning_rate": 2.005602697430675e-06, + "loss": 1.1653, + "step": 6330 + }, + { + "epoch": 0.9508811398575178, + "grad_norm": 0.8765221238136292, + "learning_rate": 1.8885885135184963e-06, + "loss": 1.1671, + "step": 6340 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.8852826356887817, + "learning_rate": 1.7750696073383974e-06, + "loss": 1.1514, + "step": 6350 + }, + { + "epoch": 0.953880764904387, + "grad_norm": 0.8397039175033569, + "learning_rate": 1.6650486578203725e-06, + "loss": 1.1617, + "step": 6360 + }, + { + "epoch": 0.9553805774278216, + "grad_norm": 0.85945725440979, + "learning_rate": 1.558528261346248e-06, + "loss": 1.1444, + "step": 6370 + }, + { + "epoch": 0.9568803899512561, + "grad_norm": 0.917316198348999, + "learning_rate": 1.455510931688364e-06, + "loss": 1.1542, + "step": 6380 + }, + { + "epoch": 0.9583802024746907, + "grad_norm": 0.8585231304168701, + "learning_rate": 1.3559990999502556e-06, + "loss": 1.1611, + "step": 6390 + }, + { + "epoch": 0.9598800149981253, + "grad_norm": 0.9111419320106506, + "learning_rate": 1.2599951145093157e-06, + "loss": 1.155, + "step": 6400 + }, + { + "epoch": 0.9613798275215598, + "grad_norm": 0.8682249188423157, + "learning_rate": 1.1675012409613715e-06, + "loss": 1.1484, + "step": 6410 + }, + { + "epoch": 0.9628796400449944, + "grad_norm": 0.8944458365440369, + "learning_rate": 1.0785196620671455e-06, + "loss": 1.1566, + "step": 6420 + }, + { + "epoch": 0.9643794525684289, + "grad_norm": 0.838281512260437, + "learning_rate": 9.93052477700862e-07, + "loss": 1.1691, + "step": 6430 + }, + { + "epoch": 0.9658792650918635, + "grad_norm": 0.8258534669876099, + "learning_rate": 9.111017048005876e-07, + "loss": 1.1631, + "step": 6440 + }, + { + "epoch": 0.9673790776152981, + "grad_norm": 0.7644683718681335, + "learning_rate": 8.326692773207189e-07, + "loss": 1.1599, + "step": 6450 + }, + { + "epoch": 0.9688788901387326, + "grad_norm": 0.922005832195282, + "learning_rate": 7.577570461862359e-07, + "loss": 1.1589, + "step": 6460 + }, + { + "epoch": 0.9703787026621672, + "grad_norm": 0.8499112129211426, + "learning_rate": 6.863667792491534e-07, + "loss": 1.1573, + "step": 6470 + }, + { + "epoch": 0.9718785151856018, + "grad_norm": 0.9114837646484375, + "learning_rate": 6.185001612467044e-07, + "loss": 1.1716, + "step": 6480 + }, + { + "epoch": 0.9733783277090363, + "grad_norm": 0.8339487314224243, + "learning_rate": 5.541587937616221e-07, + "loss": 1.1591, + "step": 6490 + }, + { + "epoch": 0.974878140232471, + "grad_norm": 0.8110185265541077, + "learning_rate": 4.933441951843198e-07, + "loss": 1.1539, + "step": 6500 + }, + { + "epoch": 0.9763779527559056, + "grad_norm": 0.8427588939666748, + "learning_rate": 4.360578006770865e-07, + "loss": 1.1619, + "step": 6510 + }, + { + "epoch": 0.9778777652793401, + "grad_norm": 0.8306043148040771, + "learning_rate": 3.82300962140214e-07, + "loss": 1.1609, + "step": 6520 + }, + { + "epoch": 0.9793775778027747, + "grad_norm": 0.9311191439628601, + "learning_rate": 3.320749481800888e-07, + "loss": 1.1556, + "step": 6530 + }, + { + "epoch": 0.9808773903262092, + "grad_norm": 0.932629406452179, + "learning_rate": 2.8538094407919987e-07, + "loss": 1.1523, + "step": 6540 + }, + { + "epoch": 0.9823772028496438, + "grad_norm": 0.8704735040664673, + "learning_rate": 2.4222005176829375e-07, + "loss": 1.1471, + "step": 6550 + }, + { + "epoch": 0.9838770153730784, + "grad_norm": 0.980547308921814, + "learning_rate": 2.025932898002458e-07, + "loss": 1.1687, + "step": 6560 + }, + { + "epoch": 0.9853768278965129, + "grad_norm": 0.8425877094268799, + "learning_rate": 1.6650159332607939e-07, + "loss": 1.1513, + "step": 6570 + }, + { + "epoch": 0.9868766404199475, + "grad_norm": 0.9170466065406799, + "learning_rate": 1.3394581407289996e-07, + "loss": 1.1726, + "step": 6580 + }, + { + "epoch": 0.9883764529433821, + "grad_norm": 0.9073484539985657, + "learning_rate": 1.0492672032377803e-07, + "loss": 1.1627, + "step": 6590 + }, + { + "epoch": 0.9898762654668166, + "grad_norm": 0.9039649963378906, + "learning_rate": 7.944499689961358e-08, + "loss": 1.1544, + "step": 6600 + }, + { + "epoch": 0.9913760779902512, + "grad_norm": 0.9152038097381592, + "learning_rate": 5.7501245143015685e-08, + "loss": 1.1618, + "step": 6610 + }, + { + "epoch": 0.9928758905136857, + "grad_norm": 0.922379732131958, + "learning_rate": 3.9095982904080447e-08, + "loss": 1.1587, + "step": 6620 + }, + { + "epoch": 0.9943757030371203, + "grad_norm": 0.8688434362411499, + "learning_rate": 2.4229644528150905e-08, + "loss": 1.1668, + "step": 6630 + }, + { + "epoch": 0.995875515560555, + "grad_norm": 0.8602100014686584, + "learning_rate": 1.290258084557516e-08, + "loss": 1.1596, + "step": 6640 + }, + { + "epoch": 0.9973753280839895, + "grad_norm": 0.8412054777145386, + "learning_rate": 5.115059163496304e-09, + "loss": 1.1424, + "step": 6650 + }, + { + "epoch": 0.9988751406074241, + "grad_norm": 0.9167276620864868, + "learning_rate": 8.672632594408646e-10, + "loss": 1.1601, + "step": 6660 + } + ], + "logging_steps": 10, + "max_steps": 6667, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 667, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2426610074517504.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}