diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,4767 @@
+{
+  "best_metric": 1.2019070386886597,
+  "best_model_checkpoint": "/Users/bbunzeck/Documents/german-llamas/cxn-llamas/cds-llama/checkpoint-6003",
+  "epoch": 0.9999250093738282,
+  "eval_steps": 667,
+  "global_step": 6667,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0014998125234345708,
+      "grad_norm": 4.572068214416504,
+      "learning_rate": 1.4999999999999999e-05,
+      "loss": 4.4943,
+      "step": 10
+    },
+    {
+      "epoch": 0.0029996250468691415,
+      "grad_norm": 3.0534508228302,
+      "learning_rate": 2.9999999999999997e-05,
+      "loss": 4.1499,
+      "step": 20
+    },
+    {
+      "epoch": 0.0044994375703037125,
+      "grad_norm": 2.2651097774505615,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 3.7895,
+      "step": 30
+    },
+    {
+      "epoch": 0.005999250093738283,
+      "grad_norm": 1.8512789011001587,
+      "learning_rate": 5.9999999999999995e-05,
+      "loss": 3.5688,
+      "step": 40
+    },
+    {
+      "epoch": 0.0074990626171728535,
+      "grad_norm": 1.5266691446304321,
+      "learning_rate": 7.5e-05,
+      "loss": 3.4336,
+      "step": 50
+    },
+    {
+      "epoch": 0.008998875140607425,
+      "grad_norm": 1.1185054779052734,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 3.2977,
+      "step": 60
+    },
+    {
+      "epoch": 0.010498687664041995,
+      "grad_norm": 0.9552314877510071,
+      "learning_rate": 0.00010499999999999999,
+      "loss": 3.1571,
+      "step": 70
+    },
+    {
+      "epoch": 0.011998500187476566,
+      "grad_norm": 1.1306709051132202,
+      "learning_rate": 0.00011999999999999999,
+      "loss": 2.9127,
+      "step": 80
+    },
+    {
+      "epoch": 0.013498312710911136,
+      "grad_norm": 1.14328932762146,
+      "learning_rate": 0.000135,
+      "loss": 2.7226,
+      "step": 90
+    },
+    {
+      "epoch": 0.014998125234345707,
+      "grad_norm": 2.9210283756256104,
+      "learning_rate": 0.00015,
+      "loss": 2.6076,
+      "step": 100
+    },
+    {
+      "epoch": 0.016497937757780277,
+      "grad_norm": 1.3305509090423584,
+      "learning_rate": 0.000165,
+      "loss": 2.5095,
+      "step": 110
+    },
+    {
+      "epoch": 0.01799775028121485,
+      "grad_norm": 1.7380200624465942,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 2.434,
+      "step": 120
+    },
+    {
+      "epoch": 0.01949756280464942,
+      "grad_norm": 2.1826679706573486,
+      "learning_rate": 0.000195,
+      "loss": 2.3722,
+      "step": 130
+    },
+    {
+      "epoch": 0.02099737532808399,
+      "grad_norm": 1.3203043937683105,
+      "learning_rate": 0.00020999999999999998,
+      "loss": 2.3267,
+      "step": 140
+    },
+    {
+      "epoch": 0.02249718785151856,
+      "grad_norm": 1.439988136291504,
+      "learning_rate": 0.000225,
+      "loss": 2.2467,
+      "step": 150
+    },
+    {
+      "epoch": 0.023997000374953132,
+      "grad_norm": 1.5344315767288208,
+      "learning_rate": 0.00023999999999999998,
+      "loss": 2.1827,
+      "step": 160
+    },
+    {
+      "epoch": 0.0254968128983877,
+      "grad_norm": 1.6336548328399658,
+      "learning_rate": 0.00025499999999999996,
+      "loss": 2.1359,
+      "step": 170
+    },
+    {
+      "epoch": 0.02699662542182227,
+      "grad_norm": 1.454647421836853,
+      "learning_rate": 0.00027,
+      "loss": 2.054,
+      "step": 180
+    },
+    {
+      "epoch": 0.028496437945256844,
+      "grad_norm": 1.4481509923934937,
+      "learning_rate": 0.000285,
+      "loss": 2.0315,
+      "step": 190
+    },
+    {
+      "epoch": 0.029996250468691414,
+      "grad_norm": 1.7465718984603882,
+      "learning_rate": 0.0003,
+      "loss": 1.9847,
+      "step": 200
+    },
+    {
+      "epoch": 0.031496062992125984,
+      "grad_norm": 1.4593420028686523,
+      "learning_rate": 0.0002999982300767559,
+      "loss": 1.9516,
+      "step": 210
+    },
+    {
+      "epoch": 0.032995875515560553,
+      "grad_norm": 1.270998477935791,
+      "learning_rate": 0.000299992920348792,
+      "loss": 1.9128,
+      "step": 220
+    },
+    {
+      "epoch": 0.03449568803899512,
+      "grad_norm": 1.7116246223449707,
+      "learning_rate": 0.0002999840709414124,
+      "loss": 1.9023,
+      "step": 230
+    },
+    {
+      "epoch": 0.0359955005624297,
+      "grad_norm": 1.2121365070343018,
+      "learning_rate": 0.0002999716820634541,
+      "loss": 1.8635,
+      "step": 240
+    },
+    {
+      "epoch": 0.03749531308586427,
+      "grad_norm": 1.5796058177947998,
+      "learning_rate": 0.000299955754007282,
+      "loss": 1.8292,
+      "step": 250
+    },
+    {
+      "epoch": 0.03899512560929884,
+      "grad_norm": 1.2615342140197754,
+      "learning_rate": 0.00029993628714878185,
+      "loss": 1.7873,
+      "step": 260
+    },
+    {
+      "epoch": 0.04049493813273341,
+      "grad_norm": 1.3430827856063843,
+      "learning_rate": 0.00029991328194735155,
+      "loss": 1.798,
+      "step": 270
+    },
+    {
+      "epoch": 0.04199475065616798,
+      "grad_norm": 1.2447484731674194,
+      "learning_rate": 0.0002998867389458904,
+      "loss": 1.7618,
+      "step": 280
+    },
+    {
+      "epoch": 0.04349456317960255,
+      "grad_norm": 1.4957722425460815,
+      "learning_rate": 0.00029985665877078595,
+      "loss": 1.7638,
+      "step": 290
+    },
+    {
+      "epoch": 0.04499437570303712,
+      "grad_norm": 1.152177333831787,
+      "learning_rate": 0.0002998230421318997,
+      "loss": 1.7557,
+      "step": 300
+    },
+    {
+      "epoch": 0.046494188226471694,
+      "grad_norm": 1.0447930097579956,
+      "learning_rate": 0.0002997858898225498,
+      "loss": 1.7204,
+      "step": 310
+    },
+    {
+      "epoch": 0.047994000749906264,
+      "grad_norm": 1.4656999111175537,
+      "learning_rate": 0.0002997452027194928,
+      "loss": 1.7295,
+      "step": 320
+    },
+    {
+      "epoch": 0.049493813273340834,
+      "grad_norm": 1.344132661819458,
+      "learning_rate": 0.0002997009817829027,
+      "loss": 1.7353,
+      "step": 330
+    },
+    {
+      "epoch": 0.0509936257967754,
+      "grad_norm": 1.3033422231674194,
+      "learning_rate": 0.0002996532280563483,
+      "loss": 1.7043,
+      "step": 340
+    },
+    {
+      "epoch": 0.05249343832020997,
+      "grad_norm": 1.070493221282959,
+      "learning_rate": 0.0002996019426667687,
+      "loss": 1.6626,
+      "step": 350
+    },
+    {
+      "epoch": 0.05399325084364454,
+      "grad_norm": 1.1506638526916504,
+      "learning_rate": 0.00029954712682444656,
+      "loss": 1.652,
+      "step": 360
+    },
+    {
+      "epoch": 0.05549306336707911,
+      "grad_norm": 1.296350121498108,
+      "learning_rate": 0.0002994887818229797,
+      "loss": 1.6325,
+      "step": 370
+    },
+    {
+      "epoch": 0.05699287589051369,
+      "grad_norm": 1.0449875593185425,
+      "learning_rate": 0.0002994269090392505,
+      "loss": 1.6708,
+      "step": 380
+    },
+    {
+      "epoch": 0.05849268841394826,
+      "grad_norm": 1.1577085256576538,
+      "learning_rate": 0.00029936150993339325,
+      "loss": 1.6341,
+      "step": 390
+    },
+    {
+      "epoch": 0.05999250093738283,
+      "grad_norm": 1.0227168798446655,
+      "learning_rate": 0.0002992925860487599,
+      "loss": 1.6497,
+      "step": 400
+    },
+    {
+      "epoch": 0.0614923134608174,
+      "grad_norm": 1.073912262916565,
+      "learning_rate": 0.0002992201390118837,
+      "loss": 1.6289,
+      "step": 410
+    },
+    {
+      "epoch": 0.06299212598425197,
+      "grad_norm": 1.1426217555999756,
+      "learning_rate": 0.00029914417053244054,
+      "loss": 1.6277,
+      "step": 420
+    },
+    {
+      "epoch": 0.06449193850768654,
+      "grad_norm": 1.0212661027908325,
+      "learning_rate": 0.00029906468240320874,
+      "loss": 1.6146,
+      "step": 430
+    },
+    {
+      "epoch": 0.06599175103112111,
+      "grad_norm": 1.0049968957901,
+      "learning_rate": 0.00029898167650002676,
+      "loss": 1.6091,
+      "step": 440
+    },
+    {
+      "epoch": 0.06749156355455568,
+      "grad_norm": 0.9726770520210266,
+      "learning_rate": 0.0002988951547817491,
+      "loss": 1.5967,
+      "step": 450
+    },
+    {
+      "epoch": 0.06899137607799025,
+      "grad_norm": 1.0028915405273438,
+      "learning_rate": 0.00029880511929019965,
+      "loss": 1.6033,
+      "step": 460
+    },
+    {
+      "epoch": 0.07049118860142482,
+      "grad_norm": 1.164440631866455,
+      "learning_rate": 0.0002987115721501239,
+      "loss": 1.5866,
+      "step": 470
+    },
+    {
+      "epoch": 0.0719910011248594,
+      "grad_norm": 1.1381382942199707,
+      "learning_rate": 0.00029861451556913865,
+      "loss": 1.5928,
+      "step": 480
+    },
+    {
+      "epoch": 0.07349081364829396,
+      "grad_norm": 1.057630181312561,
+      "learning_rate": 0.00029851395183767983,
+      "loss": 1.579,
+      "step": 490
+    },
+    {
+      "epoch": 0.07499062617172854,
+      "grad_norm": 1.0061906576156616,
+      "learning_rate": 0.00029840988332894864,
+      "loss": 1.5746,
+      "step": 500
+    },
+    {
+      "epoch": 0.0764904386951631,
+      "grad_norm": 1.0443668365478516,
+      "learning_rate": 0.00029830231249885537,
+      "loss": 1.5546,
+      "step": 510
+    },
+    {
+      "epoch": 0.07799025121859768,
+      "grad_norm": 1.1543552875518799,
+      "learning_rate": 0.00029819124188596146,
+      "loss": 1.553,
+      "step": 520
+    },
+    {
+      "epoch": 0.07949006374203224,
+      "grad_norm": 0.997122049331665,
+      "learning_rate": 0.00029807667411141977,
+      "loss": 1.5639,
+      "step": 530
+    },
+    {
+      "epoch": 0.08098987626546682,
+      "grad_norm": 1.0399993658065796,
+      "learning_rate": 0.0002979586118789125,
+      "loss": 1.5401,
+      "step": 540
+    },
+    {
+      "epoch": 0.0824896887889014,
+      "grad_norm": 1.0152385234832764,
+      "learning_rate": 0.0002978370579745876,
+      "loss": 1.5372,
+      "step": 550
+    },
+    {
+      "epoch": 0.08398950131233596,
+      "grad_norm": 0.9546723961830139,
+      "learning_rate": 0.00029771201526699264,
+      "loss": 1.528,
+      "step": 560
+    },
+    {
+      "epoch": 0.08548931383577053,
+      "grad_norm": 1.0119845867156982,
+      "learning_rate": 0.0002975834867070077,
+      "loss": 1.518,
+      "step": 570
+    },
+    {
+      "epoch": 0.0869891263592051,
+      "grad_norm": 1.063570261001587,
+      "learning_rate": 0.00029745147532777514,
+      "loss": 1.5108,
+      "step": 580
+    },
+    {
+      "epoch": 0.08848893888263967,
+      "grad_norm": 1.0260778665542603,
+      "learning_rate": 0.0002973159842446285,
+      "loss": 1.5021,
+      "step": 590
+    },
+    {
+      "epoch": 0.08998875140607424,
+      "grad_norm": 0.9610818028450012,
+      "learning_rate": 0.00029717701665501865,
+      "loss": 1.516,
+      "step": 600
+    },
+    {
+      "epoch": 0.09148856392950881,
+      "grad_norm": 1.0446592569351196,
+      "learning_rate": 0.00029703457583843846,
+      "loss": 1.5103,
+      "step": 610
+    },
+    {
+      "epoch": 0.09298837645294339,
+      "grad_norm": 0.9651235342025757,
+      "learning_rate": 0.00029688866515634546,
+      "loss": 1.5173,
+      "step": 620
+    },
+    {
+      "epoch": 0.09448818897637795,
+      "grad_norm": 0.9919349551200867,
+      "learning_rate": 0.00029673928805208237,
+      "loss": 1.5078,
+      "step": 630
+    },
+    {
+      "epoch": 0.09598800149981253,
+      "grad_norm": 0.967276394367218,
+      "learning_rate": 0.00029658644805079606,
+      "loss": 1.5167,
+      "step": 640
+    },
+    {
+      "epoch": 0.09748781402324709,
+      "grad_norm": 1.0425859689712524,
+      "learning_rate": 0.00029643014875935404,
+      "loss": 1.5134,
+      "step": 650
+    },
+    {
+      "epoch": 0.09898762654668167,
+      "grad_norm": 0.9938341975212097,
+      "learning_rate": 0.00029627039386625976,
+      "loss": 1.4941,
+      "step": 660
+    },
+    {
+      "epoch": 0.10003749531308587,
+      "eval_loss": 1.5367733240127563,
+      "eval_runtime": 35.5683,
+      "eval_samples_per_second": 702.874,
+      "eval_steps_per_second": 87.859,
+      "step": 667
+    },
+    {
+      "epoch": 0.10048743907011623,
+      "grad_norm": 1.0440524816513062,
+      "learning_rate": 0.0002961071871415651,
+      "loss": 1.474,
+      "step": 670
+    },
+    {
+      "epoch": 0.1019872515935508,
+      "grad_norm": 0.9019431471824646,
+      "learning_rate": 0.00029594053243678175,
+      "loss": 1.5061,
+      "step": 680
+    },
+    {
+      "epoch": 0.10348706411698538,
+      "grad_norm": 1.0530225038528442,
+      "learning_rate": 0.00029577043368479017,
+      "loss": 1.4618,
+      "step": 690
+    },
+    {
+      "epoch": 0.10498687664041995,
+      "grad_norm": 0.9635890126228333,
+      "learning_rate": 0.0002955968948997469,
+      "loss": 1.4822,
+      "step": 700
+    },
+    {
+      "epoch": 0.10648668916385452,
+      "grad_norm": 0.97013920545578,
+      "learning_rate": 0.00029541992017698956,
+      "loss": 1.4458,
+      "step": 710
+    },
+    {
+      "epoch": 0.10798650168728909,
+      "grad_norm": 1.0321599245071411,
+      "learning_rate": 0.0002952395136929406,
+      "loss": 1.4708,
+      "step": 720
+    },
+    {
+      "epoch": 0.10948631421072366,
+      "grad_norm": 0.8976233601570129,
+      "learning_rate": 0.00029505567970500833,
+      "loss": 1.4572,
+      "step": 730
+    },
+    {
+      "epoch": 0.11098612673415822,
+      "grad_norm": 0.9593002796173096,
+      "learning_rate": 0.0002948684225514868,
+      "loss": 1.4506,
+      "step": 740
+    },
+    {
+      "epoch": 0.1124859392575928,
+      "grad_norm": 0.941360354423523,
+      "learning_rate": 0.0002946777466514531,
+      "loss": 1.4683,
+      "step": 750
+    },
+    {
+      "epoch": 0.11398575178102738,
+      "grad_norm": 1.0240117311477661,
+      "learning_rate": 0.00029448365650466336,
+      "loss": 1.4679,
+      "step": 760
+    },
+    {
+      "epoch": 0.11548556430446194,
+      "grad_norm": 1.041799545288086,
+      "learning_rate": 0.0002942861566914465,
+      "loss": 1.4544,
+      "step": 770
+    },
+    {
+      "epoch": 0.11698537682789652,
+      "grad_norm": 0.9296654462814331,
+      "learning_rate": 0.0002940852518725959,
+      "loss": 1.4473,
+      "step": 780
+    },
+    {
+      "epoch": 0.11848518935133108,
+      "grad_norm": 0.9188225269317627,
+      "learning_rate": 0.0002938809467892596,
+      "loss": 1.4461,
+      "step": 790
+    },
+    {
+      "epoch": 0.11998500187476566,
+      "grad_norm": 0.9431837201118469,
+      "learning_rate": 0.0002936732462628287,
+      "loss": 1.4413,
+      "step": 800
+    },
+    {
+      "epoch": 0.12148481439820022,
+      "grad_norm": 0.9874680042266846,
+      "learning_rate": 0.0002934621551948229,
+      "loss": 1.4435,
+      "step": 810
+    },
+    {
+      "epoch": 0.1229846269216348,
+      "grad_norm": 0.9407353401184082,
+      "learning_rate": 0.0002932476785667754,
+      "loss": 1.4256,
+      "step": 820
+    },
+    {
+      "epoch": 0.12448443944506937,
+      "grad_norm": 0.9208086729049683,
+      "learning_rate": 0.00029302982144011514,
+      "loss": 1.4556,
+      "step": 830
+    },
+    {
+      "epoch": 0.12598425196850394,
+      "grad_norm": 0.8780565857887268,
+      "learning_rate": 0.00029280858895604727,
+      "loss": 1.4389,
+      "step": 840
+    },
+    {
+      "epoch": 0.1274840644919385,
+      "grad_norm": 0.9499910473823547,
+      "learning_rate": 0.0002925839863354322,
+      "loss": 1.4312,
+      "step": 850
+    },
+    {
+      "epoch": 0.1289838770153731,
+      "grad_norm": 0.9616991281509399,
+      "learning_rate": 0.00029235601887866167,
+      "loss": 1.4203,
+      "step": 860
+    },
+    {
+      "epoch": 0.13048368953880765,
+      "grad_norm": 0.8856968283653259,
+      "learning_rate": 0.00029212469196553456,
+      "loss": 1.4192,
+      "step": 870
+    },
+    {
+      "epoch": 0.13198350206224221,
+      "grad_norm": 0.9547106027603149,
+      "learning_rate": 0.00029189001105512914,
+      "loss": 1.4346,
+      "step": 880
+    },
+    {
+      "epoch": 0.13348331458567678,
+      "grad_norm": 0.9112891554832458,
+      "learning_rate": 0.0002916519816856748,
+      "loss": 1.4377,
+      "step": 890
+    },
+    {
+      "epoch": 0.13498312710911137,
+      "grad_norm": 0.9346863031387329,
+      "learning_rate": 0.000291410609474421,
+      "loss": 1.4468,
+      "step": 900
+    },
+    {
+      "epoch": 0.13648293963254593,
+      "grad_norm": 0.9371203184127808,
+      "learning_rate": 0.0002911659001175049,
+      "loss": 1.4067,
+      "step": 910
+    },
+    {
+      "epoch": 0.1379827521559805,
+      "grad_norm": 0.8880829811096191,
+      "learning_rate": 0.000290917859389817,
+      "loss": 1.4319,
+      "step": 920
+    },
+    {
+      "epoch": 0.13948256467941508,
+      "grad_norm": 1.0249537229537964,
+      "learning_rate": 0.0002906664931448645,
+      "loss": 1.4336,
+      "step": 930
+    },
+    {
+      "epoch": 0.14098237720284965,
+      "grad_norm": 0.8886300325393677,
+      "learning_rate": 0.00029041180731463357,
+      "loss": 1.4253,
+      "step": 940
+    },
+    {
+      "epoch": 0.1424821897262842,
+      "grad_norm": 0.9199953079223633,
+      "learning_rate": 0.00029015380790944916,
+      "loss": 1.4305,
+      "step": 950
+    },
+    {
+      "epoch": 0.1439820022497188,
+      "grad_norm": 0.9134594202041626,
+      "learning_rate": 0.0002898925010178332,
+      "loss": 1.4137,
+      "step": 960
+    },
+    {
+      "epoch": 0.14548181477315336,
+      "grad_norm": 0.9190260767936707,
+      "learning_rate": 0.00028962789280636083,
+      "loss": 1.4079,
+      "step": 970
+    },
+    {
+      "epoch": 0.14698162729658792,
+      "grad_norm": 0.9165851473808289,
+      "learning_rate": 0.00028935998951951515,
+      "loss": 1.4166,
+      "step": 980
+    },
+    {
+      "epoch": 0.1484814398200225,
+      "grad_norm": 0.9126707315444946,
+      "learning_rate": 0.00028908879747953955,
+      "loss": 1.4025,
+      "step": 990
+    },
+    {
+      "epoch": 0.14998125234345708,
+      "grad_norm": 0.9610698819160461,
+      "learning_rate": 0.00028881432308628855,
+      "loss": 1.3994,
+      "step": 1000
+    },
+    {
+      "epoch": 0.15148106486689164,
+      "grad_norm": 0.9160841703414917,
+      "learning_rate": 0.00028853657281707696,
+      "loss": 1.4105,
+      "step": 1010
+    },
+    {
+      "epoch": 0.1529808773903262,
+      "grad_norm": 0.9419758319854736,
+      "learning_rate": 0.0002882555532265269,
+      "loss": 1.4107,
+      "step": 1020
+    },
+    {
+      "epoch": 0.1544806899137608,
+      "grad_norm": 0.8920834064483643,
+      "learning_rate": 0.0002879712709464131,
+      "loss": 1.4076,
+      "step": 1030
+    },
+    {
+      "epoch": 0.15598050243719536,
+      "grad_norm": 0.88045734167099,
+      "learning_rate": 0.0002876837326855064,
+      "loss": 1.3878,
+      "step": 1040
+    },
+    {
+      "epoch": 0.15748031496062992,
+      "grad_norm": 0.8166473507881165,
+      "learning_rate": 0.00028739294522941555,
+      "loss": 1.3883,
+      "step": 1050
+    },
+    {
+      "epoch": 0.15898012748406448,
+      "grad_norm": 0.9151524901390076,
+      "learning_rate": 0.00028709891544042687,
+      "loss": 1.3817,
+      "step": 1060
+    },
+    {
+      "epoch": 0.16047994000749907,
+      "grad_norm": 0.9363940954208374,
+      "learning_rate": 0.0002868016502573425,
+      "loss": 1.3969,
+      "step": 1070
+    },
+    {
+      "epoch": 0.16197975253093364,
+      "grad_norm": 0.8952043056488037,
+      "learning_rate": 0.00028650115669531654,
+      "loss": 1.3784,
+      "step": 1080
+    },
+    {
+      "epoch": 0.1634795650543682,
+      "grad_norm": 0.89581698179245,
+      "learning_rate": 0.00028619744184568946,
+      "loss": 1.3764,
+      "step": 1090
+    },
+    {
+      "epoch": 0.1649793775778028,
+      "grad_norm": 0.856798529624939,
+      "learning_rate": 0.00028589051287582093,
+      "loss": 1.3826,
+      "step": 1100
+    },
+    {
+      "epoch": 0.16647919010123735,
+      "grad_norm": 0.9560316801071167,
+      "learning_rate": 0.0002855803770289206,
+      "loss": 1.3924,
+      "step": 1110
+    },
+    {
+      "epoch": 0.1679790026246719,
+      "grad_norm": 0.9139536619186401,
+      "learning_rate": 0.0002852670416238769,
+      "loss": 1.3655,
+      "step": 1120
+    },
+    {
+      "epoch": 0.16947881514810648,
+      "grad_norm": 0.8980649709701538,
+      "learning_rate": 0.0002849505140550848,
+      "loss": 1.3826,
+      "step": 1130
+    },
+    {
+      "epoch": 0.17097862767154107,
+      "grad_norm": 0.8634670376777649,
+      "learning_rate": 0.00028463080179227105,
+      "loss": 1.3837,
+      "step": 1140
+    },
+    {
+      "epoch": 0.17247844019497563,
+      "grad_norm": 0.9725201725959778,
+      "learning_rate": 0.00028430791238031775,
+      "loss": 1.4023,
+      "step": 1150
+    },
+    {
+      "epoch": 0.1739782527184102,
+      "grad_norm": 0.7960573434829712,
+      "learning_rate": 0.00028398185343908464,
+      "loss": 1.3784,
+      "step": 1160
+    },
+    {
+      "epoch": 0.17547806524184478,
+      "grad_norm": 0.866399884223938,
+      "learning_rate": 0.000283652632663229,
+      "loss": 1.3919,
+      "step": 1170
+    },
+    {
+      "epoch": 0.17697787776527935,
+      "grad_norm": 0.8062925934791565,
+      "learning_rate": 0.0002833202578220242,
+      "loss": 1.3724,
+      "step": 1180
+    },
+    {
+      "epoch": 0.1784776902887139,
+      "grad_norm": 0.8660501837730408,
+      "learning_rate": 0.0002829847367591764,
+      "loss": 1.3948,
+      "step": 1190
+    },
+    {
+      "epoch": 0.17997750281214847,
+      "grad_norm": 0.8984708189964294,
+      "learning_rate": 0.0002826460773926393,
+      "loss": 1.3664,
+      "step": 1200
+    },
+    {
+      "epoch": 0.18147731533558306,
+      "grad_norm": 0.8522405624389648,
+      "learning_rate": 0.00028230428771442725,
+      "loss": 1.3623,
+      "step": 1210
+    },
+    {
+      "epoch": 0.18297712785901762,
+      "grad_norm": 0.8871493935585022,
+      "learning_rate": 0.000281959375790427,
+      "loss": 1.3609,
+      "step": 1220
+    },
+    {
+      "epoch": 0.1844769403824522,
+      "grad_norm": 0.9038013815879822,
+      "learning_rate": 0.0002816113497602069,
+      "loss": 1.3708,
+      "step": 1230
+    },
+    {
+      "epoch": 0.18597675290588678,
+      "grad_norm": 0.8472639918327332,
+      "learning_rate": 0.0002812602178368251,
+      "loss": 1.3683,
+      "step": 1240
+    },
+    {
+      "epoch": 0.18747656542932134,
+      "grad_norm": 0.8597409725189209,
+      "learning_rate": 0.00028090598830663566,
+      "loss": 1.3672,
+      "step": 1250
+    },
+    {
+      "epoch": 0.1889763779527559,
+      "grad_norm": 0.8871280550956726,
+      "learning_rate": 0.00028054866952909296,
+      "loss": 1.3935,
+      "step": 1260
+    },
+    {
+      "epoch": 0.19047619047619047,
+      "grad_norm": 0.8564477562904358,
+      "learning_rate": 0.00028018826993655445,
+      "loss": 1.3611,
+      "step": 1270
+    },
+    {
+      "epoch": 0.19197600299962506,
+      "grad_norm": 0.8285048007965088,
+      "learning_rate": 0.00027982479803408166,
+      "loss": 1.3513,
+      "step": 1280
+    },
+    {
+      "epoch": 0.19347581552305962,
+      "grad_norm": 0.8391554355621338,
+      "learning_rate": 0.00027945826239923955,
+      "loss": 1.3692,
+      "step": 1290
+    },
+    {
+      "epoch": 0.19497562804649418,
+      "grad_norm": 0.9388437271118164,
+      "learning_rate": 0.000279088671681894,
+      "loss": 1.3611,
+      "step": 1300
+    },
+    {
+      "epoch": 0.19647544056992877,
+      "grad_norm": 0.9204142093658447,
+      "learning_rate": 0.0002787160346040076,
+      "loss": 1.3392,
+      "step": 1310
+    },
+    {
+      "epoch": 0.19797525309336333,
+      "grad_norm": 0.9090197086334229,
+      "learning_rate": 0.00027834035995943413,
+      "loss": 1.3555,
+      "step": 1320
+    },
+    {
+      "epoch": 0.1994750656167979,
+      "grad_norm": 0.8415820002555847,
+      "learning_rate": 0.00027796165661371074,
+      "loss": 1.3371,
+      "step": 1330
+    },
+    {
+      "epoch": 0.20007499062617173,
+      "eval_loss": 1.3978526592254639,
+      "eval_runtime": 35.0057,
+      "eval_samples_per_second": 714.169,
+      "eval_steps_per_second": 89.271,
+      "step": 1334
+    },
+    {
+      "epoch": 0.20097487814023246,
+      "grad_norm": 0.8407223224639893,
+      "learning_rate": 0.00027757993350384873,
+      "loss": 1.3453,
+      "step": 1340
+    },
+    {
+      "epoch": 0.20247469066366705,
+      "grad_norm": 0.8398892879486084,
+      "learning_rate": 0.00027719519963812286,
+      "loss": 1.3543,
+      "step": 1350
+    },
+    {
+      "epoch": 0.2039745031871016,
+      "grad_norm": 0.8169878721237183,
+      "learning_rate": 0.00027680746409585865,
+      "loss": 1.3542,
+      "step": 1360
+    },
+    {
+      "epoch": 0.20547431571053618,
+      "grad_norm": 0.8487715721130371,
+      "learning_rate": 0.00027641673602721805,
+      "loss": 1.3308,
+      "step": 1370
+    },
+    {
+      "epoch": 0.20697412823397077,
+      "grad_norm": 0.9306272268295288,
+      "learning_rate": 0.00027602302465298367,
+      "loss": 1.3381,
+      "step": 1380
+    },
+    {
+      "epoch": 0.20847394075740533,
+      "grad_norm": 0.8499324321746826,
+      "learning_rate": 0.0002756263392643409,
+      "loss": 1.3371,
+      "step": 1390
+    },
+    {
+      "epoch": 0.2099737532808399,
+      "grad_norm": 0.7504796385765076,
+      "learning_rate": 0.0002752266892226591,
+      "loss": 1.3359,
+      "step": 1400
+    },
+    {
+      "epoch": 0.21147356580427445,
+      "grad_norm": 0.8636272549629211,
+      "learning_rate": 0.0002748240839592701,
+      "loss": 1.3378,
+      "step": 1410
+    },
+    {
+      "epoch": 0.21297337832770905,
+      "grad_norm": 0.9339527487754822,
+      "learning_rate": 0.00027441853297524615,
+      "loss": 1.3743,
+      "step": 1420
+    },
+    {
+      "epoch": 0.2144731908511436,
+      "grad_norm": 0.9132825136184692,
+      "learning_rate": 0.00027401004584117535,
+      "loss": 1.3427,
+      "step": 1430
+    },
+    {
+      "epoch": 0.21597300337457817,
+      "grad_norm": 0.8461547493934631,
+      "learning_rate": 0.00027359863219693614,
+      "loss": 1.3338,
+      "step": 1440
+    },
+    {
+      "epoch": 0.21747281589801276,
+      "grad_norm": 0.8285521268844604,
+      "learning_rate": 0.00027318430175146934,
+      "loss": 1.3419,
+      "step": 1450
+    },
+    {
+      "epoch": 0.21897262842144732,
+      "grad_norm": 0.9066304564476013,
+      "learning_rate": 0.00027276706428254965,
+      "loss": 1.344,
+      "step": 1460
+    },
+    {
+      "epoch": 0.2204724409448819,
+      "grad_norm": 0.9125919342041016,
+      "learning_rate": 0.00027234692963655407,
+      "loss": 1.3395,
+      "step": 1470
+    },
+    {
+      "epoch": 0.22197225346831645,
+      "grad_norm": 0.8508041501045227,
+      "learning_rate": 0.00027192390772823045,
+      "loss": 1.3419,
+      "step": 1480
+    },
+    {
+      "epoch": 0.22347206599175104,
+      "grad_norm": 0.9331530928611755,
+      "learning_rate": 0.00027149800854046283,
+      "loss": 1.335,
+      "step": 1490
+    },
+    {
+      "epoch": 0.2249718785151856,
+      "grad_norm": 0.9175031781196594,
+      "learning_rate": 0.0002710692421240362,
+      "loss": 1.3341,
+      "step": 1500
+    },
+    {
+      "epoch": 0.22647169103862017,
+      "grad_norm": 0.8572413921356201,
+      "learning_rate": 0.0002706376185973991,
+      "loss": 1.3411,
+      "step": 1510
+    },
+    {
+      "epoch": 0.22797150356205476,
+      "grad_norm": 0.8543765544891357,
+      "learning_rate": 0.0002702031481464252,
+      "loss": 1.3164,
+      "step": 1520
+    },
+    {
+      "epoch": 0.22947131608548932,
+      "grad_norm": 0.8498407006263733,
+      "learning_rate": 0.00026976584102417233,
+      "loss": 1.3411,
+      "step": 1530
+    },
+    {
+      "epoch": 0.23097112860892388,
+      "grad_norm": 0.8382455110549927,
+      "learning_rate": 0.0002693257075506411,
+      "loss": 1.3418,
+      "step": 1540
+    },
+    {
+      "epoch": 0.23247094113235844,
+      "grad_norm": 0.8555087447166443,
+      "learning_rate": 0.00026888275811253105,
+      "loss": 1.3438,
+      "step": 1550
+    },
+    {
+      "epoch": 0.23397075365579303,
+      "grad_norm": 0.8968890309333801,
+      "learning_rate": 0.00026843700316299564,
+      "loss": 1.3292,
+      "step": 1560
+    },
+    {
+      "epoch": 0.2354705661792276,
+      "grad_norm": 0.8054748773574829,
+      "learning_rate": 0.0002679884532213954,
+      "loss": 1.3019,
+      "step": 1570
+    },
+    {
+      "epoch": 0.23697037870266216,
+      "grad_norm": 0.8258795738220215,
+      "learning_rate": 0.00026753711887304995,
+      "loss": 1.3379,
+      "step": 1580
+    },
+    {
+      "epoch": 0.23847019122609675,
+      "grad_norm": 0.9339794516563416,
+      "learning_rate": 0.000267083010768988,
+      "loss": 1.3397,
+      "step": 1590
+    },
+    {
+      "epoch": 0.2399700037495313,
+      "grad_norm": 0.8125893473625183,
+      "learning_rate": 0.0002666261396256961,
+      "loss": 1.319,
+      "step": 1600
+    },
+    {
+      "epoch": 0.24146981627296588,
+      "grad_norm": 0.8743818402290344,
+      "learning_rate": 0.0002661665162248656,
+      "loss": 1.3271,
+      "step": 1610
+    },
+    {
+      "epoch": 0.24296962879640044,
+      "grad_norm": 0.8262471556663513,
+      "learning_rate": 0.0002657041514131385,
+      "loss": 1.3342,
+      "step": 1620
+    },
+    {
+      "epoch": 0.24446944131983503,
+      "grad_norm": 0.8480871319770813,
+      "learning_rate": 0.000265239056101851,
+      "loss": 1.3228,
+      "step": 1630
+    },
+    {
+      "epoch": 0.2459692538432696,
+      "grad_norm": 0.8479325771331787,
+      "learning_rate": 0.0002647712412667765,
+      "loss": 1.3152,
+      "step": 1640
+    },
+    {
+      "epoch": 0.24746906636670415,
+      "grad_norm": 0.8265785574913025,
+      "learning_rate": 0.00026430071794786644,
+      "loss": 1.3234,
+      "step": 1650
+    },
+    {
+      "epoch": 0.24896887889013875,
+      "grad_norm": 0.8676069974899292,
+      "learning_rate": 0.00026382749724898955,
+      "loss": 1.2985,
+      "step": 1660
+    },
+    {
+      "epoch": 0.2504686914135733,
+      "grad_norm": 0.8200732469558716,
+      "learning_rate": 0.00026335159033766996,
+      "loss": 1.3306,
+      "step": 1670
+    },
+    {
+      "epoch": 0.25196850393700787,
+      "grad_norm": 0.8174329996109009,
+      "learning_rate": 0.0002628730084448239,
+      "loss": 1.3251,
+      "step": 1680
+    },
+    {
+      "epoch": 0.25346831646044243,
+      "grad_norm": 0.8221441507339478,
+      "learning_rate": 0.000262391762864494,
+      "loss": 1.3298,
+      "step": 1690
+    },
+    {
+      "epoch": 0.254968128983877,
+      "grad_norm": 0.8342576026916504,
+      "learning_rate": 0.00026190786495358366,
+      "loss": 1.3139,
+      "step": 1700
+    },
+    {
+      "epoch": 0.25646794150731156,
+      "grad_norm": 0.8533841967582703,
+      "learning_rate": 0.0002614213261315883,
+      "loss": 1.3109,
+      "step": 1710
+    },
+    {
+      "epoch": 0.2579677540307462,
+      "grad_norm": 0.8298860788345337,
+      "learning_rate": 0.0002609321578803261,
+      "loss": 1.3212,
+      "step": 1720
+    },
+    {
+      "epoch": 0.25946756655418074,
+      "grad_norm": 0.8570966720581055,
+      "learning_rate": 0.00026044037174366734,
+      "loss": 1.306,
+      "step": 1730
+    },
+    {
+      "epoch": 0.2609673790776153,
+      "grad_norm": 0.8347874879837036,
+      "learning_rate": 0.00025994597932726135,
+      "loss": 1.3215,
+      "step": 1740
+    },
+    {
+      "epoch": 0.26246719160104987,
+      "grad_norm": 0.8482568860054016,
+      "learning_rate": 0.0002594489922982633,
+      "loss": 1.3244,
+      "step": 1750
+    },
+    {
+      "epoch": 0.26396700412448443,
+      "grad_norm": 0.7775010466575623,
+      "learning_rate": 0.0002589494223850584,
+      "loss": 1.2984,
+      "step": 1760
+    },
+    {
+      "epoch": 0.265466816647919,
+      "grad_norm": 0.8049771189689636,
+      "learning_rate": 0.00025844728137698543,
+      "loss": 1.33,
+      "step": 1770
+    },
+    {
+      "epoch": 0.26696662917135355,
+      "grad_norm": 0.8238893747329712,
+      "learning_rate": 0.0002579425811240582,
+      "loss": 1.3175,
+      "step": 1780
+    },
+    {
+      "epoch": 0.26846644169478817,
+      "grad_norm": 0.8227720260620117,
+      "learning_rate": 0.00025743533353668626,
+      "loss": 1.3069,
+      "step": 1790
+    },
+    {
+      "epoch": 0.26996625421822273,
+      "grad_norm": 0.8744603991508484,
+      "learning_rate": 0.0002569255505853934,
+      "loss": 1.3132,
+      "step": 1800
+    },
+    {
+      "epoch": 0.2714660667416573,
+      "grad_norm": 0.8706551790237427,
+      "learning_rate": 0.0002564132443005356,
+      "loss": 1.3075,
+      "step": 1810
+    },
+    {
+      "epoch": 0.27296587926509186,
+      "grad_norm": 0.8768461346626282,
+      "learning_rate": 0.00025589842677201693,
+      "loss": 1.3012,
+      "step": 1820
+    },
+    {
+      "epoch": 0.2744656917885264,
+      "grad_norm": 0.8715064525604248,
+      "learning_rate": 0.0002553811101490042,
+      "loss": 1.303,
+      "step": 1830
+    },
+    {
+      "epoch": 0.275965504311961,
+      "grad_norm": 0.8031432628631592,
+      "learning_rate": 0.00025486130663964016,
+      "loss": 1.3038,
+      "step": 1840
+    },
+    {
+      "epoch": 0.27746531683539555,
+      "grad_norm": 0.78215491771698,
+      "learning_rate": 0.00025433902851075584,
+      "loss": 1.31,
+      "step": 1850
+    },
+    {
+      "epoch": 0.27896512935883017,
+      "grad_norm": 0.8224983215332031,
+      "learning_rate": 0.0002538142880875805,
+      "loss": 1.2931,
+      "step": 1860
+    },
+    {
+      "epoch": 0.28046494188226473,
+      "grad_norm": 0.9267176389694214,
+      "learning_rate": 0.00025328709775345105,
+      "loss": 1.3136,
+      "step": 1870
+    },
+    {
+      "epoch": 0.2819647544056993,
+      "grad_norm": 0.7887846827507019,
+      "learning_rate": 0.0002527574699495199,
+      "loss": 1.3086,
+      "step": 1880
+    },
+    {
+      "epoch": 0.28346456692913385,
+      "grad_norm": 0.867173969745636,
+      "learning_rate": 0.00025222541717446117,
+      "loss": 1.3029,
+      "step": 1890
+    },
+    {
+      "epoch": 0.2849643794525684,
+      "grad_norm": 0.8227114677429199,
+      "learning_rate": 0.00025169095198417584,
+      "loss": 1.2938,
+      "step": 1900
+    },
+    {
+      "epoch": 0.286464191976003,
+      "grad_norm": 0.816207766532898,
+      "learning_rate": 0.00025115408699149546,
+      "loss": 1.3115,
+      "step": 1910
+    },
+    {
+      "epoch": 0.2879640044994376,
+      "grad_norm": 0.91056227684021,
+      "learning_rate": 0.00025061483486588435,
+      "loss": 1.3171,
+      "step": 1920
+    },
+    {
+      "epoch": 0.28946381702287216,
+      "grad_norm": 0.7912834882736206,
+      "learning_rate": 0.00025007320833314085,
+      "loss": 1.2853,
+      "step": 1930
+    },
+    {
+      "epoch": 0.2909636295463067,
+      "grad_norm": 0.7663973569869995,
+      "learning_rate": 0.00024952922017509687,
+      "loss": 1.3014,
+      "step": 1940
+    },
+    {
+      "epoch": 0.2924634420697413,
+      "grad_norm": 0.7914089560508728,
+      "learning_rate": 0.00024898288322931615,
+      "loss": 1.2922,
+      "step": 1950
+    },
+    {
+      "epoch": 0.29396325459317585,
+      "grad_norm": 0.8117234706878662,
+      "learning_rate": 0.00024843421038879147,
+      "loss": 1.2953,
+      "step": 1960
+    },
+    {
+      "epoch": 0.2954630671166104,
+      "grad_norm": 0.8552126288414001,
+      "learning_rate": 0.0002478832146016404,
+      "loss": 1.3,
+      "step": 1970
+    },
+    {
+      "epoch": 0.296962879640045,
+      "grad_norm": 0.8411305546760559,
+      "learning_rate": 0.0002473299088707996,
+      "loss": 1.2945,
+      "step": 1980
+    },
+    {
+      "epoch": 0.2984626921634796,
+      "grad_norm": 0.8959758281707764,
+      "learning_rate": 0.00024677430625371803,
+      "loss": 1.2945,
+      "step": 1990
+    },
+    {
+      "epoch": 0.29996250468691416,
+      "grad_norm": 0.8792350888252258,
+      "learning_rate": 0.0002462164198620489,
+      "loss": 1.3009,
+      "step": 2000
+    },
+    {
+      "epoch": 0.3001124859392576,
+      "eval_loss": 1.3354628086090088,
+      "eval_runtime": 34.4241,
+      "eval_samples_per_second": 726.236,
+      "eval_steps_per_second": 90.78,
+      "step": 2001
+    },
+    {
+      "epoch": 0.3014623172103487,
+      "grad_norm": 0.8166361451148987,
+      "learning_rate": 0.00024565626286134003,
+      "loss": 1.2829,
+      "step": 2010
+    },
+    {
+      "epoch": 0.3029621297337833,
+      "grad_norm": 0.8526578545570374,
+      "learning_rate": 0.0002450938484707234,
+      "loss": 1.2799,
+      "step": 2020
+    },
+    {
+      "epoch": 0.30446194225721784,
+      "grad_norm": 0.8354383707046509,
+      "learning_rate": 0.0002445291899626031,
+      "loss": 1.2828,
+      "step": 2030
+    },
+    {
+      "epoch": 0.3059617547806524,
+      "grad_norm": 0.8629779815673828,
+      "learning_rate": 0.000243962300662342,
+      "loss": 1.3066,
+      "step": 2040
+    },
+    {
+      "epoch": 0.30746156730408697,
+      "grad_norm": 0.8469873666763306,
+      "learning_rate": 0.00024339319394794742,
+      "loss": 1.2998,
+      "step": 2050
+    },
+    {
+      "epoch": 0.3089613798275216,
+      "grad_norm": 0.8346788883209229,
+      "learning_rate": 0.00024282188324975534,
+      "loss": 1.2917,
+      "step": 2060
+    },
+    {
+      "epoch": 0.31046119235095615,
+      "grad_norm": 0.8433026671409607,
+      "learning_rate": 0.0002422483820501136,
+      "loss": 1.2878,
+      "step": 2070
+    },
+    {
+      "epoch": 0.3119610048743907,
+      "grad_norm": 0.8289304375648499,
+      "learning_rate": 0.00024167270388306366,
+      "loss": 1.2865,
+      "step": 2080
+    },
+    {
+      "epoch": 0.3134608173978253,
+      "grad_norm": 0.8034661412239075,
+      "learning_rate": 0.00024109486233402102,
+      "loss": 1.2858,
+      "step": 2090
+    },
+    {
+      "epoch": 0.31496062992125984,
+      "grad_norm": 0.8111044764518738,
+      "learning_rate": 0.00024051487103945486,
+      "loss": 1.2693,
+      "step": 2100
+    },
+    {
+      "epoch": 0.3164604424446944,
+      "grad_norm": 0.823731541633606,
+      "learning_rate": 0.00023993274368656618,
+      "loss": 1.2802,
+      "step": 2110
+    },
+    {
+      "epoch": 0.31796025496812896,
+      "grad_norm": 0.7805794477462769,
+      "learning_rate": 0.00023934849401296472,
+      "loss": 1.2961,
+      "step": 2120
+    },
+    {
+      "epoch": 0.3194600674915636,
+      "grad_norm": 0.8786959648132324,
+      "learning_rate": 0.0002387621358063449,
+      "loss": 1.2817,
+      "step": 2130
+    },
+    {
+      "epoch": 0.32095988001499814,
+      "grad_norm": 0.8563496470451355,
+      "learning_rate": 0.00023817368290416036,
+      "loss": 1.2837,
+      "step": 2140
+    },
+    {
+      "epoch": 0.3224596925384327,
+      "grad_norm": 0.7847155928611755,
+      "learning_rate": 0.00023758314919329726,
+      "loss": 1.304,
+      "step": 2150
+    },
+    {
+      "epoch": 0.32395950506186727,
+      "grad_norm": 0.8151364326477051,
+      "learning_rate": 0.00023699054860974682,
+      "loss": 1.2699,
+      "step": 2160
+    },
+    {
+      "epoch": 0.32545931758530183,
+      "grad_norm": 0.8278871774673462,
+      "learning_rate": 0.00023639589513827636,
+      "loss": 1.2706,
+      "step": 2170
+    },
+    {
+      "epoch": 0.3269591301087364,
+      "grad_norm": 0.8579797148704529,
+      "learning_rate": 0.0002357992028120993,
+      "loss": 1.286,
+      "step": 2180
+    },
+    {
+      "epoch": 0.32845894263217096,
+      "grad_norm": 0.8568677306175232,
+      "learning_rate": 0.00023520048571254378,
+      "loss": 1.2753,
+      "step": 2190
+    },
+    {
+      "epoch": 0.3299587551556056,
+      "grad_norm": 0.8455263376235962,
+      "learning_rate": 0.00023459975796872063,
+      "loss": 1.2721,
+      "step": 2200
+    },
+    {
+      "epoch": 0.33145856767904014,
+      "grad_norm": 0.8218129277229309,
+      "learning_rate": 0.0002339970337571899,
+      "loss": 1.2663,
+      "step": 2210
+    },
+    {
+      "epoch": 0.3329583802024747,
+      "grad_norm": 0.8314395546913147,
+      "learning_rate": 0.000233392327301626,
+      "loss": 1.2991,
+      "step": 2220
+    },
+    {
+      "epoch": 0.33445819272590926,
+      "grad_norm": 0.7503239512443542,
+      "learning_rate": 0.0002327856528724825,
+      "loss": 1.2568,
+      "step": 2230
+    },
+    {
+      "epoch": 0.3359580052493438,
+      "grad_norm": 0.8333805799484253,
+      "learning_rate": 0.0002321770247866551,
+      "loss": 1.2844,
+      "step": 2240
+    },
+    {
+      "epoch": 0.3374578177727784,
+      "grad_norm": 0.7753694653511047,
+      "learning_rate": 0.00023156645740714368,
+      "loss": 1.3001,
+      "step": 2250
+    },
+    {
+      "epoch": 0.33895763029621295,
+      "grad_norm": 0.7813096642494202,
+      "learning_rate": 0.00023095396514271355,
+      "loss": 1.2735,
+      "step": 2260
+    },
+    {
+      "epoch": 0.34045744281964757,
+      "grad_norm": 0.8016805648803711,
+      "learning_rate": 0.0002303395624475553,
+      "loss": 1.2938,
+      "step": 2270
+    },
+    {
+      "epoch": 0.34195725534308213,
+      "grad_norm": 0.7955138087272644,
+      "learning_rate": 0.00022972326382094378,
+      "loss": 1.2712,
+      "step": 2280
+    },
+    {
+      "epoch": 0.3434570678665167,
+      "grad_norm": 0.8395068645477295,
+      "learning_rate": 0.00022910508380689584,
+      "loss": 1.2711,
+      "step": 2290
+    },
+    {
+      "epoch": 0.34495688038995126,
+      "grad_norm": 0.8002228140830994,
+      "learning_rate": 0.00022848503699382717,
+      "loss": 1.2985,
+      "step": 2300
+    },
+    {
+      "epoch": 0.3464566929133858,
+      "grad_norm": 0.8872259259223938,
+      "learning_rate": 0.00022786313801420794,
+      "loss": 1.2639,
+      "step": 2310
+    },
+    {
+      "epoch": 0.3479565054368204,
+      "grad_norm": 0.8235191702842712,
+      "learning_rate": 0.0002272394015442177,
+      "loss": 1.2903,
+      "step": 2320
+    },
+    {
+      "epoch": 0.34945631796025495,
+      "grad_norm": 0.8455343246459961,
+      "learning_rate": 0.0002266138423033987,
+      "loss": 1.2871,
+      "step": 2330
+    },
+    {
+      "epoch": 0.35095613048368957,
+      "grad_norm": 0.8051894307136536,
+      "learning_rate": 0.00022598647505430895,
+      "loss": 1.2577,
+      "step": 2340
+    },
+    {
+      "epoch": 0.35245594300712413,
+      "grad_norm": 0.7616756558418274,
+      "learning_rate": 0.0002253573146021733,
+      "loss": 1.261,
+      "step": 2350
+    },
+    {
+      "epoch": 0.3539557555305587,
+      "grad_norm": 0.804066002368927,
+      "learning_rate": 0.0002247263757945347,
+      "loss": 1.2954,
+      "step": 2360
+    },
+    {
+      "epoch": 0.35545556805399325,
+      "grad_norm": 0.810148298740387,
+      "learning_rate": 0.00022409367352090322,
+      "loss": 1.2603,
+      "step": 2370
+    },
+    {
+      "epoch": 0.3569553805774278,
+      "grad_norm": 0.790664553642273,
+      "learning_rate": 0.00022345922271240496,
+      "loss": 1.2606,
+      "step": 2380
+    },
+    {
+      "epoch": 0.3584551931008624,
+      "grad_norm": 0.7945841550827026,
+      "learning_rate": 0.00022282303834142978,
+      "loss": 1.2605,
+      "step": 2390
+    },
+    {
+      "epoch": 0.35995500562429694,
+      "grad_norm": 0.8161887526512146,
+      "learning_rate": 0.0002221851354212777,
+      "loss": 1.2536,
+      "step": 2400
+    },
+    {
+      "epoch": 0.36145481814773156,
+      "grad_norm": 0.7882905006408691,
+      "learning_rate": 0.0002215455290058048,
+      "loss": 1.284,
+      "step": 2410
+    },
+    {
+      "epoch": 0.3629546306711661,
+      "grad_norm": 0.8553168773651123,
+      "learning_rate": 0.000220904234189068,
+      "loss": 1.2549,
+      "step": 2420
+    },
+    {
+      "epoch": 0.3644544431946007,
+      "grad_norm": 0.8243351578712463,
+      "learning_rate": 0.00022026126610496852,
+      "loss": 1.2526,
+      "step": 2430
+    },
+    {
+      "epoch": 0.36595425571803525,
+      "grad_norm": 0.8284412622451782,
+      "learning_rate": 0.0002196166399268952,
+      "loss": 1.267,
+      "step": 2440
+    },
+    {
+      "epoch": 0.3674540682414698,
+      "grad_norm": 0.7850016951560974,
+      "learning_rate": 0.00021897037086736614,
+      "loss": 1.2648,
+      "step": 2450
+    },
+    {
+      "epoch": 0.3689538807649044,
+      "grad_norm": 0.8068733811378479,
+      "learning_rate": 0.0002183224741776697,
+      "loss": 1.2653,
+      "step": 2460
+    },
+    {
+      "epoch": 0.37045369328833894,
+      "grad_norm": 0.7865297794342041,
+      "learning_rate": 0.00021767296514750472,
+      "loss": 1.2629,
+      "step": 2470
+    },
+    {
+      "epoch": 0.37195350581177355,
+      "grad_norm": 0.7744786739349365,
+      "learning_rate": 0.00021702185910461958,
+      "loss": 1.2597,
+      "step": 2480
+    },
+    {
+      "epoch": 0.3734533183352081,
+      "grad_norm": 0.8059437870979309,
+      "learning_rate": 0.00021636917141445056,
+      "loss": 1.2405,
+      "step": 2490
+    },
+    {
+      "epoch": 0.3749531308586427,
+      "grad_norm": 0.7744713425636292,
+      "learning_rate": 0.00021571491747975917,
+      "loss": 1.2559,
+      "step": 2500
+    },
+    {
+      "epoch": 0.37645294338207724,
+      "grad_norm": 0.817789614200592,
+      "learning_rate": 0.0002150591127402687,
+      "loss": 1.2444,
+      "step": 2510
+    },
+    {
+      "epoch": 0.3779527559055118,
+      "grad_norm": 0.8189598321914673,
+      "learning_rate": 0.00021440177267229984,
+      "loss": 1.2518,
+      "step": 2520
+    },
+    {
+      "epoch": 0.37945256842894637,
+      "grad_norm": 0.8474273681640625,
+      "learning_rate": 0.00021374291278840546,
+      "loss": 1.2634,
+      "step": 2530
+    },
+    {
+      "epoch": 0.38095238095238093,
+      "grad_norm": 0.8714548349380493,
+      "learning_rate": 0.00021308254863700452,
+      "loss": 1.2498,
+      "step": 2540
+    },
+    {
+      "epoch": 0.38245219347581555,
+      "grad_norm": 0.7918749451637268,
+      "learning_rate": 0.00021242069580201524,
+      "loss": 1.2659,
+      "step": 2550
+    },
+    {
+      "epoch": 0.3839520059992501,
+      "grad_norm": 0.7831052541732788,
+      "learning_rate": 0.00021175736990248714,
+      "loss": 1.2748,
+      "step": 2560
+    },
+    {
+      "epoch": 0.3854518185226847,
+      "grad_norm": 0.770596981048584,
+      "learning_rate": 0.00021109258659223254,
+      "loss": 1.2581,
+      "step": 2570
+    },
+    {
+      "epoch": 0.38695163104611924,
+      "grad_norm": 0.8102443218231201,
+      "learning_rate": 0.00021042636155945723,
+      "loss": 1.2365,
+      "step": 2580
+    },
+    {
+      "epoch": 0.3884514435695538,
+      "grad_norm": 0.7992528080940247,
+      "learning_rate": 0.00020975871052639024,
+      "loss": 1.2617,
+      "step": 2590
+    },
+    {
+      "epoch": 0.38995125609298836,
+      "grad_norm": 0.8378466367721558,
+      "learning_rate": 0.00020908964924891256,
+      "loss": 1.2352,
+      "step": 2600
+    },
+    {
+      "epoch": 0.3914510686164229,
+      "grad_norm": 0.7832515835762024,
+      "learning_rate": 0.0002084191935161857,
+      "loss": 1.2733,
+      "step": 2610
+    },
+    {
+      "epoch": 0.39295088113985754,
+      "grad_norm": 0.7198286056518555,
+      "learning_rate": 0.0002077473591502788,
+      "loss": 1.2655,
+      "step": 2620
+    },
+    {
+      "epoch": 0.3944506936632921,
+      "grad_norm": 0.7759490609169006,
+      "learning_rate": 0.00020707416200579524,
+      "loss": 1.2592,
+      "step": 2630
+    },
+    {
+      "epoch": 0.39595050618672667,
+      "grad_norm": 0.7895490527153015,
+      "learning_rate": 0.00020639961796949877,
+      "loss": 1.2495,
+      "step": 2640
+    },
+    {
+      "epoch": 0.39745031871016123,
+      "grad_norm": 0.8566303849220276,
+      "learning_rate": 0.00020572374295993822,
+      "loss": 1.2643,
+      "step": 2650
+    },
+    {
+      "epoch": 0.3989501312335958,
+      "grad_norm": 0.8570982813835144,
+      "learning_rate": 0.00020504655292707223,
+      "loss": 1.2499,
+      "step": 2660
+    },
+    {
+      "epoch": 0.40014998125234347,
+      "eval_loss": 1.2980915307998657,
+      "eval_runtime": 34.6825,
+      "eval_samples_per_second": 720.825,
+      "eval_steps_per_second": 90.103,
+      "step": 2668
+    },
+    {
+      "epoch": 0.40044994375703036,
+      "grad_norm": 0.8225275278091431,
+      "learning_rate": 0.00020436806385189246,
+      "loss": 1.2649,
+      "step": 2670
+    },
+    {
+      "epoch": 0.4019497562804649,
+      "grad_norm": 0.8848603367805481,
+      "learning_rate": 0.00020368829174604667,
+      "loss": 1.2686,
+      "step": 2680
+    },
+    {
+      "epoch": 0.40344956880389954,
+      "grad_norm": 0.7693737149238586,
+      "learning_rate": 0.00020300725265146093,
+      "loss": 1.2617,
+      "step": 2690
+    },
+    {
+      "epoch": 0.4049493813273341,
+      "grad_norm": 0.815846860408783,
+      "learning_rate": 0.00020232496263996092,
+      "loss": 1.2474,
+      "step": 2700
+    },
+    {
+      "epoch": 0.40644919385076866,
+      "grad_norm": 0.7660710215568542,
+      "learning_rate": 0.00020164143781289256,
+      "loss": 1.246,
+      "step": 2710
+    },
+    {
+      "epoch": 0.4079490063742032,
+      "grad_norm": 0.8107971549034119,
+      "learning_rate": 0.00020095669430074235,
+      "loss": 1.267,
+      "step": 2720
+    },
+    {
+      "epoch": 0.4094488188976378,
+      "grad_norm": 0.7721084356307983,
+      "learning_rate": 0.00020027074826275629,
+      "loss": 1.2613,
+      "step": 2730
+    },
+    {
+      "epoch": 0.41094863142107235,
+      "grad_norm": 0.7531006336212158,
+      "learning_rate": 0.00019958361588655888,
+      "loss": 1.2506,
+      "step": 2740
+    },
+    {
+      "epoch": 0.4124484439445069,
+      "grad_norm": 0.8231312036514282,
+      "learning_rate": 0.00019889531338777112,
+      "loss": 1.26,
+      "step": 2750
+    },
+    {
+      "epoch": 0.41394825646794153,
+      "grad_norm": 0.8262982368469238,
+      "learning_rate": 0.0001982058570096274,
+      "loss": 1.2426,
+      "step": 2760
+    },
+    {
+      "epoch": 0.4154480689913761,
+      "grad_norm": 0.7806345820426941,
+      "learning_rate": 0.00019751526302259271,
+      "loss": 1.2318,
+      "step": 2770
+    },
+    {
+      "epoch": 0.41694788151481066,
+      "grad_norm": 0.794988751411438,
+      "learning_rate": 0.00019682354772397842,
+      "loss": 1.2336,
+      "step": 2780
+    },
+    {
+      "epoch": 0.4184476940382452,
+      "grad_norm": 0.7735899686813354,
+      "learning_rate": 0.00019613072743755755,
+      "loss": 1.2483,
+      "step": 2790
+    },
+    {
+      "epoch": 0.4199475065616798,
+      "grad_norm": 0.7777860760688782,
+      "learning_rate": 0.00019543681851317998,
+      "loss": 1.2483,
+      "step": 2800
+    },
+    {
+      "epoch": 0.42144731908511435,
+      "grad_norm": 0.8276849985122681,
+      "learning_rate": 0.00019474183732638608,
+      "loss": 1.2464,
+      "step": 2810
+    },
+    {
+      "epoch": 0.4229471316085489,
+      "grad_norm": 0.7912575602531433,
+      "learning_rate": 0.0001940458002780206,
+      "loss": 1.2317,
+      "step": 2820
+    },
+    {
+      "epoch": 0.42444694413198353,
+      "grad_norm": 0.8454885482788086,
+      "learning_rate": 0.00019334872379384556,
+      "loss": 1.2458,
+      "step": 2830
+    },
+    {
+      "epoch": 0.4259467566554181,
+      "grad_norm": 0.8399609923362732,
+      "learning_rate": 0.0001926506243241526,
+      "loss": 1.2405,
+      "step": 2840
+    },
+    {
+      "epoch": 0.42744656917885265,
+      "grad_norm": 0.7756800651550293,
+      "learning_rate": 0.00019195151834337473,
+      "loss": 1.2409,
+      "step": 2850
+    },
+    {
+      "epoch": 0.4289463817022872,
+      "grad_norm": 0.8564468622207642,
+      "learning_rate": 0.00019125142234969762,
+      "loss": 1.2458,
+      "step": 2860
+    },
+    {
+      "epoch": 0.4304461942257218,
+      "grad_norm": 0.8514673709869385,
+      "learning_rate": 0.00019055035286467034,
+      "loss": 1.2414,
+      "step": 2870
+    },
+    {
+      "epoch": 0.43194600674915634,
+      "grad_norm": 0.8593403100967407,
+      "learning_rate": 0.00018984832643281513,
+      "loss": 1.2473,
+      "step": 2880
+    },
+    {
+      "epoch": 0.4334458192725909,
+      "grad_norm": 0.8827778100967407,
+      "learning_rate": 0.00018914535962123735,
+      "loss": 1.2532,
+      "step": 2890
+    },
+    {
+      "epoch": 0.4349456317960255,
+      "grad_norm": 0.7746068239212036,
+      "learning_rate": 0.00018844146901923436,
+      "loss": 1.2368,
+      "step": 2900
+    },
+    {
+      "epoch": 0.4364454443194601,
+      "grad_norm": 0.823742687702179,
+      "learning_rate": 0.000187736671237904,
+      "loss": 1.2326,
+      "step": 2910
+    },
+    {
+      "epoch": 0.43794525684289465,
+      "grad_norm": 0.7899442911148071,
+      "learning_rate": 0.0001870309829097526,
+      "loss": 1.2342,
+      "step": 2920
+    },
+    {
+      "epoch": 0.4394450693663292,
+      "grad_norm": 0.8022111058235168,
+      "learning_rate": 0.00018632442068830244,
+      "loss": 1.2224,
+      "step": 2930
+    },
+    {
+      "epoch": 0.4409448818897638,
+      "grad_norm": 0.8160791993141174,
+      "learning_rate": 0.00018561700124769892,
+      "loss": 1.2268,
+      "step": 2940
+    },
+    {
+      "epoch": 0.44244469441319834,
+      "grad_norm": 0.8522243499755859,
+      "learning_rate": 0.0001849087412823168,
+      "loss": 1.236,
+      "step": 2950
+    },
+    {
+      "epoch": 0.4439445069366329,
+      "grad_norm": 0.7768913507461548,
+      "learning_rate": 0.00018419965750636645,
+      "loss": 1.2368,
+      "step": 2960
+    },
+    {
+      "epoch": 0.4454443194600675,
+      "grad_norm": 0.907557487487793,
+      "learning_rate": 0.00018348976665349932,
+      "loss": 1.2275,
+      "step": 2970
+    },
+    {
+      "epoch": 0.4469441319835021,
+      "grad_norm": 0.8612179160118103,
+      "learning_rate": 0.00018277908547641294,
+      "loss": 1.2352,
+      "step": 2980
+    },
+    {
+      "epoch": 0.44844394450693664,
+      "grad_norm": 0.832486093044281,
+      "learning_rate": 0.00018206763074645588,
+      "loss": 1.2262,
+      "step": 2990
+    },
+    {
+      "epoch": 0.4499437570303712,
+      "grad_norm": 0.7332595586776733,
+      "learning_rate": 0.0001813554192532316,
+      "loss": 1.2445,
+      "step": 3000
+    },
+    {
+      "epoch": 0.45144356955380577,
+      "grad_norm": 0.7843475937843323,
+      "learning_rate": 0.00018064246780420245,
+      "loss": 1.2453,
+      "step": 3010
+    },
+    {
+      "epoch": 0.45294338207724033,
+      "grad_norm": 0.8000037670135498,
+      "learning_rate": 0.000179928793224293,
+      "loss": 1.2148,
+      "step": 3020
+    },
+    {
+      "epoch": 0.4544431946006749,
+      "grad_norm": 0.8519952893257141,
+      "learning_rate": 0.00017921441235549295,
+      "loss": 1.2369,
+      "step": 3030
+    },
+    {
+      "epoch": 0.4559430071241095,
+      "grad_norm": 0.8536350131034851,
+      "learning_rate": 0.00017849934205645967,
+      "loss": 1.2442,
+      "step": 3040
+    },
+    {
+      "epoch": 0.4574428196475441,
+      "grad_norm": 0.8602985739707947,
+      "learning_rate": 0.00017778359920212047,
+      "loss": 1.2475,
+      "step": 3050
+    },
+    {
+      "epoch": 0.45894263217097864,
+      "grad_norm": 0.7802624106407166,
+      "learning_rate": 0.0001770672006832741,
+      "loss": 1.2341,
+      "step": 3060
+    },
+    {
+      "epoch": 0.4604424446944132,
+      "grad_norm": 0.791219174861908,
+      "learning_rate": 0.00017635016340619255,
+      "loss": 1.2267,
+      "step": 3070
+    },
+    {
+      "epoch": 0.46194225721784776,
+      "grad_norm": 0.8056305050849915,
+      "learning_rate": 0.00017563250429222173,
+      "loss": 1.248,
+      "step": 3080
+    },
+    {
+      "epoch": 0.4634420697412823,
+      "grad_norm": 0.859767496585846,
+      "learning_rate": 0.00017491424027738216,
+      "loss": 1.2495,
+      "step": 3090
+    },
+    {
+      "epoch": 0.4649418822647169,
+      "grad_norm": 0.8691778182983398,
+      "learning_rate": 0.0001741953883119696,
+      "loss": 1.215,
+      "step": 3100
+    },
+    {
+      "epoch": 0.4664416947881515,
+      "grad_norm": 0.8486020565032959,
+      "learning_rate": 0.00017347596536015472,
+      "loss": 1.2339,
+      "step": 3110
+    },
+    {
+      "epoch": 0.46794150731158607,
+      "grad_norm": 0.798159122467041,
+      "learning_rate": 0.00017275598839958296,
+      "loss": 1.2461,
+      "step": 3120
+    },
+    {
+      "epoch": 0.46944131983502063,
+      "grad_norm": 0.8142710328102112,
+      "learning_rate": 0.00017203547442097369,
+      "loss": 1.231,
+      "step": 3130
+    },
+    {
+      "epoch": 0.4709411323584552,
+      "grad_norm": 0.8455555438995361,
+      "learning_rate": 0.0001713144404277195,
+      "loss": 1.2376,
+      "step": 3140
+    },
+    {
+      "epoch": 0.47244094488188976,
+      "grad_norm": 0.8146346807479858,
+      "learning_rate": 0.0001705929034354846,
+      "loss": 1.2204,
+      "step": 3150
+    },
+    {
+      "epoch": 0.4739407574053243,
+      "grad_norm": 0.8013060688972473,
+      "learning_rate": 0.0001698708804718037,
+      "loss": 1.2199,
+      "step": 3160
+    },
+    {
+      "epoch": 0.4754405699287589,
+      "grad_norm": 0.7504465579986572,
+      "learning_rate": 0.00016914838857567979,
+      "loss": 1.2314,
+      "step": 3170
+    },
+    {
+      "epoch": 0.4769403824521935,
+      "grad_norm": 0.813957691192627,
+      "learning_rate": 0.00016842544479718215,
+      "loss": 1.2298,
+      "step": 3180
+    },
+    {
+      "epoch": 0.47844019497562806,
+      "grad_norm": 0.8849406838417053,
+      "learning_rate": 0.00016770206619704412,
+      "loss": 1.2398,
+      "step": 3190
+    },
+    {
+      "epoch": 0.4799400074990626,
+      "grad_norm": 0.7421510815620422,
+      "learning_rate": 0.0001669782698462603,
+      "loss": 1.2274,
+      "step": 3200
+    },
+    {
+      "epoch": 0.4814398200224972,
+      "grad_norm": 0.8098101019859314,
+      "learning_rate": 0.00016625407282568394,
+      "loss": 1.238,
+      "step": 3210
+    },
+    {
+      "epoch": 0.48293963254593175,
+      "grad_norm": 0.7728010416030884,
+      "learning_rate": 0.00016552949222562352,
+      "loss": 1.2467,
+      "step": 3220
+    },
+    {
+      "epoch": 0.4844394450693663,
+      "grad_norm": 0.8032839298248291,
+      "learning_rate": 0.00016480454514543962,
+      "loss": 1.2288,
+      "step": 3230
+    },
+    {
+      "epoch": 0.4859392575928009,
+      "grad_norm": 0.7134261727333069,
+      "learning_rate": 0.00016407924869314144,
+      "loss": 1.225,
+      "step": 3240
+    },
+    {
+      "epoch": 0.4874390701162355,
+      "grad_norm": 0.8371984362602234,
+      "learning_rate": 0.00016335361998498296,
+      "loss": 1.2015,
+      "step": 3250
+    },
+    {
+      "epoch": 0.48893888263967006,
+      "grad_norm": 0.8015307188034058,
+      "learning_rate": 0.00016262767614505912,
+      "loss": 1.2082,
+      "step": 3260
+    },
+    {
+      "epoch": 0.4904386951631046,
+      "grad_norm": 0.8056173324584961,
+      "learning_rate": 0.00016190143430490152,
+      "loss": 1.2153,
+      "step": 3270
+    },
+    {
+      "epoch": 0.4919385076865392,
+      "grad_norm": 0.7906458377838135,
+      "learning_rate": 0.00016117491160307445,
+      "loss": 1.2337,
+      "step": 3280
+    },
+    {
+      "epoch": 0.49343832020997375,
+      "grad_norm": 0.8407464027404785,
+      "learning_rate": 0.00016044812518477007,
+      "loss": 1.2333,
+      "step": 3290
+    },
+    {
+      "epoch": 0.4949381327334083,
+      "grad_norm": 0.8177177906036377,
+      "learning_rate": 0.00015972109220140402,
+      "loss": 1.216,
+      "step": 3300
+    },
+    {
+      "epoch": 0.49643794525684287,
+      "grad_norm": 0.77277672290802,
+      "learning_rate": 0.0001589938298102108,
+      "loss": 1.2279,
+      "step": 3310
+    },
+    {
+      "epoch": 0.4979377577802775,
+      "grad_norm": 0.8213476538658142,
+      "learning_rate": 0.0001582663551738384,
+      "loss": 1.2272,
+      "step": 3320
+    },
+    {
+      "epoch": 0.49943757030371205,
+      "grad_norm": 0.8175489902496338,
+      "learning_rate": 0.00015753868545994378,
+      "loss": 1.2285,
+      "step": 3330
+    },
+    {
+      "epoch": 0.5001874765654293,
+      "eval_loss": 1.2690181732177734,
+      "eval_runtime": 35.2482,
+      "eval_samples_per_second": 709.256,
+      "eval_steps_per_second": 88.657,
+      "step": 3335
+    },
+    {
+      "epoch": 0.5009373828271466,
+      "grad_norm": 0.834984540939331,
+      "learning_rate": 0.00015681083784078748,
+      "loss": 1.221,
+      "step": 3340
+    },
+    {
+      "epoch": 0.5024371953505812,
+      "grad_norm": 0.8637697100639343,
+      "learning_rate": 0.00015608282949282844,
+      "loss": 1.2343,
+      "step": 3350
+    },
+    {
+      "epoch": 0.5039370078740157,
+      "grad_norm": 0.8110714554786682,
+      "learning_rate": 0.00015535467759631862,
+      "loss": 1.2341,
+      "step": 3360
+    },
+    {
+      "epoch": 0.5054368203974503,
+      "grad_norm": 0.8313619494438171,
+      "learning_rate": 0.00015462639933489753,
+      "loss": 1.2255,
+      "step": 3370
+    },
+    {
+      "epoch": 0.5069366329208849,
+      "grad_norm": 0.8548438549041748,
+      "learning_rate": 0.00015389801189518693,
+      "loss": 1.2204,
+      "step": 3380
+    },
+    {
+      "epoch": 0.5084364454443194,
+      "grad_norm": 0.8614264130592346,
+      "learning_rate": 0.00015316953246638482,
+      "loss": 1.2182,
+      "step": 3390
+    },
+    {
+      "epoch": 0.509936257967754,
+      "grad_norm": 0.8315454125404358,
+      "learning_rate": 0.00015244097823986023,
+      "loss": 1.2168,
+      "step": 3400
+    },
+    {
+      "epoch": 0.5114360704911886,
+      "grad_norm": 0.8473772406578064,
+      "learning_rate": 0.0001517123664087473,
+      "loss": 1.2329,
+      "step": 3410
+    },
+    {
+      "epoch": 0.5129358830146231,
+      "grad_norm": 0.8713449835777283,
+      "learning_rate": 0.00015098371416753963,
+      "loss": 1.2233,
+      "step": 3420
+    },
+    {
+      "epoch": 0.5144356955380578,
+      "grad_norm": 0.8293794989585876,
+      "learning_rate": 0.00015025503871168432,
+      "loss": 1.2085,
+      "step": 3430
+    },
+    {
+      "epoch": 0.5159355080614924,
+      "grad_norm": 0.8054031729698181,
+      "learning_rate": 0.00014952635723717642,
+      "loss": 1.2443,
+      "step": 3440
+    },
+    {
+      "epoch": 0.5174353205849269,
+      "grad_norm": 0.8494643568992615,
+      "learning_rate": 0.000148797686940153,
+      "loss": 1.2231,
+      "step": 3450
+    },
+    {
+      "epoch": 0.5189351331083615,
+      "grad_norm": 0.8356276154518127,
+      "learning_rate": 0.0001480690450164873,
+      "loss": 1.2275,
+      "step": 3460
+    },
+    {
+      "epoch": 0.520434945631796,
+      "grad_norm": 0.8196589350700378,
+      "learning_rate": 0.00014734044866138312,
+      "loss": 1.197,
+      "step": 3470
+    },
+    {
+      "epoch": 0.5219347581552306,
+      "grad_norm": 0.8253743648529053,
+      "learning_rate": 0.00014661191506896867,
+      "loss": 1.2068,
+      "step": 3480
+    },
+    {
+      "epoch": 0.5234345706786652,
+      "grad_norm": 0.7777687311172485,
+      "learning_rate": 0.0001458834614318912,
+      "loss": 1.2072,
+      "step": 3490
+    },
+    {
+      "epoch": 0.5249343832020997,
+      "grad_norm": 0.771138608455658,
+      "learning_rate": 0.00014515510494091102,
+      "loss": 1.2066,
+      "step": 3500
+    },
+    {
+      "epoch": 0.5264341957255343,
+      "grad_norm": 0.7920674085617065,
+      "learning_rate": 0.00014442686278449588,
+      "loss": 1.2078,
+      "step": 3510
+    },
+    {
+      "epoch": 0.5279340082489689,
+      "grad_norm": 0.8012422919273376,
+      "learning_rate": 0.00014369875214841548,
+      "loss": 1.2218,
+      "step": 3520
+    },
+    {
+      "epoch": 0.5294338207724034,
+      "grad_norm": 0.8012481331825256,
+      "learning_rate": 0.0001429707902153355,
+      "loss": 1.229,
+      "step": 3530
+    },
+    {
+      "epoch": 0.530933633295838,
+      "grad_norm": 0.8194921612739563,
+      "learning_rate": 0.0001422429941644127,
+      "loss": 1.2141,
+      "step": 3540
+    },
+    {
+      "epoch": 0.5324334458192725,
+      "grad_norm": 0.7735179662704468,
+      "learning_rate": 0.000141515381170889,
+      "loss": 1.2271,
+      "step": 3550
+    },
+    {
+      "epoch": 0.5339332583427071,
+      "grad_norm": 0.7934532761573792,
+      "learning_rate": 0.00014078796840568647,
+      "loss": 1.2161,
+      "step": 3560
+    },
+    {
+      "epoch": 0.5354330708661418,
+      "grad_norm": 0.844835102558136,
+      "learning_rate": 0.0001400607730350018,
+      "loss": 1.215,
+      "step": 3570
+    },
+    {
+      "epoch": 0.5369328833895763,
+      "grad_norm": 0.7871401309967041,
+      "learning_rate": 0.0001393338122199016,
+      "loss": 1.2248,
+      "step": 3580
+    },
+    {
+      "epoch": 0.5384326959130109,
+      "grad_norm": 0.7825441956520081,
+      "learning_rate": 0.00013860710311591713,
+      "loss": 1.2182,
+      "step": 3590
+    },
+    {
+      "epoch": 0.5399325084364455,
+      "grad_norm": 0.9266743659973145,
+      "learning_rate": 0.00013788066287263946,
+      "loss": 1.2195,
+      "step": 3600
+    },
+    {
+      "epoch": 0.54143232095988,
+      "grad_norm": 0.8771612048149109,
+      "learning_rate": 0.00013715450863331495,
+      "loss": 1.2077,
+      "step": 3610
+    },
+    {
+      "epoch": 0.5429321334833146,
+      "grad_norm": 0.8083598017692566,
+      "learning_rate": 0.00013642865753444043,
+      "loss": 1.2095,
+      "step": 3620
+    },
+    {
+      "epoch": 0.5444319460067492,
+      "grad_norm": 0.8372387886047363,
+      "learning_rate": 0.000135703126705359,
+      "loss": 1.1981,
+      "step": 3630
+    },
+    {
+      "epoch": 0.5459317585301837,
+      "grad_norm": 0.791763424873352,
+      "learning_rate": 0.00013497793326785573,
+      "loss": 1.2181,
+      "step": 3640
+    },
+    {
+      "epoch": 0.5474315710536183,
+      "grad_norm": 0.784136950969696,
+      "learning_rate": 0.00013425309433575365,
+      "loss": 1.2156,
+      "step": 3650
+    },
+    {
+      "epoch": 0.5489313835770528,
+      "grad_norm": 0.7631322741508484,
+      "learning_rate": 0.0001335286270145096,
+      "loss": 1.1944,
+      "step": 3660
+    },
+    {
+      "epoch": 0.5504311961004874,
+      "grad_norm": 0.8252114057540894,
+      "learning_rate": 0.00013280454840081105,
+      "loss": 1.2048,
+      "step": 3670
+    },
+    {
+      "epoch": 0.551931008623922,
+      "grad_norm": 0.7659439444541931,
+      "learning_rate": 0.0001320808755821722,
+      "loss": 1.2113,
+      "step": 3680
+    },
+    {
+      "epoch": 0.5534308211473565,
+      "grad_norm": 0.8548429608345032,
+      "learning_rate": 0.00013135762563653097,
+      "loss": 1.2021,
+      "step": 3690
+    },
+    {
+      "epoch": 0.5549306336707911,
+      "grad_norm": 0.8134576678276062,
+      "learning_rate": 0.00013063481563184589,
+      "loss": 1.1896,
+      "step": 3700
+    },
+    {
+      "epoch": 0.5564304461942258,
+      "grad_norm": 0.7915739417076111,
+      "learning_rate": 0.00012991246262569327,
+      "loss": 1.2162,
+      "step": 3710
+    },
+    {
+      "epoch": 0.5579302587176603,
+      "grad_norm": 0.8490334749221802,
+      "learning_rate": 0.00012919058366486492,
+      "loss": 1.2148,
+      "step": 3720
+    },
+    {
+      "epoch": 0.5594300712410949,
+      "grad_norm": 0.8796569108963013,
+      "learning_rate": 0.00012846919578496545,
+      "loss": 1.19,
+      "step": 3730
+    },
+    {
+      "epoch": 0.5609298837645295,
+      "grad_norm": 0.8151915669441223,
+      "learning_rate": 0.00012774831601001054,
+      "loss": 1.2171,
+      "step": 3740
+    },
+    {
+      "epoch": 0.562429696287964,
+      "grad_norm": 0.8759236335754395,
+      "learning_rate": 0.00012702796135202518,
+      "loss": 1.2296,
+      "step": 3750
+    },
+    {
+      "epoch": 0.5639295088113986,
+      "grad_norm": 0.8563137054443359,
+      "learning_rate": 0.00012630814881064206,
+      "loss": 1.2178,
+      "step": 3760
+    },
+    {
+      "epoch": 0.5654293213348331,
+      "grad_norm": 0.8290548920631409,
+      "learning_rate": 0.00012558889537270048,
+      "loss": 1.1993,
+      "step": 3770
+    },
+    {
+      "epoch": 0.5669291338582677,
+      "grad_norm": 0.8869160413742065,
+      "learning_rate": 0.0001248702180118455,
+      "loss": 1.2207,
+      "step": 3780
+    },
+    {
+      "epoch": 0.5684289463817023,
+      "grad_norm": 0.7675673961639404,
+      "learning_rate": 0.00012415213368812731,
+      "loss": 1.2006,
+      "step": 3790
+    },
+    {
+      "epoch": 0.5699287589051368,
+      "grad_norm": 0.7600024342536926,
+      "learning_rate": 0.00012343465934760102,
+      "loss": 1.205,
+      "step": 3800
+    },
+    {
+      "epoch": 0.5714285714285714,
+      "grad_norm": 0.8360748291015625,
+      "learning_rate": 0.00012271781192192688,
+      "loss": 1.2177,
+      "step": 3810
+    },
+    {
+      "epoch": 0.572928383952006,
+      "grad_norm": 0.7921136021614075,
+      "learning_rate": 0.00012200160832797046,
+      "loss": 1.1977,
+      "step": 3820
+    },
+    {
+      "epoch": 0.5744281964754405,
+      "grad_norm": 0.7750628590583801,
+      "learning_rate": 0.0001212860654674036,
+      "loss": 1.223,
+      "step": 3830
+    },
+    {
+      "epoch": 0.5759280089988752,
+      "grad_norm": 0.8707244992256165,
+      "learning_rate": 0.00012057120022630546,
+      "loss": 1.2088,
+      "step": 3840
+    },
+    {
+      "epoch": 0.5774278215223098,
+      "grad_norm": 0.8273059725761414,
+      "learning_rate": 0.00011985702947476424,
+      "loss": 1.201,
+      "step": 3850
+    },
+    {
+      "epoch": 0.5789276340457443,
+      "grad_norm": 0.9016513824462891,
+      "learning_rate": 0.00011914357006647877,
+      "loss": 1.1946,
+      "step": 3860
+    },
+    {
+      "epoch": 0.5804274465691789,
+      "grad_norm": 0.9097079038619995,
+      "learning_rate": 0.00011843083883836084,
+      "loss": 1.2074,
+      "step": 3870
+    },
+    {
+      "epoch": 0.5819272590926134,
+      "grad_norm": 0.845512866973877,
+      "learning_rate": 0.0001177188526101381,
+      "loss": 1.2024,
+      "step": 3880
+    },
+    {
+      "epoch": 0.583427071616048,
+      "grad_norm": 0.8833523392677307,
+      "learning_rate": 0.00011700762818395682,
+      "loss": 1.2116,
+      "step": 3890
+    },
+    {
+      "epoch": 0.5849268841394826,
+      "grad_norm": 0.7923941016197205,
+      "learning_rate": 0.0001162971823439856,
+      "loss": 1.2023,
+      "step": 3900
+    },
+    {
+      "epoch": 0.5864266966629171,
+      "grad_norm": 0.7897645235061646,
+      "learning_rate": 0.00011558753185601922,
+      "loss": 1.1994,
+      "step": 3910
+    },
+    {
+      "epoch": 0.5879265091863517,
+      "grad_norm": 0.7955695390701294,
+      "learning_rate": 0.00011487869346708289,
+      "loss": 1.1875,
+      "step": 3920
+    },
+    {
+      "epoch": 0.5894263217097863,
+      "grad_norm": 0.7841424942016602,
+      "learning_rate": 0.00011417068390503716,
+      "loss": 1.2065,
+      "step": 3930
+    },
+    {
+      "epoch": 0.5909261342332208,
+      "grad_norm": 0.8241638541221619,
+      "learning_rate": 0.00011346351987818307,
+      "loss": 1.1919,
+      "step": 3940
+    },
+    {
+      "epoch": 0.5924259467566554,
+      "grad_norm": 0.7868320941925049,
+      "learning_rate": 0.00011275721807486805,
+      "loss": 1.2089,
+      "step": 3950
+    },
+    {
+      "epoch": 0.59392575928009,
+      "grad_norm": 0.8634785413742065,
+      "learning_rate": 0.00011205179516309172,
+      "loss": 1.1979,
+      "step": 3960
+    },
+    {
+      "epoch": 0.5954255718035245,
+      "grad_norm": 0.9329147338867188,
+      "learning_rate": 0.00011134726779011288,
+      "loss": 1.2006,
+      "step": 3970
+    },
+    {
+      "epoch": 0.5969253843269592,
+      "grad_norm": 0.8250072598457336,
+      "learning_rate": 0.00011064365258205658,
+      "loss": 1.1937,
+      "step": 3980
+    },
+    {
+      "epoch": 0.5984251968503937,
+      "grad_norm": 0.7763371467590332,
+      "learning_rate": 0.00010994096614352153,
+      "loss": 1.205,
+      "step": 3990
+    },
+    {
+      "epoch": 0.5999250093738283,
+      "grad_norm": 0.8118007183074951,
+      "learning_rate": 0.00010923922505718863,
+      "loss": 1.2063,
+      "step": 4000
+    },
+    {
+      "epoch": 0.6002249718785152,
+      "eval_loss": 1.246632695198059,
+      "eval_runtime": 34.4866,
+      "eval_samples_per_second": 724.918,
+      "eval_steps_per_second": 90.615,
+      "step": 4002
+    },
+    {
+      "epoch": 0.6014248218972629,
+      "grad_norm": 0.8617530465126038,
+      "learning_rate": 0.00010853844588342926,
+      "loss": 1.1877,
+      "step": 4010
+    },
+    {
+      "epoch": 0.6029246344206974,
+      "grad_norm": 0.8039788007736206,
+      "learning_rate": 0.00010783864515991481,
+      "loss": 1.2154,
+      "step": 4020
+    },
+    {
+      "epoch": 0.604424446944132,
+      "grad_norm": 0.8393165469169617,
+      "learning_rate": 0.00010713983940122617,
+      "loss": 1.2147,
+      "step": 4030
+    },
+    {
+      "epoch": 0.6059242594675666,
+      "grad_norm": 0.8276166915893555,
+      "learning_rate": 0.00010644204509846398,
+      "loss": 1.185,
+      "step": 4040
+    },
+    {
+      "epoch": 0.6074240719910011,
+      "grad_norm": 0.8333732485771179,
+      "learning_rate": 0.00010574527871885977,
+      "loss": 1.2304,
+      "step": 4050
+    },
+    {
+      "epoch": 0.6089238845144357,
+      "grad_norm": 0.8123278617858887,
+      "learning_rate": 0.00010504955670538699,
+      "loss": 1.2036,
+      "step": 4060
+    },
+    {
+      "epoch": 0.6104236970378702,
+      "grad_norm": 0.8344663977622986,
+      "learning_rate": 0.00010435489547637316,
+      "loss": 1.2073,
+      "step": 4070
+    },
+    {
+      "epoch": 0.6119235095613048,
+      "grad_norm": 0.7638000845909119,
+      "learning_rate": 0.00010366131142511228,
+      "loss": 1.2091,
+      "step": 4080
+    },
+    {
+      "epoch": 0.6134233220847394,
+      "grad_norm": 0.8148689866065979,
+      "learning_rate": 0.00010296882091947826,
+      "loss": 1.1973,
+      "step": 4090
+    },
+    {
+      "epoch": 0.6149231346081739,
+      "grad_norm": 0.8646231293678284,
+      "learning_rate": 0.00010227744030153821,
+      "loss": 1.1868,
+      "step": 4100
+    },
+    {
+      "epoch": 0.6164229471316085,
+      "grad_norm": 0.8547177910804749,
+      "learning_rate": 0.0001015871858871672,
+      "loss": 1.2246,
+      "step": 4110
+    },
+    {
+      "epoch": 0.6179227596550432,
+      "grad_norm": 0.8218623399734497,
+      "learning_rate": 0.00010089807396566306,
+      "loss": 1.2139,
+      "step": 4120
+    },
+    {
+      "epoch": 0.6194225721784777,
+      "grad_norm": 0.8276129961013794,
+      "learning_rate": 0.00010021012079936174,
+      "loss": 1.1913,
+      "step": 4130
+    },
+    {
+      "epoch": 0.6209223847019123,
+      "grad_norm": 0.8115292191505432,
+      "learning_rate": 9.952334262325399e-05,
+      "loss": 1.172,
+      "step": 4140
+    },
+    {
+      "epoch": 0.6224221972253469,
+      "grad_norm": 0.8721445798873901,
+      "learning_rate": 9.883775564460193e-05,
+      "loss": 1.1901,
+      "step": 4150
+    },
+    {
+      "epoch": 0.6239220097487814,
+      "grad_norm": 0.8343495726585388,
+      "learning_rate": 9.815337604255665e-05,
+      "loss": 1.1891,
+      "step": 4160
+    },
+    {
+      "epoch": 0.625421822272216,
+      "grad_norm": 0.8031916618347168,
+      "learning_rate": 9.747021996777624e-05,
+      "loss": 1.1967,
+      "step": 4170
+    },
+    {
+      "epoch": 0.6269216347956506,
+      "grad_norm": 0.7772048711776733,
+      "learning_rate": 9.678830354204504e-05,
+      "loss": 1.2089,
+      "step": 4180
+    },
+    {
+      "epoch": 0.6284214473190851,
+      "grad_norm": 0.8471980690956116,
+      "learning_rate": 9.610764285789271e-05,
+      "loss": 1.1967,
+      "step": 4190
+    },
+    {
+      "epoch": 0.6299212598425197,
+      "grad_norm": 0.8212400078773499,
+      "learning_rate": 9.542825397821485e-05,
+      "loss": 1.1861,
+      "step": 4200
+    },
+    {
+      "epoch": 0.6314210723659542,
+      "grad_norm": 0.8733875155448914,
+      "learning_rate": 9.475015293589373e-05,
+      "loss": 1.1977,
+      "step": 4210
+    },
+    {
+      "epoch": 0.6329208848893888,
+      "grad_norm": 0.8037174940109253,
+      "learning_rate": 9.407335573341997e-05,
+      "loss": 1.1842,
+      "step": 4220
+    },
+    {
+      "epoch": 0.6344206974128234,
+      "grad_norm": 0.8639466762542725,
+      "learning_rate": 9.339787834251489e-05,
+      "loss": 1.195,
+      "step": 4230
+    },
+    {
+      "epoch": 0.6359205099362579,
+      "grad_norm": 0.8288107514381409,
+      "learning_rate": 9.272373670375362e-05,
+      "loss": 1.1919,
+      "step": 4240
+    },
+    {
+      "epoch": 0.6374203224596925,
+      "grad_norm": 0.7566668391227722,
+      "learning_rate": 9.205094672618889e-05,
+      "loss": 1.2009,
+      "step": 4250
+    },
+    {
+      "epoch": 0.6389201349831272,
+      "grad_norm": 0.8419100046157837,
+      "learning_rate": 9.137952428697568e-05,
+      "loss": 1.1658,
+      "step": 4260
+    },
+    {
+      "epoch": 0.6404199475065617,
+      "grad_norm": 0.7842187881469727,
+      "learning_rate": 9.070948523099643e-05,
+      "loss": 1.1876,
+      "step": 4270
+    },
+    {
+      "epoch": 0.6419197600299963,
+      "grad_norm": 0.7916682958602905,
+      "learning_rate": 9.004084537048708e-05,
+      "loss": 1.1952,
+      "step": 4280
+    },
+    {
+      "epoch": 0.6434195725534309,
+      "grad_norm": 0.7986481189727783,
+      "learning_rate": 8.937362048466404e-05,
+      "loss": 1.1925,
+      "step": 4290
+    },
+    {
+      "epoch": 0.6449193850768654,
+      "grad_norm": 0.7649396061897278,
+      "learning_rate": 8.870782631935184e-05,
+      "loss": 1.2007,
+      "step": 4300
+    },
+    {
+      "epoch": 0.6464191976003,
+      "grad_norm": 0.8236790895462036,
+      "learning_rate": 8.804347858661131e-05,
+      "loss": 1.1939,
+      "step": 4310
+    },
+    {
+      "epoch": 0.6479190101237345,
+      "grad_norm": 0.8257557153701782,
+      "learning_rate": 8.73805929643691e-05,
+      "loss": 1.1907,
+      "step": 4320
+    },
+    {
+      "epoch": 0.6494188226471691,
+      "grad_norm": 0.8270912170410156,
+      "learning_rate": 8.67191850960475e-05,
+      "loss": 1.1864,
+      "step": 4330
+    },
+    {
+      "epoch": 0.6509186351706037,
+      "grad_norm": 0.8583309650421143,
+      "learning_rate": 8.605927059019528e-05,
+      "loss": 1.1888,
+      "step": 4340
+    },
+    {
+      "epoch": 0.6524184476940382,
+      "grad_norm": 0.8187999725341797,
+      "learning_rate": 8.540086502011935e-05,
+      "loss": 1.2046,
+      "step": 4350
+    },
+    {
+      "epoch": 0.6539182602174728,
+      "grad_norm": 0.769585907459259,
+      "learning_rate": 8.47439839235174e-05,
+      "loss": 1.2002,
+      "step": 4360
+    },
+    {
+      "epoch": 0.6554180727409074,
+      "grad_norm": 0.8344823718070984,
+      "learning_rate": 8.408864280211115e-05,
+      "loss": 1.1753,
+      "step": 4370
+    },
+    {
+      "epoch": 0.6569178852643419,
+      "grad_norm": 0.8236811757087708,
+      "learning_rate": 8.343485712128026e-05,
+      "loss": 1.1818,
+      "step": 4380
+    },
+    {
+      "epoch": 0.6584176977877765,
+      "grad_norm": 0.807981014251709,
+      "learning_rate": 8.278264230969769e-05,
+      "loss": 1.1932,
+      "step": 4390
+    },
+    {
+      "epoch": 0.6599175103112112,
+      "grad_norm": 0.7782221436500549,
+      "learning_rate": 8.213201375896563e-05,
+      "loss": 1.1802,
+      "step": 4400
+    },
+    {
+      "epoch": 0.6614173228346457,
+      "grad_norm": 0.7768334746360779,
+      "learning_rate": 8.14829868232519e-05,
+      "loss": 1.188,
+      "step": 4410
+    },
+    {
+      "epoch": 0.6629171353580803,
+      "grad_norm": 0.8299062252044678,
+      "learning_rate": 8.083557681892797e-05,
+      "loss": 1.1834,
+      "step": 4420
+    },
+    {
+      "epoch": 0.6644169478815148,
+      "grad_norm": 0.8360872864723206,
+      "learning_rate": 8.018979902420746e-05,
+      "loss": 1.1928,
+      "step": 4430
+    },
+    {
+      "epoch": 0.6659167604049494,
+      "grad_norm": 0.8572105765342712,
+      "learning_rate": 7.954566867878538e-05,
+      "loss": 1.1892,
+      "step": 4440
+    },
+    {
+      "epoch": 0.667416572928384,
+      "grad_norm": 0.7985962629318237,
+      "learning_rate": 7.890320098347861e-05,
+      "loss": 1.1725,
+      "step": 4450
+    },
+    {
+      "epoch": 0.6689163854518185,
+      "grad_norm": 0.7798737287521362,
+      "learning_rate": 7.82624110998673e-05,
+      "loss": 1.1814,
+      "step": 4460
+    },
+    {
+      "epoch": 0.6704161979752531,
+      "grad_norm": 0.9000463485717773,
+      "learning_rate": 7.762331414993697e-05,
+      "loss": 1.1793,
+      "step": 4470
+    },
+    {
+      "epoch": 0.6719160104986877,
+      "grad_norm": 0.8930211067199707,
+      "learning_rate": 7.698592521572155e-05,
+      "loss": 1.1812,
+      "step": 4480
+    },
+    {
+      "epoch": 0.6734158230221222,
+      "grad_norm": 0.8517357707023621,
+      "learning_rate": 7.635025933894747e-05,
+      "loss": 1.1984,
+      "step": 4490
+    },
+    {
+      "epoch": 0.6749156355455568,
+      "grad_norm": 0.8382774591445923,
+      "learning_rate": 7.571633152067901e-05,
+      "loss": 1.1956,
+      "step": 4500
+    },
+    {
+      "epoch": 0.6764154480689913,
+      "grad_norm": 0.8367804884910583,
+      "learning_rate": 7.508415672096389e-05,
+      "loss": 1.1892,
+      "step": 4510
+    },
+    {
+      "epoch": 0.6779152605924259,
+      "grad_norm": 0.8804981708526611,
+      "learning_rate": 7.445374985848035e-05,
+      "loss": 1.1712,
+      "step": 4520
+    },
+    {
+      "epoch": 0.6794150731158605,
+      "grad_norm": 0.8023963570594788,
+      "learning_rate": 7.382512581018514e-05,
+      "loss": 1.2116,
+      "step": 4530
+    },
+    {
+      "epoch": 0.6809148856392951,
+      "grad_norm": 0.7947113513946533,
+      "learning_rate": 7.31982994109626e-05,
+      "loss": 1.1829,
+      "step": 4540
+    },
+    {
+      "epoch": 0.6824146981627297,
+      "grad_norm": 0.7888148427009583,
+      "learning_rate": 7.25732854532741e-05,
+      "loss": 1.1979,
+      "step": 4550
+    },
+    {
+      "epoch": 0.6839145106861643,
+      "grad_norm": 0.8301454782485962,
+      "learning_rate": 7.195009868680954e-05,
+      "loss": 1.1884,
+      "step": 4560
+    },
+    {
+      "epoch": 0.6854143232095988,
+      "grad_norm": 0.8213895559310913,
+      "learning_rate": 7.13287538181387e-05,
+      "loss": 1.1782,
+      "step": 4570
+    },
+    {
+      "epoch": 0.6869141357330334,
+      "grad_norm": 0.8629058599472046,
+      "learning_rate": 7.070926551036469e-05,
+      "loss": 1.1728,
+      "step": 4580
+    },
+    {
+      "epoch": 0.688413948256468,
+      "grad_norm": 0.8480379581451416,
+      "learning_rate": 7.009164838277754e-05,
+      "loss": 1.185,
+      "step": 4590
+    },
+    {
+      "epoch": 0.6899137607799025,
+      "grad_norm": 0.8442770838737488,
+      "learning_rate": 6.947591701050932e-05,
+      "loss": 1.2177,
+      "step": 4600
+    },
+    {
+      "epoch": 0.6914135733033371,
+      "grad_norm": 0.8650897741317749,
+      "learning_rate": 6.886208592419043e-05,
+      "loss": 1.1931,
+      "step": 4610
+    },
+    {
+      "epoch": 0.6929133858267716,
+      "grad_norm": 0.8003455400466919,
+      "learning_rate": 6.825016960960616e-05,
+      "loss": 1.199,
+      "step": 4620
+    },
+    {
+      "epoch": 0.6944131983502062,
+      "grad_norm": 0.8398526906967163,
+      "learning_rate": 6.764018250735532e-05,
+      "loss": 1.1725,
+      "step": 4630
+    },
+    {
+      "epoch": 0.6959130108736408,
+      "grad_norm": 0.8158472180366516,
+      "learning_rate": 6.703213901250931e-05,
+      "loss": 1.1756,
+      "step": 4640
+    },
+    {
+      "epoch": 0.6974128233970753,
+      "grad_norm": 0.8212939500808716,
+      "learning_rate": 6.64260534742723e-05,
+      "loss": 1.1783,
+      "step": 4650
+    },
+    {
+      "epoch": 0.6989126359205099,
+      "grad_norm": 0.8833694458007812,
+      "learning_rate": 6.582194019564266e-05,
+      "loss": 1.1649,
+      "step": 4660
+    },
+    {
+      "epoch": 0.700262467191601,
+      "eval_loss": 1.224123239517212,
+      "eval_runtime": 35.4085,
+      "eval_samples_per_second": 706.045,
+      "eval_steps_per_second": 88.256,
+      "step": 4669
+    },
+    {
+      "epoch": 0.7004124484439445,
+      "grad_norm": 0.8789852857589722,
+      "learning_rate": 6.521981343307554e-05,
+      "loss": 1.1856,
+      "step": 4670
+    },
+    {
+      "epoch": 0.7019122609673791,
+      "grad_norm": 0.7838938236236572,
+      "learning_rate": 6.461968739614639e-05,
+      "loss": 1.171,
+      "step": 4680
+    },
+    {
+      "epoch": 0.7034120734908137,
+      "grad_norm": 0.8347029685974121,
+      "learning_rate": 6.402157624721546e-05,
+      "loss": 1.186,
+      "step": 4690
+    },
+    {
+      "epoch": 0.7049118860142483,
+      "grad_norm": 0.8648792505264282,
+      "learning_rate": 6.342549410109372e-05,
+      "loss": 1.1603,
+      "step": 4700
+    },
+    {
+      "epoch": 0.7064116985376828,
+      "grad_norm": 0.8432384729385376,
+      "learning_rate": 6.283145502470976e-05,
+      "loss": 1.1896,
+      "step": 4710
+    },
+    {
+      "epoch": 0.7079115110611174,
+      "grad_norm": 0.8470872044563293,
+      "learning_rate": 6.223947303677793e-05,
+      "loss": 1.1933,
+      "step": 4720
+    },
+    {
+      "epoch": 0.709411323584552,
+      "grad_norm": 0.7818253636360168,
+      "learning_rate": 6.164956210746723e-05,
+      "loss": 1.1783,
+      "step": 4730
+    },
+    {
+      "epoch": 0.7109111361079865,
+      "grad_norm": 0.8789279460906982,
+      "learning_rate": 6.106173615807186e-05,
+      "loss": 1.18,
+      "step": 4740
+    },
+    {
+      "epoch": 0.7124109486314211,
+      "grad_norm": 0.7911210060119629,
+      "learning_rate": 6.047600906068269e-05,
+      "loss": 1.1675,
+      "step": 4750
+    },
+    {
+      "epoch": 0.7139107611548556,
+      "grad_norm": 0.827257513999939,
+      "learning_rate": 5.989239463785971e-05,
+      "loss": 1.1939,
+      "step": 4760
+    },
+    {
+      "epoch": 0.7154105736782902,
+      "grad_norm": 0.8434903025627136,
+      "learning_rate": 5.9310906662306125e-05,
+      "loss": 1.1885,
+      "step": 4770
+    },
+    {
+      "epoch": 0.7169103862017248,
+      "grad_norm": 0.8206247091293335,
+      "learning_rate": 5.8731558856542935e-05,
+      "loss": 1.1795,
+      "step": 4780
+    },
+    {
+      "epoch": 0.7184101987251593,
+      "grad_norm": 0.8511059880256653,
+      "learning_rate": 5.8154364892585574e-05,
+      "loss": 1.1673,
+      "step": 4790
+    },
+    {
+      "epoch": 0.7199100112485939,
+      "grad_norm": 0.8107950687408447,
+      "learning_rate": 5.75793383916208e-05,
+      "loss": 1.1627,
+      "step": 4800
+    },
+    {
+      "epoch": 0.7214098237720284,
+      "grad_norm": 0.8588976860046387,
+      "learning_rate": 5.70064929236855e-05,
+      "loss": 1.1817,
+      "step": 4810
+    },
+    {
+      "epoch": 0.7229096362954631,
+      "grad_norm": 0.8428772687911987,
+      "learning_rate": 5.643584200734659e-05,
+      "loss": 1.188,
+      "step": 4820
+    },
+    {
+      "epoch": 0.7244094488188977,
+      "grad_norm": 0.9080433249473572,
+      "learning_rate": 5.586739910938161e-05,
+      "loss": 1.1858,
+      "step": 4830
+    },
+    {
+      "epoch": 0.7259092613423322,
+      "grad_norm": 0.8083125352859497,
+      "learning_rate": 5.5301177644461164e-05,
+      "loss": 1.1629,
+      "step": 4840
+    },
+    {
+      "epoch": 0.7274090738657668,
+      "grad_norm": 0.8565297722816467,
+      "learning_rate": 5.4737190974832426e-05,
+      "loss": 1.1819,
+      "step": 4850
+    },
+    {
+      "epoch": 0.7289088863892014,
+      "grad_norm": 0.892975926399231,
+      "learning_rate": 5.417545241000353e-05,
+      "loss": 1.1745,
+      "step": 4860
+    },
+    {
+      "epoch": 0.7304086989126359,
+      "grad_norm": 0.9558145403862,
+      "learning_rate": 5.361597520642981e-05,
+      "loss": 1.1624,
+      "step": 4870
+    },
+    {
+      "epoch": 0.7319085114360705,
+      "grad_norm": 0.893839418888092,
+      "learning_rate": 5.3058772567200595e-05,
+      "loss": 1.1784,
+      "step": 4880
+    },
+    {
+      "epoch": 0.7334083239595051,
+      "grad_norm": 0.879852831363678,
+      "learning_rate": 5.250385764172802e-05,
+      "loss": 1.1754,
+      "step": 4890
+    },
+    {
+      "epoch": 0.7349081364829396,
+      "grad_norm": 0.9134666323661804,
+      "learning_rate": 5.195124352543636e-05,
+      "loss": 1.1919,
+      "step": 4900
+    },
+    {
+      "epoch": 0.7364079490063742,
+      "grad_norm": 0.8220536708831787,
+      "learning_rate": 5.140094325945323e-05,
+      "loss": 1.1639,
+      "step": 4910
+    },
+    {
+      "epoch": 0.7379077615298087,
+      "grad_norm": 0.9278730154037476,
+      "learning_rate": 5.085296983030164e-05,
+      "loss": 1.1914,
+      "step": 4920
+    },
+    {
+      "epoch": 0.7394075740532433,
+      "grad_norm": 0.874758780002594,
+      "learning_rate": 5.030733616959384e-05,
+      "loss": 1.179,
+      "step": 4930
+    },
+    {
+      "epoch": 0.7409073865766779,
+      "grad_norm": 0.8796992897987366,
+      "learning_rate": 4.976405515372577e-05,
+      "loss": 1.1838,
+      "step": 4940
+    },
+    {
+      "epoch": 0.7424071991001124,
+      "grad_norm": 0.8270970582962036,
+      "learning_rate": 4.922313960357336e-05,
+      "loss": 1.1744,
+      "step": 4950
+    },
+    {
+      "epoch": 0.7439070116235471,
+      "grad_norm": 0.8671916723251343,
+      "learning_rate": 4.868460228419003e-05,
+      "loss": 1.1837,
+      "step": 4960
+    },
+    {
+      "epoch": 0.7454068241469817,
+      "grad_norm": 0.8556026220321655,
+      "learning_rate": 4.814845590450544e-05,
+      "loss": 1.1724,
+      "step": 4970
+    },
+    {
+      "epoch": 0.7469066366704162,
+      "grad_norm": 0.8527052998542786,
+      "learning_rate": 4.761471311702541e-05,
+      "loss": 1.1604,
+      "step": 4980
+    },
+    {
+      "epoch": 0.7484064491938508,
+      "grad_norm": 0.9000732898712158,
+      "learning_rate": 4.70833865175334e-05,
+      "loss": 1.1787,
+      "step": 4990
+    },
+    {
+      "epoch": 0.7499062617172854,
+      "grad_norm": 0.8143606185913086,
+      "learning_rate": 4.6554488644793555e-05,
+      "loss": 1.1808,
+      "step": 5000
+    },
+    {
+      "epoch": 0.7514060742407199,
+      "grad_norm": 0.8999016880989075,
+      "learning_rate": 4.602803198025429e-05,
+      "loss": 1.1774,
+      "step": 5010
+    },
+    {
+      "epoch": 0.7529058867641545,
+      "grad_norm": 0.8793285489082336,
+      "learning_rate": 4.550402894775408e-05,
+      "loss": 1.1567,
+      "step": 5020
+    },
+    {
+      "epoch": 0.754405699287589,
+      "grad_norm": 0.8987736701965332,
+      "learning_rate": 4.49824919132283e-05,
+      "loss": 1.1531,
+      "step": 5030
+    },
+    {
+      "epoch": 0.7559055118110236,
+      "grad_norm": 0.9041977524757385,
+      "learning_rate": 4.446343318441719e-05,
+      "loss": 1.1695,
+      "step": 5040
+    },
+    {
+      "epoch": 0.7574053243344582,
+      "grad_norm": 0.8655080795288086,
+      "learning_rate": 4.394686501057553e-05,
+      "loss": 1.1734,
+      "step": 5050
+    },
+    {
+      "epoch": 0.7589051368578927,
+      "grad_norm": 0.7983774542808533,
+      "learning_rate": 4.343279958218352e-05,
+      "loss": 1.1742,
+      "step": 5060
+    },
+    {
+      "epoch": 0.7604049493813273,
+      "grad_norm": 0.8692320585250854,
+      "learning_rate": 4.29212490306592e-05,
+      "loss": 1.1727,
+      "step": 5070
+    },
+    {
+      "epoch": 0.7619047619047619,
+      "grad_norm": 0.8991125226020813,
+      "learning_rate": 4.241222542807211e-05,
+      "loss": 1.1796,
+      "step": 5080
+    },
+    {
+      "epoch": 0.7634045744281964,
+      "grad_norm": 0.8381094336509705,
+      "learning_rate": 4.19057407868583e-05,
+      "loss": 1.1658,
+      "step": 5090
+    },
+    {
+      "epoch": 0.7649043869516311,
+      "grad_norm": 0.8688058257102966,
+      "learning_rate": 4.140180705953689e-05,
+      "loss": 1.1808,
+      "step": 5100
+    },
+    {
+      "epoch": 0.7664041994750657,
+      "grad_norm": 0.910092830657959,
+      "learning_rate": 4.090043613842823e-05,
+      "loss": 1.1837,
+      "step": 5110
+    },
+    {
+      "epoch": 0.7679040119985002,
+      "grad_norm": 0.8620381355285645,
+      "learning_rate": 4.0401639855372884e-05,
+      "loss": 1.1887,
+      "step": 5120
+    },
+    {
+      "epoch": 0.7694038245219348,
+      "grad_norm": 0.8097319006919861,
+      "learning_rate": 3.990542998145262e-05,
+      "loss": 1.1579,
+      "step": 5130
+    },
+    {
+      "epoch": 0.7709036370453693,
+      "grad_norm": 0.9091822504997253,
+      "learning_rate": 3.941181822671273e-05,
+      "loss": 1.1801,
+      "step": 5140
+    },
+    {
+      "epoch": 0.7724034495688039,
+      "grad_norm": 0.8581311106681824,
+      "learning_rate": 3.892081623988541e-05,
+      "loss": 1.1892,
+      "step": 5150
+    },
+    {
+      "epoch": 0.7739032620922385,
+      "grad_norm": 0.7965316772460938,
+      "learning_rate": 3.8432435608115e-05,
+      "loss": 1.1629,
+      "step": 5160
+    },
+    {
+      "epoch": 0.775403074615673,
+      "grad_norm": 0.844725489616394,
+      "learning_rate": 3.794668785668465e-05,
+      "loss": 1.173,
+      "step": 5170
+    },
+    {
+      "epoch": 0.7769028871391076,
+      "grad_norm": 0.8694456219673157,
+      "learning_rate": 3.7463584448744186e-05,
+      "loss": 1.167,
+      "step": 5180
+    },
+    {
+      "epoch": 0.7784026996625422,
+      "grad_norm": 0.8310941457748413,
+      "learning_rate": 3.6983136785039636e-05,
+      "loss": 1.1647,
+      "step": 5190
+    },
+    {
+      "epoch": 0.7799025121859767,
+      "grad_norm": 0.7977578043937683,
+      "learning_rate": 3.650535620364407e-05,
+      "loss": 1.1704,
+      "step": 5200
+    },
+    {
+      "epoch": 0.7814023247094113,
+      "grad_norm": 0.8039920330047607,
+      "learning_rate": 3.603025397969037e-05,
+      "loss": 1.1585,
+      "step": 5210
+    },
+    {
+      "epoch": 0.7829021372328459,
+      "grad_norm": 0.974540114402771,
+      "learning_rate": 3.555784132510472e-05,
+      "loss": 1.1672,
+      "step": 5220
+    },
+    {
+      "epoch": 0.7844019497562804,
+      "grad_norm": 0.9114983081817627,
+      "learning_rate": 3.508812938834227e-05,
+      "loss": 1.1707,
+      "step": 5230
+    },
+    {
+      "epoch": 0.7859017622797151,
+      "grad_norm": 0.8260616064071655,
+      "learning_rate": 3.4621129254124106e-05,
+      "loss": 1.1548,
+      "step": 5240
+    },
+    {
+      "epoch": 0.7874015748031497,
+      "grad_norm": 0.8436164855957031,
+      "learning_rate": 3.415685194317539e-05,
+      "loss": 1.1552,
+      "step": 5250
+    },
+    {
+      "epoch": 0.7889013873265842,
+      "grad_norm": 0.8650628924369812,
+      "learning_rate": 3.3695308411965564e-05,
+      "loss": 1.1679,
+      "step": 5260
+    },
+    {
+      "epoch": 0.7904011998500188,
+      "grad_norm": 0.8216997385025024,
+      "learning_rate": 3.323650955244951e-05,
+      "loss": 1.1761,
+      "step": 5270
+    },
+    {
+      "epoch": 0.7919010123734533,
+      "grad_norm": 0.9183224439620972,
+      "learning_rate": 3.2780466191810905e-05,
+      "loss": 1.1657,
+      "step": 5280
+    },
+    {
+      "epoch": 0.7934008248968879,
+      "grad_norm": 0.8745443820953369,
+      "learning_rate": 3.232718909220631e-05,
+      "loss": 1.1748,
+      "step": 5290
+    },
+    {
+      "epoch": 0.7949006374203225,
+      "grad_norm": 0.8624297976493835,
+      "learning_rate": 3.187668895051135e-05,
+      "loss": 1.1656,
+      "step": 5300
+    },
+    {
+      "epoch": 0.796400449943757,
+      "grad_norm": 0.8851115107536316,
+      "learning_rate": 3.14289763980683e-05,
+      "loss": 1.1893,
+      "step": 5310
+    },
+    {
+      "epoch": 0.7979002624671916,
+      "grad_norm": 0.902093768119812,
+      "learning_rate": 3.0984062000435276e-05,
+      "loss": 1.1729,
+      "step": 5320
+    },
+    {
+      "epoch": 0.7994000749906262,
+      "grad_norm": 0.9625739455223083,
+      "learning_rate": 3.054195625713668e-05,
+      "loss": 1.1674,
+      "step": 5330
+    },
+    {
+      "epoch": 0.8002999625046869,
+      "eval_loss": 1.2090579271316528,
+      "eval_runtime": 34.3372,
+      "eval_samples_per_second": 728.073,
+      "eval_steps_per_second": 91.009,
+      "step": 5336
+    },
+    {
+      "epoch": 0.8008998875140607,
+      "grad_norm": 0.8621588349342346,
+      "learning_rate": 3.0102669601415575e-05,
+      "loss": 1.1566,
+      "step": 5340
+    },
+    {
+      "epoch": 0.8023997000374953,
+      "grad_norm": 0.7970715165138245,
+      "learning_rate": 2.966621239998755e-05,
+      "loss": 1.1581,
+      "step": 5350
+    },
+    {
+      "epoch": 0.8038995125609298,
+      "grad_norm": 0.78794264793396,
+      "learning_rate": 2.9232594952795818e-05,
+      "loss": 1.1581,
+      "step": 5360
+    },
+    {
+      "epoch": 0.8053993250843644,
+      "grad_norm": 0.8636032342910767,
+      "learning_rate": 2.8801827492768352e-05,
+      "loss": 1.1783,
+      "step": 5370
+    },
+    {
+      "epoch": 0.8068991376077991,
+      "grad_norm": 0.8886037468910217,
+      "learning_rate": 2.8373920185576375e-05,
+      "loss": 1.1646,
+      "step": 5380
+    },
+    {
+      "epoch": 0.8083989501312336,
+      "grad_norm": 0.8844544887542725,
+      "learning_rate": 2.7948883129394467e-05,
+      "loss": 1.1627,
+      "step": 5390
+    },
+    {
+      "epoch": 0.8098987626546682,
+      "grad_norm": 0.8555653691291809,
+      "learning_rate": 2.7526726354662104e-05,
+      "loss": 1.1557,
+      "step": 5400
+    },
+    {
+      "epoch": 0.8113985751781028,
+      "grad_norm": 0.8595440983772278,
+      "learning_rate": 2.7107459823847106e-05,
+      "loss": 1.1606,
+      "step": 5410
+    },
+    {
+      "epoch": 0.8128983877015373,
+      "grad_norm": 0.9353649020195007,
+      "learning_rate": 2.6691093431210596e-05,
+      "loss": 1.1755,
+      "step": 5420
+    },
+    {
+      "epoch": 0.8143982002249719,
+      "grad_norm": 0.8532871603965759,
+      "learning_rate": 2.6277637002573288e-05,
+      "loss": 1.1738,
+      "step": 5430
+    },
+    {
+      "epoch": 0.8158980127484065,
+      "grad_norm": 0.8638527393341064,
+      "learning_rate": 2.586710029508375e-05,
+      "loss": 1.1643,
+      "step": 5440
+    },
+    {
+      "epoch": 0.817397825271841,
+      "grad_norm": 0.9085490107536316,
+      "learning_rate": 2.54594929969881e-05,
+      "loss": 1.1604,
+      "step": 5450
+    },
+    {
+      "epoch": 0.8188976377952756,
+      "grad_norm": 0.8463364839553833,
+      "learning_rate": 2.5054824727401502e-05,
+      "loss": 1.1651,
+      "step": 5460
+    },
+    {
+      "epoch": 0.8203974503187101,
+      "grad_norm": 0.8295713663101196,
+      "learning_rate": 2.46531050360809e-05,
+      "loss": 1.1645,
+      "step": 5470
+    },
+    {
+      "epoch": 0.8218972628421447,
+      "grad_norm": 0.8853150010108948,
+      "learning_rate": 2.4254343403199945e-05,
+      "loss": 1.1768,
+      "step": 5480
+    },
+    {
+      "epoch": 0.8233970753655793,
+      "grad_norm": 0.9157831072807312,
+      "learning_rate": 2.3858549239125034e-05,
+      "loss": 1.1601,
+      "step": 5490
+    },
+    {
+      "epoch": 0.8248968878890138,
+      "grad_norm": 0.8486490845680237,
+      "learning_rate": 2.346573188419341e-05,
+      "loss": 1.1647,
+      "step": 5500
+    },
+    {
+      "epoch": 0.8263967004124484,
+      "grad_norm": 0.7939295768737793,
+      "learning_rate": 2.3075900608492637e-05,
+      "loss": 1.1692,
+      "step": 5510
+    },
+    {
+      "epoch": 0.8278965129358831,
+      "grad_norm": 1.1155019998550415,
+      "learning_rate": 2.2689064611641794e-05,
+      "loss": 1.1907,
+      "step": 5520
+    },
+    {
+      "epoch": 0.8293963254593176,
+      "grad_norm": 0.8578311800956726,
+      "learning_rate": 2.230523302257461e-05,
+      "loss": 1.15,
+      "step": 5530
+    },
+    {
+      "epoch": 0.8308961379827522,
+      "grad_norm": 0.86622154712677,
+      "learning_rate": 2.192441489932372e-05,
+      "loss": 1.1708,
+      "step": 5540
+    },
+    {
+      "epoch": 0.8323959505061868,
+      "grad_norm": 0.8797856569290161,
+      "learning_rate": 2.154661922880708e-05,
+      "loss": 1.155,
+      "step": 5550
+    },
+    {
+      "epoch": 0.8338957630296213,
+      "grad_norm": 0.9027743935585022,
+      "learning_rate": 2.117185492661592e-05,
+      "loss": 1.1502,
+      "step": 5560
+    },
+    {
+      "epoch": 0.8353955755530559,
+      "grad_norm": 0.8181419968605042,
+      "learning_rate": 2.0800130836804214e-05,
+      "loss": 1.1618,
+      "step": 5570
+    },
+    {
+      "epoch": 0.8368953880764904,
+      "grad_norm": 0.846794843673706,
+      "learning_rate": 2.043145573168003e-05,
+      "loss": 1.1588,
+      "step": 5580
+    },
+    {
+      "epoch": 0.838395200599925,
+      "grad_norm": 0.9015936255455017,
+      "learning_rate": 2.0065838311598543e-05,
+      "loss": 1.1775,
+      "step": 5590
+    },
+    {
+      "epoch": 0.8398950131233596,
+      "grad_norm": 0.8660979866981506,
+      "learning_rate": 1.9703287204756757e-05,
+      "loss": 1.1582,
+      "step": 5600
+    },
+    {
+      "epoch": 0.8413948256467941,
+      "grad_norm": 0.8045121431350708,
+      "learning_rate": 1.9343810966989716e-05,
+      "loss": 1.1778,
+      "step": 5610
+    },
+    {
+      "epoch": 0.8428946381702287,
+      "grad_norm": 0.8060126304626465,
+      "learning_rate": 1.8987418081568683e-05,
+      "loss": 1.1445,
+      "step": 5620
+    },
+    {
+      "epoch": 0.8443944506936633,
+      "grad_norm": 0.8652163147926331,
+      "learning_rate": 1.8634116959001106e-05,
+      "loss": 1.1634,
+      "step": 5630
+    },
+    {
+      "epoch": 0.8458942632170978,
+      "grad_norm": 0.9784821271896362,
+      "learning_rate": 1.828391593683185e-05,
+      "loss": 1.1474,
+      "step": 5640
+    },
+    {
+      "epoch": 0.8473940757405324,
+      "grad_norm": 0.9049434065818787,
+      "learning_rate": 1.7936823279446676e-05,
+      "loss": 1.1548,
+      "step": 5650
+    },
+    {
+      "epoch": 0.8488938882639671,
+      "grad_norm": 0.8466004133224487,
+      "learning_rate": 1.7592847177877008e-05,
+      "loss": 1.1611,
+      "step": 5660
+    },
+    {
+      "epoch": 0.8503937007874016,
+      "grad_norm": 0.8664677739143372,
+      "learning_rate": 1.725199574960689e-05,
+      "loss": 1.1472,
+      "step": 5670
+    },
+    {
+      "epoch": 0.8518935133108362,
+      "grad_norm": 0.8182629346847534,
+      "learning_rate": 1.6914277038381145e-05,
+      "loss": 1.1646,
+      "step": 5680
+    },
+    {
+      "epoch": 0.8533933258342707,
+      "grad_norm": 0.8338120579719543,
+      "learning_rate": 1.6579699014015783e-05,
+      "loss": 1.159,
+      "step": 5690
+    },
+    {
+      "epoch": 0.8548931383577053,
+      "grad_norm": 0.910591185092926,
+      "learning_rate": 1.6248269572209716e-05,
+      "loss": 1.1562,
+      "step": 5700
+    },
+    {
+      "epoch": 0.8563929508811399,
+      "grad_norm": 0.9760018587112427,
+      "learning_rate": 1.5919996534358635e-05,
+      "loss": 1.1413,
+      "step": 5710
+    },
+    {
+      "epoch": 0.8578927634045744,
+      "grad_norm": 0.9556133151054382,
+      "learning_rate": 1.5594887647370263e-05,
+      "loss": 1.1537,
+      "step": 5720
+    },
+    {
+      "epoch": 0.859392575928009,
+      "grad_norm": 0.8409389853477478,
+      "learning_rate": 1.527295058348154e-05,
+      "loss": 1.1351,
+      "step": 5730
+    },
+    {
+      "epoch": 0.8608923884514436,
+      "grad_norm": 0.8387997150421143,
+      "learning_rate": 1.4954192940077809e-05,
+      "loss": 1.1509,
+      "step": 5740
+    },
+    {
+      "epoch": 0.8623922009748781,
+      "grad_norm": 0.9269035458564758,
+      "learning_rate": 1.463862223951317e-05,
+      "loss": 1.1634,
+      "step": 5750
+    },
+    {
+      "epoch": 0.8638920134983127,
+      "grad_norm": 0.8396034836769104,
+      "learning_rate": 1.4326245928933178e-05,
+      "loss": 1.1663,
+      "step": 5760
+    },
+    {
+      "epoch": 0.8653918260217472,
+      "grad_norm": 0.8768131732940674,
+      "learning_rate": 1.4017071380099132e-05,
+      "loss": 1.1597,
+      "step": 5770
+    },
+    {
+      "epoch": 0.8668916385451818,
+      "grad_norm": 0.8506944179534912,
+      "learning_rate": 1.3711105889213908e-05,
+      "loss": 1.1605,
+      "step": 5780
+    },
+    {
+      "epoch": 0.8683914510686164,
+      "grad_norm": 0.8659221529960632,
+      "learning_rate": 1.3408356676750043e-05,
+      "loss": 1.1792,
+      "step": 5790
+    },
+    {
+      "epoch": 0.869891263592051,
+      "grad_norm": 0.869342029094696,
+      "learning_rate": 1.310883088727902e-05,
+      "loss": 1.1579,
+      "step": 5800
+    },
+    {
+      "epoch": 0.8713910761154856,
+      "grad_norm": 0.8290470242500305,
+      "learning_rate": 1.2812535589303024e-05,
+      "loss": 1.143,
+      "step": 5810
+    },
+    {
+      "epoch": 0.8728908886389202,
+      "grad_norm": 0.8000004887580872,
+      "learning_rate": 1.2519477775087805e-05,
+      "loss": 1.1688,
+      "step": 5820
+    },
+    {
+      "epoch": 0.8743907011623547,
+      "grad_norm": 0.907409131526947,
+      "learning_rate": 1.222966436049786e-05,
+      "loss": 1.1787,
+      "step": 5830
+    },
+    {
+      "epoch": 0.8758905136857893,
+      "grad_norm": 0.8386558294296265,
+      "learning_rate": 1.1943102184833165e-05,
+      "loss": 1.1736,
+      "step": 5840
+    },
+    {
+      "epoch": 0.8773903262092239,
+      "grad_norm": 0.8760896325111389,
+      "learning_rate": 1.165979801066782e-05,
+      "loss": 1.1539,
+      "step": 5850
+    },
+    {
+      "epoch": 0.8788901387326584,
+      "grad_norm": 0.959135890007019,
+      "learning_rate": 1.1379758523690413e-05,
+      "loss": 1.1756,
+      "step": 5860
+    },
+    {
+      "epoch": 0.880389951256093,
+      "grad_norm": 0.8947364091873169,
+      "learning_rate": 1.1102990332546175e-05,
+      "loss": 1.1567,
+      "step": 5870
+    },
+    {
+      "epoch": 0.8818897637795275,
+      "grad_norm": 0.8616886138916016,
+      "learning_rate": 1.0829499968681204e-05,
+      "loss": 1.1636,
+      "step": 5880
+    },
+    {
+      "epoch": 0.8833895763029621,
+      "grad_norm": 0.899998664855957,
+      "learning_rate": 1.0559293886188246e-05,
+      "loss": 1.1737,
+      "step": 5890
+    },
+    {
+      "epoch": 0.8848893888263967,
+      "grad_norm": 0.8765754699707031,
+      "learning_rate": 1.029237846165426e-05,
+      "loss": 1.165,
+      "step": 5900
+    },
+    {
+      "epoch": 0.8863892013498312,
+      "grad_norm": 0.8620162010192871,
+      "learning_rate": 1.0028759994010071e-05,
+      "loss": 1.1688,
+      "step": 5910
+    },
+    {
+      "epoch": 0.8878890138732658,
+      "grad_norm": 0.8123705387115479,
+      "learning_rate": 9.768444704381811e-06,
+      "loss": 1.157,
+      "step": 5920
+    },
+    {
+      "epoch": 0.8893888263967004,
+      "grad_norm": 0.9160408973693848,
+      "learning_rate": 9.511438735943849e-06,
+      "loss": 1.1718,
+      "step": 5930
+    },
+    {
+      "epoch": 0.890888638920135,
+      "grad_norm": 0.9307278990745544,
+      "learning_rate": 9.257748153773992e-06,
+      "loss": 1.155,
+      "step": 5940
+    },
+    {
+      "epoch": 0.8923884514435696,
+      "grad_norm": 0.8938122391700745,
+      "learning_rate": 9.007378944710431e-06,
+      "loss": 1.1486,
+      "step": 5950
+    },
+    {
+      "epoch": 0.8938882639670042,
+      "grad_norm": 0.8921361565589905,
+      "learning_rate": 8.760337017210206e-06,
+      "loss": 1.1456,
+      "step": 5960
+    },
+    {
+      "epoch": 0.8953880764904387,
+      "grad_norm": 0.9233677983283997,
+      "learning_rate": 8.516628201209985e-06,
+      "loss": 1.1566,
+      "step": 5970
+    },
+    {
+      "epoch": 0.8968878890138733,
+      "grad_norm": 0.8670746088027954,
+      "learning_rate": 8.276258247988437e-06,
+      "loss": 1.1533,
+      "step": 5980
+    },
+    {
+      "epoch": 0.8983877015373078,
+      "grad_norm": 0.8692810535430908,
+      "learning_rate": 8.039232830030413e-06,
+      "loss": 1.1672,
+      "step": 5990
+    },
+    {
+      "epoch": 0.8998875140607424,
+      "grad_norm": 0.8850069046020508,
+      "learning_rate": 7.805557540893276e-06,
+      "loss": 1.1712,
+      "step": 6000
+    },
+    {
+      "epoch": 0.9003374578177727,
+      "eval_loss": 1.2019070386886597,
+      "eval_runtime": 34.3384,
+      "eval_samples_per_second": 728.048,
+      "eval_steps_per_second": 91.006,
+      "step": 6003
+    },
+    {
+      "epoch": 0.901387326584177,
+      "grad_norm": 0.8891724944114685,
+      "learning_rate": 7.575237895074637e-06,
+      "loss": 1.167,
+      "step": 6010
+    },
+    {
+      "epoch": 0.9028871391076115,
+      "grad_norm": 0.8959289789199829,
+      "learning_rate": 7.348279327882467e-06,
+      "loss": 1.1651,
+      "step": 6020
+    },
+    {
+      "epoch": 0.9043869516310461,
+      "grad_norm": 0.8475953340530396,
+      "learning_rate": 7.1246871953066666e-06,
+      "loss": 1.1508,
+      "step": 6030
+    },
+    {
+      "epoch": 0.9058867641544807,
+      "grad_norm": 0.8530213832855225,
+      "learning_rate": 6.9044667738927365e-06,
+      "loss": 1.1631,
+      "step": 6040
+    },
+    {
+      "epoch": 0.9073865766779152,
+      "grad_norm": 0.8265974521636963,
+      "learning_rate": 6.6876232606172255e-06,
+      "loss": 1.1553,
+      "step": 6050
+    },
+    {
+      "epoch": 0.9088863892013498,
+      "grad_norm": 0.8897525668144226,
+      "learning_rate": 6.4741617727651626e-06,
+      "loss": 1.1457,
+      "step": 6060
+    },
+    {
+      "epoch": 0.9103862017247843,
+      "grad_norm": 0.926990270614624,
+      "learning_rate": 6.264087347809188e-06,
+      "loss": 1.171,
+      "step": 6070
+    },
+    {
+      "epoch": 0.911886014248219,
+      "grad_norm": 0.9227252006530762,
+      "learning_rate": 6.0574049432907115e-06,
+      "loss": 1.1704,
+      "step": 6080
+    },
+    {
+      "epoch": 0.9133858267716536,
+      "grad_norm": 0.8871036171913147,
+      "learning_rate": 5.854119436702976e-06,
+      "loss": 1.1648,
+      "step": 6090
+    },
+    {
+      "epoch": 0.9148856392950881,
+      "grad_norm": 0.9545475244522095,
+      "learning_rate": 5.65423562537593e-06,
+      "loss": 1.1612,
+      "step": 6100
+    },
+    {
+      "epoch": 0.9163854518185227,
+      "grad_norm": 0.8905931711196899,
+      "learning_rate": 5.4577582263629235e-06,
+      "loss": 1.1673,
+      "step": 6110
+    },
+    {
+      "epoch": 0.9178852643419573,
+      "grad_norm": 0.857297420501709,
+      "learning_rate": 5.264691876329474e-06,
+      "loss": 1.1436,
+      "step": 6120
+    },
+    {
+      "epoch": 0.9193850768653918,
+      "grad_norm": 0.8451759219169617,
+      "learning_rate": 5.075041131443891e-06,
+      "loss": 1.1594,
+      "step": 6130
+    },
+    {
+      "epoch": 0.9208848893888264,
+      "grad_norm": 0.8909957408905029,
+      "learning_rate": 4.88881046726966e-06,
+      "loss": 1.1514,
+      "step": 6140
+    },
+    {
+      "epoch": 0.922384701912261,
+      "grad_norm": 0.9147069454193115,
+      "learning_rate": 4.706004278659831e-06,
+      "loss": 1.153,
+      "step": 6150
+    },
+    {
+      "epoch": 0.9238845144356955,
+      "grad_norm": 0.8463402390480042,
+      "learning_rate": 4.526626879653428e-06,
+      "loss": 1.168,
+      "step": 6160
+    },
+    {
+      "epoch": 0.9253843269591301,
+      "grad_norm": 0.8234553933143616,
+      "learning_rate": 4.350682503373437e-06,
+      "loss": 1.1484,
+      "step": 6170
+    },
+    {
+      "epoch": 0.9268841394825647,
+      "grad_norm": 0.9229360222816467,
+      "learning_rate": 4.178175301927101e-06,
+      "loss": 1.1751,
+      "step": 6180
+    },
+    {
+      "epoch": 0.9283839520059992,
+      "grad_norm": 0.884353518486023,
+      "learning_rate": 4.009109346307792e-06,
+      "loss": 1.1613,
+      "step": 6190
+    },
+    {
+      "epoch": 0.9298837645294338,
+      "grad_norm": 0.8972669839859009,
+      "learning_rate": 3.8434886262991015e-06,
+      "loss": 1.1664,
+      "step": 6200
+    },
+    {
+      "epoch": 0.9313835770528683,
+      "grad_norm": 0.9175205230712891,
+      "learning_rate": 3.6813170503804834e-06,
+      "loss": 1.1777,
+      "step": 6210
+    },
+    {
+      "epoch": 0.932883389576303,
+      "grad_norm": 0.8350104093551636,
+      "learning_rate": 3.522598445635172e-06,
+      "loss": 1.1365,
+      "step": 6220
+    },
+    {
+      "epoch": 0.9343832020997376,
+      "grad_norm": 0.851258397102356,
+      "learning_rate": 3.3673365576598e-06,
+      "loss": 1.1595,
+      "step": 6230
+    },
+    {
+      "epoch": 0.9358830146231721,
+      "grad_norm": 0.9081841111183167,
+      "learning_rate": 3.21553505047602e-06,
+      "loss": 1.1698,
+      "step": 6240
+    },
+    {
+      "epoch": 0.9373828271466067,
+      "grad_norm": 0.9121673107147217,
+      "learning_rate": 3.067197506444058e-06,
+      "loss": 1.1611,
+      "step": 6250
+    },
+    {
+      "epoch": 0.9388826396700413,
+      "grad_norm": 0.9540317058563232,
+      "learning_rate": 2.922327426178128e-06,
+      "loss": 1.1404,
+      "step": 6260
+    },
+    {
+      "epoch": 0.9403824521934758,
+      "grad_norm": 0.8566182255744934,
+      "learning_rate": 2.7809282284638855e-06,
+      "loss": 1.1821,
+      "step": 6270
+    },
+    {
+      "epoch": 0.9418822647169104,
+      "grad_norm": 0.9044936895370483,
+      "learning_rate": 2.643003250177672e-06,
+      "loss": 1.1455,
+      "step": 6280
+    },
+    {
+      "epoch": 0.943382077240345,
+      "grad_norm": 0.8130437135696411,
+      "learning_rate": 2.5085557462078134e-06,
+      "loss": 1.1486,
+      "step": 6290
+    },
+    {
+      "epoch": 0.9448818897637795,
+      "grad_norm": 0.9217805862426758,
+      "learning_rate": 2.377588889377813e-06,
+      "loss": 1.1758,
+      "step": 6300
+    },
+    {
+      "epoch": 0.9463817022872141,
+      "grad_norm": 0.9020255208015442,
+      "learning_rate": 2.2501057703714797e-06,
+      "loss": 1.1762,
+      "step": 6310
+    },
+    {
+      "epoch": 0.9478815148106486,
+      "grad_norm": 0.8997666239738464,
+      "learning_rate": 2.1261093976599365e-06,
+      "loss": 1.1698,
+      "step": 6320
+    },
+    {
+      "epoch": 0.9493813273340832,
+      "grad_norm": 0.8945279121398926,
+      "learning_rate": 2.005602697430675e-06,
+      "loss": 1.1653,
+      "step": 6330
+    },
+    {
+      "epoch": 0.9508811398575178,
+      "grad_norm": 0.8765221238136292,
+      "learning_rate": 1.8885885135184963e-06,
+      "loss": 1.1671,
+      "step": 6340
+    },
+    {
+      "epoch": 0.9523809523809523,
+      "grad_norm": 0.8852826356887817,
+      "learning_rate": 1.7750696073383974e-06,
+      "loss": 1.1514,
+      "step": 6350
+    },
+    {
+      "epoch": 0.953880764904387,
+      "grad_norm": 0.8397039175033569,
+      "learning_rate": 1.6650486578203725e-06,
+      "loss": 1.1617,
+      "step": 6360
+    },
+    {
+      "epoch": 0.9553805774278216,
+      "grad_norm": 0.85945725440979,
+      "learning_rate": 1.558528261346248e-06,
+      "loss": 1.1444,
+      "step": 6370
+    },
+    {
+      "epoch": 0.9568803899512561,
+      "grad_norm": 0.917316198348999,
+      "learning_rate": 1.455510931688364e-06,
+      "loss": 1.1542,
+      "step": 6380
+    },
+    {
+      "epoch": 0.9583802024746907,
+      "grad_norm": 0.8585231304168701,
+      "learning_rate": 1.3559990999502556e-06,
+      "loss": 1.1611,
+      "step": 6390
+    },
+    {
+      "epoch": 0.9598800149981253,
+      "grad_norm": 0.9111419320106506,
+      "learning_rate": 1.2599951145093157e-06,
+      "loss": 1.155,
+      "step": 6400
+    },
+    {
+      "epoch": 0.9613798275215598,
+      "grad_norm": 0.8682249188423157,
+      "learning_rate": 1.1675012409613715e-06,
+      "loss": 1.1484,
+      "step": 6410
+    },
+    {
+      "epoch": 0.9628796400449944,
+      "grad_norm": 0.8944458365440369,
+      "learning_rate": 1.0785196620671455e-06,
+      "loss": 1.1566,
+      "step": 6420
+    },
+    {
+      "epoch": 0.9643794525684289,
+      "grad_norm": 0.838281512260437,
+      "learning_rate": 9.93052477700862e-07,
+      "loss": 1.1691,
+      "step": 6430
+    },
+    {
+      "epoch": 0.9658792650918635,
+      "grad_norm": 0.8258534669876099,
+      "learning_rate": 9.111017048005876e-07,
+      "loss": 1.1631,
+      "step": 6440
+    },
+    {
+      "epoch": 0.9673790776152981,
+      "grad_norm": 0.7644683718681335,
+      "learning_rate": 8.326692773207189e-07,
+      "loss": 1.1599,
+      "step": 6450
+    },
+    {
+      "epoch": 0.9688788901387326,
+      "grad_norm": 0.922005832195282,
+      "learning_rate": 7.577570461862359e-07,
+      "loss": 1.1589,
+      "step": 6460
+    },
+    {
+      "epoch": 0.9703787026621672,
+      "grad_norm": 0.8499112129211426,
+      "learning_rate": 6.863667792491534e-07,
+      "loss": 1.1573,
+      "step": 6470
+    },
+    {
+      "epoch": 0.9718785151856018,
+      "grad_norm": 0.9114837646484375,
+      "learning_rate": 6.185001612467044e-07,
+      "loss": 1.1716,
+      "step": 6480
+    },
+    {
+      "epoch": 0.9733783277090363,
+      "grad_norm": 0.8339487314224243,
+      "learning_rate": 5.541587937616221e-07,
+      "loss": 1.1591,
+      "step": 6490
+    },
+    {
+      "epoch": 0.974878140232471,
+      "grad_norm": 0.8110185265541077,
+      "learning_rate": 4.933441951843198e-07,
+      "loss": 1.1539,
+      "step": 6500
+    },
+    {
+      "epoch": 0.9763779527559056,
+      "grad_norm": 0.8427588939666748,
+      "learning_rate": 4.360578006770865e-07,
+      "loss": 1.1619,
+      "step": 6510
+    },
+    {
+      "epoch": 0.9778777652793401,
+      "grad_norm": 0.8306043148040771,
+      "learning_rate": 3.82300962140214e-07,
+      "loss": 1.1609,
+      "step": 6520
+    },
+    {
+      "epoch": 0.9793775778027747,
+      "grad_norm": 0.9311191439628601,
+      "learning_rate": 3.320749481800888e-07,
+      "loss": 1.1556,
+      "step": 6530
+    },
+    {
+      "epoch": 0.9808773903262092,
+      "grad_norm": 0.932629406452179,
+      "learning_rate": 2.8538094407919987e-07,
+      "loss": 1.1523,
+      "step": 6540
+    },
+    {
+      "epoch": 0.9823772028496438,
+      "grad_norm": 0.8704735040664673,
+      "learning_rate": 2.4222005176829375e-07,
+      "loss": 1.1471,
+      "step": 6550
+    },
+    {
+      "epoch": 0.9838770153730784,
+      "grad_norm": 0.980547308921814,
+      "learning_rate": 2.025932898002458e-07,
+      "loss": 1.1687,
+      "step": 6560
+    },
+    {
+      "epoch": 0.9853768278965129,
+      "grad_norm": 0.8425877094268799,
+      "learning_rate": 1.6650159332607939e-07,
+      "loss": 1.1513,
+      "step": 6570
+    },
+    {
+      "epoch": 0.9868766404199475,
+      "grad_norm": 0.9170466065406799,
+      "learning_rate": 1.3394581407289996e-07,
+      "loss": 1.1726,
+      "step": 6580
+    },
+    {
+      "epoch": 0.9883764529433821,
+      "grad_norm": 0.9073484539985657,
+      "learning_rate": 1.0492672032377803e-07,
+      "loss": 1.1627,
+      "step": 6590
+    },
+    {
+      "epoch": 0.9898762654668166,
+      "grad_norm": 0.9039649963378906,
+      "learning_rate": 7.944499689961358e-08,
+      "loss": 1.1544,
+      "step": 6600
+    },
+    {
+      "epoch": 0.9913760779902512,
+      "grad_norm": 0.9152038097381592,
+      "learning_rate": 5.7501245143015685e-08,
+      "loss": 1.1618,
+      "step": 6610
+    },
+    {
+      "epoch": 0.9928758905136857,
+      "grad_norm": 0.922379732131958,
+      "learning_rate": 3.9095982904080447e-08,
+      "loss": 1.1587,
+      "step": 6620
+    },
+    {
+      "epoch": 0.9943757030371203,
+      "grad_norm": 0.8688434362411499,
+      "learning_rate": 2.4229644528150905e-08,
+      "loss": 1.1668,
+      "step": 6630
+    },
+    {
+      "epoch": 0.995875515560555,
+      "grad_norm": 0.8602100014686584,
+      "learning_rate": 1.290258084557516e-08,
+      "loss": 1.1596,
+      "step": 6640
+    },
+    {
+      "epoch": 0.9973753280839895,
+      "grad_norm": 0.8412054777145386,
+      "learning_rate": 5.115059163496304e-09,
+      "loss": 1.1424,
+      "step": 6650
+    },
+    {
+      "epoch": 0.9988751406074241,
+      "grad_norm": 0.9167276620864868,
+      "learning_rate": 8.672632594408646e-10,
+      "loss": 1.1601,
+      "step": 6660
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 6667,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 667,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2426610074517504.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}