{ "best_metric": 1.203166127204895, "best_model_checkpoint": "/Users/bbunzeck/Documents/german-llamas/cxn-llamas/cds-llama85/checkpoint-6003", "epoch": 0.9999250093738282, "eval_steps": 667, "global_step": 6667, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014998125234345708, "grad_norm": 4.941356658935547, "learning_rate": 1.4999999999999999e-05, "loss": 4.5697, "step": 10 }, { "epoch": 0.0029996250468691415, "grad_norm": 3.326807737350464, "learning_rate": 2.9999999999999997e-05, "loss": 4.207, "step": 20 }, { "epoch": 0.0044994375703037125, "grad_norm": 2.287435531616211, "learning_rate": 4.4999999999999996e-05, "loss": 3.7885, "step": 30 }, { "epoch": 0.005999250093738283, "grad_norm": 1.804822325706482, "learning_rate": 5.9999999999999995e-05, "loss": 3.5512, "step": 40 }, { "epoch": 0.0074990626171728535, "grad_norm": 1.4687652587890625, "learning_rate": 7.5e-05, "loss": 3.4201, "step": 50 }, { "epoch": 0.008998875140607425, "grad_norm": 1.0859005451202393, "learning_rate": 8.999999999999999e-05, "loss": 3.2891, "step": 60 }, { "epoch": 0.010498687664041995, "grad_norm": 0.9512425661087036, "learning_rate": 0.00010499999999999999, "loss": 3.1539, "step": 70 }, { "epoch": 0.011998500187476566, "grad_norm": 1.2439535856246948, "learning_rate": 0.00011999999999999999, "loss": 2.919, "step": 80 }, { "epoch": 0.013498312710911136, "grad_norm": 1.0196483135223389, "learning_rate": 0.000135, "loss": 2.728, "step": 90 }, { "epoch": 0.014998125234345707, "grad_norm": 3.3797309398651123, "learning_rate": 0.00015, "loss": 2.5985, "step": 100 }, { "epoch": 0.016497937757780277, "grad_norm": 1.4080921411514282, "learning_rate": 0.000165, "loss": 2.4991, "step": 110 }, { "epoch": 0.01799775028121485, "grad_norm": 1.177751898765564, "learning_rate": 0.00017999999999999998, "loss": 2.4234, "step": 120 }, { "epoch": 0.01949756280464942, "grad_norm": 1.9201545715332031, "learning_rate": 0.000195, "loss": 2.36, "step": 130 }, { "epoch": 0.02099737532808399, "grad_norm": 1.969179630279541, "learning_rate": 0.00020999999999999998, "loss": 2.3224, "step": 140 }, { "epoch": 0.02249718785151856, "grad_norm": 1.7428991794586182, "learning_rate": 0.000225, "loss": 2.2435, "step": 150 }, { "epoch": 0.023997000374953132, "grad_norm": 1.231548547744751, "learning_rate": 0.00023999999999999998, "loss": 2.1859, "step": 160 }, { "epoch": 0.0254968128983877, "grad_norm": 1.4751386642456055, "learning_rate": 0.00025499999999999996, "loss": 2.1358, "step": 170 }, { "epoch": 0.02699662542182227, "grad_norm": 1.457726001739502, "learning_rate": 0.00027, "loss": 2.0645, "step": 180 }, { "epoch": 0.028496437945256844, "grad_norm": 1.2510344982147217, "learning_rate": 0.000285, "loss": 2.0373, "step": 190 }, { "epoch": 0.029996250468691414, "grad_norm": 1.5325850248336792, "learning_rate": 0.0003, "loss": 1.9925, "step": 200 }, { "epoch": 0.031496062992125984, "grad_norm": 1.3607863187789917, "learning_rate": 0.0002999982300767559, "loss": 1.9542, "step": 210 }, { "epoch": 0.032995875515560553, "grad_norm": 1.13303804397583, "learning_rate": 0.000299992920348792, "loss": 1.9144, "step": 220 }, { "epoch": 0.03449568803899512, "grad_norm": 1.341699242591858, "learning_rate": 0.0002999840709414124, "loss": 1.9061, "step": 230 }, { "epoch": 0.0359955005624297, "grad_norm": 1.2910326719284058, "learning_rate": 0.0002999716820634541, "loss": 1.8744, "step": 240 }, { "epoch": 0.03749531308586427, "grad_norm": 1.3730792999267578, "learning_rate": 0.000299955754007282, "loss": 1.8369, "step": 250 }, { "epoch": 0.03899512560929884, "grad_norm": 1.2474159002304077, "learning_rate": 0.00029993628714878185, "loss": 1.796, "step": 260 }, { "epoch": 0.04049493813273341, "grad_norm": 1.2987885475158691, "learning_rate": 0.00029991328194735155, "loss": 1.8051, "step": 270 }, { "epoch": 0.04199475065616798, "grad_norm": 1.0884069204330444, "learning_rate": 0.0002998867389458904, "loss": 1.7658, "step": 280 }, { "epoch": 0.04349456317960255, "grad_norm": 1.3922066688537598, "learning_rate": 0.00029985665877078595, "loss": 1.7632, "step": 290 }, { "epoch": 0.04499437570303712, "grad_norm": 1.1491564512252808, "learning_rate": 0.0002998230421318997, "loss": 1.7608, "step": 300 }, { "epoch": 0.046494188226471694, "grad_norm": 1.2716453075408936, "learning_rate": 0.0002997858898225498, "loss": 1.7231, "step": 310 }, { "epoch": 0.047994000749906264, "grad_norm": 1.3676432371139526, "learning_rate": 0.0002997452027194928, "loss": 1.7303, "step": 320 }, { "epoch": 0.049493813273340834, "grad_norm": 1.1510239839553833, "learning_rate": 0.0002997009817829027, "loss": 1.7337, "step": 330 }, { "epoch": 0.0509936257967754, "grad_norm": 1.2083615064620972, "learning_rate": 0.0002996532280563483, "loss": 1.708, "step": 340 }, { "epoch": 0.05249343832020997, "grad_norm": 1.0882530212402344, "learning_rate": 0.0002996019426667687, "loss": 1.6717, "step": 350 }, { "epoch": 0.05399325084364454, "grad_norm": 1.2023850679397583, "learning_rate": 0.00029954712682444656, "loss": 1.6565, "step": 360 }, { "epoch": 0.05549306336707911, "grad_norm": 1.0490646362304688, "learning_rate": 0.0002994887818229797, "loss": 1.639, "step": 370 }, { "epoch": 0.05699287589051369, "grad_norm": 1.0732958316802979, "learning_rate": 0.0002994269090392505, "loss": 1.6685, "step": 380 }, { "epoch": 0.05849268841394826, "grad_norm": 1.1127232313156128, "learning_rate": 0.00029936150993339325, "loss": 1.6365, "step": 390 }, { "epoch": 0.05999250093738283, "grad_norm": 1.1355839967727661, "learning_rate": 0.0002992925860487599, "loss": 1.6495, "step": 400 }, { "epoch": 0.0614923134608174, "grad_norm": 1.0531744956970215, "learning_rate": 0.0002992201390118837, "loss": 1.6279, "step": 410 }, { "epoch": 0.06299212598425197, "grad_norm": 1.165711522102356, "learning_rate": 0.00029914417053244054, "loss": 1.6342, "step": 420 }, { "epoch": 0.06449193850768654, "grad_norm": 1.1082898378372192, "learning_rate": 0.00029906468240320874, "loss": 1.6184, "step": 430 }, { "epoch": 0.06599175103112111, "grad_norm": 1.07638680934906, "learning_rate": 0.00029898167650002676, "loss": 1.6078, "step": 440 }, { "epoch": 0.06749156355455568, "grad_norm": 0.9482781887054443, "learning_rate": 0.0002988951547817491, "loss": 1.6024, "step": 450 }, { "epoch": 0.06899137607799025, "grad_norm": 0.9475975036621094, "learning_rate": 0.00029880511929019965, "loss": 1.6056, "step": 460 }, { "epoch": 0.07049118860142482, "grad_norm": 1.2477397918701172, "learning_rate": 0.0002987115721501239, "loss": 1.5891, "step": 470 }, { "epoch": 0.0719910011248594, "grad_norm": 1.0981507301330566, "learning_rate": 0.00029861451556913865, "loss": 1.5971, "step": 480 }, { "epoch": 0.07349081364829396, "grad_norm": 1.0291006565093994, "learning_rate": 0.00029851395183767983, "loss": 1.5748, "step": 490 }, { "epoch": 0.07499062617172854, "grad_norm": 1.0406434535980225, "learning_rate": 0.00029840988332894864, "loss": 1.5739, "step": 500 }, { "epoch": 0.0764904386951631, "grad_norm": 0.9979759454727173, "learning_rate": 0.00029830231249885537, "loss": 1.5593, "step": 510 }, { "epoch": 0.07799025121859768, "grad_norm": 1.1204349994659424, "learning_rate": 0.00029819124188596146, "loss": 1.5598, "step": 520 }, { "epoch": 0.07949006374203224, "grad_norm": 1.0350697040557861, "learning_rate": 0.00029807667411141977, "loss": 1.5625, "step": 530 }, { "epoch": 0.08098987626546682, "grad_norm": 1.0373060703277588, "learning_rate": 0.0002979586118789125, "loss": 1.5413, "step": 540 }, { "epoch": 0.0824896887889014, "grad_norm": 0.9631483554840088, "learning_rate": 0.0002978370579745876, "loss": 1.5431, "step": 550 }, { "epoch": 0.08398950131233596, "grad_norm": 0.9262831211090088, "learning_rate": 0.00029771201526699264, "loss": 1.5288, "step": 560 }, { "epoch": 0.08548931383577053, "grad_norm": 1.0329458713531494, "learning_rate": 0.0002975834867070077, "loss": 1.5172, "step": 570 }, { "epoch": 0.0869891263592051, "grad_norm": 0.9907341599464417, "learning_rate": 0.00029745147532777514, "loss": 1.5118, "step": 580 }, { "epoch": 0.08848893888263967, "grad_norm": 0.9761767983436584, "learning_rate": 0.0002973159842446285, "loss": 1.5067, "step": 590 }, { "epoch": 0.08998875140607424, "grad_norm": 0.9076977968215942, "learning_rate": 0.00029717701665501865, "loss": 1.5173, "step": 600 }, { "epoch": 0.09148856392950881, "grad_norm": 0.9733080863952637, "learning_rate": 0.00029703457583843846, "loss": 1.5101, "step": 610 }, { "epoch": 0.09298837645294339, "grad_norm": 0.965733528137207, "learning_rate": 0.00029688866515634546, "loss": 1.5192, "step": 620 }, { "epoch": 0.09448818897637795, "grad_norm": 0.9676783680915833, "learning_rate": 0.00029673928805208237, "loss": 1.5123, "step": 630 }, { "epoch": 0.09598800149981253, "grad_norm": 0.9860779047012329, "learning_rate": 0.00029658644805079606, "loss": 1.5205, "step": 640 }, { "epoch": 0.09748781402324709, "grad_norm": 1.026779294013977, "learning_rate": 0.00029643014875935404, "loss": 1.5175, "step": 650 }, { "epoch": 0.09898762654668167, "grad_norm": 0.9782848954200745, "learning_rate": 0.00029627039386625976, "loss": 1.4966, "step": 660 }, { "epoch": 0.10003749531308587, "eval_loss": 1.540096402168274, "eval_runtime": 34.7434, "eval_samples_per_second": 719.561, "eval_steps_per_second": 89.945, "step": 667 }, { "epoch": 0.10048743907011623, "grad_norm": 1.0175178050994873, "learning_rate": 0.0002961071871415651, "loss": 1.4742, "step": 670 }, { "epoch": 0.1019872515935508, "grad_norm": 0.9086681008338928, "learning_rate": 0.00029594053243678175, "loss": 1.5138, "step": 680 }, { "epoch": 0.10348706411698538, "grad_norm": 1.0384730100631714, "learning_rate": 0.00029577043368479017, "loss": 1.4645, "step": 690 }, { "epoch": 0.10498687664041995, "grad_norm": 0.9737691283226013, "learning_rate": 0.0002955968948997469, "loss": 1.4812, "step": 700 }, { "epoch": 0.10648668916385452, "grad_norm": 0.9272564053535461, "learning_rate": 0.00029541992017698956, "loss": 1.4489, "step": 710 }, { "epoch": 0.10798650168728909, "grad_norm": 0.9667676687240601, "learning_rate": 0.0002952395136929406, "loss": 1.4726, "step": 720 }, { "epoch": 0.10948631421072366, "grad_norm": 0.8968635201454163, "learning_rate": 0.00029505567970500833, "loss": 1.4585, "step": 730 }, { "epoch": 0.11098612673415822, "grad_norm": 0.9180886745452881, "learning_rate": 0.0002948684225514868, "loss": 1.4507, "step": 740 }, { "epoch": 0.1124859392575928, "grad_norm": 0.9335956573486328, "learning_rate": 0.0002946777466514531, "loss": 1.4707, "step": 750 }, { "epoch": 0.11398575178102738, "grad_norm": 1.0144034624099731, "learning_rate": 0.00029448365650466336, "loss": 1.4697, "step": 760 }, { "epoch": 0.11548556430446194, "grad_norm": 1.003219485282898, "learning_rate": 0.0002942861566914465, "loss": 1.455, "step": 770 }, { "epoch": 0.11698537682789652, "grad_norm": 0.9664227962493896, "learning_rate": 0.0002940852518725959, "loss": 1.4472, "step": 780 }, { "epoch": 0.11848518935133108, "grad_norm": 0.9362863898277283, "learning_rate": 0.0002938809467892596, "loss": 1.4411, "step": 790 }, { "epoch": 0.11998500187476566, "grad_norm": 0.9592990279197693, "learning_rate": 0.0002936732462628287, "loss": 1.4459, "step": 800 }, { "epoch": 0.12148481439820022, "grad_norm": 0.9843602180480957, "learning_rate": 0.0002934621551948229, "loss": 1.4439, "step": 810 }, { "epoch": 0.1229846269216348, "grad_norm": 0.9243733286857605, "learning_rate": 0.0002932476785667754, "loss": 1.4312, "step": 820 }, { "epoch": 0.12448443944506937, "grad_norm": 0.9494661092758179, "learning_rate": 0.00029302982144011514, "loss": 1.457, "step": 830 }, { "epoch": 0.12598425196850394, "grad_norm": 0.8792492747306824, "learning_rate": 0.00029280858895604727, "loss": 1.4365, "step": 840 }, { "epoch": 0.1274840644919385, "grad_norm": 0.9076453447341919, "learning_rate": 0.0002925839863354322, "loss": 1.4302, "step": 850 }, { "epoch": 0.1289838770153731, "grad_norm": 0.9594305157661438, "learning_rate": 0.00029235601887866167, "loss": 1.4238, "step": 860 }, { "epoch": 0.13048368953880765, "grad_norm": 0.8772804737091064, "learning_rate": 0.00029212469196553456, "loss": 1.4174, "step": 870 }, { "epoch": 0.13198350206224221, "grad_norm": 0.9967095851898193, "learning_rate": 0.00029189001105512914, "loss": 1.4379, "step": 880 }, { "epoch": 0.13348331458567678, "grad_norm": 0.9739612936973572, "learning_rate": 0.0002916519816856748, "loss": 1.4396, "step": 890 }, { "epoch": 0.13498312710911137, "grad_norm": 0.9377513527870178, "learning_rate": 0.000291410609474421, "loss": 1.4512, "step": 900 }, { "epoch": 0.13648293963254593, "grad_norm": 0.9376864433288574, "learning_rate": 0.0002911659001175049, "loss": 1.4093, "step": 910 }, { "epoch": 0.1379827521559805, "grad_norm": 0.8760865330696106, "learning_rate": 0.000290917859389817, "loss": 1.4286, "step": 920 }, { "epoch": 0.13948256467941508, "grad_norm": 1.0221220254898071, "learning_rate": 0.0002906664931448645, "loss": 1.4424, "step": 930 }, { "epoch": 0.14098237720284965, "grad_norm": 0.8309196829795837, "learning_rate": 0.00029041180731463357, "loss": 1.426, "step": 940 }, { "epoch": 0.1424821897262842, "grad_norm": 0.9327015280723572, "learning_rate": 0.00029015380790944916, "loss": 1.4279, "step": 950 }, { "epoch": 0.1439820022497188, "grad_norm": 0.8863860964775085, "learning_rate": 0.0002898925010178332, "loss": 1.4184, "step": 960 }, { "epoch": 0.14548181477315336, "grad_norm": 0.9390380382537842, "learning_rate": 0.00028962789280636083, "loss": 1.4131, "step": 970 }, { "epoch": 0.14698162729658792, "grad_norm": 0.9246792197227478, "learning_rate": 0.00028935998951951515, "loss": 1.4148, "step": 980 }, { "epoch": 0.1484814398200225, "grad_norm": 0.8679428696632385, "learning_rate": 0.00028908879747953955, "loss": 1.405, "step": 990 }, { "epoch": 0.14998125234345708, "grad_norm": 0.9332796931266785, "learning_rate": 0.00028881432308628855, "loss": 1.3973, "step": 1000 }, { "epoch": 0.15148106486689164, "grad_norm": 0.9386698007583618, "learning_rate": 0.00028853657281707696, "loss": 1.4142, "step": 1010 }, { "epoch": 0.1529808773903262, "grad_norm": 0.8631579279899597, "learning_rate": 0.0002882555532265269, "loss": 1.4148, "step": 1020 }, { "epoch": 0.1544806899137608, "grad_norm": 0.892475962638855, "learning_rate": 0.0002879712709464131, "loss": 1.4077, "step": 1030 }, { "epoch": 0.15598050243719536, "grad_norm": 0.8318502306938171, "learning_rate": 0.0002876837326855064, "loss": 1.3898, "step": 1040 }, { "epoch": 0.15748031496062992, "grad_norm": 0.8452482223510742, "learning_rate": 0.00028739294522941555, "loss": 1.3882, "step": 1050 }, { "epoch": 0.15898012748406448, "grad_norm": 0.9173560738563538, "learning_rate": 0.00028709891544042687, "loss": 1.3831, "step": 1060 }, { "epoch": 0.16047994000749907, "grad_norm": 0.86043381690979, "learning_rate": 0.0002868016502573425, "loss": 1.3982, "step": 1070 }, { "epoch": 0.16197975253093364, "grad_norm": 0.8810198903083801, "learning_rate": 0.00028650115669531654, "loss": 1.3806, "step": 1080 }, { "epoch": 0.1634795650543682, "grad_norm": 0.966907799243927, "learning_rate": 0.00028619744184568946, "loss": 1.3766, "step": 1090 }, { "epoch": 0.1649793775778028, "grad_norm": 0.8428529500961304, "learning_rate": 0.00028589051287582093, "loss": 1.3873, "step": 1100 }, { "epoch": 0.16647919010123735, "grad_norm": 0.9754992723464966, "learning_rate": 0.0002855803770289206, "loss": 1.3956, "step": 1110 }, { "epoch": 0.1679790026246719, "grad_norm": 0.9287955164909363, "learning_rate": 0.0002852670416238769, "loss": 1.3714, "step": 1120 }, { "epoch": 0.16947881514810648, "grad_norm": 0.9051069617271423, "learning_rate": 0.0002849505140550848, "loss": 1.3866, "step": 1130 }, { "epoch": 0.17097862767154107, "grad_norm": 0.8417872190475464, "learning_rate": 0.00028463080179227105, "loss": 1.3827, "step": 1140 }, { "epoch": 0.17247844019497563, "grad_norm": 1.0358750820159912, "learning_rate": 0.00028430791238031775, "loss": 1.4054, "step": 1150 }, { "epoch": 0.1739782527184102, "grad_norm": 0.8023399710655212, "learning_rate": 0.00028398185343908464, "loss": 1.3819, "step": 1160 }, { "epoch": 0.17547806524184478, "grad_norm": 0.887250542640686, "learning_rate": 0.000283652632663229, "loss": 1.3997, "step": 1170 }, { "epoch": 0.17697787776527935, "grad_norm": 0.8158445358276367, "learning_rate": 0.0002833202578220242, "loss": 1.3762, "step": 1180 }, { "epoch": 0.1784776902887139, "grad_norm": 0.799800455570221, "learning_rate": 0.0002829847367591764, "loss": 1.3974, "step": 1190 }, { "epoch": 0.17997750281214847, "grad_norm": 0.8806690573692322, "learning_rate": 0.0002826460773926393, "loss": 1.3694, "step": 1200 }, { "epoch": 0.18147731533558306, "grad_norm": 0.9353827834129333, "learning_rate": 0.00028230428771442725, "loss": 1.3646, "step": 1210 }, { "epoch": 0.18297712785901762, "grad_norm": 0.8729731440544128, "learning_rate": 0.000281959375790427, "loss": 1.3693, "step": 1220 }, { "epoch": 0.1844769403824522, "grad_norm": 0.8897218108177185, "learning_rate": 0.0002816113497602069, "loss": 1.3737, "step": 1230 }, { "epoch": 0.18597675290588678, "grad_norm": 0.8456818461418152, "learning_rate": 0.0002812602178368251, "loss": 1.3628, "step": 1240 }, { "epoch": 0.18747656542932134, "grad_norm": 0.8601028323173523, "learning_rate": 0.00028090598830663566, "loss": 1.3691, "step": 1250 }, { "epoch": 0.1889763779527559, "grad_norm": 0.9002561569213867, "learning_rate": 0.00028054866952909296, "loss": 1.3955, "step": 1260 }, { "epoch": 0.19047619047619047, "grad_norm": 0.8424673080444336, "learning_rate": 0.00028018826993655445, "loss": 1.3606, "step": 1270 }, { "epoch": 0.19197600299962506, "grad_norm": 0.8275784254074097, "learning_rate": 0.00027982479803408166, "loss": 1.3566, "step": 1280 }, { "epoch": 0.19347581552305962, "grad_norm": 0.8737898468971252, "learning_rate": 0.00027945826239923955, "loss": 1.3677, "step": 1290 }, { "epoch": 0.19497562804649418, "grad_norm": 0.901017963886261, "learning_rate": 0.000279088671681894, "loss": 1.3691, "step": 1300 }, { "epoch": 0.19647544056992877, "grad_norm": 0.9295936822891235, "learning_rate": 0.0002787160346040076, "loss": 1.3403, "step": 1310 }, { "epoch": 0.19797525309336333, "grad_norm": 0.9055348038673401, "learning_rate": 0.00027834035995943413, "loss": 1.3562, "step": 1320 }, { "epoch": 0.1994750656167979, "grad_norm": 0.8951241970062256, "learning_rate": 0.00027796165661371074, "loss": 1.3415, "step": 1330 }, { "epoch": 0.20007499062617173, "eval_loss": 1.4021942615509033, "eval_runtime": 34.3883, "eval_samples_per_second": 726.991, "eval_steps_per_second": 90.874, "step": 1334 }, { "epoch": 0.20097487814023246, "grad_norm": 0.8198707699775696, "learning_rate": 0.00027757993350384873, "loss": 1.3479, "step": 1340 }, { "epoch": 0.20247469066366705, "grad_norm": 0.8340823650360107, "learning_rate": 0.00027719519963812286, "loss": 1.3498, "step": 1350 }, { "epoch": 0.2039745031871016, "grad_norm": 0.8691400289535522, "learning_rate": 0.00027680746409585865, "loss": 1.3531, "step": 1360 }, { "epoch": 0.20547431571053618, "grad_norm": 0.8769707679748535, "learning_rate": 0.00027641673602721805, "loss": 1.3337, "step": 1370 }, { "epoch": 0.20697412823397077, "grad_norm": 0.8912369012832642, "learning_rate": 0.00027602302465298367, "loss": 1.3398, "step": 1380 }, { "epoch": 0.20847394075740533, "grad_norm": 0.8747676014900208, "learning_rate": 0.0002756263392643409, "loss": 1.3427, "step": 1390 }, { "epoch": 0.2099737532808399, "grad_norm": 0.8138441443443298, "learning_rate": 0.0002752266892226591, "loss": 1.3397, "step": 1400 }, { "epoch": 0.21147356580427445, "grad_norm": 0.8546535968780518, "learning_rate": 0.0002748240839592701, "loss": 1.3423, "step": 1410 }, { "epoch": 0.21297337832770905, "grad_norm": 0.907558023929596, "learning_rate": 0.00027441853297524615, "loss": 1.3753, "step": 1420 }, { "epoch": 0.2144731908511436, "grad_norm": 0.8942850232124329, "learning_rate": 0.00027401004584117535, "loss": 1.3413, "step": 1430 }, { "epoch": 0.21597300337457817, "grad_norm": 0.8546082377433777, "learning_rate": 0.00027359863219693614, "loss": 1.3349, "step": 1440 }, { "epoch": 0.21747281589801276, "grad_norm": 0.8140226006507874, "learning_rate": 0.00027318430175146934, "loss": 1.3418, "step": 1450 }, { "epoch": 0.21897262842144732, "grad_norm": 0.8695687651634216, "learning_rate": 0.00027276706428254965, "loss": 1.345, "step": 1460 }, { "epoch": 0.2204724409448819, "grad_norm": 0.9050869941711426, "learning_rate": 0.00027234692963655407, "loss": 1.3443, "step": 1470 }, { "epoch": 0.22197225346831645, "grad_norm": 0.8348211646080017, "learning_rate": 0.00027192390772823045, "loss": 1.3453, "step": 1480 }, { "epoch": 0.22347206599175104, "grad_norm": 0.9440985918045044, "learning_rate": 0.00027149800854046283, "loss": 1.3336, "step": 1490 }, { "epoch": 0.2249718785151856, "grad_norm": 0.8988614678382874, "learning_rate": 0.0002710692421240362, "loss": 1.3411, "step": 1500 }, { "epoch": 0.22647169103862017, "grad_norm": 0.8702236413955688, "learning_rate": 0.0002706376185973991, "loss": 1.3423, "step": 1510 }, { "epoch": 0.22797150356205476, "grad_norm": 0.8790213465690613, "learning_rate": 0.0002702031481464252, "loss": 1.3192, "step": 1520 }, { "epoch": 0.22947131608548932, "grad_norm": 0.8360859155654907, "learning_rate": 0.00026976584102417233, "loss": 1.3411, "step": 1530 }, { "epoch": 0.23097112860892388, "grad_norm": 0.8287367820739746, "learning_rate": 0.0002693257075506411, "loss": 1.3423, "step": 1540 }, { "epoch": 0.23247094113235844, "grad_norm": 0.8342249989509583, "learning_rate": 0.00026888275811253105, "loss": 1.3485, "step": 1550 }, { "epoch": 0.23397075365579303, "grad_norm": 0.8485095500946045, "learning_rate": 0.00026843700316299564, "loss": 1.328, "step": 1560 }, { "epoch": 0.2354705661792276, "grad_norm": 0.8335973620414734, "learning_rate": 0.0002679884532213954, "loss": 1.3002, "step": 1570 }, { "epoch": 0.23697037870266216, "grad_norm": 0.8893833160400391, "learning_rate": 0.00026753711887304995, "loss": 1.3364, "step": 1580 }, { "epoch": 0.23847019122609675, "grad_norm": 0.9313494563102722, "learning_rate": 0.000267083010768988, "loss": 1.3419, "step": 1590 }, { "epoch": 0.2399700037495313, "grad_norm": 0.7982856631278992, "learning_rate": 0.0002666261396256961, "loss": 1.3219, "step": 1600 }, { "epoch": 0.24146981627296588, "grad_norm": 0.8474456071853638, "learning_rate": 0.0002661665162248656, "loss": 1.329, "step": 1610 }, { "epoch": 0.24296962879640044, "grad_norm": 0.8094434142112732, "learning_rate": 0.0002657041514131385, "loss": 1.3344, "step": 1620 }, { "epoch": 0.24446944131983503, "grad_norm": 0.8186250925064087, "learning_rate": 0.000265239056101851, "loss": 1.3266, "step": 1630 }, { "epoch": 0.2459692538432696, "grad_norm": 0.8790938854217529, "learning_rate": 0.0002647712412667765, "loss": 1.3141, "step": 1640 }, { "epoch": 0.24746906636670415, "grad_norm": 0.8377759456634521, "learning_rate": 0.00026430071794786644, "loss": 1.3285, "step": 1650 }, { "epoch": 0.24896887889013875, "grad_norm": 0.9131957292556763, "learning_rate": 0.00026382749724898955, "loss": 1.3029, "step": 1660 }, { "epoch": 0.2504686914135733, "grad_norm": 0.8385202884674072, "learning_rate": 0.00026335159033766996, "loss": 1.3329, "step": 1670 }, { "epoch": 0.25196850393700787, "grad_norm": 0.7972739338874817, "learning_rate": 0.0002628730084448239, "loss": 1.3253, "step": 1680 }, { "epoch": 0.25346831646044243, "grad_norm": 0.8282127380371094, "learning_rate": 0.000262391762864494, "loss": 1.3327, "step": 1690 }, { "epoch": 0.254968128983877, "grad_norm": 0.8004878163337708, "learning_rate": 0.00026190786495358366, "loss": 1.3186, "step": 1700 }, { "epoch": 0.25646794150731156, "grad_norm": 0.825681746006012, "learning_rate": 0.0002614213261315883, "loss": 1.3129, "step": 1710 }, { "epoch": 0.2579677540307462, "grad_norm": 0.8196373581886292, "learning_rate": 0.0002609321578803261, "loss": 1.3185, "step": 1720 }, { "epoch": 0.25946756655418074, "grad_norm": 0.8522502779960632, "learning_rate": 0.00026044037174366734, "loss": 1.3107, "step": 1730 }, { "epoch": 0.2609673790776153, "grad_norm": 0.8095912933349609, "learning_rate": 0.00025994597932726135, "loss": 1.3218, "step": 1740 }, { "epoch": 0.26246719160104987, "grad_norm": 0.8493536710739136, "learning_rate": 0.0002594489922982633, "loss": 1.3234, "step": 1750 }, { "epoch": 0.26396700412448443, "grad_norm": 0.8144869208335876, "learning_rate": 0.0002589494223850584, "loss": 1.3, "step": 1760 }, { "epoch": 0.265466816647919, "grad_norm": 0.8083682060241699, "learning_rate": 0.00025844728137698543, "loss": 1.3283, "step": 1770 }, { "epoch": 0.26696662917135355, "grad_norm": 0.8459505438804626, "learning_rate": 0.0002579425811240582, "loss": 1.319, "step": 1780 }, { "epoch": 0.26846644169478817, "grad_norm": 0.8262299299240112, "learning_rate": 0.00025743533353668626, "loss": 1.3089, "step": 1790 }, { "epoch": 0.26996625421822273, "grad_norm": 0.8823822736740112, "learning_rate": 0.0002569255505853934, "loss": 1.3157, "step": 1800 }, { "epoch": 0.2714660667416573, "grad_norm": 0.8654087781906128, "learning_rate": 0.0002564132443005356, "loss": 1.3113, "step": 1810 }, { "epoch": 0.27296587926509186, "grad_norm": 0.8660631775856018, "learning_rate": 0.00025589842677201693, "loss": 1.303, "step": 1820 }, { "epoch": 0.2744656917885264, "grad_norm": 0.8453037142753601, "learning_rate": 0.0002553811101490042, "loss": 1.3036, "step": 1830 }, { "epoch": 0.275965504311961, "grad_norm": 0.7789093255996704, "learning_rate": 0.00025486130663964016, "loss": 1.3064, "step": 1840 }, { "epoch": 0.27746531683539555, "grad_norm": 0.8085753321647644, "learning_rate": 0.00025433902851075584, "loss": 1.3135, "step": 1850 }, { "epoch": 0.27896512935883017, "grad_norm": 0.8380696773529053, "learning_rate": 0.0002538142880875805, "loss": 1.2949, "step": 1860 }, { "epoch": 0.28046494188226473, "grad_norm": 0.8569440245628357, "learning_rate": 0.00025328709775345105, "loss": 1.3174, "step": 1870 }, { "epoch": 0.2819647544056993, "grad_norm": 0.7963806390762329, "learning_rate": 0.0002527574699495199, "loss": 1.3079, "step": 1880 }, { "epoch": 0.28346456692913385, "grad_norm": 0.8502215147018433, "learning_rate": 0.00025222541717446117, "loss": 1.3019, "step": 1890 }, { "epoch": 0.2849643794525684, "grad_norm": 0.8192076086997986, "learning_rate": 0.00025169095198417584, "loss": 1.2963, "step": 1900 }, { "epoch": 0.286464191976003, "grad_norm": 0.8132408261299133, "learning_rate": 0.00025115408699149546, "loss": 1.3122, "step": 1910 }, { "epoch": 0.2879640044994376, "grad_norm": 0.9454010128974915, "learning_rate": 0.00025061483486588435, "loss": 1.3203, "step": 1920 }, { "epoch": 0.28946381702287216, "grad_norm": 0.7738835215568542, "learning_rate": 0.00025007320833314085, "loss": 1.2868, "step": 1930 }, { "epoch": 0.2909636295463067, "grad_norm": 0.7828739881515503, "learning_rate": 0.00024952922017509687, "loss": 1.3065, "step": 1940 }, { "epoch": 0.2924634420697413, "grad_norm": 0.8362312316894531, "learning_rate": 0.00024898288322931615, "loss": 1.2927, "step": 1950 }, { "epoch": 0.29396325459317585, "grad_norm": 0.80490642786026, "learning_rate": 0.00024843421038879147, "loss": 1.2976, "step": 1960 }, { "epoch": 0.2954630671166104, "grad_norm": 0.8396286368370056, "learning_rate": 0.0002478832146016404, "loss": 1.3003, "step": 1970 }, { "epoch": 0.296962879640045, "grad_norm": 0.8507101535797119, "learning_rate": 0.0002473299088707996, "loss": 1.298, "step": 1980 }, { "epoch": 0.2984626921634796, "grad_norm": 0.886080801486969, "learning_rate": 0.00024677430625371803, "loss": 1.3002, "step": 1990 }, { "epoch": 0.29996250468691416, "grad_norm": 0.8753901124000549, "learning_rate": 0.0002462164198620489, "loss": 1.3057, "step": 2000 }, { "epoch": 0.3001124859392576, "eval_loss": 1.337799072265625, "eval_runtime": 34.4016, "eval_samples_per_second": 726.711, "eval_steps_per_second": 90.839, "step": 2001 }, { "epoch": 0.3014623172103487, "grad_norm": 0.8367862701416016, "learning_rate": 0.00024565626286134003, "loss": 1.2853, "step": 2010 }, { "epoch": 0.3029621297337833, "grad_norm": 0.8323600888252258, "learning_rate": 0.0002450938484707234, "loss": 1.2831, "step": 2020 }, { "epoch": 0.30446194225721784, "grad_norm": 0.8133791089057922, "learning_rate": 0.0002445291899626031, "loss": 1.2837, "step": 2030 }, { "epoch": 0.3059617547806524, "grad_norm": 0.8445649743080139, "learning_rate": 0.000243962300662342, "loss": 1.3076, "step": 2040 }, { "epoch": 0.30746156730408697, "grad_norm": 0.8368006944656372, "learning_rate": 0.00024339319394794742, "loss": 1.3018, "step": 2050 }, { "epoch": 0.3089613798275216, "grad_norm": 0.8313194513320923, "learning_rate": 0.00024282188324975534, "loss": 1.2931, "step": 2060 }, { "epoch": 0.31046119235095615, "grad_norm": 0.8213202357292175, "learning_rate": 0.0002422483820501136, "loss": 1.2962, "step": 2070 }, { "epoch": 0.3119610048743907, "grad_norm": 0.8784091472625732, "learning_rate": 0.00024167270388306366, "loss": 1.288, "step": 2080 }, { "epoch": 0.3134608173978253, "grad_norm": 0.7992005348205566, "learning_rate": 0.00024109486233402102, "loss": 1.29, "step": 2090 }, { "epoch": 0.31496062992125984, "grad_norm": 0.7810459136962891, "learning_rate": 0.00024051487103945486, "loss": 1.2769, "step": 2100 }, { "epoch": 0.3164604424446944, "grad_norm": 0.7913289666175842, "learning_rate": 0.00023993274368656618, "loss": 1.2822, "step": 2110 }, { "epoch": 0.31796025496812896, "grad_norm": 0.7812384366989136, "learning_rate": 0.00023934849401296472, "loss": 1.2962, "step": 2120 }, { "epoch": 0.3194600674915636, "grad_norm": 0.8585249185562134, "learning_rate": 0.0002387621358063449, "loss": 1.2842, "step": 2130 }, { "epoch": 0.32095988001499814, "grad_norm": 0.8342397212982178, "learning_rate": 0.00023817368290416036, "loss": 1.287, "step": 2140 }, { "epoch": 0.3224596925384327, "grad_norm": 0.7857896089553833, "learning_rate": 0.00023758314919329726, "loss": 1.3053, "step": 2150 }, { "epoch": 0.32395950506186727, "grad_norm": 0.7923012971878052, "learning_rate": 0.00023699054860974682, "loss": 1.2731, "step": 2160 }, { "epoch": 0.32545931758530183, "grad_norm": 0.7796991467475891, "learning_rate": 0.00023639589513827636, "loss": 1.2716, "step": 2170 }, { "epoch": 0.3269591301087364, "grad_norm": 0.8867438435554504, "learning_rate": 0.0002357992028120993, "loss": 1.2908, "step": 2180 }, { "epoch": 0.32845894263217096, "grad_norm": 0.8547908067703247, "learning_rate": 0.00023520048571254378, "loss": 1.2772, "step": 2190 }, { "epoch": 0.3299587551556056, "grad_norm": 0.8480901122093201, "learning_rate": 0.00023459975796872063, "loss": 1.2716, "step": 2200 }, { "epoch": 0.33145856767904014, "grad_norm": 0.8187602758407593, "learning_rate": 0.0002339970337571899, "loss": 1.2724, "step": 2210 }, { "epoch": 0.3329583802024747, "grad_norm": 0.8206058740615845, "learning_rate": 0.000233392327301626, "loss": 1.3034, "step": 2220 }, { "epoch": 0.33445819272590926, "grad_norm": 0.76264888048172, "learning_rate": 0.0002327856528724825, "loss": 1.2576, "step": 2230 }, { "epoch": 0.3359580052493438, "grad_norm": 0.8335399627685547, "learning_rate": 0.0002321770247866551, "loss": 1.2857, "step": 2240 }, { "epoch": 0.3374578177727784, "grad_norm": 0.7656426429748535, "learning_rate": 0.00023156645740714368, "loss": 1.2978, "step": 2250 }, { "epoch": 0.33895763029621295, "grad_norm": 0.7743305563926697, "learning_rate": 0.00023095396514271355, "loss": 1.2803, "step": 2260 }, { "epoch": 0.34045744281964757, "grad_norm": 0.7768455147743225, "learning_rate": 0.0002303395624475553, "loss": 1.2978, "step": 2270 }, { "epoch": 0.34195725534308213, "grad_norm": 0.8204723596572876, "learning_rate": 0.00022972326382094378, "loss": 1.2708, "step": 2280 }, { "epoch": 0.3434570678665167, "grad_norm": 0.8377450108528137, "learning_rate": 0.00022910508380689584, "loss": 1.276, "step": 2290 }, { "epoch": 0.34495688038995126, "grad_norm": 0.7735800743103027, "learning_rate": 0.00022848503699382717, "loss": 1.2987, "step": 2300 }, { "epoch": 0.3464566929133858, "grad_norm": 0.8727670907974243, "learning_rate": 0.00022786313801420794, "loss": 1.267, "step": 2310 }, { "epoch": 0.3479565054368204, "grad_norm": 0.7944260835647583, "learning_rate": 0.0002272394015442177, "loss": 1.2937, "step": 2320 }, { "epoch": 0.34945631796025495, "grad_norm": 0.8214771747589111, "learning_rate": 0.0002266138423033987, "loss": 1.2879, "step": 2330 }, { "epoch": 0.35095613048368957, "grad_norm": 0.7794116139411926, "learning_rate": 0.00022598647505430895, "loss": 1.2599, "step": 2340 }, { "epoch": 0.35245594300712413, "grad_norm": 0.76594078540802, "learning_rate": 0.0002253573146021733, "loss": 1.2613, "step": 2350 }, { "epoch": 0.3539557555305587, "grad_norm": 0.8062904477119446, "learning_rate": 0.0002247263757945347, "loss": 1.2959, "step": 2360 }, { "epoch": 0.35545556805399325, "grad_norm": 0.8257420063018799, "learning_rate": 0.00022409367352090322, "loss": 1.2567, "step": 2370 }, { "epoch": 0.3569553805774278, "grad_norm": 0.8322898149490356, "learning_rate": 0.00022345922271240496, "loss": 1.2684, "step": 2380 }, { "epoch": 0.3584551931008624, "grad_norm": 0.8116471171379089, "learning_rate": 0.00022282303834142978, "loss": 1.2643, "step": 2390 }, { "epoch": 0.35995500562429694, "grad_norm": 0.8192791938781738, "learning_rate": 0.0002221851354212777, "loss": 1.2586, "step": 2400 }, { "epoch": 0.36145481814773156, "grad_norm": 0.7919474244117737, "learning_rate": 0.0002215455290058048, "loss": 1.2869, "step": 2410 }, { "epoch": 0.3629546306711661, "grad_norm": 0.8426802158355713, "learning_rate": 0.000220904234189068, "loss": 1.2589, "step": 2420 }, { "epoch": 0.3644544431946007, "grad_norm": 0.851420521736145, "learning_rate": 0.00022026126610496852, "loss": 1.2569, "step": 2430 }, { "epoch": 0.36595425571803525, "grad_norm": 0.8213547468185425, "learning_rate": 0.0002196166399268952, "loss": 1.2698, "step": 2440 }, { "epoch": 0.3674540682414698, "grad_norm": 0.7695969343185425, "learning_rate": 0.00021897037086736614, "loss": 1.2668, "step": 2450 }, { "epoch": 0.3689538807649044, "grad_norm": 0.7834669351577759, "learning_rate": 0.0002183224741776697, "loss": 1.2662, "step": 2460 }, { "epoch": 0.37045369328833894, "grad_norm": 0.7951564788818359, "learning_rate": 0.00021767296514750472, "loss": 1.2661, "step": 2470 }, { "epoch": 0.37195350581177355, "grad_norm": 0.7705678939819336, "learning_rate": 0.00021702185910461958, "loss": 1.2623, "step": 2480 }, { "epoch": 0.3734533183352081, "grad_norm": 0.850374162197113, "learning_rate": 0.00021636917141445056, "loss": 1.2386, "step": 2490 }, { "epoch": 0.3749531308586427, "grad_norm": 0.795702338218689, "learning_rate": 0.00021571491747975917, "loss": 1.2604, "step": 2500 }, { "epoch": 0.37645294338207724, "grad_norm": 0.8536216020584106, "learning_rate": 0.0002150591127402687, "loss": 1.2497, "step": 2510 }, { "epoch": 0.3779527559055118, "grad_norm": 0.813890278339386, "learning_rate": 0.00021440177267229984, "loss": 1.2505, "step": 2520 }, { "epoch": 0.37945256842894637, "grad_norm": 0.8229677081108093, "learning_rate": 0.00021374291278840546, "loss": 1.2634, "step": 2530 }, { "epoch": 0.38095238095238093, "grad_norm": 0.868016242980957, "learning_rate": 0.00021308254863700452, "loss": 1.2537, "step": 2540 }, { "epoch": 0.38245219347581555, "grad_norm": 0.8083469271659851, "learning_rate": 0.00021242069580201524, "loss": 1.2702, "step": 2550 }, { "epoch": 0.3839520059992501, "grad_norm": 0.7700805068016052, "learning_rate": 0.00021175736990248714, "loss": 1.2755, "step": 2560 }, { "epoch": 0.3854518185226847, "grad_norm": 0.787372350692749, "learning_rate": 0.00021109258659223254, "loss": 1.2581, "step": 2570 }, { "epoch": 0.38695163104611924, "grad_norm": 0.8132008910179138, "learning_rate": 0.00021042636155945723, "loss": 1.2408, "step": 2580 }, { "epoch": 0.3884514435695538, "grad_norm": 0.7871599793434143, "learning_rate": 0.00020975871052639024, "loss": 1.2622, "step": 2590 }, { "epoch": 0.38995125609298836, "grad_norm": 0.841528594493866, "learning_rate": 0.00020908964924891256, "loss": 1.2382, "step": 2600 }, { "epoch": 0.3914510686164229, "grad_norm": 0.7999451756477356, "learning_rate": 0.0002084191935161857, "loss": 1.2771, "step": 2610 }, { "epoch": 0.39295088113985754, "grad_norm": 0.7474108934402466, "learning_rate": 0.0002077473591502788, "loss": 1.2656, "step": 2620 }, { "epoch": 0.3944506936632921, "grad_norm": 0.8236092329025269, "learning_rate": 0.00020707416200579524, "loss": 1.2576, "step": 2630 }, { "epoch": 0.39595050618672667, "grad_norm": 0.8108281493186951, "learning_rate": 0.00020639961796949877, "loss": 1.2534, "step": 2640 }, { "epoch": 0.39745031871016123, "grad_norm": 0.8653910756111145, "learning_rate": 0.00020572374295993822, "loss": 1.2666, "step": 2650 }, { "epoch": 0.3989501312335958, "grad_norm": 0.8193902969360352, "learning_rate": 0.00020504655292707223, "loss": 1.2528, "step": 2660 }, { "epoch": 0.40014998125234347, "eval_loss": 1.300592303276062, "eval_runtime": 34.4278, "eval_samples_per_second": 726.158, "eval_steps_per_second": 90.77, "step": 2668 }, { "epoch": 0.40044994375703036, "grad_norm": 0.8101006150245667, "learning_rate": 0.00020436806385189246, "loss": 1.2646, "step": 2670 }, { "epoch": 0.4019497562804649, "grad_norm": 0.8838757276535034, "learning_rate": 0.00020368829174604667, "loss": 1.2667, "step": 2680 }, { "epoch": 0.40344956880389954, "grad_norm": 0.7894991636276245, "learning_rate": 0.00020300725265146093, "loss": 1.266, "step": 2690 }, { "epoch": 0.4049493813273341, "grad_norm": 0.8235330581665039, "learning_rate": 0.00020232496263996092, "loss": 1.2503, "step": 2700 }, { "epoch": 0.40644919385076866, "grad_norm": 0.7787916660308838, "learning_rate": 0.00020164143781289256, "loss": 1.2521, "step": 2710 }, { "epoch": 0.4079490063742032, "grad_norm": 0.8202505707740784, "learning_rate": 0.00020095669430074235, "loss": 1.2717, "step": 2720 }, { "epoch": 0.4094488188976378, "grad_norm": 0.7967453598976135, "learning_rate": 0.00020027074826275629, "loss": 1.2636, "step": 2730 }, { "epoch": 0.41094863142107235, "grad_norm": 0.7384628653526306, "learning_rate": 0.00019958361588655888, "loss": 1.2531, "step": 2740 }, { "epoch": 0.4124484439445069, "grad_norm": 0.8367340564727783, "learning_rate": 0.00019889531338777112, "loss": 1.262, "step": 2750 }, { "epoch": 0.41394825646794153, "grad_norm": 0.8179819583892822, "learning_rate": 0.0001982058570096274, "loss": 1.2453, "step": 2760 }, { "epoch": 0.4154480689913761, "grad_norm": 0.7929909825325012, "learning_rate": 0.00019751526302259271, "loss": 1.2332, "step": 2770 }, { "epoch": 0.41694788151481066, "grad_norm": 0.803432285785675, "learning_rate": 0.00019682354772397842, "loss": 1.2376, "step": 2780 }, { "epoch": 0.4184476940382452, "grad_norm": 0.7572347521781921, "learning_rate": 0.00019613072743755755, "loss": 1.2535, "step": 2790 }, { "epoch": 0.4199475065616798, "grad_norm": 0.7730807662010193, "learning_rate": 0.00019543681851317998, "loss": 1.2457, "step": 2800 }, { "epoch": 0.42144731908511435, "grad_norm": 0.8271188735961914, "learning_rate": 0.00019474183732638608, "loss": 1.2525, "step": 2810 }, { "epoch": 0.4229471316085489, "grad_norm": 0.8117753863334656, "learning_rate": 0.0001940458002780206, "loss": 1.2321, "step": 2820 }, { "epoch": 0.42444694413198353, "grad_norm": 0.8418020606040955, "learning_rate": 0.00019334872379384556, "loss": 1.2501, "step": 2830 }, { "epoch": 0.4259467566554181, "grad_norm": 0.8009008765220642, "learning_rate": 0.0001926506243241526, "loss": 1.2398, "step": 2840 }, { "epoch": 0.42744656917885265, "grad_norm": 0.7942110300064087, "learning_rate": 0.00019195151834337473, "loss": 1.244, "step": 2850 }, { "epoch": 0.4289463817022872, "grad_norm": 0.8147196173667908, "learning_rate": 0.00019125142234969762, "loss": 1.2499, "step": 2860 }, { "epoch": 0.4304461942257218, "grad_norm": 0.843659520149231, "learning_rate": 0.00019055035286467034, "loss": 1.2424, "step": 2870 }, { "epoch": 0.43194600674915634, "grad_norm": 0.8289938569068909, "learning_rate": 0.00018984832643281513, "loss": 1.2529, "step": 2880 }, { "epoch": 0.4334458192725909, "grad_norm": 0.8647093772888184, "learning_rate": 0.00018914535962123735, "loss": 1.2579, "step": 2890 }, { "epoch": 0.4349456317960255, "grad_norm": 0.7539538145065308, "learning_rate": 0.00018844146901923436, "loss": 1.2408, "step": 2900 }, { "epoch": 0.4364454443194601, "grad_norm": 0.8231214284896851, "learning_rate": 0.000187736671237904, "loss": 1.2346, "step": 2910 }, { "epoch": 0.43794525684289465, "grad_norm": 0.7767258882522583, "learning_rate": 0.0001870309829097526, "loss": 1.2379, "step": 2920 }, { "epoch": 0.4394450693663292, "grad_norm": 0.8100400567054749, "learning_rate": 0.00018632442068830244, "loss": 1.2248, "step": 2930 }, { "epoch": 0.4409448818897638, "grad_norm": 0.7970197796821594, "learning_rate": 0.00018561700124769892, "loss": 1.2312, "step": 2940 }, { "epoch": 0.44244469441319834, "grad_norm": 0.8453084826469421, "learning_rate": 0.0001849087412823168, "loss": 1.2379, "step": 2950 }, { "epoch": 0.4439445069366329, "grad_norm": 0.7835219502449036, "learning_rate": 0.00018419965750636645, "loss": 1.2377, "step": 2960 }, { "epoch": 0.4454443194600675, "grad_norm": 0.9176828861236572, "learning_rate": 0.00018348976665349932, "loss": 1.2322, "step": 2970 }, { "epoch": 0.4469441319835021, "grad_norm": 0.8404021859169006, "learning_rate": 0.00018277908547641294, "loss": 1.2364, "step": 2980 }, { "epoch": 0.44844394450693664, "grad_norm": 0.8273976445198059, "learning_rate": 0.00018206763074645588, "loss": 1.2242, "step": 2990 }, { "epoch": 0.4499437570303712, "grad_norm": 0.7463936805725098, "learning_rate": 0.0001813554192532316, "loss": 1.2459, "step": 3000 }, { "epoch": 0.45144356955380577, "grad_norm": 0.7790806293487549, "learning_rate": 0.00018064246780420245, "loss": 1.2473, "step": 3010 }, { "epoch": 0.45294338207724033, "grad_norm": 0.766952633857727, "learning_rate": 0.000179928793224293, "loss": 1.219, "step": 3020 }, { "epoch": 0.4544431946006749, "grad_norm": 0.8558096289634705, "learning_rate": 0.00017921441235549295, "loss": 1.2413, "step": 3030 }, { "epoch": 0.4559430071241095, "grad_norm": 0.8494218587875366, "learning_rate": 0.00017849934205645967, "loss": 1.2492, "step": 3040 }, { "epoch": 0.4574428196475441, "grad_norm": 0.8279352784156799, "learning_rate": 0.00017778359920212047, "loss": 1.2509, "step": 3050 }, { "epoch": 0.45894263217097864, "grad_norm": 0.7695614695549011, "learning_rate": 0.0001770672006832741, "loss": 1.2375, "step": 3060 }, { "epoch": 0.4604424446944132, "grad_norm": 0.8258097767829895, "learning_rate": 0.00017635016340619255, "loss": 1.2286, "step": 3070 }, { "epoch": 0.46194225721784776, "grad_norm": 0.7904211282730103, "learning_rate": 0.00017563250429222173, "loss": 1.2527, "step": 3080 }, { "epoch": 0.4634420697412823, "grad_norm": 0.8494943976402283, "learning_rate": 0.00017491424027738216, "loss": 1.2484, "step": 3090 }, { "epoch": 0.4649418822647169, "grad_norm": 0.8655872941017151, "learning_rate": 0.0001741953883119696, "loss": 1.2172, "step": 3100 }, { "epoch": 0.4664416947881515, "grad_norm": 0.8473703861236572, "learning_rate": 0.00017347596536015472, "loss": 1.2376, "step": 3110 }, { "epoch": 0.46794150731158607, "grad_norm": 0.8274358510971069, "learning_rate": 0.00017275598839958296, "loss": 1.2458, "step": 3120 }, { "epoch": 0.46944131983502063, "grad_norm": 0.8414213061332703, "learning_rate": 0.00017203547442097369, "loss": 1.233, "step": 3130 }, { "epoch": 0.4709411323584552, "grad_norm": 0.8282588720321655, "learning_rate": 0.0001713144404277195, "loss": 1.2398, "step": 3140 }, { "epoch": 0.47244094488188976, "grad_norm": 0.8157975077629089, "learning_rate": 0.0001705929034354846, "loss": 1.2236, "step": 3150 }, { "epoch": 0.4739407574053243, "grad_norm": 0.8301715850830078, "learning_rate": 0.0001698708804718037, "loss": 1.2214, "step": 3160 }, { "epoch": 0.4754405699287589, "grad_norm": 0.7506479620933533, "learning_rate": 0.00016914838857567979, "loss": 1.2332, "step": 3170 }, { "epoch": 0.4769403824521935, "grad_norm": 0.8075399994850159, "learning_rate": 0.00016842544479718215, "loss": 1.2344, "step": 3180 }, { "epoch": 0.47844019497562806, "grad_norm": 0.9165489673614502, "learning_rate": 0.00016770206619704412, "loss": 1.2393, "step": 3190 }, { "epoch": 0.4799400074990626, "grad_norm": 0.7626012563705444, "learning_rate": 0.0001669782698462603, "loss": 1.2273, "step": 3200 }, { "epoch": 0.4814398200224972, "grad_norm": 0.8293296694755554, "learning_rate": 0.00016625407282568394, "loss": 1.2378, "step": 3210 }, { "epoch": 0.48293963254593175, "grad_norm": 0.7572868466377258, "learning_rate": 0.00016552949222562352, "loss": 1.2449, "step": 3220 }, { "epoch": 0.4844394450693663, "grad_norm": 0.7891345024108887, "learning_rate": 0.00016480454514543962, "loss": 1.2336, "step": 3230 }, { "epoch": 0.4859392575928009, "grad_norm": 0.7002502083778381, "learning_rate": 0.00016407924869314144, "loss": 1.2249, "step": 3240 }, { "epoch": 0.4874390701162355, "grad_norm": 0.811424732208252, "learning_rate": 0.00016335361998498296, "loss": 1.2053, "step": 3250 }, { "epoch": 0.48893888263967006, "grad_norm": 0.8211063146591187, "learning_rate": 0.00016262767614505912, "loss": 1.2139, "step": 3260 }, { "epoch": 0.4904386951631046, "grad_norm": 0.8250209093093872, "learning_rate": 0.00016190143430490152, "loss": 1.2163, "step": 3270 }, { "epoch": 0.4919385076865392, "grad_norm": 0.7996110320091248, "learning_rate": 0.00016117491160307445, "loss": 1.2361, "step": 3280 }, { "epoch": 0.49343832020997375, "grad_norm": 0.8566946983337402, "learning_rate": 0.00016044812518477007, "loss": 1.2353, "step": 3290 }, { "epoch": 0.4949381327334083, "grad_norm": 0.7891775369644165, "learning_rate": 0.00015972109220140402, "loss": 1.2174, "step": 3300 }, { "epoch": 0.49643794525684287, "grad_norm": 0.769114077091217, "learning_rate": 0.0001589938298102108, "loss": 1.2293, "step": 3310 }, { "epoch": 0.4979377577802775, "grad_norm": 0.8417636156082153, "learning_rate": 0.0001582663551738384, "loss": 1.2303, "step": 3320 }, { "epoch": 0.49943757030371205, "grad_norm": 0.7648106813430786, "learning_rate": 0.00015753868545994378, "loss": 1.2287, "step": 3330 }, { "epoch": 0.5001874765654293, "eval_loss": 1.2699334621429443, "eval_runtime": 34.8375, "eval_samples_per_second": 717.618, "eval_steps_per_second": 89.702, "step": 3335 }, { "epoch": 0.5009373828271466, "grad_norm": 0.8185369372367859, "learning_rate": 0.00015681083784078748, "loss": 1.2221, "step": 3340 }, { "epoch": 0.5024371953505812, "grad_norm": 0.8825401067733765, "learning_rate": 0.00015608282949282844, "loss": 1.2339, "step": 3350 }, { "epoch": 0.5039370078740157, "grad_norm": 0.8161605596542358, "learning_rate": 0.00015535467759631862, "loss": 1.2352, "step": 3360 }, { "epoch": 0.5054368203974503, "grad_norm": 0.8088217973709106, "learning_rate": 0.00015462639933489753, "loss": 1.2212, "step": 3370 }, { "epoch": 0.5069366329208849, "grad_norm": 0.8628859519958496, "learning_rate": 0.00015389801189518693, "loss": 1.2222, "step": 3380 }, { "epoch": 0.5084364454443194, "grad_norm": 0.8813005089759827, "learning_rate": 0.00015316953246638482, "loss": 1.2165, "step": 3390 }, { "epoch": 0.509936257967754, "grad_norm": 0.7920242547988892, "learning_rate": 0.00015244097823986023, "loss": 1.2197, "step": 3400 }, { "epoch": 0.5114360704911886, "grad_norm": 0.8255246877670288, "learning_rate": 0.0001517123664087473, "loss": 1.2359, "step": 3410 }, { "epoch": 0.5129358830146231, "grad_norm": 0.8508349657058716, "learning_rate": 0.00015098371416753963, "loss": 1.2222, "step": 3420 }, { "epoch": 0.5144356955380578, "grad_norm": 0.870098352432251, "learning_rate": 0.00015025503871168432, "loss": 1.2107, "step": 3430 }, { "epoch": 0.5159355080614924, "grad_norm": 0.7887611985206604, "learning_rate": 0.00014952635723717642, "loss": 1.2469, "step": 3440 }, { "epoch": 0.5174353205849269, "grad_norm": 0.8463996052742004, "learning_rate": 0.000148797686940153, "loss": 1.2251, "step": 3450 }, { "epoch": 0.5189351331083615, "grad_norm": 0.8258456587791443, "learning_rate": 0.0001480690450164873, "loss": 1.2241, "step": 3460 }, { "epoch": 0.520434945631796, "grad_norm": 0.8315289616584778, "learning_rate": 0.00014734044866138312, "loss": 1.1998, "step": 3470 }, { "epoch": 0.5219347581552306, "grad_norm": 0.8053680062294006, "learning_rate": 0.00014661191506896867, "loss": 1.2062, "step": 3480 }, { "epoch": 0.5234345706786652, "grad_norm": 0.7926360368728638, "learning_rate": 0.0001458834614318912, "loss": 1.2092, "step": 3490 }, { "epoch": 0.5249343832020997, "grad_norm": 0.7700707912445068, "learning_rate": 0.00014515510494091102, "loss": 1.2077, "step": 3500 }, { "epoch": 0.5264341957255343, "grad_norm": 0.8049729466438293, "learning_rate": 0.00014442686278449588, "loss": 1.2134, "step": 3510 }, { "epoch": 0.5279340082489689, "grad_norm": 0.8229398727416992, "learning_rate": 0.00014369875214841548, "loss": 1.2235, "step": 3520 }, { "epoch": 0.5294338207724034, "grad_norm": 0.7805364727973938, "learning_rate": 0.0001429707902153355, "loss": 1.2315, "step": 3530 }, { "epoch": 0.530933633295838, "grad_norm": 0.7947820425033569, "learning_rate": 0.0001422429941644127, "loss": 1.2166, "step": 3540 }, { "epoch": 0.5324334458192725, "grad_norm": 0.7772228121757507, "learning_rate": 0.000141515381170889, "loss": 1.2267, "step": 3550 }, { "epoch": 0.5339332583427071, "grad_norm": 0.7693078517913818, "learning_rate": 0.00014078796840568647, "loss": 1.2173, "step": 3560 }, { "epoch": 0.5354330708661418, "grad_norm": 0.8406286239624023, "learning_rate": 0.0001400607730350018, "loss": 1.2161, "step": 3570 }, { "epoch": 0.5369328833895763, "grad_norm": 0.7883846163749695, "learning_rate": 0.0001393338122199016, "loss": 1.23, "step": 3580 }, { "epoch": 0.5384326959130109, "grad_norm": 0.7825552821159363, "learning_rate": 0.00013860710311591713, "loss": 1.2165, "step": 3590 }, { "epoch": 0.5399325084364455, "grad_norm": 0.9384158253669739, "learning_rate": 0.00013788066287263946, "loss": 1.2197, "step": 3600 }, { "epoch": 0.54143232095988, "grad_norm": 0.888299286365509, "learning_rate": 0.00013715450863331495, "loss": 1.2082, "step": 3610 }, { "epoch": 0.5429321334833146, "grad_norm": 0.8535633087158203, "learning_rate": 0.00013642865753444043, "loss": 1.212, "step": 3620 }, { "epoch": 0.5444319460067492, "grad_norm": 0.8287407159805298, "learning_rate": 0.000135703126705359, "loss": 1.2009, "step": 3630 }, { "epoch": 0.5459317585301837, "grad_norm": 0.7710711359977722, "learning_rate": 0.00013497793326785573, "loss": 1.2205, "step": 3640 }, { "epoch": 0.5474315710536183, "grad_norm": 0.7808111310005188, "learning_rate": 0.00013425309433575365, "loss": 1.2149, "step": 3650 }, { "epoch": 0.5489313835770528, "grad_norm": 0.7702746987342834, "learning_rate": 0.0001335286270145096, "loss": 1.197, "step": 3660 }, { "epoch": 0.5504311961004874, "grad_norm": 0.832952618598938, "learning_rate": 0.00013280454840081105, "loss": 1.2075, "step": 3670 }, { "epoch": 0.551931008623922, "grad_norm": 0.7749794125556946, "learning_rate": 0.0001320808755821722, "loss": 1.2136, "step": 3680 }, { "epoch": 0.5534308211473565, "grad_norm": 0.8737798929214478, "learning_rate": 0.00013135762563653097, "loss": 1.2017, "step": 3690 }, { "epoch": 0.5549306336707911, "grad_norm": 0.8434765934944153, "learning_rate": 0.00013063481563184589, "loss": 1.1912, "step": 3700 }, { "epoch": 0.5564304461942258, "grad_norm": 0.7952425479888916, "learning_rate": 0.00012991246262569327, "loss": 1.2148, "step": 3710 }, { "epoch": 0.5579302587176603, "grad_norm": 0.8273133039474487, "learning_rate": 0.00012919058366486492, "loss": 1.219, "step": 3720 }, { "epoch": 0.5594300712410949, "grad_norm": 0.86753249168396, "learning_rate": 0.00012846919578496545, "loss": 1.1893, "step": 3730 }, { "epoch": 0.5609298837645295, "grad_norm": 0.8196529746055603, "learning_rate": 0.00012774831601001054, "loss": 1.2166, "step": 3740 }, { "epoch": 0.562429696287964, "grad_norm": 0.9246373772621155, "learning_rate": 0.00012702796135202518, "loss": 1.2296, "step": 3750 }, { "epoch": 0.5639295088113986, "grad_norm": 0.8339414000511169, "learning_rate": 0.00012630814881064206, "loss": 1.2164, "step": 3760 }, { "epoch": 0.5654293213348331, "grad_norm": 0.8195322155952454, "learning_rate": 0.00012558889537270048, "loss": 1.2031, "step": 3770 }, { "epoch": 0.5669291338582677, "grad_norm": 0.8975194096565247, "learning_rate": 0.0001248702180118455, "loss": 1.2236, "step": 3780 }, { "epoch": 0.5684289463817023, "grad_norm": 0.7996919751167297, "learning_rate": 0.00012415213368812731, "loss": 1.1993, "step": 3790 }, { "epoch": 0.5699287589051368, "grad_norm": 0.7755063772201538, "learning_rate": 0.00012343465934760102, "loss": 1.2084, "step": 3800 }, { "epoch": 0.5714285714285714, "grad_norm": 0.8294559717178345, "learning_rate": 0.00012271781192192688, "loss": 1.2175, "step": 3810 }, { "epoch": 0.572928383952006, "grad_norm": 0.8047690391540527, "learning_rate": 0.00012200160832797046, "loss": 1.1986, "step": 3820 }, { "epoch": 0.5744281964754405, "grad_norm": 0.7778790593147278, "learning_rate": 0.0001212860654674036, "loss": 1.2211, "step": 3830 }, { "epoch": 0.5759280089988752, "grad_norm": 0.8431829810142517, "learning_rate": 0.00012057120022630546, "loss": 1.2089, "step": 3840 }, { "epoch": 0.5774278215223098, "grad_norm": 0.8307316899299622, "learning_rate": 0.00011985702947476424, "loss": 1.2035, "step": 3850 }, { "epoch": 0.5789276340457443, "grad_norm": 0.9299211502075195, "learning_rate": 0.00011914357006647877, "loss": 1.1933, "step": 3860 }, { "epoch": 0.5804274465691789, "grad_norm": 0.9003536105155945, "learning_rate": 0.00011843083883836084, "loss": 1.2093, "step": 3870 }, { "epoch": 0.5819272590926134, "grad_norm": 0.8364601731300354, "learning_rate": 0.0001177188526101381, "loss": 1.2051, "step": 3880 }, { "epoch": 0.583427071616048, "grad_norm": 0.877537190914154, "learning_rate": 0.00011700762818395682, "loss": 1.213, "step": 3890 }, { "epoch": 0.5849268841394826, "grad_norm": 0.820450484752655, "learning_rate": 0.0001162971823439856, "loss": 1.2025, "step": 3900 }, { "epoch": 0.5864266966629171, "grad_norm": 0.7949150800704956, "learning_rate": 0.00011558753185601922, "loss": 1.2006, "step": 3910 }, { "epoch": 0.5879265091863517, "grad_norm": 0.7832253575325012, "learning_rate": 0.00011487869346708289, "loss": 1.1894, "step": 3920 }, { "epoch": 0.5894263217097863, "grad_norm": 0.7712039351463318, "learning_rate": 0.00011417068390503716, "loss": 1.2076, "step": 3930 }, { "epoch": 0.5909261342332208, "grad_norm": 0.8228356838226318, "learning_rate": 0.00011346351987818307, "loss": 1.1907, "step": 3940 }, { "epoch": 0.5924259467566554, "grad_norm": 0.8279032111167908, "learning_rate": 0.00011275721807486805, "loss": 1.2132, "step": 3950 }, { "epoch": 0.59392575928009, "grad_norm": 0.8680859208106995, "learning_rate": 0.00011205179516309172, "loss": 1.199, "step": 3960 }, { "epoch": 0.5954255718035245, "grad_norm": 0.9166079759597778, "learning_rate": 0.00011134726779011288, "loss": 1.2, "step": 3970 }, { "epoch": 0.5969253843269592, "grad_norm": 0.8387262225151062, "learning_rate": 0.00011064365258205658, "loss": 1.1922, "step": 3980 }, { "epoch": 0.5984251968503937, "grad_norm": 0.7966826558113098, "learning_rate": 0.00010994096614352153, "loss": 1.2041, "step": 3990 }, { "epoch": 0.5999250093738283, "grad_norm": 0.8371879458427429, "learning_rate": 0.00010923922505718863, "loss": 1.2041, "step": 4000 }, { "epoch": 0.6002249718785152, "eval_loss": 1.2461844682693481, "eval_runtime": 34.5014, "eval_samples_per_second": 724.607, "eval_steps_per_second": 90.576, "step": 4002 }, { "epoch": 0.6014248218972629, "grad_norm": 0.8346044421195984, "learning_rate": 0.00010853844588342926, "loss": 1.1886, "step": 4010 }, { "epoch": 0.6029246344206974, "grad_norm": 0.7978150248527527, "learning_rate": 0.00010783864515991481, "loss": 1.2161, "step": 4020 }, { "epoch": 0.604424446944132, "grad_norm": 0.8575156331062317, "learning_rate": 0.00010713983940122617, "loss": 1.2171, "step": 4030 }, { "epoch": 0.6059242594675666, "grad_norm": 0.8124422430992126, "learning_rate": 0.00010644204509846398, "loss": 1.1864, "step": 4040 }, { "epoch": 0.6074240719910011, "grad_norm": 0.8295300006866455, "learning_rate": 0.00010574527871885977, "loss": 1.2312, "step": 4050 }, { "epoch": 0.6089238845144357, "grad_norm": 0.8077391982078552, "learning_rate": 0.00010504955670538699, "loss": 1.2036, "step": 4060 }, { "epoch": 0.6104236970378702, "grad_norm": 0.839034914970398, "learning_rate": 0.00010435489547637316, "loss": 1.2101, "step": 4070 }, { "epoch": 0.6119235095613048, "grad_norm": 0.7653098106384277, "learning_rate": 0.00010366131142511228, "loss": 1.2127, "step": 4080 }, { "epoch": 0.6134233220847394, "grad_norm": 0.7839916944503784, "learning_rate": 0.00010296882091947826, "loss": 1.1972, "step": 4090 }, { "epoch": 0.6149231346081739, "grad_norm": 0.8738676905632019, "learning_rate": 0.00010227744030153821, "loss": 1.1864, "step": 4100 }, { "epoch": 0.6164229471316085, "grad_norm": 0.8577001094818115, "learning_rate": 0.0001015871858871672, "loss": 1.2282, "step": 4110 }, { "epoch": 0.6179227596550432, "grad_norm": 0.8081793785095215, "learning_rate": 0.00010089807396566306, "loss": 1.2156, "step": 4120 }, { "epoch": 0.6194225721784777, "grad_norm": 0.8585608601570129, "learning_rate": 0.00010021012079936174, "loss": 1.1875, "step": 4130 }, { "epoch": 0.6209223847019123, "grad_norm": 0.8267788290977478, "learning_rate": 9.952334262325399e-05, "loss": 1.175, "step": 4140 }, { "epoch": 0.6224221972253469, "grad_norm": 0.840057373046875, "learning_rate": 9.883775564460193e-05, "loss": 1.1884, "step": 4150 }, { "epoch": 0.6239220097487814, "grad_norm": 0.8279968500137329, "learning_rate": 9.815337604255665e-05, "loss": 1.1902, "step": 4160 }, { "epoch": 0.625421822272216, "grad_norm": 0.7800420522689819, "learning_rate": 9.747021996777624e-05, "loss": 1.1982, "step": 4170 }, { "epoch": 0.6269216347956506, "grad_norm": 0.7733381390571594, "learning_rate": 9.678830354204504e-05, "loss": 1.2104, "step": 4180 }, { "epoch": 0.6284214473190851, "grad_norm": 0.8349838256835938, "learning_rate": 9.610764285789271e-05, "loss": 1.1976, "step": 4190 }, { "epoch": 0.6299212598425197, "grad_norm": 0.8445114493370056, "learning_rate": 9.542825397821485e-05, "loss": 1.1877, "step": 4200 }, { "epoch": 0.6314210723659542, "grad_norm": 0.8512496948242188, "learning_rate": 9.475015293589373e-05, "loss": 1.1979, "step": 4210 }, { "epoch": 0.6329208848893888, "grad_norm": 0.817348837852478, "learning_rate": 9.407335573341997e-05, "loss": 1.1888, "step": 4220 }, { "epoch": 0.6344206974128234, "grad_norm": 0.8680126070976257, "learning_rate": 9.339787834251489e-05, "loss": 1.1961, "step": 4230 }, { "epoch": 0.6359205099362579, "grad_norm": 0.82338547706604, "learning_rate": 9.272373670375362e-05, "loss": 1.196, "step": 4240 }, { "epoch": 0.6374203224596925, "grad_norm": 0.7532997131347656, "learning_rate": 9.205094672618889e-05, "loss": 1.1987, "step": 4250 }, { "epoch": 0.6389201349831272, "grad_norm": 0.8462244272232056, "learning_rate": 9.137952428697568e-05, "loss": 1.1696, "step": 4260 }, { "epoch": 0.6404199475065617, "grad_norm": 0.7773210406303406, "learning_rate": 9.070948523099643e-05, "loss": 1.1903, "step": 4270 }, { "epoch": 0.6419197600299963, "grad_norm": 0.8117038607597351, "learning_rate": 9.004084537048708e-05, "loss": 1.1968, "step": 4280 }, { "epoch": 0.6434195725534309, "grad_norm": 0.7861129641532898, "learning_rate": 8.937362048466404e-05, "loss": 1.1933, "step": 4290 }, { "epoch": 0.6449193850768654, "grad_norm": 0.777773916721344, "learning_rate": 8.870782631935184e-05, "loss": 1.2017, "step": 4300 }, { "epoch": 0.6464191976003, "grad_norm": 0.8288027048110962, "learning_rate": 8.804347858661131e-05, "loss": 1.1937, "step": 4310 }, { "epoch": 0.6479190101237345, "grad_norm": 0.8184522390365601, "learning_rate": 8.73805929643691e-05, "loss": 1.1911, "step": 4320 }, { "epoch": 0.6494188226471691, "grad_norm": 0.7932900190353394, "learning_rate": 8.67191850960475e-05, "loss": 1.1875, "step": 4330 }, { "epoch": 0.6509186351706037, "grad_norm": 0.8443998694419861, "learning_rate": 8.605927059019528e-05, "loss": 1.1897, "step": 4340 }, { "epoch": 0.6524184476940382, "grad_norm": 0.8229474425315857, "learning_rate": 8.540086502011935e-05, "loss": 1.2059, "step": 4350 }, { "epoch": 0.6539182602174728, "grad_norm": 0.7777069211006165, "learning_rate": 8.47439839235174e-05, "loss": 1.2047, "step": 4360 }, { "epoch": 0.6554180727409074, "grad_norm": 0.8484175205230713, "learning_rate": 8.408864280211115e-05, "loss": 1.1743, "step": 4370 }, { "epoch": 0.6569178852643419, "grad_norm": 0.8335903882980347, "learning_rate": 8.343485712128026e-05, "loss": 1.1826, "step": 4380 }, { "epoch": 0.6584176977877765, "grad_norm": 0.827379584312439, "learning_rate": 8.278264230969769e-05, "loss": 1.1924, "step": 4390 }, { "epoch": 0.6599175103112112, "grad_norm": 0.7646244168281555, "learning_rate": 8.213201375896563e-05, "loss": 1.1829, "step": 4400 }, { "epoch": 0.6614173228346457, "grad_norm": 0.7910548448562622, "learning_rate": 8.14829868232519e-05, "loss": 1.1861, "step": 4410 }, { "epoch": 0.6629171353580803, "grad_norm": 0.8238165974617004, "learning_rate": 8.083557681892797e-05, "loss": 1.1852, "step": 4420 }, { "epoch": 0.6644169478815148, "grad_norm": 0.8701585531234741, "learning_rate": 8.018979902420746e-05, "loss": 1.1935, "step": 4430 }, { "epoch": 0.6659167604049494, "grad_norm": 0.8514190316200256, "learning_rate": 7.954566867878538e-05, "loss": 1.1901, "step": 4440 }, { "epoch": 0.667416572928384, "grad_norm": 0.8281582593917847, "learning_rate": 7.890320098347861e-05, "loss": 1.1747, "step": 4450 }, { "epoch": 0.6689163854518185, "grad_norm": 0.8169477581977844, "learning_rate": 7.82624110998673e-05, "loss": 1.182, "step": 4460 }, { "epoch": 0.6704161979752531, "grad_norm": 0.9043864011764526, "learning_rate": 7.762331414993697e-05, "loss": 1.1811, "step": 4470 }, { "epoch": 0.6719160104986877, "grad_norm": 0.9023800492286682, "learning_rate": 7.698592521572155e-05, "loss": 1.182, "step": 4480 }, { "epoch": 0.6734158230221222, "grad_norm": 0.8538755178451538, "learning_rate": 7.635025933894747e-05, "loss": 1.2004, "step": 4490 }, { "epoch": 0.6749156355455568, "grad_norm": 0.8358992338180542, "learning_rate": 7.571633152067901e-05, "loss": 1.1949, "step": 4500 }, { "epoch": 0.6764154480689913, "grad_norm": 0.8562530279159546, "learning_rate": 7.508415672096389e-05, "loss": 1.1891, "step": 4510 }, { "epoch": 0.6779152605924259, "grad_norm": 0.867493212223053, "learning_rate": 7.445374985848035e-05, "loss": 1.172, "step": 4520 }, { "epoch": 0.6794150731158605, "grad_norm": 0.8086623549461365, "learning_rate": 7.382512581018514e-05, "loss": 1.2105, "step": 4530 }, { "epoch": 0.6809148856392951, "grad_norm": 0.8249533176422119, "learning_rate": 7.31982994109626e-05, "loss": 1.1874, "step": 4540 }, { "epoch": 0.6824146981627297, "grad_norm": 0.7889946103096008, "learning_rate": 7.25732854532741e-05, "loss": 1.1974, "step": 4550 }, { "epoch": 0.6839145106861643, "grad_norm": 0.8312809467315674, "learning_rate": 7.195009868680954e-05, "loss": 1.1875, "step": 4560 }, { "epoch": 0.6854143232095988, "grad_norm": 0.8431591987609863, "learning_rate": 7.13287538181387e-05, "loss": 1.1794, "step": 4570 }, { "epoch": 0.6869141357330334, "grad_norm": 0.8547993302345276, "learning_rate": 7.070926551036469e-05, "loss": 1.1723, "step": 4580 }, { "epoch": 0.688413948256468, "grad_norm": 0.8453377485275269, "learning_rate": 7.009164838277754e-05, "loss": 1.1835, "step": 4590 }, { "epoch": 0.6899137607799025, "grad_norm": 0.8473166823387146, "learning_rate": 6.947591701050932e-05, "loss": 1.2166, "step": 4600 }, { "epoch": 0.6914135733033371, "grad_norm": 0.882140040397644, "learning_rate": 6.886208592419043e-05, "loss": 1.1916, "step": 4610 }, { "epoch": 0.6929133858267716, "grad_norm": 0.7941833138465881, "learning_rate": 6.825016960960616e-05, "loss": 1.1955, "step": 4620 }, { "epoch": 0.6944131983502062, "grad_norm": 0.8405663967132568, "learning_rate": 6.764018250735532e-05, "loss": 1.1734, "step": 4630 }, { "epoch": 0.6959130108736408, "grad_norm": 0.8553961515426636, "learning_rate": 6.703213901250931e-05, "loss": 1.1743, "step": 4640 }, { "epoch": 0.6974128233970753, "grad_norm": 0.8203520178794861, "learning_rate": 6.64260534742723e-05, "loss": 1.181, "step": 4650 }, { "epoch": 0.6989126359205099, "grad_norm": 0.9138359427452087, "learning_rate": 6.582194019564266e-05, "loss": 1.1663, "step": 4660 }, { "epoch": 0.700262467191601, "eval_loss": 1.2265734672546387, "eval_runtime": 34.5391, "eval_samples_per_second": 723.818, "eval_steps_per_second": 90.477, "step": 4669 }, { "epoch": 0.7004124484439445, "grad_norm": 0.8695697784423828, "learning_rate": 6.521981343307554e-05, "loss": 1.186, "step": 4670 }, { "epoch": 0.7019122609673791, "grad_norm": 0.7855265140533447, "learning_rate": 6.461968739614639e-05, "loss": 1.1716, "step": 4680 }, { "epoch": 0.7034120734908137, "grad_norm": 0.8557237982749939, "learning_rate": 6.402157624721546e-05, "loss": 1.1854, "step": 4690 }, { "epoch": 0.7049118860142483, "grad_norm": 0.9032356142997742, "learning_rate": 6.342549410109372e-05, "loss": 1.1622, "step": 4700 }, { "epoch": 0.7064116985376828, "grad_norm": 0.845805287361145, "learning_rate": 6.283145502470976e-05, "loss": 1.1895, "step": 4710 }, { "epoch": 0.7079115110611174, "grad_norm": 0.8459937572479248, "learning_rate": 6.223947303677793e-05, "loss": 1.1948, "step": 4720 }, { "epoch": 0.709411323584552, "grad_norm": 0.7924135327339172, "learning_rate": 6.164956210746723e-05, "loss": 1.1826, "step": 4730 }, { "epoch": 0.7109111361079865, "grad_norm": 0.8918642997741699, "learning_rate": 6.106173615807186e-05, "loss": 1.1798, "step": 4740 }, { "epoch": 0.7124109486314211, "grad_norm": 0.7782163619995117, "learning_rate": 6.047600906068269e-05, "loss": 1.1677, "step": 4750 }, { "epoch": 0.7139107611548556, "grad_norm": 0.8455360531806946, "learning_rate": 5.989239463785971e-05, "loss": 1.1956, "step": 4760 }, { "epoch": 0.7154105736782902, "grad_norm": 0.8486021161079407, "learning_rate": 5.9310906662306125e-05, "loss": 1.1881, "step": 4770 }, { "epoch": 0.7169103862017248, "grad_norm": 0.8694136142730713, "learning_rate": 5.8731558856542935e-05, "loss": 1.1795, "step": 4780 }, { "epoch": 0.7184101987251593, "grad_norm": 0.8481091260910034, "learning_rate": 5.8154364892585574e-05, "loss": 1.1663, "step": 4790 }, { "epoch": 0.7199100112485939, "grad_norm": 0.7921723127365112, "learning_rate": 5.75793383916208e-05, "loss": 1.1648, "step": 4800 }, { "epoch": 0.7214098237720284, "grad_norm": 0.8086884021759033, "learning_rate": 5.70064929236855e-05, "loss": 1.181, "step": 4810 }, { "epoch": 0.7229096362954631, "grad_norm": 0.854877233505249, "learning_rate": 5.643584200734659e-05, "loss": 1.1877, "step": 4820 }, { "epoch": 0.7244094488188977, "grad_norm": 0.9068853259086609, "learning_rate": 5.586739910938161e-05, "loss": 1.184, "step": 4830 }, { "epoch": 0.7259092613423322, "grad_norm": 0.7965474724769592, "learning_rate": 5.5301177644461164e-05, "loss": 1.1629, "step": 4840 }, { "epoch": 0.7274090738657668, "grad_norm": 0.8608559966087341, "learning_rate": 5.4737190974832426e-05, "loss": 1.1807, "step": 4850 }, { "epoch": 0.7289088863892014, "grad_norm": 0.8948774337768555, "learning_rate": 5.417545241000353e-05, "loss": 1.1759, "step": 4860 }, { "epoch": 0.7304086989126359, "grad_norm": 0.9525447487831116, "learning_rate": 5.361597520642981e-05, "loss": 1.1643, "step": 4870 }, { "epoch": 0.7319085114360705, "grad_norm": 0.905668318271637, "learning_rate": 5.3058772567200595e-05, "loss": 1.1799, "step": 4880 }, { "epoch": 0.7334083239595051, "grad_norm": 0.8792032599449158, "learning_rate": 5.250385764172802e-05, "loss": 1.1766, "step": 4890 }, { "epoch": 0.7349081364829396, "grad_norm": 0.9094644784927368, "learning_rate": 5.195124352543636e-05, "loss": 1.192, "step": 4900 }, { "epoch": 0.7364079490063742, "grad_norm": 0.8372027277946472, "learning_rate": 5.140094325945323e-05, "loss": 1.1655, "step": 4910 }, { "epoch": 0.7379077615298087, "grad_norm": 0.9662985801696777, "learning_rate": 5.085296983030164e-05, "loss": 1.1926, "step": 4920 }, { "epoch": 0.7394075740532433, "grad_norm": 0.8691169023513794, "learning_rate": 5.030733616959384e-05, "loss": 1.1785, "step": 4930 }, { "epoch": 0.7409073865766779, "grad_norm": 0.8685835599899292, "learning_rate": 4.976405515372577e-05, "loss": 1.182, "step": 4940 }, { "epoch": 0.7424071991001124, "grad_norm": 0.8488807082176208, "learning_rate": 4.922313960357336e-05, "loss": 1.1782, "step": 4950 }, { "epoch": 0.7439070116235471, "grad_norm": 0.8728071451187134, "learning_rate": 4.868460228419003e-05, "loss": 1.1837, "step": 4960 }, { "epoch": 0.7454068241469817, "grad_norm": 0.8640130162239075, "learning_rate": 4.814845590450544e-05, "loss": 1.1762, "step": 4970 }, { "epoch": 0.7469066366704162, "grad_norm": 0.861436128616333, "learning_rate": 4.761471311702541e-05, "loss": 1.1605, "step": 4980 }, { "epoch": 0.7484064491938508, "grad_norm": 0.9322028756141663, "learning_rate": 4.70833865175334e-05, "loss": 1.1804, "step": 4990 }, { "epoch": 0.7499062617172854, "grad_norm": 0.8335092663764954, "learning_rate": 4.6554488644793555e-05, "loss": 1.1822, "step": 5000 }, { "epoch": 0.7514060742407199, "grad_norm": 0.8994686603546143, "learning_rate": 4.602803198025429e-05, "loss": 1.1805, "step": 5010 }, { "epoch": 0.7529058867641545, "grad_norm": 0.9040279388427734, "learning_rate": 4.550402894775408e-05, "loss": 1.1593, "step": 5020 }, { "epoch": 0.754405699287589, "grad_norm": 0.8711543083190918, "learning_rate": 4.49824919132283e-05, "loss": 1.1526, "step": 5030 }, { "epoch": 0.7559055118110236, "grad_norm": 0.8705037832260132, "learning_rate": 4.446343318441719e-05, "loss": 1.1688, "step": 5040 }, { "epoch": 0.7574053243344582, "grad_norm": 0.8655483722686768, "learning_rate": 4.394686501057553e-05, "loss": 1.1758, "step": 5050 }, { "epoch": 0.7589051368578927, "grad_norm": 0.813109815120697, "learning_rate": 4.343279958218352e-05, "loss": 1.1772, "step": 5060 }, { "epoch": 0.7604049493813273, "grad_norm": 0.8437618613243103, "learning_rate": 4.29212490306592e-05, "loss": 1.1725, "step": 5070 }, { "epoch": 0.7619047619047619, "grad_norm": 0.908439576625824, "learning_rate": 4.241222542807211e-05, "loss": 1.1821, "step": 5080 }, { "epoch": 0.7634045744281964, "grad_norm": 0.8363406658172607, "learning_rate": 4.19057407868583e-05, "loss": 1.1671, "step": 5090 }, { "epoch": 0.7649043869516311, "grad_norm": 0.8935152888298035, "learning_rate": 4.140180705953689e-05, "loss": 1.1831, "step": 5100 }, { "epoch": 0.7664041994750657, "grad_norm": 0.9219470620155334, "learning_rate": 4.090043613842823e-05, "loss": 1.1837, "step": 5110 }, { "epoch": 0.7679040119985002, "grad_norm": 0.8783389329910278, "learning_rate": 4.0401639855372884e-05, "loss": 1.1915, "step": 5120 }, { "epoch": 0.7694038245219348, "grad_norm": 0.8177280426025391, "learning_rate": 3.990542998145262e-05, "loss": 1.1598, "step": 5130 }, { "epoch": 0.7709036370453693, "grad_norm": 0.8821897506713867, "learning_rate": 3.941181822671273e-05, "loss": 1.1794, "step": 5140 }, { "epoch": 0.7724034495688039, "grad_norm": 0.8683632612228394, "learning_rate": 3.892081623988541e-05, "loss": 1.1904, "step": 5150 }, { "epoch": 0.7739032620922385, "grad_norm": 0.800815999507904, "learning_rate": 3.8432435608115e-05, "loss": 1.1648, "step": 5160 }, { "epoch": 0.775403074615673, "grad_norm": 0.8233063817024231, "learning_rate": 3.794668785668465e-05, "loss": 1.1718, "step": 5170 }, { "epoch": 0.7769028871391076, "grad_norm": 0.8883649110794067, "learning_rate": 3.7463584448744186e-05, "loss": 1.1682, "step": 5180 }, { "epoch": 0.7784026996625422, "grad_norm": 0.8076657652854919, "learning_rate": 3.6983136785039636e-05, "loss": 1.1635, "step": 5190 }, { "epoch": 0.7799025121859767, "grad_norm": 0.7962640523910522, "learning_rate": 3.650535620364407e-05, "loss": 1.1731, "step": 5200 }, { "epoch": 0.7814023247094113, "grad_norm": 0.8250430822372437, "learning_rate": 3.603025397969037e-05, "loss": 1.1603, "step": 5210 }, { "epoch": 0.7829021372328459, "grad_norm": 0.9766924977302551, "learning_rate": 3.555784132510472e-05, "loss": 1.1686, "step": 5220 }, { "epoch": 0.7844019497562804, "grad_norm": 0.9148427844047546, "learning_rate": 3.508812938834227e-05, "loss": 1.1703, "step": 5230 }, { "epoch": 0.7859017622797151, "grad_norm": 0.8292137980461121, "learning_rate": 3.4621129254124106e-05, "loss": 1.1565, "step": 5240 }, { "epoch": 0.7874015748031497, "grad_norm": 0.8619733452796936, "learning_rate": 3.415685194317539e-05, "loss": 1.1571, "step": 5250 }, { "epoch": 0.7889013873265842, "grad_norm": 0.8798223733901978, "learning_rate": 3.3695308411965564e-05, "loss": 1.1738, "step": 5260 }, { "epoch": 0.7904011998500188, "grad_norm": 0.8004907369613647, "learning_rate": 3.323650955244951e-05, "loss": 1.1769, "step": 5270 }, { "epoch": 0.7919010123734533, "grad_norm": 0.9049323201179504, "learning_rate": 3.2780466191810905e-05, "loss": 1.1641, "step": 5280 }, { "epoch": 0.7934008248968879, "grad_norm": 0.8505051136016846, "learning_rate": 3.232718909220631e-05, "loss": 1.1765, "step": 5290 }, { "epoch": 0.7949006374203225, "grad_norm": 0.878035843372345, "learning_rate": 3.187668895051135e-05, "loss": 1.1665, "step": 5300 }, { "epoch": 0.796400449943757, "grad_norm": 0.8694955706596375, "learning_rate": 3.14289763980683e-05, "loss": 1.1904, "step": 5310 }, { "epoch": 0.7979002624671916, "grad_norm": 0.917580783367157, "learning_rate": 3.0984062000435276e-05, "loss": 1.1738, "step": 5320 }, { "epoch": 0.7994000749906262, "grad_norm": 0.9352427124977112, "learning_rate": 3.054195625713668e-05, "loss": 1.1685, "step": 5330 }, { "epoch": 0.8002999625046869, "eval_loss": 1.210194706916809, "eval_runtime": 35.2273, "eval_samples_per_second": 709.676, "eval_steps_per_second": 88.71, "step": 5336 }, { "epoch": 0.8008998875140607, "grad_norm": 0.845475971698761, "learning_rate": 3.0102669601415575e-05, "loss": 1.1577, "step": 5340 }, { "epoch": 0.8023997000374953, "grad_norm": 0.8200697898864746, "learning_rate": 2.966621239998755e-05, "loss": 1.1577, "step": 5350 }, { "epoch": 0.8038995125609298, "grad_norm": 0.8095065355300903, "learning_rate": 2.9232594952795818e-05, "loss": 1.1589, "step": 5360 }, { "epoch": 0.8053993250843644, "grad_norm": 0.8357236385345459, "learning_rate": 2.8801827492768352e-05, "loss": 1.1808, "step": 5370 }, { "epoch": 0.8068991376077991, "grad_norm": 0.8971685171127319, "learning_rate": 2.8373920185576375e-05, "loss": 1.1649, "step": 5380 }, { "epoch": 0.8083989501312336, "grad_norm": 0.8829404711723328, "learning_rate": 2.7948883129394467e-05, "loss": 1.1626, "step": 5390 }, { "epoch": 0.8098987626546682, "grad_norm": 0.8451800346374512, "learning_rate": 2.7526726354662104e-05, "loss": 1.155, "step": 5400 }, { "epoch": 0.8113985751781028, "grad_norm": 0.8615455031394958, "learning_rate": 2.7107459823847106e-05, "loss": 1.1607, "step": 5410 }, { "epoch": 0.8128983877015373, "grad_norm": 0.9241278767585754, "learning_rate": 2.6691093431210596e-05, "loss": 1.1771, "step": 5420 }, { "epoch": 0.8143982002249719, "grad_norm": 0.8356271982192993, "learning_rate": 2.6277637002573288e-05, "loss": 1.1755, "step": 5430 }, { "epoch": 0.8158980127484065, "grad_norm": 0.8474392294883728, "learning_rate": 2.586710029508375e-05, "loss": 1.1652, "step": 5440 }, { "epoch": 0.817397825271841, "grad_norm": 0.9105897545814514, "learning_rate": 2.54594929969881e-05, "loss": 1.158, "step": 5450 }, { "epoch": 0.8188976377952756, "grad_norm": 0.8202654123306274, "learning_rate": 2.5054824727401502e-05, "loss": 1.1632, "step": 5460 }, { "epoch": 0.8203974503187101, "grad_norm": 0.8397065997123718, "learning_rate": 2.46531050360809e-05, "loss": 1.166, "step": 5470 }, { "epoch": 0.8218972628421447, "grad_norm": 0.8714698553085327, "learning_rate": 2.4254343403199945e-05, "loss": 1.1749, "step": 5480 }, { "epoch": 0.8233970753655793, "grad_norm": 0.8937433958053589, "learning_rate": 2.3858549239125034e-05, "loss": 1.1631, "step": 5490 }, { "epoch": 0.8248968878890138, "grad_norm": 0.8778429627418518, "learning_rate": 2.346573188419341e-05, "loss": 1.1661, "step": 5500 }, { "epoch": 0.8263967004124484, "grad_norm": 0.7826377153396606, "learning_rate": 2.3075900608492637e-05, "loss": 1.1686, "step": 5510 }, { "epoch": 0.8278965129358831, "grad_norm": 1.108136773109436, "learning_rate": 2.2689064611641794e-05, "loss": 1.1926, "step": 5520 }, { "epoch": 0.8293963254593176, "grad_norm": 0.8729405403137207, "learning_rate": 2.230523302257461e-05, "loss": 1.1554, "step": 5530 }, { "epoch": 0.8308961379827522, "grad_norm": 0.824364185333252, "learning_rate": 2.192441489932372e-05, "loss": 1.1715, "step": 5540 }, { "epoch": 0.8323959505061868, "grad_norm": 0.8968133330345154, "learning_rate": 2.154661922880708e-05, "loss": 1.1549, "step": 5550 }, { "epoch": 0.8338957630296213, "grad_norm": 0.9061416983604431, "learning_rate": 2.117185492661592e-05, "loss": 1.1516, "step": 5560 }, { "epoch": 0.8353955755530559, "grad_norm": 0.8204578161239624, "learning_rate": 2.0800130836804214e-05, "loss": 1.1637, "step": 5570 }, { "epoch": 0.8368953880764904, "grad_norm": 0.8293260931968689, "learning_rate": 2.043145573168003e-05, "loss": 1.1604, "step": 5580 }, { "epoch": 0.838395200599925, "grad_norm": 0.8907037377357483, "learning_rate": 2.0065838311598543e-05, "loss": 1.1783, "step": 5590 }, { "epoch": 0.8398950131233596, "grad_norm": 0.8842608332633972, "learning_rate": 1.9703287204756757e-05, "loss": 1.1576, "step": 5600 }, { "epoch": 0.8413948256467941, "grad_norm": 0.8065600991249084, "learning_rate": 1.9343810966989716e-05, "loss": 1.182, "step": 5610 }, { "epoch": 0.8428946381702287, "grad_norm": 0.8144896626472473, "learning_rate": 1.8987418081568683e-05, "loss": 1.1482, "step": 5620 }, { "epoch": 0.8443944506936633, "grad_norm": 0.8756045699119568, "learning_rate": 1.8634116959001106e-05, "loss": 1.1623, "step": 5630 }, { "epoch": 0.8458942632170978, "grad_norm": 0.9638755321502686, "learning_rate": 1.828391593683185e-05, "loss": 1.1479, "step": 5640 }, { "epoch": 0.8473940757405324, "grad_norm": 0.8920614123344421, "learning_rate": 1.7936823279446676e-05, "loss": 1.1531, "step": 5650 }, { "epoch": 0.8488938882639671, "grad_norm": 0.8344776630401611, "learning_rate": 1.7592847177877008e-05, "loss": 1.1642, "step": 5660 }, { "epoch": 0.8503937007874016, "grad_norm": 0.8521091341972351, "learning_rate": 1.725199574960689e-05, "loss": 1.1456, "step": 5670 }, { "epoch": 0.8518935133108362, "grad_norm": 0.8243123292922974, "learning_rate": 1.6914277038381145e-05, "loss": 1.1689, "step": 5680 }, { "epoch": 0.8533933258342707, "grad_norm": 0.8362585306167603, "learning_rate": 1.6579699014015783e-05, "loss": 1.1565, "step": 5690 }, { "epoch": 0.8548931383577053, "grad_norm": 0.8934555053710938, "learning_rate": 1.6248269572209716e-05, "loss": 1.158, "step": 5700 }, { "epoch": 0.8563929508811399, "grad_norm": 0.9755041599273682, "learning_rate": 1.5919996534358635e-05, "loss": 1.1412, "step": 5710 }, { "epoch": 0.8578927634045744, "grad_norm": 0.9437934160232544, "learning_rate": 1.5594887647370263e-05, "loss": 1.1528, "step": 5720 }, { "epoch": 0.859392575928009, "grad_norm": 0.8423501253128052, "learning_rate": 1.527295058348154e-05, "loss": 1.1374, "step": 5730 }, { "epoch": 0.8608923884514436, "grad_norm": 0.8530197739601135, "learning_rate": 1.4954192940077809e-05, "loss": 1.153, "step": 5740 }, { "epoch": 0.8623922009748781, "grad_norm": 0.9238439798355103, "learning_rate": 1.463862223951317e-05, "loss": 1.1658, "step": 5750 }, { "epoch": 0.8638920134983127, "grad_norm": 0.82196444272995, "learning_rate": 1.4326245928933178e-05, "loss": 1.1657, "step": 5760 }, { "epoch": 0.8653918260217472, "grad_norm": 0.8516616821289062, "learning_rate": 1.4017071380099132e-05, "loss": 1.1585, "step": 5770 }, { "epoch": 0.8668916385451818, "grad_norm": 0.8456748127937317, "learning_rate": 1.3711105889213908e-05, "loss": 1.1606, "step": 5780 }, { "epoch": 0.8683914510686164, "grad_norm": 0.8640986680984497, "learning_rate": 1.3408356676750043e-05, "loss": 1.1806, "step": 5790 }, { "epoch": 0.869891263592051, "grad_norm": 0.8690096735954285, "learning_rate": 1.310883088727902e-05, "loss": 1.1607, "step": 5800 }, { "epoch": 0.8713910761154856, "grad_norm": 0.8273411393165588, "learning_rate": 1.2812535589303024e-05, "loss": 1.1436, "step": 5810 }, { "epoch": 0.8728908886389202, "grad_norm": 0.7910122871398926, "learning_rate": 1.2519477775087805e-05, "loss": 1.1706, "step": 5820 }, { "epoch": 0.8743907011623547, "grad_norm": 0.9277138710021973, "learning_rate": 1.222966436049786e-05, "loss": 1.1801, "step": 5830 }, { "epoch": 0.8758905136857893, "grad_norm": 0.861088752746582, "learning_rate": 1.1943102184833165e-05, "loss": 1.1763, "step": 5840 }, { "epoch": 0.8773903262092239, "grad_norm": 0.8729887008666992, "learning_rate": 1.165979801066782e-05, "loss": 1.1571, "step": 5850 }, { "epoch": 0.8788901387326584, "grad_norm": 0.9203227162361145, "learning_rate": 1.1379758523690413e-05, "loss": 1.1756, "step": 5860 }, { "epoch": 0.880389951256093, "grad_norm": 0.8871293663978577, "learning_rate": 1.1102990332546175e-05, "loss": 1.1578, "step": 5870 }, { "epoch": 0.8818897637795275, "grad_norm": 0.8668881058692932, "learning_rate": 1.0829499968681204e-05, "loss": 1.1618, "step": 5880 }, { "epoch": 0.8833895763029621, "grad_norm": 0.8996675610542297, "learning_rate": 1.0559293886188246e-05, "loss": 1.1723, "step": 5890 }, { "epoch": 0.8848893888263967, "grad_norm": 0.892325758934021, "learning_rate": 1.029237846165426e-05, "loss": 1.1645, "step": 5900 }, { "epoch": 0.8863892013498312, "grad_norm": 0.8483745455741882, "learning_rate": 1.0028759994010071e-05, "loss": 1.1685, "step": 5910 }, { "epoch": 0.8878890138732658, "grad_norm": 0.8226345181465149, "learning_rate": 9.768444704381811e-06, "loss": 1.156, "step": 5920 }, { "epoch": 0.8893888263967004, "grad_norm": 0.9334031343460083, "learning_rate": 9.511438735943849e-06, "loss": 1.1732, "step": 5930 }, { "epoch": 0.890888638920135, "grad_norm": 0.9309195876121521, "learning_rate": 9.257748153773992e-06, "loss": 1.158, "step": 5940 }, { "epoch": 0.8923884514435696, "grad_norm": 0.899749219417572, "learning_rate": 9.007378944710431e-06, "loss": 1.1512, "step": 5950 }, { "epoch": 0.8938882639670042, "grad_norm": 0.8760167956352234, "learning_rate": 8.760337017210206e-06, "loss": 1.1453, "step": 5960 }, { "epoch": 0.8953880764904387, "grad_norm": 0.9029643535614014, "learning_rate": 8.516628201209985e-06, "loss": 1.1561, "step": 5970 }, { "epoch": 0.8968878890138733, "grad_norm": 0.8686094284057617, "learning_rate": 8.276258247988437e-06, "loss": 1.1569, "step": 5980 }, { "epoch": 0.8983877015373078, "grad_norm": 0.876531720161438, "learning_rate": 8.039232830030413e-06, "loss": 1.1651, "step": 5990 }, { "epoch": 0.8998875140607424, "grad_norm": 0.896596372127533, "learning_rate": 7.805557540893276e-06, "loss": 1.1709, "step": 6000 }, { "epoch": 0.9003374578177727, "eval_loss": 1.203166127204895, "eval_runtime": 35.8498, "eval_samples_per_second": 697.355, "eval_steps_per_second": 87.169, "step": 6003 }, { "epoch": 0.901387326584177, "grad_norm": 0.8913058638572693, "learning_rate": 7.575237895074637e-06, "loss": 1.1691, "step": 6010 }, { "epoch": 0.9028871391076115, "grad_norm": 0.8780670166015625, "learning_rate": 7.348279327882467e-06, "loss": 1.1685, "step": 6020 }, { "epoch": 0.9043869516310461, "grad_norm": 0.828803539276123, "learning_rate": 7.1246871953066666e-06, "loss": 1.1532, "step": 6030 }, { "epoch": 0.9058867641544807, "grad_norm": 0.8630168437957764, "learning_rate": 6.9044667738927365e-06, "loss": 1.1641, "step": 6040 }, { "epoch": 0.9073865766779152, "grad_norm": 0.8399310111999512, "learning_rate": 6.6876232606172255e-06, "loss": 1.1596, "step": 6050 }, { "epoch": 0.9088863892013498, "grad_norm": 0.8821493983268738, "learning_rate": 6.4741617727651626e-06, "loss": 1.1501, "step": 6060 }, { "epoch": 0.9103862017247843, "grad_norm": 0.948297381401062, "learning_rate": 6.264087347809188e-06, "loss": 1.1734, "step": 6070 }, { "epoch": 0.911886014248219, "grad_norm": 0.9395800232887268, "learning_rate": 6.0574049432907115e-06, "loss": 1.1738, "step": 6080 }, { "epoch": 0.9133858267716536, "grad_norm": 0.8580440878868103, "learning_rate": 5.854119436702976e-06, "loss": 1.1677, "step": 6090 }, { "epoch": 0.9148856392950881, "grad_norm": 0.9488980770111084, "learning_rate": 5.65423562537593e-06, "loss": 1.1638, "step": 6100 }, { "epoch": 0.9163854518185227, "grad_norm": 0.8783366680145264, "learning_rate": 5.4577582263629235e-06, "loss": 1.1676, "step": 6110 }, { "epoch": 0.9178852643419573, "grad_norm": 0.8629846572875977, "learning_rate": 5.264691876329474e-06, "loss": 1.1426, "step": 6120 }, { "epoch": 0.9193850768653918, "grad_norm": 0.8540226221084595, "learning_rate": 5.075041131443891e-06, "loss": 1.1582, "step": 6130 }, { "epoch": 0.9208848893888264, "grad_norm": 0.8809103965759277, "learning_rate": 4.88881046726966e-06, "loss": 1.1514, "step": 6140 }, { "epoch": 0.922384701912261, "grad_norm": 0.9011795520782471, "learning_rate": 4.706004278659831e-06, "loss": 1.1543, "step": 6150 }, { "epoch": 0.9238845144356955, "grad_norm": 0.8636093139648438, "learning_rate": 4.526626879653428e-06, "loss": 1.1694, "step": 6160 }, { "epoch": 0.9253843269591301, "grad_norm": 0.8122191429138184, "learning_rate": 4.350682503373437e-06, "loss": 1.1508, "step": 6170 }, { "epoch": 0.9268841394825647, "grad_norm": 0.9113324880599976, "learning_rate": 4.178175301927101e-06, "loss": 1.1767, "step": 6180 }, { "epoch": 0.9283839520059992, "grad_norm": 0.8934022784233093, "learning_rate": 4.009109346307792e-06, "loss": 1.162, "step": 6190 }, { "epoch": 0.9298837645294338, "grad_norm": 0.8794459700584412, "learning_rate": 3.8434886262991015e-06, "loss": 1.167, "step": 6200 }, { "epoch": 0.9313835770528683, "grad_norm": 0.9140746593475342, "learning_rate": 3.6813170503804834e-06, "loss": 1.1828, "step": 6210 }, { "epoch": 0.932883389576303, "grad_norm": 0.8573930859565735, "learning_rate": 3.522598445635172e-06, "loss": 1.138, "step": 6220 }, { "epoch": 0.9343832020997376, "grad_norm": 0.8401947021484375, "learning_rate": 3.3673365576598e-06, "loss": 1.1599, "step": 6230 }, { "epoch": 0.9358830146231721, "grad_norm": 0.9218401908874512, "learning_rate": 3.21553505047602e-06, "loss": 1.1699, "step": 6240 }, { "epoch": 0.9373828271466067, "grad_norm": 0.9082098603248596, "learning_rate": 3.067197506444058e-06, "loss": 1.1595, "step": 6250 }, { "epoch": 0.9388826396700413, "grad_norm": 0.9707618355751038, "learning_rate": 2.922327426178128e-06, "loss": 1.1417, "step": 6260 }, { "epoch": 0.9403824521934758, "grad_norm": 0.8763731718063354, "learning_rate": 2.7809282284638855e-06, "loss": 1.1839, "step": 6270 }, { "epoch": 0.9418822647169104, "grad_norm": 0.9058078527450562, "learning_rate": 2.643003250177672e-06, "loss": 1.147, "step": 6280 }, { "epoch": 0.943382077240345, "grad_norm": 0.817454993724823, "learning_rate": 2.5085557462078134e-06, "loss": 1.1457, "step": 6290 }, { "epoch": 0.9448818897637795, "grad_norm": 0.919282853603363, "learning_rate": 2.377588889377813e-06, "loss": 1.1738, "step": 6300 }, { "epoch": 0.9463817022872141, "grad_norm": 0.9078910946846008, "learning_rate": 2.2501057703714797e-06, "loss": 1.175, "step": 6310 }, { "epoch": 0.9478815148106486, "grad_norm": 0.894939661026001, "learning_rate": 2.1261093976599365e-06, "loss": 1.1704, "step": 6320 }, { "epoch": 0.9493813273340832, "grad_norm": 0.8727539777755737, "learning_rate": 2.005602697430675e-06, "loss": 1.1619, "step": 6330 }, { "epoch": 0.9508811398575178, "grad_norm": 0.8867236971855164, "learning_rate": 1.8885885135184963e-06, "loss": 1.1693, "step": 6340 }, { "epoch": 0.9523809523809523, "grad_norm": 0.8786768317222595, "learning_rate": 1.7750696073383974e-06, "loss": 1.1494, "step": 6350 }, { "epoch": 0.953880764904387, "grad_norm": 0.8469645380973816, "learning_rate": 1.6650486578203725e-06, "loss": 1.1619, "step": 6360 }, { "epoch": 0.9553805774278216, "grad_norm": 0.8775497674942017, "learning_rate": 1.558528261346248e-06, "loss": 1.1448, "step": 6370 }, { "epoch": 0.9568803899512561, "grad_norm": 0.9189411997795105, "learning_rate": 1.455510931688364e-06, "loss": 1.1539, "step": 6380 }, { "epoch": 0.9583802024746907, "grad_norm": 0.8403207659721375, "learning_rate": 1.3559990999502556e-06, "loss": 1.1644, "step": 6390 }, { "epoch": 0.9598800149981253, "grad_norm": 0.910416305065155, "learning_rate": 1.2599951145093157e-06, "loss": 1.1549, "step": 6400 }, { "epoch": 0.9613798275215598, "grad_norm": 0.8597015738487244, "learning_rate": 1.1675012409613715e-06, "loss": 1.1502, "step": 6410 }, { "epoch": 0.9628796400449944, "grad_norm": 0.8848598003387451, "learning_rate": 1.0785196620671455e-06, "loss": 1.1582, "step": 6420 }, { "epoch": 0.9643794525684289, "grad_norm": 0.8559622764587402, "learning_rate": 9.93052477700862e-07, "loss": 1.1679, "step": 6430 }, { "epoch": 0.9658792650918635, "grad_norm": 0.814378559589386, "learning_rate": 9.111017048005876e-07, "loss": 1.1639, "step": 6440 }, { "epoch": 0.9673790776152981, "grad_norm": 0.7913538217544556, "learning_rate": 8.326692773207189e-07, "loss": 1.1621, "step": 6450 }, { "epoch": 0.9688788901387326, "grad_norm": 0.947375476360321, "learning_rate": 7.577570461862359e-07, "loss": 1.158, "step": 6460 }, { "epoch": 0.9703787026621672, "grad_norm": 0.8357100486755371, "learning_rate": 6.863667792491534e-07, "loss": 1.1584, "step": 6470 }, { "epoch": 0.9718785151856018, "grad_norm": 0.880916953086853, "learning_rate": 6.185001612467044e-07, "loss": 1.1671, "step": 6480 }, { "epoch": 0.9733783277090363, "grad_norm": 0.8177213072776794, "learning_rate": 5.541587937616221e-07, "loss": 1.161, "step": 6490 }, { "epoch": 0.974878140232471, "grad_norm": 0.8296107053756714, "learning_rate": 4.933441951843198e-07, "loss": 1.1555, "step": 6500 }, { "epoch": 0.9763779527559056, "grad_norm": 0.8631340861320496, "learning_rate": 4.360578006770865e-07, "loss": 1.1624, "step": 6510 }, { "epoch": 0.9778777652793401, "grad_norm": 0.8406107425689697, "learning_rate": 3.82300962140214e-07, "loss": 1.1615, "step": 6520 }, { "epoch": 0.9793775778027747, "grad_norm": 0.9254620671272278, "learning_rate": 3.320749481800888e-07, "loss": 1.1597, "step": 6530 }, { "epoch": 0.9808773903262092, "grad_norm": 0.9091020822525024, "learning_rate": 2.8538094407919987e-07, "loss": 1.1537, "step": 6540 }, { "epoch": 0.9823772028496438, "grad_norm": 0.8602820634841919, "learning_rate": 2.4222005176829375e-07, "loss": 1.1485, "step": 6550 }, { "epoch": 0.9838770153730784, "grad_norm": 0.9516273736953735, "learning_rate": 2.025932898002458e-07, "loss": 1.1706, "step": 6560 }, { "epoch": 0.9853768278965129, "grad_norm": 0.8547878265380859, "learning_rate": 1.6650159332607939e-07, "loss": 1.1511, "step": 6570 }, { "epoch": 0.9868766404199475, "grad_norm": 0.9445357918739319, "learning_rate": 1.3394581407289996e-07, "loss": 1.1735, "step": 6580 }, { "epoch": 0.9883764529433821, "grad_norm": 0.9283078908920288, "learning_rate": 1.0492672032377803e-07, "loss": 1.1665, "step": 6590 }, { "epoch": 0.9898762654668166, "grad_norm": 0.9254633784294128, "learning_rate": 7.944499689961358e-08, "loss": 1.1533, "step": 6600 }, { "epoch": 0.9913760779902512, "grad_norm": 0.9120994806289673, "learning_rate": 5.7501245143015685e-08, "loss": 1.1616, "step": 6610 }, { "epoch": 0.9928758905136857, "grad_norm": 0.8868552446365356, "learning_rate": 3.9095982904080447e-08, "loss": 1.1591, "step": 6620 }, { "epoch": 0.9943757030371203, "grad_norm": 0.8874196410179138, "learning_rate": 2.4229644528150905e-08, "loss": 1.168, "step": 6630 }, { "epoch": 0.995875515560555, "grad_norm": 0.8636330366134644, "learning_rate": 1.290258084557516e-08, "loss": 1.1572, "step": 6640 }, { "epoch": 0.9973753280839895, "grad_norm": 0.8626487255096436, "learning_rate": 5.115059163496304e-09, "loss": 1.1482, "step": 6650 }, { "epoch": 0.9988751406074241, "grad_norm": 0.9289808869361877, "learning_rate": 8.672632594408646e-10, "loss": 1.1632, "step": 6660 } ], "logging_steps": 10, "max_steps": 6667, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 667, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2426610074517504.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }