| { | |
| "best_metric": 1.203166127204895, | |
| "best_model_checkpoint": "/Users/bbunzeck/Documents/german-llamas/cxn-llamas/cds-llama85/checkpoint-6003", | |
| "epoch": 0.9999250093738282, | |
| "eval_steps": 667, | |
| "global_step": 6667, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0014998125234345708, | |
| "grad_norm": 4.941356658935547, | |
| "learning_rate": 1.4999999999999999e-05, | |
| "loss": 4.5697, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0029996250468691415, | |
| "grad_norm": 3.326807737350464, | |
| "learning_rate": 2.9999999999999997e-05, | |
| "loss": 4.207, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0044994375703037125, | |
| "grad_norm": 2.287435531616211, | |
| "learning_rate": 4.4999999999999996e-05, | |
| "loss": 3.7885, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.005999250093738283, | |
| "grad_norm": 1.804822325706482, | |
| "learning_rate": 5.9999999999999995e-05, | |
| "loss": 3.5512, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0074990626171728535, | |
| "grad_norm": 1.4687652587890625, | |
| "learning_rate": 7.5e-05, | |
| "loss": 3.4201, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.008998875140607425, | |
| "grad_norm": 1.0859005451202393, | |
| "learning_rate": 8.999999999999999e-05, | |
| "loss": 3.2891, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.010498687664041995, | |
| "grad_norm": 0.9512425661087036, | |
| "learning_rate": 0.00010499999999999999, | |
| "loss": 3.1539, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.011998500187476566, | |
| "grad_norm": 1.2439535856246948, | |
| "learning_rate": 0.00011999999999999999, | |
| "loss": 2.919, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.013498312710911136, | |
| "grad_norm": 1.0196483135223389, | |
| "learning_rate": 0.000135, | |
| "loss": 2.728, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.014998125234345707, | |
| "grad_norm": 3.3797309398651123, | |
| "learning_rate": 0.00015, | |
| "loss": 2.5985, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.016497937757780277, | |
| "grad_norm": 1.4080921411514282, | |
| "learning_rate": 0.000165, | |
| "loss": 2.4991, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.01799775028121485, | |
| "grad_norm": 1.177751898765564, | |
| "learning_rate": 0.00017999999999999998, | |
| "loss": 2.4234, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.01949756280464942, | |
| "grad_norm": 1.9201545715332031, | |
| "learning_rate": 0.000195, | |
| "loss": 2.36, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.02099737532808399, | |
| "grad_norm": 1.969179630279541, | |
| "learning_rate": 0.00020999999999999998, | |
| "loss": 2.3224, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.02249718785151856, | |
| "grad_norm": 1.7428991794586182, | |
| "learning_rate": 0.000225, | |
| "loss": 2.2435, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.023997000374953132, | |
| "grad_norm": 1.231548547744751, | |
| "learning_rate": 0.00023999999999999998, | |
| "loss": 2.1859, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0254968128983877, | |
| "grad_norm": 1.4751386642456055, | |
| "learning_rate": 0.00025499999999999996, | |
| "loss": 2.1358, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.02699662542182227, | |
| "grad_norm": 1.457726001739502, | |
| "learning_rate": 0.00027, | |
| "loss": 2.0645, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.028496437945256844, | |
| "grad_norm": 1.2510344982147217, | |
| "learning_rate": 0.000285, | |
| "loss": 2.0373, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.029996250468691414, | |
| "grad_norm": 1.5325850248336792, | |
| "learning_rate": 0.0003, | |
| "loss": 1.9925, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.031496062992125984, | |
| "grad_norm": 1.3607863187789917, | |
| "learning_rate": 0.0002999982300767559, | |
| "loss": 1.9542, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.032995875515560553, | |
| "grad_norm": 1.13303804397583, | |
| "learning_rate": 0.000299992920348792, | |
| "loss": 1.9144, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.03449568803899512, | |
| "grad_norm": 1.341699242591858, | |
| "learning_rate": 0.0002999840709414124, | |
| "loss": 1.9061, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0359955005624297, | |
| "grad_norm": 1.2910326719284058, | |
| "learning_rate": 0.0002999716820634541, | |
| "loss": 1.8744, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.03749531308586427, | |
| "grad_norm": 1.3730792999267578, | |
| "learning_rate": 0.000299955754007282, | |
| "loss": 1.8369, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.03899512560929884, | |
| "grad_norm": 1.2474159002304077, | |
| "learning_rate": 0.00029993628714878185, | |
| "loss": 1.796, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.04049493813273341, | |
| "grad_norm": 1.2987885475158691, | |
| "learning_rate": 0.00029991328194735155, | |
| "loss": 1.8051, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.04199475065616798, | |
| "grad_norm": 1.0884069204330444, | |
| "learning_rate": 0.0002998867389458904, | |
| "loss": 1.7658, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.04349456317960255, | |
| "grad_norm": 1.3922066688537598, | |
| "learning_rate": 0.00029985665877078595, | |
| "loss": 1.7632, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.04499437570303712, | |
| "grad_norm": 1.1491564512252808, | |
| "learning_rate": 0.0002998230421318997, | |
| "loss": 1.7608, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.046494188226471694, | |
| "grad_norm": 1.2716453075408936, | |
| "learning_rate": 0.0002997858898225498, | |
| "loss": 1.7231, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.047994000749906264, | |
| "grad_norm": 1.3676432371139526, | |
| "learning_rate": 0.0002997452027194928, | |
| "loss": 1.7303, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.049493813273340834, | |
| "grad_norm": 1.1510239839553833, | |
| "learning_rate": 0.0002997009817829027, | |
| "loss": 1.7337, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.0509936257967754, | |
| "grad_norm": 1.2083615064620972, | |
| "learning_rate": 0.0002996532280563483, | |
| "loss": 1.708, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.05249343832020997, | |
| "grad_norm": 1.0882530212402344, | |
| "learning_rate": 0.0002996019426667687, | |
| "loss": 1.6717, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.05399325084364454, | |
| "grad_norm": 1.2023850679397583, | |
| "learning_rate": 0.00029954712682444656, | |
| "loss": 1.6565, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.05549306336707911, | |
| "grad_norm": 1.0490646362304688, | |
| "learning_rate": 0.0002994887818229797, | |
| "loss": 1.639, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.05699287589051369, | |
| "grad_norm": 1.0732958316802979, | |
| "learning_rate": 0.0002994269090392505, | |
| "loss": 1.6685, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.05849268841394826, | |
| "grad_norm": 1.1127232313156128, | |
| "learning_rate": 0.00029936150993339325, | |
| "loss": 1.6365, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.05999250093738283, | |
| "grad_norm": 1.1355839967727661, | |
| "learning_rate": 0.0002992925860487599, | |
| "loss": 1.6495, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.0614923134608174, | |
| "grad_norm": 1.0531744956970215, | |
| "learning_rate": 0.0002992201390118837, | |
| "loss": 1.6279, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.06299212598425197, | |
| "grad_norm": 1.165711522102356, | |
| "learning_rate": 0.00029914417053244054, | |
| "loss": 1.6342, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.06449193850768654, | |
| "grad_norm": 1.1082898378372192, | |
| "learning_rate": 0.00029906468240320874, | |
| "loss": 1.6184, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.06599175103112111, | |
| "grad_norm": 1.07638680934906, | |
| "learning_rate": 0.00029898167650002676, | |
| "loss": 1.6078, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.06749156355455568, | |
| "grad_norm": 0.9482781887054443, | |
| "learning_rate": 0.0002988951547817491, | |
| "loss": 1.6024, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.06899137607799025, | |
| "grad_norm": 0.9475975036621094, | |
| "learning_rate": 0.00029880511929019965, | |
| "loss": 1.6056, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.07049118860142482, | |
| "grad_norm": 1.2477397918701172, | |
| "learning_rate": 0.0002987115721501239, | |
| "loss": 1.5891, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.0719910011248594, | |
| "grad_norm": 1.0981507301330566, | |
| "learning_rate": 0.00029861451556913865, | |
| "loss": 1.5971, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.07349081364829396, | |
| "grad_norm": 1.0291006565093994, | |
| "learning_rate": 0.00029851395183767983, | |
| "loss": 1.5748, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.07499062617172854, | |
| "grad_norm": 1.0406434535980225, | |
| "learning_rate": 0.00029840988332894864, | |
| "loss": 1.5739, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0764904386951631, | |
| "grad_norm": 0.9979759454727173, | |
| "learning_rate": 0.00029830231249885537, | |
| "loss": 1.5593, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.07799025121859768, | |
| "grad_norm": 1.1204349994659424, | |
| "learning_rate": 0.00029819124188596146, | |
| "loss": 1.5598, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.07949006374203224, | |
| "grad_norm": 1.0350697040557861, | |
| "learning_rate": 0.00029807667411141977, | |
| "loss": 1.5625, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.08098987626546682, | |
| "grad_norm": 1.0373060703277588, | |
| "learning_rate": 0.0002979586118789125, | |
| "loss": 1.5413, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.0824896887889014, | |
| "grad_norm": 0.9631483554840088, | |
| "learning_rate": 0.0002978370579745876, | |
| "loss": 1.5431, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.08398950131233596, | |
| "grad_norm": 0.9262831211090088, | |
| "learning_rate": 0.00029771201526699264, | |
| "loss": 1.5288, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.08548931383577053, | |
| "grad_norm": 1.0329458713531494, | |
| "learning_rate": 0.0002975834867070077, | |
| "loss": 1.5172, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.0869891263592051, | |
| "grad_norm": 0.9907341599464417, | |
| "learning_rate": 0.00029745147532777514, | |
| "loss": 1.5118, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.08848893888263967, | |
| "grad_norm": 0.9761767983436584, | |
| "learning_rate": 0.0002973159842446285, | |
| "loss": 1.5067, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.08998875140607424, | |
| "grad_norm": 0.9076977968215942, | |
| "learning_rate": 0.00029717701665501865, | |
| "loss": 1.5173, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.09148856392950881, | |
| "grad_norm": 0.9733080863952637, | |
| "learning_rate": 0.00029703457583843846, | |
| "loss": 1.5101, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.09298837645294339, | |
| "grad_norm": 0.965733528137207, | |
| "learning_rate": 0.00029688866515634546, | |
| "loss": 1.5192, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.09448818897637795, | |
| "grad_norm": 0.9676783680915833, | |
| "learning_rate": 0.00029673928805208237, | |
| "loss": 1.5123, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.09598800149981253, | |
| "grad_norm": 0.9860779047012329, | |
| "learning_rate": 0.00029658644805079606, | |
| "loss": 1.5205, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.09748781402324709, | |
| "grad_norm": 1.026779294013977, | |
| "learning_rate": 0.00029643014875935404, | |
| "loss": 1.5175, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.09898762654668167, | |
| "grad_norm": 0.9782848954200745, | |
| "learning_rate": 0.00029627039386625976, | |
| "loss": 1.4966, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.10003749531308587, | |
| "eval_loss": 1.540096402168274, | |
| "eval_runtime": 34.7434, | |
| "eval_samples_per_second": 719.561, | |
| "eval_steps_per_second": 89.945, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 0.10048743907011623, | |
| "grad_norm": 1.0175178050994873, | |
| "learning_rate": 0.0002961071871415651, | |
| "loss": 1.4742, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.1019872515935508, | |
| "grad_norm": 0.9086681008338928, | |
| "learning_rate": 0.00029594053243678175, | |
| "loss": 1.5138, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.10348706411698538, | |
| "grad_norm": 1.0384730100631714, | |
| "learning_rate": 0.00029577043368479017, | |
| "loss": 1.4645, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.10498687664041995, | |
| "grad_norm": 0.9737691283226013, | |
| "learning_rate": 0.0002955968948997469, | |
| "loss": 1.4812, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.10648668916385452, | |
| "grad_norm": 0.9272564053535461, | |
| "learning_rate": 0.00029541992017698956, | |
| "loss": 1.4489, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.10798650168728909, | |
| "grad_norm": 0.9667676687240601, | |
| "learning_rate": 0.0002952395136929406, | |
| "loss": 1.4726, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.10948631421072366, | |
| "grad_norm": 0.8968635201454163, | |
| "learning_rate": 0.00029505567970500833, | |
| "loss": 1.4585, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.11098612673415822, | |
| "grad_norm": 0.9180886745452881, | |
| "learning_rate": 0.0002948684225514868, | |
| "loss": 1.4507, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.1124859392575928, | |
| "grad_norm": 0.9335956573486328, | |
| "learning_rate": 0.0002946777466514531, | |
| "loss": 1.4707, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.11398575178102738, | |
| "grad_norm": 1.0144034624099731, | |
| "learning_rate": 0.00029448365650466336, | |
| "loss": 1.4697, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.11548556430446194, | |
| "grad_norm": 1.003219485282898, | |
| "learning_rate": 0.0002942861566914465, | |
| "loss": 1.455, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.11698537682789652, | |
| "grad_norm": 0.9664227962493896, | |
| "learning_rate": 0.0002940852518725959, | |
| "loss": 1.4472, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.11848518935133108, | |
| "grad_norm": 0.9362863898277283, | |
| "learning_rate": 0.0002938809467892596, | |
| "loss": 1.4411, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.11998500187476566, | |
| "grad_norm": 0.9592990279197693, | |
| "learning_rate": 0.0002936732462628287, | |
| "loss": 1.4459, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.12148481439820022, | |
| "grad_norm": 0.9843602180480957, | |
| "learning_rate": 0.0002934621551948229, | |
| "loss": 1.4439, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.1229846269216348, | |
| "grad_norm": 0.9243733286857605, | |
| "learning_rate": 0.0002932476785667754, | |
| "loss": 1.4312, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.12448443944506937, | |
| "grad_norm": 0.9494661092758179, | |
| "learning_rate": 0.00029302982144011514, | |
| "loss": 1.457, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.12598425196850394, | |
| "grad_norm": 0.8792492747306824, | |
| "learning_rate": 0.00029280858895604727, | |
| "loss": 1.4365, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.1274840644919385, | |
| "grad_norm": 0.9076453447341919, | |
| "learning_rate": 0.0002925839863354322, | |
| "loss": 1.4302, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.1289838770153731, | |
| "grad_norm": 0.9594305157661438, | |
| "learning_rate": 0.00029235601887866167, | |
| "loss": 1.4238, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.13048368953880765, | |
| "grad_norm": 0.8772804737091064, | |
| "learning_rate": 0.00029212469196553456, | |
| "loss": 1.4174, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.13198350206224221, | |
| "grad_norm": 0.9967095851898193, | |
| "learning_rate": 0.00029189001105512914, | |
| "loss": 1.4379, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.13348331458567678, | |
| "grad_norm": 0.9739612936973572, | |
| "learning_rate": 0.0002916519816856748, | |
| "loss": 1.4396, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.13498312710911137, | |
| "grad_norm": 0.9377513527870178, | |
| "learning_rate": 0.000291410609474421, | |
| "loss": 1.4512, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.13648293963254593, | |
| "grad_norm": 0.9376864433288574, | |
| "learning_rate": 0.0002911659001175049, | |
| "loss": 1.4093, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.1379827521559805, | |
| "grad_norm": 0.8760865330696106, | |
| "learning_rate": 0.000290917859389817, | |
| "loss": 1.4286, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.13948256467941508, | |
| "grad_norm": 1.0221220254898071, | |
| "learning_rate": 0.0002906664931448645, | |
| "loss": 1.4424, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.14098237720284965, | |
| "grad_norm": 0.8309196829795837, | |
| "learning_rate": 0.00029041180731463357, | |
| "loss": 1.426, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.1424821897262842, | |
| "grad_norm": 0.9327015280723572, | |
| "learning_rate": 0.00029015380790944916, | |
| "loss": 1.4279, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.1439820022497188, | |
| "grad_norm": 0.8863860964775085, | |
| "learning_rate": 0.0002898925010178332, | |
| "loss": 1.4184, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.14548181477315336, | |
| "grad_norm": 0.9390380382537842, | |
| "learning_rate": 0.00028962789280636083, | |
| "loss": 1.4131, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.14698162729658792, | |
| "grad_norm": 0.9246792197227478, | |
| "learning_rate": 0.00028935998951951515, | |
| "loss": 1.4148, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.1484814398200225, | |
| "grad_norm": 0.8679428696632385, | |
| "learning_rate": 0.00028908879747953955, | |
| "loss": 1.405, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.14998125234345708, | |
| "grad_norm": 0.9332796931266785, | |
| "learning_rate": 0.00028881432308628855, | |
| "loss": 1.3973, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.15148106486689164, | |
| "grad_norm": 0.9386698007583618, | |
| "learning_rate": 0.00028853657281707696, | |
| "loss": 1.4142, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.1529808773903262, | |
| "grad_norm": 0.8631579279899597, | |
| "learning_rate": 0.0002882555532265269, | |
| "loss": 1.4148, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.1544806899137608, | |
| "grad_norm": 0.892475962638855, | |
| "learning_rate": 0.0002879712709464131, | |
| "loss": 1.4077, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.15598050243719536, | |
| "grad_norm": 0.8318502306938171, | |
| "learning_rate": 0.0002876837326855064, | |
| "loss": 1.3898, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.15748031496062992, | |
| "grad_norm": 0.8452482223510742, | |
| "learning_rate": 0.00028739294522941555, | |
| "loss": 1.3882, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.15898012748406448, | |
| "grad_norm": 0.9173560738563538, | |
| "learning_rate": 0.00028709891544042687, | |
| "loss": 1.3831, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.16047994000749907, | |
| "grad_norm": 0.86043381690979, | |
| "learning_rate": 0.0002868016502573425, | |
| "loss": 1.3982, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.16197975253093364, | |
| "grad_norm": 0.8810198903083801, | |
| "learning_rate": 0.00028650115669531654, | |
| "loss": 1.3806, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.1634795650543682, | |
| "grad_norm": 0.966907799243927, | |
| "learning_rate": 0.00028619744184568946, | |
| "loss": 1.3766, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.1649793775778028, | |
| "grad_norm": 0.8428529500961304, | |
| "learning_rate": 0.00028589051287582093, | |
| "loss": 1.3873, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.16647919010123735, | |
| "grad_norm": 0.9754992723464966, | |
| "learning_rate": 0.0002855803770289206, | |
| "loss": 1.3956, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.1679790026246719, | |
| "grad_norm": 0.9287955164909363, | |
| "learning_rate": 0.0002852670416238769, | |
| "loss": 1.3714, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.16947881514810648, | |
| "grad_norm": 0.9051069617271423, | |
| "learning_rate": 0.0002849505140550848, | |
| "loss": 1.3866, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.17097862767154107, | |
| "grad_norm": 0.8417872190475464, | |
| "learning_rate": 0.00028463080179227105, | |
| "loss": 1.3827, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.17247844019497563, | |
| "grad_norm": 1.0358750820159912, | |
| "learning_rate": 0.00028430791238031775, | |
| "loss": 1.4054, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.1739782527184102, | |
| "grad_norm": 0.8023399710655212, | |
| "learning_rate": 0.00028398185343908464, | |
| "loss": 1.3819, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.17547806524184478, | |
| "grad_norm": 0.887250542640686, | |
| "learning_rate": 0.000283652632663229, | |
| "loss": 1.3997, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.17697787776527935, | |
| "grad_norm": 0.8158445358276367, | |
| "learning_rate": 0.0002833202578220242, | |
| "loss": 1.3762, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.1784776902887139, | |
| "grad_norm": 0.799800455570221, | |
| "learning_rate": 0.0002829847367591764, | |
| "loss": 1.3974, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.17997750281214847, | |
| "grad_norm": 0.8806690573692322, | |
| "learning_rate": 0.0002826460773926393, | |
| "loss": 1.3694, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.18147731533558306, | |
| "grad_norm": 0.9353827834129333, | |
| "learning_rate": 0.00028230428771442725, | |
| "loss": 1.3646, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.18297712785901762, | |
| "grad_norm": 0.8729731440544128, | |
| "learning_rate": 0.000281959375790427, | |
| "loss": 1.3693, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.1844769403824522, | |
| "grad_norm": 0.8897218108177185, | |
| "learning_rate": 0.0002816113497602069, | |
| "loss": 1.3737, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.18597675290588678, | |
| "grad_norm": 0.8456818461418152, | |
| "learning_rate": 0.0002812602178368251, | |
| "loss": 1.3628, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.18747656542932134, | |
| "grad_norm": 0.8601028323173523, | |
| "learning_rate": 0.00028090598830663566, | |
| "loss": 1.3691, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.1889763779527559, | |
| "grad_norm": 0.9002561569213867, | |
| "learning_rate": 0.00028054866952909296, | |
| "loss": 1.3955, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 0.8424673080444336, | |
| "learning_rate": 0.00028018826993655445, | |
| "loss": 1.3606, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.19197600299962506, | |
| "grad_norm": 0.8275784254074097, | |
| "learning_rate": 0.00027982479803408166, | |
| "loss": 1.3566, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.19347581552305962, | |
| "grad_norm": 0.8737898468971252, | |
| "learning_rate": 0.00027945826239923955, | |
| "loss": 1.3677, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.19497562804649418, | |
| "grad_norm": 0.901017963886261, | |
| "learning_rate": 0.000279088671681894, | |
| "loss": 1.3691, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.19647544056992877, | |
| "grad_norm": 0.9295936822891235, | |
| "learning_rate": 0.0002787160346040076, | |
| "loss": 1.3403, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.19797525309336333, | |
| "grad_norm": 0.9055348038673401, | |
| "learning_rate": 0.00027834035995943413, | |
| "loss": 1.3562, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.1994750656167979, | |
| "grad_norm": 0.8951241970062256, | |
| "learning_rate": 0.00027796165661371074, | |
| "loss": 1.3415, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.20007499062617173, | |
| "eval_loss": 1.4021942615509033, | |
| "eval_runtime": 34.3883, | |
| "eval_samples_per_second": 726.991, | |
| "eval_steps_per_second": 90.874, | |
| "step": 1334 | |
| }, | |
| { | |
| "epoch": 0.20097487814023246, | |
| "grad_norm": 0.8198707699775696, | |
| "learning_rate": 0.00027757993350384873, | |
| "loss": 1.3479, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.20247469066366705, | |
| "grad_norm": 0.8340823650360107, | |
| "learning_rate": 0.00027719519963812286, | |
| "loss": 1.3498, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.2039745031871016, | |
| "grad_norm": 0.8691400289535522, | |
| "learning_rate": 0.00027680746409585865, | |
| "loss": 1.3531, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.20547431571053618, | |
| "grad_norm": 0.8769707679748535, | |
| "learning_rate": 0.00027641673602721805, | |
| "loss": 1.3337, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.20697412823397077, | |
| "grad_norm": 0.8912369012832642, | |
| "learning_rate": 0.00027602302465298367, | |
| "loss": 1.3398, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.20847394075740533, | |
| "grad_norm": 0.8747676014900208, | |
| "learning_rate": 0.0002756263392643409, | |
| "loss": 1.3427, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.2099737532808399, | |
| "grad_norm": 0.8138441443443298, | |
| "learning_rate": 0.0002752266892226591, | |
| "loss": 1.3397, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.21147356580427445, | |
| "grad_norm": 0.8546535968780518, | |
| "learning_rate": 0.0002748240839592701, | |
| "loss": 1.3423, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.21297337832770905, | |
| "grad_norm": 0.907558023929596, | |
| "learning_rate": 0.00027441853297524615, | |
| "loss": 1.3753, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.2144731908511436, | |
| "grad_norm": 0.8942850232124329, | |
| "learning_rate": 0.00027401004584117535, | |
| "loss": 1.3413, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.21597300337457817, | |
| "grad_norm": 0.8546082377433777, | |
| "learning_rate": 0.00027359863219693614, | |
| "loss": 1.3349, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.21747281589801276, | |
| "grad_norm": 0.8140226006507874, | |
| "learning_rate": 0.00027318430175146934, | |
| "loss": 1.3418, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.21897262842144732, | |
| "grad_norm": 0.8695687651634216, | |
| "learning_rate": 0.00027276706428254965, | |
| "loss": 1.345, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.2204724409448819, | |
| "grad_norm": 0.9050869941711426, | |
| "learning_rate": 0.00027234692963655407, | |
| "loss": 1.3443, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.22197225346831645, | |
| "grad_norm": 0.8348211646080017, | |
| "learning_rate": 0.00027192390772823045, | |
| "loss": 1.3453, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.22347206599175104, | |
| "grad_norm": 0.9440985918045044, | |
| "learning_rate": 0.00027149800854046283, | |
| "loss": 1.3336, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.2249718785151856, | |
| "grad_norm": 0.8988614678382874, | |
| "learning_rate": 0.0002710692421240362, | |
| "loss": 1.3411, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.22647169103862017, | |
| "grad_norm": 0.8702236413955688, | |
| "learning_rate": 0.0002706376185973991, | |
| "loss": 1.3423, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.22797150356205476, | |
| "grad_norm": 0.8790213465690613, | |
| "learning_rate": 0.0002702031481464252, | |
| "loss": 1.3192, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.22947131608548932, | |
| "grad_norm": 0.8360859155654907, | |
| "learning_rate": 0.00026976584102417233, | |
| "loss": 1.3411, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.23097112860892388, | |
| "grad_norm": 0.8287367820739746, | |
| "learning_rate": 0.0002693257075506411, | |
| "loss": 1.3423, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.23247094113235844, | |
| "grad_norm": 0.8342249989509583, | |
| "learning_rate": 0.00026888275811253105, | |
| "loss": 1.3485, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.23397075365579303, | |
| "grad_norm": 0.8485095500946045, | |
| "learning_rate": 0.00026843700316299564, | |
| "loss": 1.328, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.2354705661792276, | |
| "grad_norm": 0.8335973620414734, | |
| "learning_rate": 0.0002679884532213954, | |
| "loss": 1.3002, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.23697037870266216, | |
| "grad_norm": 0.8893833160400391, | |
| "learning_rate": 0.00026753711887304995, | |
| "loss": 1.3364, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.23847019122609675, | |
| "grad_norm": 0.9313494563102722, | |
| "learning_rate": 0.000267083010768988, | |
| "loss": 1.3419, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.2399700037495313, | |
| "grad_norm": 0.7982856631278992, | |
| "learning_rate": 0.0002666261396256961, | |
| "loss": 1.3219, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.24146981627296588, | |
| "grad_norm": 0.8474456071853638, | |
| "learning_rate": 0.0002661665162248656, | |
| "loss": 1.329, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.24296962879640044, | |
| "grad_norm": 0.8094434142112732, | |
| "learning_rate": 0.0002657041514131385, | |
| "loss": 1.3344, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.24446944131983503, | |
| "grad_norm": 0.8186250925064087, | |
| "learning_rate": 0.000265239056101851, | |
| "loss": 1.3266, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.2459692538432696, | |
| "grad_norm": 0.8790938854217529, | |
| "learning_rate": 0.0002647712412667765, | |
| "loss": 1.3141, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.24746906636670415, | |
| "grad_norm": 0.8377759456634521, | |
| "learning_rate": 0.00026430071794786644, | |
| "loss": 1.3285, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.24896887889013875, | |
| "grad_norm": 0.9131957292556763, | |
| "learning_rate": 0.00026382749724898955, | |
| "loss": 1.3029, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.2504686914135733, | |
| "grad_norm": 0.8385202884674072, | |
| "learning_rate": 0.00026335159033766996, | |
| "loss": 1.3329, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.25196850393700787, | |
| "grad_norm": 0.7972739338874817, | |
| "learning_rate": 0.0002628730084448239, | |
| "loss": 1.3253, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.25346831646044243, | |
| "grad_norm": 0.8282127380371094, | |
| "learning_rate": 0.000262391762864494, | |
| "loss": 1.3327, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.254968128983877, | |
| "grad_norm": 0.8004878163337708, | |
| "learning_rate": 0.00026190786495358366, | |
| "loss": 1.3186, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.25646794150731156, | |
| "grad_norm": 0.825681746006012, | |
| "learning_rate": 0.0002614213261315883, | |
| "loss": 1.3129, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.2579677540307462, | |
| "grad_norm": 0.8196373581886292, | |
| "learning_rate": 0.0002609321578803261, | |
| "loss": 1.3185, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.25946756655418074, | |
| "grad_norm": 0.8522502779960632, | |
| "learning_rate": 0.00026044037174366734, | |
| "loss": 1.3107, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.2609673790776153, | |
| "grad_norm": 0.8095912933349609, | |
| "learning_rate": 0.00025994597932726135, | |
| "loss": 1.3218, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.26246719160104987, | |
| "grad_norm": 0.8493536710739136, | |
| "learning_rate": 0.0002594489922982633, | |
| "loss": 1.3234, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.26396700412448443, | |
| "grad_norm": 0.8144869208335876, | |
| "learning_rate": 0.0002589494223850584, | |
| "loss": 1.3, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.265466816647919, | |
| "grad_norm": 0.8083682060241699, | |
| "learning_rate": 0.00025844728137698543, | |
| "loss": 1.3283, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.26696662917135355, | |
| "grad_norm": 0.8459505438804626, | |
| "learning_rate": 0.0002579425811240582, | |
| "loss": 1.319, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.26846644169478817, | |
| "grad_norm": 0.8262299299240112, | |
| "learning_rate": 0.00025743533353668626, | |
| "loss": 1.3089, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.26996625421822273, | |
| "grad_norm": 0.8823822736740112, | |
| "learning_rate": 0.0002569255505853934, | |
| "loss": 1.3157, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2714660667416573, | |
| "grad_norm": 0.8654087781906128, | |
| "learning_rate": 0.0002564132443005356, | |
| "loss": 1.3113, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.27296587926509186, | |
| "grad_norm": 0.8660631775856018, | |
| "learning_rate": 0.00025589842677201693, | |
| "loss": 1.303, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.2744656917885264, | |
| "grad_norm": 0.8453037142753601, | |
| "learning_rate": 0.0002553811101490042, | |
| "loss": 1.3036, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.275965504311961, | |
| "grad_norm": 0.7789093255996704, | |
| "learning_rate": 0.00025486130663964016, | |
| "loss": 1.3064, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.27746531683539555, | |
| "grad_norm": 0.8085753321647644, | |
| "learning_rate": 0.00025433902851075584, | |
| "loss": 1.3135, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.27896512935883017, | |
| "grad_norm": 0.8380696773529053, | |
| "learning_rate": 0.0002538142880875805, | |
| "loss": 1.2949, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.28046494188226473, | |
| "grad_norm": 0.8569440245628357, | |
| "learning_rate": 0.00025328709775345105, | |
| "loss": 1.3174, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.2819647544056993, | |
| "grad_norm": 0.7963806390762329, | |
| "learning_rate": 0.0002527574699495199, | |
| "loss": 1.3079, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.28346456692913385, | |
| "grad_norm": 0.8502215147018433, | |
| "learning_rate": 0.00025222541717446117, | |
| "loss": 1.3019, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.2849643794525684, | |
| "grad_norm": 0.8192076086997986, | |
| "learning_rate": 0.00025169095198417584, | |
| "loss": 1.2963, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.286464191976003, | |
| "grad_norm": 0.8132408261299133, | |
| "learning_rate": 0.00025115408699149546, | |
| "loss": 1.3122, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.2879640044994376, | |
| "grad_norm": 0.9454010128974915, | |
| "learning_rate": 0.00025061483486588435, | |
| "loss": 1.3203, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.28946381702287216, | |
| "grad_norm": 0.7738835215568542, | |
| "learning_rate": 0.00025007320833314085, | |
| "loss": 1.2868, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.2909636295463067, | |
| "grad_norm": 0.7828739881515503, | |
| "learning_rate": 0.00024952922017509687, | |
| "loss": 1.3065, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.2924634420697413, | |
| "grad_norm": 0.8362312316894531, | |
| "learning_rate": 0.00024898288322931615, | |
| "loss": 1.2927, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.29396325459317585, | |
| "grad_norm": 0.80490642786026, | |
| "learning_rate": 0.00024843421038879147, | |
| "loss": 1.2976, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.2954630671166104, | |
| "grad_norm": 0.8396286368370056, | |
| "learning_rate": 0.0002478832146016404, | |
| "loss": 1.3003, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.296962879640045, | |
| "grad_norm": 0.8507101535797119, | |
| "learning_rate": 0.0002473299088707996, | |
| "loss": 1.298, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.2984626921634796, | |
| "grad_norm": 0.886080801486969, | |
| "learning_rate": 0.00024677430625371803, | |
| "loss": 1.3002, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.29996250468691416, | |
| "grad_norm": 0.8753901124000549, | |
| "learning_rate": 0.0002462164198620489, | |
| "loss": 1.3057, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.3001124859392576, | |
| "eval_loss": 1.337799072265625, | |
| "eval_runtime": 34.4016, | |
| "eval_samples_per_second": 726.711, | |
| "eval_steps_per_second": 90.839, | |
| "step": 2001 | |
| }, | |
| { | |
| "epoch": 0.3014623172103487, | |
| "grad_norm": 0.8367862701416016, | |
| "learning_rate": 0.00024565626286134003, | |
| "loss": 1.2853, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.3029621297337833, | |
| "grad_norm": 0.8323600888252258, | |
| "learning_rate": 0.0002450938484707234, | |
| "loss": 1.2831, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.30446194225721784, | |
| "grad_norm": 0.8133791089057922, | |
| "learning_rate": 0.0002445291899626031, | |
| "loss": 1.2837, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.3059617547806524, | |
| "grad_norm": 0.8445649743080139, | |
| "learning_rate": 0.000243962300662342, | |
| "loss": 1.3076, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.30746156730408697, | |
| "grad_norm": 0.8368006944656372, | |
| "learning_rate": 0.00024339319394794742, | |
| "loss": 1.3018, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.3089613798275216, | |
| "grad_norm": 0.8313194513320923, | |
| "learning_rate": 0.00024282188324975534, | |
| "loss": 1.2931, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.31046119235095615, | |
| "grad_norm": 0.8213202357292175, | |
| "learning_rate": 0.0002422483820501136, | |
| "loss": 1.2962, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.3119610048743907, | |
| "grad_norm": 0.8784091472625732, | |
| "learning_rate": 0.00024167270388306366, | |
| "loss": 1.288, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.3134608173978253, | |
| "grad_norm": 0.7992005348205566, | |
| "learning_rate": 0.00024109486233402102, | |
| "loss": 1.29, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.31496062992125984, | |
| "grad_norm": 0.7810459136962891, | |
| "learning_rate": 0.00024051487103945486, | |
| "loss": 1.2769, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3164604424446944, | |
| "grad_norm": 0.7913289666175842, | |
| "learning_rate": 0.00023993274368656618, | |
| "loss": 1.2822, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.31796025496812896, | |
| "grad_norm": 0.7812384366989136, | |
| "learning_rate": 0.00023934849401296472, | |
| "loss": 1.2962, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.3194600674915636, | |
| "grad_norm": 0.8585249185562134, | |
| "learning_rate": 0.0002387621358063449, | |
| "loss": 1.2842, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.32095988001499814, | |
| "grad_norm": 0.8342397212982178, | |
| "learning_rate": 0.00023817368290416036, | |
| "loss": 1.287, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.3224596925384327, | |
| "grad_norm": 0.7857896089553833, | |
| "learning_rate": 0.00023758314919329726, | |
| "loss": 1.3053, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.32395950506186727, | |
| "grad_norm": 0.7923012971878052, | |
| "learning_rate": 0.00023699054860974682, | |
| "loss": 1.2731, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.32545931758530183, | |
| "grad_norm": 0.7796991467475891, | |
| "learning_rate": 0.00023639589513827636, | |
| "loss": 1.2716, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.3269591301087364, | |
| "grad_norm": 0.8867438435554504, | |
| "learning_rate": 0.0002357992028120993, | |
| "loss": 1.2908, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.32845894263217096, | |
| "grad_norm": 0.8547908067703247, | |
| "learning_rate": 0.00023520048571254378, | |
| "loss": 1.2772, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.3299587551556056, | |
| "grad_norm": 0.8480901122093201, | |
| "learning_rate": 0.00023459975796872063, | |
| "loss": 1.2716, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.33145856767904014, | |
| "grad_norm": 0.8187602758407593, | |
| "learning_rate": 0.0002339970337571899, | |
| "loss": 1.2724, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.3329583802024747, | |
| "grad_norm": 0.8206058740615845, | |
| "learning_rate": 0.000233392327301626, | |
| "loss": 1.3034, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.33445819272590926, | |
| "grad_norm": 0.76264888048172, | |
| "learning_rate": 0.0002327856528724825, | |
| "loss": 1.2576, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.3359580052493438, | |
| "grad_norm": 0.8335399627685547, | |
| "learning_rate": 0.0002321770247866551, | |
| "loss": 1.2857, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.3374578177727784, | |
| "grad_norm": 0.7656426429748535, | |
| "learning_rate": 0.00023156645740714368, | |
| "loss": 1.2978, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.33895763029621295, | |
| "grad_norm": 0.7743305563926697, | |
| "learning_rate": 0.00023095396514271355, | |
| "loss": 1.2803, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.34045744281964757, | |
| "grad_norm": 0.7768455147743225, | |
| "learning_rate": 0.0002303395624475553, | |
| "loss": 1.2978, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.34195725534308213, | |
| "grad_norm": 0.8204723596572876, | |
| "learning_rate": 0.00022972326382094378, | |
| "loss": 1.2708, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.3434570678665167, | |
| "grad_norm": 0.8377450108528137, | |
| "learning_rate": 0.00022910508380689584, | |
| "loss": 1.276, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.34495688038995126, | |
| "grad_norm": 0.7735800743103027, | |
| "learning_rate": 0.00022848503699382717, | |
| "loss": 1.2987, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.3464566929133858, | |
| "grad_norm": 0.8727670907974243, | |
| "learning_rate": 0.00022786313801420794, | |
| "loss": 1.267, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.3479565054368204, | |
| "grad_norm": 0.7944260835647583, | |
| "learning_rate": 0.0002272394015442177, | |
| "loss": 1.2937, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.34945631796025495, | |
| "grad_norm": 0.8214771747589111, | |
| "learning_rate": 0.0002266138423033987, | |
| "loss": 1.2879, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.35095613048368957, | |
| "grad_norm": 0.7794116139411926, | |
| "learning_rate": 0.00022598647505430895, | |
| "loss": 1.2599, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.35245594300712413, | |
| "grad_norm": 0.76594078540802, | |
| "learning_rate": 0.0002253573146021733, | |
| "loss": 1.2613, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.3539557555305587, | |
| "grad_norm": 0.8062904477119446, | |
| "learning_rate": 0.0002247263757945347, | |
| "loss": 1.2959, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.35545556805399325, | |
| "grad_norm": 0.8257420063018799, | |
| "learning_rate": 0.00022409367352090322, | |
| "loss": 1.2567, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.3569553805774278, | |
| "grad_norm": 0.8322898149490356, | |
| "learning_rate": 0.00022345922271240496, | |
| "loss": 1.2684, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.3584551931008624, | |
| "grad_norm": 0.8116471171379089, | |
| "learning_rate": 0.00022282303834142978, | |
| "loss": 1.2643, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.35995500562429694, | |
| "grad_norm": 0.8192791938781738, | |
| "learning_rate": 0.0002221851354212777, | |
| "loss": 1.2586, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.36145481814773156, | |
| "grad_norm": 0.7919474244117737, | |
| "learning_rate": 0.0002215455290058048, | |
| "loss": 1.2869, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.3629546306711661, | |
| "grad_norm": 0.8426802158355713, | |
| "learning_rate": 0.000220904234189068, | |
| "loss": 1.2589, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.3644544431946007, | |
| "grad_norm": 0.851420521736145, | |
| "learning_rate": 0.00022026126610496852, | |
| "loss": 1.2569, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.36595425571803525, | |
| "grad_norm": 0.8213547468185425, | |
| "learning_rate": 0.0002196166399268952, | |
| "loss": 1.2698, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.3674540682414698, | |
| "grad_norm": 0.7695969343185425, | |
| "learning_rate": 0.00021897037086736614, | |
| "loss": 1.2668, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.3689538807649044, | |
| "grad_norm": 0.7834669351577759, | |
| "learning_rate": 0.0002183224741776697, | |
| "loss": 1.2662, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.37045369328833894, | |
| "grad_norm": 0.7951564788818359, | |
| "learning_rate": 0.00021767296514750472, | |
| "loss": 1.2661, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.37195350581177355, | |
| "grad_norm": 0.7705678939819336, | |
| "learning_rate": 0.00021702185910461958, | |
| "loss": 1.2623, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.3734533183352081, | |
| "grad_norm": 0.850374162197113, | |
| "learning_rate": 0.00021636917141445056, | |
| "loss": 1.2386, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.3749531308586427, | |
| "grad_norm": 0.795702338218689, | |
| "learning_rate": 0.00021571491747975917, | |
| "loss": 1.2604, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.37645294338207724, | |
| "grad_norm": 0.8536216020584106, | |
| "learning_rate": 0.0002150591127402687, | |
| "loss": 1.2497, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.3779527559055118, | |
| "grad_norm": 0.813890278339386, | |
| "learning_rate": 0.00021440177267229984, | |
| "loss": 1.2505, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.37945256842894637, | |
| "grad_norm": 0.8229677081108093, | |
| "learning_rate": 0.00021374291278840546, | |
| "loss": 1.2634, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 0.868016242980957, | |
| "learning_rate": 0.00021308254863700452, | |
| "loss": 1.2537, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.38245219347581555, | |
| "grad_norm": 0.8083469271659851, | |
| "learning_rate": 0.00021242069580201524, | |
| "loss": 1.2702, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.3839520059992501, | |
| "grad_norm": 0.7700805068016052, | |
| "learning_rate": 0.00021175736990248714, | |
| "loss": 1.2755, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.3854518185226847, | |
| "grad_norm": 0.787372350692749, | |
| "learning_rate": 0.00021109258659223254, | |
| "loss": 1.2581, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.38695163104611924, | |
| "grad_norm": 0.8132008910179138, | |
| "learning_rate": 0.00021042636155945723, | |
| "loss": 1.2408, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.3884514435695538, | |
| "grad_norm": 0.7871599793434143, | |
| "learning_rate": 0.00020975871052639024, | |
| "loss": 1.2622, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.38995125609298836, | |
| "grad_norm": 0.841528594493866, | |
| "learning_rate": 0.00020908964924891256, | |
| "loss": 1.2382, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.3914510686164229, | |
| "grad_norm": 0.7999451756477356, | |
| "learning_rate": 0.0002084191935161857, | |
| "loss": 1.2771, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.39295088113985754, | |
| "grad_norm": 0.7474108934402466, | |
| "learning_rate": 0.0002077473591502788, | |
| "loss": 1.2656, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.3944506936632921, | |
| "grad_norm": 0.8236092329025269, | |
| "learning_rate": 0.00020707416200579524, | |
| "loss": 1.2576, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.39595050618672667, | |
| "grad_norm": 0.8108281493186951, | |
| "learning_rate": 0.00020639961796949877, | |
| "loss": 1.2534, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.39745031871016123, | |
| "grad_norm": 0.8653910756111145, | |
| "learning_rate": 0.00020572374295993822, | |
| "loss": 1.2666, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.3989501312335958, | |
| "grad_norm": 0.8193902969360352, | |
| "learning_rate": 0.00020504655292707223, | |
| "loss": 1.2528, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.40014998125234347, | |
| "eval_loss": 1.300592303276062, | |
| "eval_runtime": 34.4278, | |
| "eval_samples_per_second": 726.158, | |
| "eval_steps_per_second": 90.77, | |
| "step": 2668 | |
| }, | |
| { | |
| "epoch": 0.40044994375703036, | |
| "grad_norm": 0.8101006150245667, | |
| "learning_rate": 0.00020436806385189246, | |
| "loss": 1.2646, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.4019497562804649, | |
| "grad_norm": 0.8838757276535034, | |
| "learning_rate": 0.00020368829174604667, | |
| "loss": 1.2667, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.40344956880389954, | |
| "grad_norm": 0.7894991636276245, | |
| "learning_rate": 0.00020300725265146093, | |
| "loss": 1.266, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.4049493813273341, | |
| "grad_norm": 0.8235330581665039, | |
| "learning_rate": 0.00020232496263996092, | |
| "loss": 1.2503, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.40644919385076866, | |
| "grad_norm": 0.7787916660308838, | |
| "learning_rate": 0.00020164143781289256, | |
| "loss": 1.2521, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.4079490063742032, | |
| "grad_norm": 0.8202505707740784, | |
| "learning_rate": 0.00020095669430074235, | |
| "loss": 1.2717, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.4094488188976378, | |
| "grad_norm": 0.7967453598976135, | |
| "learning_rate": 0.00020027074826275629, | |
| "loss": 1.2636, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.41094863142107235, | |
| "grad_norm": 0.7384628653526306, | |
| "learning_rate": 0.00019958361588655888, | |
| "loss": 1.2531, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.4124484439445069, | |
| "grad_norm": 0.8367340564727783, | |
| "learning_rate": 0.00019889531338777112, | |
| "loss": 1.262, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.41394825646794153, | |
| "grad_norm": 0.8179819583892822, | |
| "learning_rate": 0.0001982058570096274, | |
| "loss": 1.2453, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.4154480689913761, | |
| "grad_norm": 0.7929909825325012, | |
| "learning_rate": 0.00019751526302259271, | |
| "loss": 1.2332, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.41694788151481066, | |
| "grad_norm": 0.803432285785675, | |
| "learning_rate": 0.00019682354772397842, | |
| "loss": 1.2376, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.4184476940382452, | |
| "grad_norm": 0.7572347521781921, | |
| "learning_rate": 0.00019613072743755755, | |
| "loss": 1.2535, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.4199475065616798, | |
| "grad_norm": 0.7730807662010193, | |
| "learning_rate": 0.00019543681851317998, | |
| "loss": 1.2457, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.42144731908511435, | |
| "grad_norm": 0.8271188735961914, | |
| "learning_rate": 0.00019474183732638608, | |
| "loss": 1.2525, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.4229471316085489, | |
| "grad_norm": 0.8117753863334656, | |
| "learning_rate": 0.0001940458002780206, | |
| "loss": 1.2321, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.42444694413198353, | |
| "grad_norm": 0.8418020606040955, | |
| "learning_rate": 0.00019334872379384556, | |
| "loss": 1.2501, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.4259467566554181, | |
| "grad_norm": 0.8009008765220642, | |
| "learning_rate": 0.0001926506243241526, | |
| "loss": 1.2398, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.42744656917885265, | |
| "grad_norm": 0.7942110300064087, | |
| "learning_rate": 0.00019195151834337473, | |
| "loss": 1.244, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.4289463817022872, | |
| "grad_norm": 0.8147196173667908, | |
| "learning_rate": 0.00019125142234969762, | |
| "loss": 1.2499, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.4304461942257218, | |
| "grad_norm": 0.843659520149231, | |
| "learning_rate": 0.00019055035286467034, | |
| "loss": 1.2424, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.43194600674915634, | |
| "grad_norm": 0.8289938569068909, | |
| "learning_rate": 0.00018984832643281513, | |
| "loss": 1.2529, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.4334458192725909, | |
| "grad_norm": 0.8647093772888184, | |
| "learning_rate": 0.00018914535962123735, | |
| "loss": 1.2579, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.4349456317960255, | |
| "grad_norm": 0.7539538145065308, | |
| "learning_rate": 0.00018844146901923436, | |
| "loss": 1.2408, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.4364454443194601, | |
| "grad_norm": 0.8231214284896851, | |
| "learning_rate": 0.000187736671237904, | |
| "loss": 1.2346, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.43794525684289465, | |
| "grad_norm": 0.7767258882522583, | |
| "learning_rate": 0.0001870309829097526, | |
| "loss": 1.2379, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.4394450693663292, | |
| "grad_norm": 0.8100400567054749, | |
| "learning_rate": 0.00018632442068830244, | |
| "loss": 1.2248, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.4409448818897638, | |
| "grad_norm": 0.7970197796821594, | |
| "learning_rate": 0.00018561700124769892, | |
| "loss": 1.2312, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.44244469441319834, | |
| "grad_norm": 0.8453084826469421, | |
| "learning_rate": 0.0001849087412823168, | |
| "loss": 1.2379, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.4439445069366329, | |
| "grad_norm": 0.7835219502449036, | |
| "learning_rate": 0.00018419965750636645, | |
| "loss": 1.2377, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.4454443194600675, | |
| "grad_norm": 0.9176828861236572, | |
| "learning_rate": 0.00018348976665349932, | |
| "loss": 1.2322, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.4469441319835021, | |
| "grad_norm": 0.8404021859169006, | |
| "learning_rate": 0.00018277908547641294, | |
| "loss": 1.2364, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.44844394450693664, | |
| "grad_norm": 0.8273976445198059, | |
| "learning_rate": 0.00018206763074645588, | |
| "loss": 1.2242, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.4499437570303712, | |
| "grad_norm": 0.7463936805725098, | |
| "learning_rate": 0.0001813554192532316, | |
| "loss": 1.2459, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.45144356955380577, | |
| "grad_norm": 0.7790806293487549, | |
| "learning_rate": 0.00018064246780420245, | |
| "loss": 1.2473, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.45294338207724033, | |
| "grad_norm": 0.766952633857727, | |
| "learning_rate": 0.000179928793224293, | |
| "loss": 1.219, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.4544431946006749, | |
| "grad_norm": 0.8558096289634705, | |
| "learning_rate": 0.00017921441235549295, | |
| "loss": 1.2413, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.4559430071241095, | |
| "grad_norm": 0.8494218587875366, | |
| "learning_rate": 0.00017849934205645967, | |
| "loss": 1.2492, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.4574428196475441, | |
| "grad_norm": 0.8279352784156799, | |
| "learning_rate": 0.00017778359920212047, | |
| "loss": 1.2509, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.45894263217097864, | |
| "grad_norm": 0.7695614695549011, | |
| "learning_rate": 0.0001770672006832741, | |
| "loss": 1.2375, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.4604424446944132, | |
| "grad_norm": 0.8258097767829895, | |
| "learning_rate": 0.00017635016340619255, | |
| "loss": 1.2286, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.46194225721784776, | |
| "grad_norm": 0.7904211282730103, | |
| "learning_rate": 0.00017563250429222173, | |
| "loss": 1.2527, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.4634420697412823, | |
| "grad_norm": 0.8494943976402283, | |
| "learning_rate": 0.00017491424027738216, | |
| "loss": 1.2484, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.4649418822647169, | |
| "grad_norm": 0.8655872941017151, | |
| "learning_rate": 0.0001741953883119696, | |
| "loss": 1.2172, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.4664416947881515, | |
| "grad_norm": 0.8473703861236572, | |
| "learning_rate": 0.00017347596536015472, | |
| "loss": 1.2376, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.46794150731158607, | |
| "grad_norm": 0.8274358510971069, | |
| "learning_rate": 0.00017275598839958296, | |
| "loss": 1.2458, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.46944131983502063, | |
| "grad_norm": 0.8414213061332703, | |
| "learning_rate": 0.00017203547442097369, | |
| "loss": 1.233, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.4709411323584552, | |
| "grad_norm": 0.8282588720321655, | |
| "learning_rate": 0.0001713144404277195, | |
| "loss": 1.2398, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.47244094488188976, | |
| "grad_norm": 0.8157975077629089, | |
| "learning_rate": 0.0001705929034354846, | |
| "loss": 1.2236, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.4739407574053243, | |
| "grad_norm": 0.8301715850830078, | |
| "learning_rate": 0.0001698708804718037, | |
| "loss": 1.2214, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.4754405699287589, | |
| "grad_norm": 0.7506479620933533, | |
| "learning_rate": 0.00016914838857567979, | |
| "loss": 1.2332, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.4769403824521935, | |
| "grad_norm": 0.8075399994850159, | |
| "learning_rate": 0.00016842544479718215, | |
| "loss": 1.2344, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.47844019497562806, | |
| "grad_norm": 0.9165489673614502, | |
| "learning_rate": 0.00016770206619704412, | |
| "loss": 1.2393, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.4799400074990626, | |
| "grad_norm": 0.7626012563705444, | |
| "learning_rate": 0.0001669782698462603, | |
| "loss": 1.2273, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.4814398200224972, | |
| "grad_norm": 0.8293296694755554, | |
| "learning_rate": 0.00016625407282568394, | |
| "loss": 1.2378, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.48293963254593175, | |
| "grad_norm": 0.7572868466377258, | |
| "learning_rate": 0.00016552949222562352, | |
| "loss": 1.2449, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.4844394450693663, | |
| "grad_norm": 0.7891345024108887, | |
| "learning_rate": 0.00016480454514543962, | |
| "loss": 1.2336, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.4859392575928009, | |
| "grad_norm": 0.7002502083778381, | |
| "learning_rate": 0.00016407924869314144, | |
| "loss": 1.2249, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.4874390701162355, | |
| "grad_norm": 0.811424732208252, | |
| "learning_rate": 0.00016335361998498296, | |
| "loss": 1.2053, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.48893888263967006, | |
| "grad_norm": 0.8211063146591187, | |
| "learning_rate": 0.00016262767614505912, | |
| "loss": 1.2139, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.4904386951631046, | |
| "grad_norm": 0.8250209093093872, | |
| "learning_rate": 0.00016190143430490152, | |
| "loss": 1.2163, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.4919385076865392, | |
| "grad_norm": 0.7996110320091248, | |
| "learning_rate": 0.00016117491160307445, | |
| "loss": 1.2361, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.49343832020997375, | |
| "grad_norm": 0.8566946983337402, | |
| "learning_rate": 0.00016044812518477007, | |
| "loss": 1.2353, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.4949381327334083, | |
| "grad_norm": 0.7891775369644165, | |
| "learning_rate": 0.00015972109220140402, | |
| "loss": 1.2174, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.49643794525684287, | |
| "grad_norm": 0.769114077091217, | |
| "learning_rate": 0.0001589938298102108, | |
| "loss": 1.2293, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.4979377577802775, | |
| "grad_norm": 0.8417636156082153, | |
| "learning_rate": 0.0001582663551738384, | |
| "loss": 1.2303, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.49943757030371205, | |
| "grad_norm": 0.7648106813430786, | |
| "learning_rate": 0.00015753868545994378, | |
| "loss": 1.2287, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.5001874765654293, | |
| "eval_loss": 1.2699334621429443, | |
| "eval_runtime": 34.8375, | |
| "eval_samples_per_second": 717.618, | |
| "eval_steps_per_second": 89.702, | |
| "step": 3335 | |
| }, | |
| { | |
| "epoch": 0.5009373828271466, | |
| "grad_norm": 0.8185369372367859, | |
| "learning_rate": 0.00015681083784078748, | |
| "loss": 1.2221, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.5024371953505812, | |
| "grad_norm": 0.8825401067733765, | |
| "learning_rate": 0.00015608282949282844, | |
| "loss": 1.2339, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.5039370078740157, | |
| "grad_norm": 0.8161605596542358, | |
| "learning_rate": 0.00015535467759631862, | |
| "loss": 1.2352, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.5054368203974503, | |
| "grad_norm": 0.8088217973709106, | |
| "learning_rate": 0.00015462639933489753, | |
| "loss": 1.2212, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.5069366329208849, | |
| "grad_norm": 0.8628859519958496, | |
| "learning_rate": 0.00015389801189518693, | |
| "loss": 1.2222, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.5084364454443194, | |
| "grad_norm": 0.8813005089759827, | |
| "learning_rate": 0.00015316953246638482, | |
| "loss": 1.2165, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.509936257967754, | |
| "grad_norm": 0.7920242547988892, | |
| "learning_rate": 0.00015244097823986023, | |
| "loss": 1.2197, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5114360704911886, | |
| "grad_norm": 0.8255246877670288, | |
| "learning_rate": 0.0001517123664087473, | |
| "loss": 1.2359, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.5129358830146231, | |
| "grad_norm": 0.8508349657058716, | |
| "learning_rate": 0.00015098371416753963, | |
| "loss": 1.2222, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.5144356955380578, | |
| "grad_norm": 0.870098352432251, | |
| "learning_rate": 0.00015025503871168432, | |
| "loss": 1.2107, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.5159355080614924, | |
| "grad_norm": 0.7887611985206604, | |
| "learning_rate": 0.00014952635723717642, | |
| "loss": 1.2469, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.5174353205849269, | |
| "grad_norm": 0.8463996052742004, | |
| "learning_rate": 0.000148797686940153, | |
| "loss": 1.2251, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.5189351331083615, | |
| "grad_norm": 0.8258456587791443, | |
| "learning_rate": 0.0001480690450164873, | |
| "loss": 1.2241, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.520434945631796, | |
| "grad_norm": 0.8315289616584778, | |
| "learning_rate": 0.00014734044866138312, | |
| "loss": 1.1998, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.5219347581552306, | |
| "grad_norm": 0.8053680062294006, | |
| "learning_rate": 0.00014661191506896867, | |
| "loss": 1.2062, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.5234345706786652, | |
| "grad_norm": 0.7926360368728638, | |
| "learning_rate": 0.0001458834614318912, | |
| "loss": 1.2092, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.5249343832020997, | |
| "grad_norm": 0.7700707912445068, | |
| "learning_rate": 0.00014515510494091102, | |
| "loss": 1.2077, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.5264341957255343, | |
| "grad_norm": 0.8049729466438293, | |
| "learning_rate": 0.00014442686278449588, | |
| "loss": 1.2134, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.5279340082489689, | |
| "grad_norm": 0.8229398727416992, | |
| "learning_rate": 0.00014369875214841548, | |
| "loss": 1.2235, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.5294338207724034, | |
| "grad_norm": 0.7805364727973938, | |
| "learning_rate": 0.0001429707902153355, | |
| "loss": 1.2315, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.530933633295838, | |
| "grad_norm": 0.7947820425033569, | |
| "learning_rate": 0.0001422429941644127, | |
| "loss": 1.2166, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.5324334458192725, | |
| "grad_norm": 0.7772228121757507, | |
| "learning_rate": 0.000141515381170889, | |
| "loss": 1.2267, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.5339332583427071, | |
| "grad_norm": 0.7693078517913818, | |
| "learning_rate": 0.00014078796840568647, | |
| "loss": 1.2173, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.5354330708661418, | |
| "grad_norm": 0.8406286239624023, | |
| "learning_rate": 0.0001400607730350018, | |
| "loss": 1.2161, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.5369328833895763, | |
| "grad_norm": 0.7883846163749695, | |
| "learning_rate": 0.0001393338122199016, | |
| "loss": 1.23, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.5384326959130109, | |
| "grad_norm": 0.7825552821159363, | |
| "learning_rate": 0.00013860710311591713, | |
| "loss": 1.2165, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.5399325084364455, | |
| "grad_norm": 0.9384158253669739, | |
| "learning_rate": 0.00013788066287263946, | |
| "loss": 1.2197, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.54143232095988, | |
| "grad_norm": 0.888299286365509, | |
| "learning_rate": 0.00013715450863331495, | |
| "loss": 1.2082, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.5429321334833146, | |
| "grad_norm": 0.8535633087158203, | |
| "learning_rate": 0.00013642865753444043, | |
| "loss": 1.212, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.5444319460067492, | |
| "grad_norm": 0.8287407159805298, | |
| "learning_rate": 0.000135703126705359, | |
| "loss": 1.2009, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.5459317585301837, | |
| "grad_norm": 0.7710711359977722, | |
| "learning_rate": 0.00013497793326785573, | |
| "loss": 1.2205, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.5474315710536183, | |
| "grad_norm": 0.7808111310005188, | |
| "learning_rate": 0.00013425309433575365, | |
| "loss": 1.2149, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.5489313835770528, | |
| "grad_norm": 0.7702746987342834, | |
| "learning_rate": 0.0001335286270145096, | |
| "loss": 1.197, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.5504311961004874, | |
| "grad_norm": 0.832952618598938, | |
| "learning_rate": 0.00013280454840081105, | |
| "loss": 1.2075, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.551931008623922, | |
| "grad_norm": 0.7749794125556946, | |
| "learning_rate": 0.0001320808755821722, | |
| "loss": 1.2136, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.5534308211473565, | |
| "grad_norm": 0.8737798929214478, | |
| "learning_rate": 0.00013135762563653097, | |
| "loss": 1.2017, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.5549306336707911, | |
| "grad_norm": 0.8434765934944153, | |
| "learning_rate": 0.00013063481563184589, | |
| "loss": 1.1912, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.5564304461942258, | |
| "grad_norm": 0.7952425479888916, | |
| "learning_rate": 0.00012991246262569327, | |
| "loss": 1.2148, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.5579302587176603, | |
| "grad_norm": 0.8273133039474487, | |
| "learning_rate": 0.00012919058366486492, | |
| "loss": 1.219, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.5594300712410949, | |
| "grad_norm": 0.86753249168396, | |
| "learning_rate": 0.00012846919578496545, | |
| "loss": 1.1893, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.5609298837645295, | |
| "grad_norm": 0.8196529746055603, | |
| "learning_rate": 0.00012774831601001054, | |
| "loss": 1.2166, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.562429696287964, | |
| "grad_norm": 0.9246373772621155, | |
| "learning_rate": 0.00012702796135202518, | |
| "loss": 1.2296, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.5639295088113986, | |
| "grad_norm": 0.8339414000511169, | |
| "learning_rate": 0.00012630814881064206, | |
| "loss": 1.2164, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.5654293213348331, | |
| "grad_norm": 0.8195322155952454, | |
| "learning_rate": 0.00012558889537270048, | |
| "loss": 1.2031, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.5669291338582677, | |
| "grad_norm": 0.8975194096565247, | |
| "learning_rate": 0.0001248702180118455, | |
| "loss": 1.2236, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.5684289463817023, | |
| "grad_norm": 0.7996919751167297, | |
| "learning_rate": 0.00012415213368812731, | |
| "loss": 1.1993, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.5699287589051368, | |
| "grad_norm": 0.7755063772201538, | |
| "learning_rate": 0.00012343465934760102, | |
| "loss": 1.2084, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.8294559717178345, | |
| "learning_rate": 0.00012271781192192688, | |
| "loss": 1.2175, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.572928383952006, | |
| "grad_norm": 0.8047690391540527, | |
| "learning_rate": 0.00012200160832797046, | |
| "loss": 1.1986, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.5744281964754405, | |
| "grad_norm": 0.7778790593147278, | |
| "learning_rate": 0.0001212860654674036, | |
| "loss": 1.2211, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.5759280089988752, | |
| "grad_norm": 0.8431829810142517, | |
| "learning_rate": 0.00012057120022630546, | |
| "loss": 1.2089, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.5774278215223098, | |
| "grad_norm": 0.8307316899299622, | |
| "learning_rate": 0.00011985702947476424, | |
| "loss": 1.2035, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.5789276340457443, | |
| "grad_norm": 0.9299211502075195, | |
| "learning_rate": 0.00011914357006647877, | |
| "loss": 1.1933, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.5804274465691789, | |
| "grad_norm": 0.9003536105155945, | |
| "learning_rate": 0.00011843083883836084, | |
| "loss": 1.2093, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.5819272590926134, | |
| "grad_norm": 0.8364601731300354, | |
| "learning_rate": 0.0001177188526101381, | |
| "loss": 1.2051, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.583427071616048, | |
| "grad_norm": 0.877537190914154, | |
| "learning_rate": 0.00011700762818395682, | |
| "loss": 1.213, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.5849268841394826, | |
| "grad_norm": 0.820450484752655, | |
| "learning_rate": 0.0001162971823439856, | |
| "loss": 1.2025, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.5864266966629171, | |
| "grad_norm": 0.7949150800704956, | |
| "learning_rate": 0.00011558753185601922, | |
| "loss": 1.2006, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.5879265091863517, | |
| "grad_norm": 0.7832253575325012, | |
| "learning_rate": 0.00011487869346708289, | |
| "loss": 1.1894, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.5894263217097863, | |
| "grad_norm": 0.7712039351463318, | |
| "learning_rate": 0.00011417068390503716, | |
| "loss": 1.2076, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.5909261342332208, | |
| "grad_norm": 0.8228356838226318, | |
| "learning_rate": 0.00011346351987818307, | |
| "loss": 1.1907, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.5924259467566554, | |
| "grad_norm": 0.8279032111167908, | |
| "learning_rate": 0.00011275721807486805, | |
| "loss": 1.2132, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.59392575928009, | |
| "grad_norm": 0.8680859208106995, | |
| "learning_rate": 0.00011205179516309172, | |
| "loss": 1.199, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.5954255718035245, | |
| "grad_norm": 0.9166079759597778, | |
| "learning_rate": 0.00011134726779011288, | |
| "loss": 1.2, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.5969253843269592, | |
| "grad_norm": 0.8387262225151062, | |
| "learning_rate": 0.00011064365258205658, | |
| "loss": 1.1922, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.5984251968503937, | |
| "grad_norm": 0.7966826558113098, | |
| "learning_rate": 0.00010994096614352153, | |
| "loss": 1.2041, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.5999250093738283, | |
| "grad_norm": 0.8371879458427429, | |
| "learning_rate": 0.00010923922505718863, | |
| "loss": 1.2041, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6002249718785152, | |
| "eval_loss": 1.2461844682693481, | |
| "eval_runtime": 34.5014, | |
| "eval_samples_per_second": 724.607, | |
| "eval_steps_per_second": 90.576, | |
| "step": 4002 | |
| }, | |
| { | |
| "epoch": 0.6014248218972629, | |
| "grad_norm": 0.8346044421195984, | |
| "learning_rate": 0.00010853844588342926, | |
| "loss": 1.1886, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.6029246344206974, | |
| "grad_norm": 0.7978150248527527, | |
| "learning_rate": 0.00010783864515991481, | |
| "loss": 1.2161, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.604424446944132, | |
| "grad_norm": 0.8575156331062317, | |
| "learning_rate": 0.00010713983940122617, | |
| "loss": 1.2171, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.6059242594675666, | |
| "grad_norm": 0.8124422430992126, | |
| "learning_rate": 0.00010644204509846398, | |
| "loss": 1.1864, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.6074240719910011, | |
| "grad_norm": 0.8295300006866455, | |
| "learning_rate": 0.00010574527871885977, | |
| "loss": 1.2312, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.6089238845144357, | |
| "grad_norm": 0.8077391982078552, | |
| "learning_rate": 0.00010504955670538699, | |
| "loss": 1.2036, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.6104236970378702, | |
| "grad_norm": 0.839034914970398, | |
| "learning_rate": 0.00010435489547637316, | |
| "loss": 1.2101, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.6119235095613048, | |
| "grad_norm": 0.7653098106384277, | |
| "learning_rate": 0.00010366131142511228, | |
| "loss": 1.2127, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.6134233220847394, | |
| "grad_norm": 0.7839916944503784, | |
| "learning_rate": 0.00010296882091947826, | |
| "loss": 1.1972, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.6149231346081739, | |
| "grad_norm": 0.8738676905632019, | |
| "learning_rate": 0.00010227744030153821, | |
| "loss": 1.1864, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.6164229471316085, | |
| "grad_norm": 0.8577001094818115, | |
| "learning_rate": 0.0001015871858871672, | |
| "loss": 1.2282, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.6179227596550432, | |
| "grad_norm": 0.8081793785095215, | |
| "learning_rate": 0.00010089807396566306, | |
| "loss": 1.2156, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.6194225721784777, | |
| "grad_norm": 0.8585608601570129, | |
| "learning_rate": 0.00010021012079936174, | |
| "loss": 1.1875, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.6209223847019123, | |
| "grad_norm": 0.8267788290977478, | |
| "learning_rate": 9.952334262325399e-05, | |
| "loss": 1.175, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.6224221972253469, | |
| "grad_norm": 0.840057373046875, | |
| "learning_rate": 9.883775564460193e-05, | |
| "loss": 1.1884, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.6239220097487814, | |
| "grad_norm": 0.8279968500137329, | |
| "learning_rate": 9.815337604255665e-05, | |
| "loss": 1.1902, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.625421822272216, | |
| "grad_norm": 0.7800420522689819, | |
| "learning_rate": 9.747021996777624e-05, | |
| "loss": 1.1982, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.6269216347956506, | |
| "grad_norm": 0.7733381390571594, | |
| "learning_rate": 9.678830354204504e-05, | |
| "loss": 1.2104, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.6284214473190851, | |
| "grad_norm": 0.8349838256835938, | |
| "learning_rate": 9.610764285789271e-05, | |
| "loss": 1.1976, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.6299212598425197, | |
| "grad_norm": 0.8445114493370056, | |
| "learning_rate": 9.542825397821485e-05, | |
| "loss": 1.1877, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.6314210723659542, | |
| "grad_norm": 0.8512496948242188, | |
| "learning_rate": 9.475015293589373e-05, | |
| "loss": 1.1979, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.6329208848893888, | |
| "grad_norm": 0.817348837852478, | |
| "learning_rate": 9.407335573341997e-05, | |
| "loss": 1.1888, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.6344206974128234, | |
| "grad_norm": 0.8680126070976257, | |
| "learning_rate": 9.339787834251489e-05, | |
| "loss": 1.1961, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.6359205099362579, | |
| "grad_norm": 0.82338547706604, | |
| "learning_rate": 9.272373670375362e-05, | |
| "loss": 1.196, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.6374203224596925, | |
| "grad_norm": 0.7532997131347656, | |
| "learning_rate": 9.205094672618889e-05, | |
| "loss": 1.1987, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.6389201349831272, | |
| "grad_norm": 0.8462244272232056, | |
| "learning_rate": 9.137952428697568e-05, | |
| "loss": 1.1696, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.6404199475065617, | |
| "grad_norm": 0.7773210406303406, | |
| "learning_rate": 9.070948523099643e-05, | |
| "loss": 1.1903, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.6419197600299963, | |
| "grad_norm": 0.8117038607597351, | |
| "learning_rate": 9.004084537048708e-05, | |
| "loss": 1.1968, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.6434195725534309, | |
| "grad_norm": 0.7861129641532898, | |
| "learning_rate": 8.937362048466404e-05, | |
| "loss": 1.1933, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.6449193850768654, | |
| "grad_norm": 0.777773916721344, | |
| "learning_rate": 8.870782631935184e-05, | |
| "loss": 1.2017, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.6464191976003, | |
| "grad_norm": 0.8288027048110962, | |
| "learning_rate": 8.804347858661131e-05, | |
| "loss": 1.1937, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.6479190101237345, | |
| "grad_norm": 0.8184522390365601, | |
| "learning_rate": 8.73805929643691e-05, | |
| "loss": 1.1911, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.6494188226471691, | |
| "grad_norm": 0.7932900190353394, | |
| "learning_rate": 8.67191850960475e-05, | |
| "loss": 1.1875, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.6509186351706037, | |
| "grad_norm": 0.8443998694419861, | |
| "learning_rate": 8.605927059019528e-05, | |
| "loss": 1.1897, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.6524184476940382, | |
| "grad_norm": 0.8229474425315857, | |
| "learning_rate": 8.540086502011935e-05, | |
| "loss": 1.2059, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.6539182602174728, | |
| "grad_norm": 0.7777069211006165, | |
| "learning_rate": 8.47439839235174e-05, | |
| "loss": 1.2047, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.6554180727409074, | |
| "grad_norm": 0.8484175205230713, | |
| "learning_rate": 8.408864280211115e-05, | |
| "loss": 1.1743, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.6569178852643419, | |
| "grad_norm": 0.8335903882980347, | |
| "learning_rate": 8.343485712128026e-05, | |
| "loss": 1.1826, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.6584176977877765, | |
| "grad_norm": 0.827379584312439, | |
| "learning_rate": 8.278264230969769e-05, | |
| "loss": 1.1924, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.6599175103112112, | |
| "grad_norm": 0.7646244168281555, | |
| "learning_rate": 8.213201375896563e-05, | |
| "loss": 1.1829, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.6614173228346457, | |
| "grad_norm": 0.7910548448562622, | |
| "learning_rate": 8.14829868232519e-05, | |
| "loss": 1.1861, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.6629171353580803, | |
| "grad_norm": 0.8238165974617004, | |
| "learning_rate": 8.083557681892797e-05, | |
| "loss": 1.1852, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.6644169478815148, | |
| "grad_norm": 0.8701585531234741, | |
| "learning_rate": 8.018979902420746e-05, | |
| "loss": 1.1935, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.6659167604049494, | |
| "grad_norm": 0.8514190316200256, | |
| "learning_rate": 7.954566867878538e-05, | |
| "loss": 1.1901, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.667416572928384, | |
| "grad_norm": 0.8281582593917847, | |
| "learning_rate": 7.890320098347861e-05, | |
| "loss": 1.1747, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.6689163854518185, | |
| "grad_norm": 0.8169477581977844, | |
| "learning_rate": 7.82624110998673e-05, | |
| "loss": 1.182, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.6704161979752531, | |
| "grad_norm": 0.9043864011764526, | |
| "learning_rate": 7.762331414993697e-05, | |
| "loss": 1.1811, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.6719160104986877, | |
| "grad_norm": 0.9023800492286682, | |
| "learning_rate": 7.698592521572155e-05, | |
| "loss": 1.182, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.6734158230221222, | |
| "grad_norm": 0.8538755178451538, | |
| "learning_rate": 7.635025933894747e-05, | |
| "loss": 1.2004, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.6749156355455568, | |
| "grad_norm": 0.8358992338180542, | |
| "learning_rate": 7.571633152067901e-05, | |
| "loss": 1.1949, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6764154480689913, | |
| "grad_norm": 0.8562530279159546, | |
| "learning_rate": 7.508415672096389e-05, | |
| "loss": 1.1891, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.6779152605924259, | |
| "grad_norm": 0.867493212223053, | |
| "learning_rate": 7.445374985848035e-05, | |
| "loss": 1.172, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.6794150731158605, | |
| "grad_norm": 0.8086623549461365, | |
| "learning_rate": 7.382512581018514e-05, | |
| "loss": 1.2105, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.6809148856392951, | |
| "grad_norm": 0.8249533176422119, | |
| "learning_rate": 7.31982994109626e-05, | |
| "loss": 1.1874, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.6824146981627297, | |
| "grad_norm": 0.7889946103096008, | |
| "learning_rate": 7.25732854532741e-05, | |
| "loss": 1.1974, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.6839145106861643, | |
| "grad_norm": 0.8312809467315674, | |
| "learning_rate": 7.195009868680954e-05, | |
| "loss": 1.1875, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.6854143232095988, | |
| "grad_norm": 0.8431591987609863, | |
| "learning_rate": 7.13287538181387e-05, | |
| "loss": 1.1794, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.6869141357330334, | |
| "grad_norm": 0.8547993302345276, | |
| "learning_rate": 7.070926551036469e-05, | |
| "loss": 1.1723, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.688413948256468, | |
| "grad_norm": 0.8453377485275269, | |
| "learning_rate": 7.009164838277754e-05, | |
| "loss": 1.1835, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.6899137607799025, | |
| "grad_norm": 0.8473166823387146, | |
| "learning_rate": 6.947591701050932e-05, | |
| "loss": 1.2166, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.6914135733033371, | |
| "grad_norm": 0.882140040397644, | |
| "learning_rate": 6.886208592419043e-05, | |
| "loss": 1.1916, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.6929133858267716, | |
| "grad_norm": 0.7941833138465881, | |
| "learning_rate": 6.825016960960616e-05, | |
| "loss": 1.1955, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.6944131983502062, | |
| "grad_norm": 0.8405663967132568, | |
| "learning_rate": 6.764018250735532e-05, | |
| "loss": 1.1734, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.6959130108736408, | |
| "grad_norm": 0.8553961515426636, | |
| "learning_rate": 6.703213901250931e-05, | |
| "loss": 1.1743, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.6974128233970753, | |
| "grad_norm": 0.8203520178794861, | |
| "learning_rate": 6.64260534742723e-05, | |
| "loss": 1.181, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.6989126359205099, | |
| "grad_norm": 0.9138359427452087, | |
| "learning_rate": 6.582194019564266e-05, | |
| "loss": 1.1663, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.700262467191601, | |
| "eval_loss": 1.2265734672546387, | |
| "eval_runtime": 34.5391, | |
| "eval_samples_per_second": 723.818, | |
| "eval_steps_per_second": 90.477, | |
| "step": 4669 | |
| }, | |
| { | |
| "epoch": 0.7004124484439445, | |
| "grad_norm": 0.8695697784423828, | |
| "learning_rate": 6.521981343307554e-05, | |
| "loss": 1.186, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.7019122609673791, | |
| "grad_norm": 0.7855265140533447, | |
| "learning_rate": 6.461968739614639e-05, | |
| "loss": 1.1716, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.7034120734908137, | |
| "grad_norm": 0.8557237982749939, | |
| "learning_rate": 6.402157624721546e-05, | |
| "loss": 1.1854, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.7049118860142483, | |
| "grad_norm": 0.9032356142997742, | |
| "learning_rate": 6.342549410109372e-05, | |
| "loss": 1.1622, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.7064116985376828, | |
| "grad_norm": 0.845805287361145, | |
| "learning_rate": 6.283145502470976e-05, | |
| "loss": 1.1895, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.7079115110611174, | |
| "grad_norm": 0.8459937572479248, | |
| "learning_rate": 6.223947303677793e-05, | |
| "loss": 1.1948, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.709411323584552, | |
| "grad_norm": 0.7924135327339172, | |
| "learning_rate": 6.164956210746723e-05, | |
| "loss": 1.1826, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.7109111361079865, | |
| "grad_norm": 0.8918642997741699, | |
| "learning_rate": 6.106173615807186e-05, | |
| "loss": 1.1798, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.7124109486314211, | |
| "grad_norm": 0.7782163619995117, | |
| "learning_rate": 6.047600906068269e-05, | |
| "loss": 1.1677, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.7139107611548556, | |
| "grad_norm": 0.8455360531806946, | |
| "learning_rate": 5.989239463785971e-05, | |
| "loss": 1.1956, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.7154105736782902, | |
| "grad_norm": 0.8486021161079407, | |
| "learning_rate": 5.9310906662306125e-05, | |
| "loss": 1.1881, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.7169103862017248, | |
| "grad_norm": 0.8694136142730713, | |
| "learning_rate": 5.8731558856542935e-05, | |
| "loss": 1.1795, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.7184101987251593, | |
| "grad_norm": 0.8481091260910034, | |
| "learning_rate": 5.8154364892585574e-05, | |
| "loss": 1.1663, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.7199100112485939, | |
| "grad_norm": 0.7921723127365112, | |
| "learning_rate": 5.75793383916208e-05, | |
| "loss": 1.1648, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.7214098237720284, | |
| "grad_norm": 0.8086884021759033, | |
| "learning_rate": 5.70064929236855e-05, | |
| "loss": 1.181, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.7229096362954631, | |
| "grad_norm": 0.854877233505249, | |
| "learning_rate": 5.643584200734659e-05, | |
| "loss": 1.1877, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.7244094488188977, | |
| "grad_norm": 0.9068853259086609, | |
| "learning_rate": 5.586739910938161e-05, | |
| "loss": 1.184, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.7259092613423322, | |
| "grad_norm": 0.7965474724769592, | |
| "learning_rate": 5.5301177644461164e-05, | |
| "loss": 1.1629, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.7274090738657668, | |
| "grad_norm": 0.8608559966087341, | |
| "learning_rate": 5.4737190974832426e-05, | |
| "loss": 1.1807, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.7289088863892014, | |
| "grad_norm": 0.8948774337768555, | |
| "learning_rate": 5.417545241000353e-05, | |
| "loss": 1.1759, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.7304086989126359, | |
| "grad_norm": 0.9525447487831116, | |
| "learning_rate": 5.361597520642981e-05, | |
| "loss": 1.1643, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.7319085114360705, | |
| "grad_norm": 0.905668318271637, | |
| "learning_rate": 5.3058772567200595e-05, | |
| "loss": 1.1799, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.7334083239595051, | |
| "grad_norm": 0.8792032599449158, | |
| "learning_rate": 5.250385764172802e-05, | |
| "loss": 1.1766, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.7349081364829396, | |
| "grad_norm": 0.9094644784927368, | |
| "learning_rate": 5.195124352543636e-05, | |
| "loss": 1.192, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.7364079490063742, | |
| "grad_norm": 0.8372027277946472, | |
| "learning_rate": 5.140094325945323e-05, | |
| "loss": 1.1655, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.7379077615298087, | |
| "grad_norm": 0.9662985801696777, | |
| "learning_rate": 5.085296983030164e-05, | |
| "loss": 1.1926, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.7394075740532433, | |
| "grad_norm": 0.8691169023513794, | |
| "learning_rate": 5.030733616959384e-05, | |
| "loss": 1.1785, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.7409073865766779, | |
| "grad_norm": 0.8685835599899292, | |
| "learning_rate": 4.976405515372577e-05, | |
| "loss": 1.182, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.7424071991001124, | |
| "grad_norm": 0.8488807082176208, | |
| "learning_rate": 4.922313960357336e-05, | |
| "loss": 1.1782, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.7439070116235471, | |
| "grad_norm": 0.8728071451187134, | |
| "learning_rate": 4.868460228419003e-05, | |
| "loss": 1.1837, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.7454068241469817, | |
| "grad_norm": 0.8640130162239075, | |
| "learning_rate": 4.814845590450544e-05, | |
| "loss": 1.1762, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.7469066366704162, | |
| "grad_norm": 0.861436128616333, | |
| "learning_rate": 4.761471311702541e-05, | |
| "loss": 1.1605, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.7484064491938508, | |
| "grad_norm": 0.9322028756141663, | |
| "learning_rate": 4.70833865175334e-05, | |
| "loss": 1.1804, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.7499062617172854, | |
| "grad_norm": 0.8335092663764954, | |
| "learning_rate": 4.6554488644793555e-05, | |
| "loss": 1.1822, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7514060742407199, | |
| "grad_norm": 0.8994686603546143, | |
| "learning_rate": 4.602803198025429e-05, | |
| "loss": 1.1805, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.7529058867641545, | |
| "grad_norm": 0.9040279388427734, | |
| "learning_rate": 4.550402894775408e-05, | |
| "loss": 1.1593, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.754405699287589, | |
| "grad_norm": 0.8711543083190918, | |
| "learning_rate": 4.49824919132283e-05, | |
| "loss": 1.1526, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.7559055118110236, | |
| "grad_norm": 0.8705037832260132, | |
| "learning_rate": 4.446343318441719e-05, | |
| "loss": 1.1688, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.7574053243344582, | |
| "grad_norm": 0.8655483722686768, | |
| "learning_rate": 4.394686501057553e-05, | |
| "loss": 1.1758, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.7589051368578927, | |
| "grad_norm": 0.813109815120697, | |
| "learning_rate": 4.343279958218352e-05, | |
| "loss": 1.1772, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.7604049493813273, | |
| "grad_norm": 0.8437618613243103, | |
| "learning_rate": 4.29212490306592e-05, | |
| "loss": 1.1725, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 0.908439576625824, | |
| "learning_rate": 4.241222542807211e-05, | |
| "loss": 1.1821, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.7634045744281964, | |
| "grad_norm": 0.8363406658172607, | |
| "learning_rate": 4.19057407868583e-05, | |
| "loss": 1.1671, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.7649043869516311, | |
| "grad_norm": 0.8935152888298035, | |
| "learning_rate": 4.140180705953689e-05, | |
| "loss": 1.1831, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.7664041994750657, | |
| "grad_norm": 0.9219470620155334, | |
| "learning_rate": 4.090043613842823e-05, | |
| "loss": 1.1837, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.7679040119985002, | |
| "grad_norm": 0.8783389329910278, | |
| "learning_rate": 4.0401639855372884e-05, | |
| "loss": 1.1915, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.7694038245219348, | |
| "grad_norm": 0.8177280426025391, | |
| "learning_rate": 3.990542998145262e-05, | |
| "loss": 1.1598, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.7709036370453693, | |
| "grad_norm": 0.8821897506713867, | |
| "learning_rate": 3.941181822671273e-05, | |
| "loss": 1.1794, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.7724034495688039, | |
| "grad_norm": 0.8683632612228394, | |
| "learning_rate": 3.892081623988541e-05, | |
| "loss": 1.1904, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.7739032620922385, | |
| "grad_norm": 0.800815999507904, | |
| "learning_rate": 3.8432435608115e-05, | |
| "loss": 1.1648, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.775403074615673, | |
| "grad_norm": 0.8233063817024231, | |
| "learning_rate": 3.794668785668465e-05, | |
| "loss": 1.1718, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.7769028871391076, | |
| "grad_norm": 0.8883649110794067, | |
| "learning_rate": 3.7463584448744186e-05, | |
| "loss": 1.1682, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.7784026996625422, | |
| "grad_norm": 0.8076657652854919, | |
| "learning_rate": 3.6983136785039636e-05, | |
| "loss": 1.1635, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.7799025121859767, | |
| "grad_norm": 0.7962640523910522, | |
| "learning_rate": 3.650535620364407e-05, | |
| "loss": 1.1731, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.7814023247094113, | |
| "grad_norm": 0.8250430822372437, | |
| "learning_rate": 3.603025397969037e-05, | |
| "loss": 1.1603, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.7829021372328459, | |
| "grad_norm": 0.9766924977302551, | |
| "learning_rate": 3.555784132510472e-05, | |
| "loss": 1.1686, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.7844019497562804, | |
| "grad_norm": 0.9148427844047546, | |
| "learning_rate": 3.508812938834227e-05, | |
| "loss": 1.1703, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.7859017622797151, | |
| "grad_norm": 0.8292137980461121, | |
| "learning_rate": 3.4621129254124106e-05, | |
| "loss": 1.1565, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.7874015748031497, | |
| "grad_norm": 0.8619733452796936, | |
| "learning_rate": 3.415685194317539e-05, | |
| "loss": 1.1571, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.7889013873265842, | |
| "grad_norm": 0.8798223733901978, | |
| "learning_rate": 3.3695308411965564e-05, | |
| "loss": 1.1738, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.7904011998500188, | |
| "grad_norm": 0.8004907369613647, | |
| "learning_rate": 3.323650955244951e-05, | |
| "loss": 1.1769, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.7919010123734533, | |
| "grad_norm": 0.9049323201179504, | |
| "learning_rate": 3.2780466191810905e-05, | |
| "loss": 1.1641, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.7934008248968879, | |
| "grad_norm": 0.8505051136016846, | |
| "learning_rate": 3.232718909220631e-05, | |
| "loss": 1.1765, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.7949006374203225, | |
| "grad_norm": 0.878035843372345, | |
| "learning_rate": 3.187668895051135e-05, | |
| "loss": 1.1665, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.796400449943757, | |
| "grad_norm": 0.8694955706596375, | |
| "learning_rate": 3.14289763980683e-05, | |
| "loss": 1.1904, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.7979002624671916, | |
| "grad_norm": 0.917580783367157, | |
| "learning_rate": 3.0984062000435276e-05, | |
| "loss": 1.1738, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.7994000749906262, | |
| "grad_norm": 0.9352427124977112, | |
| "learning_rate": 3.054195625713668e-05, | |
| "loss": 1.1685, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.8002999625046869, | |
| "eval_loss": 1.210194706916809, | |
| "eval_runtime": 35.2273, | |
| "eval_samples_per_second": 709.676, | |
| "eval_steps_per_second": 88.71, | |
| "step": 5336 | |
| }, | |
| { | |
| "epoch": 0.8008998875140607, | |
| "grad_norm": 0.845475971698761, | |
| "learning_rate": 3.0102669601415575e-05, | |
| "loss": 1.1577, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.8023997000374953, | |
| "grad_norm": 0.8200697898864746, | |
| "learning_rate": 2.966621239998755e-05, | |
| "loss": 1.1577, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.8038995125609298, | |
| "grad_norm": 0.8095065355300903, | |
| "learning_rate": 2.9232594952795818e-05, | |
| "loss": 1.1589, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.8053993250843644, | |
| "grad_norm": 0.8357236385345459, | |
| "learning_rate": 2.8801827492768352e-05, | |
| "loss": 1.1808, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.8068991376077991, | |
| "grad_norm": 0.8971685171127319, | |
| "learning_rate": 2.8373920185576375e-05, | |
| "loss": 1.1649, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.8083989501312336, | |
| "grad_norm": 0.8829404711723328, | |
| "learning_rate": 2.7948883129394467e-05, | |
| "loss": 1.1626, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.8098987626546682, | |
| "grad_norm": 0.8451800346374512, | |
| "learning_rate": 2.7526726354662104e-05, | |
| "loss": 1.155, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.8113985751781028, | |
| "grad_norm": 0.8615455031394958, | |
| "learning_rate": 2.7107459823847106e-05, | |
| "loss": 1.1607, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.8128983877015373, | |
| "grad_norm": 0.9241278767585754, | |
| "learning_rate": 2.6691093431210596e-05, | |
| "loss": 1.1771, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.8143982002249719, | |
| "grad_norm": 0.8356271982192993, | |
| "learning_rate": 2.6277637002573288e-05, | |
| "loss": 1.1755, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.8158980127484065, | |
| "grad_norm": 0.8474392294883728, | |
| "learning_rate": 2.586710029508375e-05, | |
| "loss": 1.1652, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.817397825271841, | |
| "grad_norm": 0.9105897545814514, | |
| "learning_rate": 2.54594929969881e-05, | |
| "loss": 1.158, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.8188976377952756, | |
| "grad_norm": 0.8202654123306274, | |
| "learning_rate": 2.5054824727401502e-05, | |
| "loss": 1.1632, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.8203974503187101, | |
| "grad_norm": 0.8397065997123718, | |
| "learning_rate": 2.46531050360809e-05, | |
| "loss": 1.166, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.8218972628421447, | |
| "grad_norm": 0.8714698553085327, | |
| "learning_rate": 2.4254343403199945e-05, | |
| "loss": 1.1749, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.8233970753655793, | |
| "grad_norm": 0.8937433958053589, | |
| "learning_rate": 2.3858549239125034e-05, | |
| "loss": 1.1631, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 0.8248968878890138, | |
| "grad_norm": 0.8778429627418518, | |
| "learning_rate": 2.346573188419341e-05, | |
| "loss": 1.1661, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8263967004124484, | |
| "grad_norm": 0.7826377153396606, | |
| "learning_rate": 2.3075900608492637e-05, | |
| "loss": 1.1686, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 0.8278965129358831, | |
| "grad_norm": 1.108136773109436, | |
| "learning_rate": 2.2689064611641794e-05, | |
| "loss": 1.1926, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.8293963254593176, | |
| "grad_norm": 0.8729405403137207, | |
| "learning_rate": 2.230523302257461e-05, | |
| "loss": 1.1554, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 0.8308961379827522, | |
| "grad_norm": 0.824364185333252, | |
| "learning_rate": 2.192441489932372e-05, | |
| "loss": 1.1715, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.8323959505061868, | |
| "grad_norm": 0.8968133330345154, | |
| "learning_rate": 2.154661922880708e-05, | |
| "loss": 1.1549, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.8338957630296213, | |
| "grad_norm": 0.9061416983604431, | |
| "learning_rate": 2.117185492661592e-05, | |
| "loss": 1.1516, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.8353955755530559, | |
| "grad_norm": 0.8204578161239624, | |
| "learning_rate": 2.0800130836804214e-05, | |
| "loss": 1.1637, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 0.8368953880764904, | |
| "grad_norm": 0.8293260931968689, | |
| "learning_rate": 2.043145573168003e-05, | |
| "loss": 1.1604, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.838395200599925, | |
| "grad_norm": 0.8907037377357483, | |
| "learning_rate": 2.0065838311598543e-05, | |
| "loss": 1.1783, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 0.8398950131233596, | |
| "grad_norm": 0.8842608332633972, | |
| "learning_rate": 1.9703287204756757e-05, | |
| "loss": 1.1576, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.8413948256467941, | |
| "grad_norm": 0.8065600991249084, | |
| "learning_rate": 1.9343810966989716e-05, | |
| "loss": 1.182, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 0.8428946381702287, | |
| "grad_norm": 0.8144896626472473, | |
| "learning_rate": 1.8987418081568683e-05, | |
| "loss": 1.1482, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.8443944506936633, | |
| "grad_norm": 0.8756045699119568, | |
| "learning_rate": 1.8634116959001106e-05, | |
| "loss": 1.1623, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 0.8458942632170978, | |
| "grad_norm": 0.9638755321502686, | |
| "learning_rate": 1.828391593683185e-05, | |
| "loss": 1.1479, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.8473940757405324, | |
| "grad_norm": 0.8920614123344421, | |
| "learning_rate": 1.7936823279446676e-05, | |
| "loss": 1.1531, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.8488938882639671, | |
| "grad_norm": 0.8344776630401611, | |
| "learning_rate": 1.7592847177877008e-05, | |
| "loss": 1.1642, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.8503937007874016, | |
| "grad_norm": 0.8521091341972351, | |
| "learning_rate": 1.725199574960689e-05, | |
| "loss": 1.1456, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 0.8518935133108362, | |
| "grad_norm": 0.8243123292922974, | |
| "learning_rate": 1.6914277038381145e-05, | |
| "loss": 1.1689, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.8533933258342707, | |
| "grad_norm": 0.8362585306167603, | |
| "learning_rate": 1.6579699014015783e-05, | |
| "loss": 1.1565, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 0.8548931383577053, | |
| "grad_norm": 0.8934555053710938, | |
| "learning_rate": 1.6248269572209716e-05, | |
| "loss": 1.158, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.8563929508811399, | |
| "grad_norm": 0.9755041599273682, | |
| "learning_rate": 1.5919996534358635e-05, | |
| "loss": 1.1412, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 0.8578927634045744, | |
| "grad_norm": 0.9437934160232544, | |
| "learning_rate": 1.5594887647370263e-05, | |
| "loss": 1.1528, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.859392575928009, | |
| "grad_norm": 0.8423501253128052, | |
| "learning_rate": 1.527295058348154e-05, | |
| "loss": 1.1374, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 0.8608923884514436, | |
| "grad_norm": 0.8530197739601135, | |
| "learning_rate": 1.4954192940077809e-05, | |
| "loss": 1.153, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.8623922009748781, | |
| "grad_norm": 0.9238439798355103, | |
| "learning_rate": 1.463862223951317e-05, | |
| "loss": 1.1658, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.8638920134983127, | |
| "grad_norm": 0.82196444272995, | |
| "learning_rate": 1.4326245928933178e-05, | |
| "loss": 1.1657, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.8653918260217472, | |
| "grad_norm": 0.8516616821289062, | |
| "learning_rate": 1.4017071380099132e-05, | |
| "loss": 1.1585, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 0.8668916385451818, | |
| "grad_norm": 0.8456748127937317, | |
| "learning_rate": 1.3711105889213908e-05, | |
| "loss": 1.1606, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.8683914510686164, | |
| "grad_norm": 0.8640986680984497, | |
| "learning_rate": 1.3408356676750043e-05, | |
| "loss": 1.1806, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 0.869891263592051, | |
| "grad_norm": 0.8690096735954285, | |
| "learning_rate": 1.310883088727902e-05, | |
| "loss": 1.1607, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.8713910761154856, | |
| "grad_norm": 0.8273411393165588, | |
| "learning_rate": 1.2812535589303024e-05, | |
| "loss": 1.1436, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 0.8728908886389202, | |
| "grad_norm": 0.7910122871398926, | |
| "learning_rate": 1.2519477775087805e-05, | |
| "loss": 1.1706, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.8743907011623547, | |
| "grad_norm": 0.9277138710021973, | |
| "learning_rate": 1.222966436049786e-05, | |
| "loss": 1.1801, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 0.8758905136857893, | |
| "grad_norm": 0.861088752746582, | |
| "learning_rate": 1.1943102184833165e-05, | |
| "loss": 1.1763, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.8773903262092239, | |
| "grad_norm": 0.8729887008666992, | |
| "learning_rate": 1.165979801066782e-05, | |
| "loss": 1.1571, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.8788901387326584, | |
| "grad_norm": 0.9203227162361145, | |
| "learning_rate": 1.1379758523690413e-05, | |
| "loss": 1.1756, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.880389951256093, | |
| "grad_norm": 0.8871293663978577, | |
| "learning_rate": 1.1102990332546175e-05, | |
| "loss": 1.1578, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 0.8818897637795275, | |
| "grad_norm": 0.8668881058692932, | |
| "learning_rate": 1.0829499968681204e-05, | |
| "loss": 1.1618, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.8833895763029621, | |
| "grad_norm": 0.8996675610542297, | |
| "learning_rate": 1.0559293886188246e-05, | |
| "loss": 1.1723, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 0.8848893888263967, | |
| "grad_norm": 0.892325758934021, | |
| "learning_rate": 1.029237846165426e-05, | |
| "loss": 1.1645, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.8863892013498312, | |
| "grad_norm": 0.8483745455741882, | |
| "learning_rate": 1.0028759994010071e-05, | |
| "loss": 1.1685, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 0.8878890138732658, | |
| "grad_norm": 0.8226345181465149, | |
| "learning_rate": 9.768444704381811e-06, | |
| "loss": 1.156, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.8893888263967004, | |
| "grad_norm": 0.9334031343460083, | |
| "learning_rate": 9.511438735943849e-06, | |
| "loss": 1.1732, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 0.890888638920135, | |
| "grad_norm": 0.9309195876121521, | |
| "learning_rate": 9.257748153773992e-06, | |
| "loss": 1.158, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.8923884514435696, | |
| "grad_norm": 0.899749219417572, | |
| "learning_rate": 9.007378944710431e-06, | |
| "loss": 1.1512, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.8938882639670042, | |
| "grad_norm": 0.8760167956352234, | |
| "learning_rate": 8.760337017210206e-06, | |
| "loss": 1.1453, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.8953880764904387, | |
| "grad_norm": 0.9029643535614014, | |
| "learning_rate": 8.516628201209985e-06, | |
| "loss": 1.1561, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 0.8968878890138733, | |
| "grad_norm": 0.8686094284057617, | |
| "learning_rate": 8.276258247988437e-06, | |
| "loss": 1.1569, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.8983877015373078, | |
| "grad_norm": 0.876531720161438, | |
| "learning_rate": 8.039232830030413e-06, | |
| "loss": 1.1651, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 0.8998875140607424, | |
| "grad_norm": 0.896596372127533, | |
| "learning_rate": 7.805557540893276e-06, | |
| "loss": 1.1709, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9003374578177727, | |
| "eval_loss": 1.203166127204895, | |
| "eval_runtime": 35.8498, | |
| "eval_samples_per_second": 697.355, | |
| "eval_steps_per_second": 87.169, | |
| "step": 6003 | |
| }, | |
| { | |
| "epoch": 0.901387326584177, | |
| "grad_norm": 0.8913058638572693, | |
| "learning_rate": 7.575237895074637e-06, | |
| "loss": 1.1691, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 0.9028871391076115, | |
| "grad_norm": 0.8780670166015625, | |
| "learning_rate": 7.348279327882467e-06, | |
| "loss": 1.1685, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.9043869516310461, | |
| "grad_norm": 0.828803539276123, | |
| "learning_rate": 7.1246871953066666e-06, | |
| "loss": 1.1532, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 0.9058867641544807, | |
| "grad_norm": 0.8630168437957764, | |
| "learning_rate": 6.9044667738927365e-06, | |
| "loss": 1.1641, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.9073865766779152, | |
| "grad_norm": 0.8399310111999512, | |
| "learning_rate": 6.6876232606172255e-06, | |
| "loss": 1.1596, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.9088863892013498, | |
| "grad_norm": 0.8821493983268738, | |
| "learning_rate": 6.4741617727651626e-06, | |
| "loss": 1.1501, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.9103862017247843, | |
| "grad_norm": 0.948297381401062, | |
| "learning_rate": 6.264087347809188e-06, | |
| "loss": 1.1734, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 0.911886014248219, | |
| "grad_norm": 0.9395800232887268, | |
| "learning_rate": 6.0574049432907115e-06, | |
| "loss": 1.1738, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.9133858267716536, | |
| "grad_norm": 0.8580440878868103, | |
| "learning_rate": 5.854119436702976e-06, | |
| "loss": 1.1677, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 0.9148856392950881, | |
| "grad_norm": 0.9488980770111084, | |
| "learning_rate": 5.65423562537593e-06, | |
| "loss": 1.1638, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.9163854518185227, | |
| "grad_norm": 0.8783366680145264, | |
| "learning_rate": 5.4577582263629235e-06, | |
| "loss": 1.1676, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 0.9178852643419573, | |
| "grad_norm": 0.8629846572875977, | |
| "learning_rate": 5.264691876329474e-06, | |
| "loss": 1.1426, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.9193850768653918, | |
| "grad_norm": 0.8540226221084595, | |
| "learning_rate": 5.075041131443891e-06, | |
| "loss": 1.1582, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 0.9208848893888264, | |
| "grad_norm": 0.8809103965759277, | |
| "learning_rate": 4.88881046726966e-06, | |
| "loss": 1.1514, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.922384701912261, | |
| "grad_norm": 0.9011795520782471, | |
| "learning_rate": 4.706004278659831e-06, | |
| "loss": 1.1543, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.9238845144356955, | |
| "grad_norm": 0.8636093139648438, | |
| "learning_rate": 4.526626879653428e-06, | |
| "loss": 1.1694, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.9253843269591301, | |
| "grad_norm": 0.8122191429138184, | |
| "learning_rate": 4.350682503373437e-06, | |
| "loss": 1.1508, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 0.9268841394825647, | |
| "grad_norm": 0.9113324880599976, | |
| "learning_rate": 4.178175301927101e-06, | |
| "loss": 1.1767, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.9283839520059992, | |
| "grad_norm": 0.8934022784233093, | |
| "learning_rate": 4.009109346307792e-06, | |
| "loss": 1.162, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 0.9298837645294338, | |
| "grad_norm": 0.8794459700584412, | |
| "learning_rate": 3.8434886262991015e-06, | |
| "loss": 1.167, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.9313835770528683, | |
| "grad_norm": 0.9140746593475342, | |
| "learning_rate": 3.6813170503804834e-06, | |
| "loss": 1.1828, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 0.932883389576303, | |
| "grad_norm": 0.8573930859565735, | |
| "learning_rate": 3.522598445635172e-06, | |
| "loss": 1.138, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.9343832020997376, | |
| "grad_norm": 0.8401947021484375, | |
| "learning_rate": 3.3673365576598e-06, | |
| "loss": 1.1599, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 0.9358830146231721, | |
| "grad_norm": 0.9218401908874512, | |
| "learning_rate": 3.21553505047602e-06, | |
| "loss": 1.1699, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.9373828271466067, | |
| "grad_norm": 0.9082098603248596, | |
| "learning_rate": 3.067197506444058e-06, | |
| "loss": 1.1595, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.9388826396700413, | |
| "grad_norm": 0.9707618355751038, | |
| "learning_rate": 2.922327426178128e-06, | |
| "loss": 1.1417, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.9403824521934758, | |
| "grad_norm": 0.8763731718063354, | |
| "learning_rate": 2.7809282284638855e-06, | |
| "loss": 1.1839, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 0.9418822647169104, | |
| "grad_norm": 0.9058078527450562, | |
| "learning_rate": 2.643003250177672e-06, | |
| "loss": 1.147, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.943382077240345, | |
| "grad_norm": 0.817454993724823, | |
| "learning_rate": 2.5085557462078134e-06, | |
| "loss": 1.1457, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 0.9448818897637795, | |
| "grad_norm": 0.919282853603363, | |
| "learning_rate": 2.377588889377813e-06, | |
| "loss": 1.1738, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.9463817022872141, | |
| "grad_norm": 0.9078910946846008, | |
| "learning_rate": 2.2501057703714797e-06, | |
| "loss": 1.175, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 0.9478815148106486, | |
| "grad_norm": 0.894939661026001, | |
| "learning_rate": 2.1261093976599365e-06, | |
| "loss": 1.1704, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 0.9493813273340832, | |
| "grad_norm": 0.8727539777755737, | |
| "learning_rate": 2.005602697430675e-06, | |
| "loss": 1.1619, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 0.9508811398575178, | |
| "grad_norm": 0.8867236971855164, | |
| "learning_rate": 1.8885885135184963e-06, | |
| "loss": 1.1693, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 0.8786768317222595, | |
| "learning_rate": 1.7750696073383974e-06, | |
| "loss": 1.1494, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.953880764904387, | |
| "grad_norm": 0.8469645380973816, | |
| "learning_rate": 1.6650486578203725e-06, | |
| "loss": 1.1619, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 0.9553805774278216, | |
| "grad_norm": 0.8775497674942017, | |
| "learning_rate": 1.558528261346248e-06, | |
| "loss": 1.1448, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 0.9568803899512561, | |
| "grad_norm": 0.9189411997795105, | |
| "learning_rate": 1.455510931688364e-06, | |
| "loss": 1.1539, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 0.9583802024746907, | |
| "grad_norm": 0.8403207659721375, | |
| "learning_rate": 1.3559990999502556e-06, | |
| "loss": 1.1644, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 0.9598800149981253, | |
| "grad_norm": 0.910416305065155, | |
| "learning_rate": 1.2599951145093157e-06, | |
| "loss": 1.1549, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.9613798275215598, | |
| "grad_norm": 0.8597015738487244, | |
| "learning_rate": 1.1675012409613715e-06, | |
| "loss": 1.1502, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 0.9628796400449944, | |
| "grad_norm": 0.8848598003387451, | |
| "learning_rate": 1.0785196620671455e-06, | |
| "loss": 1.1582, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 0.9643794525684289, | |
| "grad_norm": 0.8559622764587402, | |
| "learning_rate": 9.93052477700862e-07, | |
| "loss": 1.1679, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 0.9658792650918635, | |
| "grad_norm": 0.814378559589386, | |
| "learning_rate": 9.111017048005876e-07, | |
| "loss": 1.1639, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 0.9673790776152981, | |
| "grad_norm": 0.7913538217544556, | |
| "learning_rate": 8.326692773207189e-07, | |
| "loss": 1.1621, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.9688788901387326, | |
| "grad_norm": 0.947375476360321, | |
| "learning_rate": 7.577570461862359e-07, | |
| "loss": 1.158, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 0.9703787026621672, | |
| "grad_norm": 0.8357100486755371, | |
| "learning_rate": 6.863667792491534e-07, | |
| "loss": 1.1584, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 0.9718785151856018, | |
| "grad_norm": 0.880916953086853, | |
| "learning_rate": 6.185001612467044e-07, | |
| "loss": 1.1671, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 0.9733783277090363, | |
| "grad_norm": 0.8177213072776794, | |
| "learning_rate": 5.541587937616221e-07, | |
| "loss": 1.161, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 0.974878140232471, | |
| "grad_norm": 0.8296107053756714, | |
| "learning_rate": 4.933441951843198e-07, | |
| "loss": 1.1555, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.9763779527559056, | |
| "grad_norm": 0.8631340861320496, | |
| "learning_rate": 4.360578006770865e-07, | |
| "loss": 1.1624, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 0.9778777652793401, | |
| "grad_norm": 0.8406107425689697, | |
| "learning_rate": 3.82300962140214e-07, | |
| "loss": 1.1615, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 0.9793775778027747, | |
| "grad_norm": 0.9254620671272278, | |
| "learning_rate": 3.320749481800888e-07, | |
| "loss": 1.1597, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 0.9808773903262092, | |
| "grad_norm": 0.9091020822525024, | |
| "learning_rate": 2.8538094407919987e-07, | |
| "loss": 1.1537, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 0.9823772028496438, | |
| "grad_norm": 0.8602820634841919, | |
| "learning_rate": 2.4222005176829375e-07, | |
| "loss": 1.1485, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.9838770153730784, | |
| "grad_norm": 0.9516273736953735, | |
| "learning_rate": 2.025932898002458e-07, | |
| "loss": 1.1706, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 0.9853768278965129, | |
| "grad_norm": 0.8547878265380859, | |
| "learning_rate": 1.6650159332607939e-07, | |
| "loss": 1.1511, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 0.9868766404199475, | |
| "grad_norm": 0.9445357918739319, | |
| "learning_rate": 1.3394581407289996e-07, | |
| "loss": 1.1735, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 0.9883764529433821, | |
| "grad_norm": 0.9283078908920288, | |
| "learning_rate": 1.0492672032377803e-07, | |
| "loss": 1.1665, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 0.9898762654668166, | |
| "grad_norm": 0.9254633784294128, | |
| "learning_rate": 7.944499689961358e-08, | |
| "loss": 1.1533, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.9913760779902512, | |
| "grad_norm": 0.9120994806289673, | |
| "learning_rate": 5.7501245143015685e-08, | |
| "loss": 1.1616, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 0.9928758905136857, | |
| "grad_norm": 0.8868552446365356, | |
| "learning_rate": 3.9095982904080447e-08, | |
| "loss": 1.1591, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 0.9943757030371203, | |
| "grad_norm": 0.8874196410179138, | |
| "learning_rate": 2.4229644528150905e-08, | |
| "loss": 1.168, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 0.995875515560555, | |
| "grad_norm": 0.8636330366134644, | |
| "learning_rate": 1.290258084557516e-08, | |
| "loss": 1.1572, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 0.9973753280839895, | |
| "grad_norm": 0.8626487255096436, | |
| "learning_rate": 5.115059163496304e-09, | |
| "loss": 1.1482, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.9988751406074241, | |
| "grad_norm": 0.9289808869361877, | |
| "learning_rate": 8.672632594408646e-10, | |
| "loss": 1.1632, | |
| "step": 6660 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 6667, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 667, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2426610074517504.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |