{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1552, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012890750886239123, "grad_norm": 1.7974083423614502, "learning_rate": 1.153846153846154e-05, "loss": 2.5309, "step": 10 }, { "epoch": 0.025781501772478246, "grad_norm": 0.9546365737915039, "learning_rate": 2.435897435897436e-05, "loss": 2.2805, "step": 20 }, { "epoch": 0.03867225265871737, "grad_norm": 0.6726247072219849, "learning_rate": 3.717948717948718e-05, "loss": 1.9329, "step": 30 }, { "epoch": 0.05156300354495649, "grad_norm": 0.5635246634483337, "learning_rate": 5e-05, "loss": 1.7205, "step": 40 }, { "epoch": 0.06445375443119562, "grad_norm": 0.48018646240234375, "learning_rate": 6.282051282051282e-05, "loss": 1.6227, "step": 50 }, { "epoch": 0.07734450531743474, "grad_norm": 0.4588874578475952, "learning_rate": 7.564102564102564e-05, "loss": 1.565, "step": 60 }, { "epoch": 0.09023525620367387, "grad_norm": 0.5382615923881531, "learning_rate": 8.846153846153847e-05, "loss": 1.5151, "step": 70 }, { "epoch": 0.10312600708991299, "grad_norm": 0.5467047691345215, "learning_rate": 0.00010128205128205129, "loss": 1.4629, "step": 80 }, { "epoch": 0.11601675797615212, "grad_norm": 0.5823647379875183, "learning_rate": 0.0001141025641025641, "loss": 1.457, "step": 90 }, { "epoch": 0.12890750886239125, "grad_norm": 0.6547259092330933, "learning_rate": 0.00012692307692307693, "loss": 1.4192, "step": 100 }, { "epoch": 0.14179825974863036, "grad_norm": 0.5140829086303711, "learning_rate": 0.00013974358974358974, "loss": 1.3869, "step": 110 }, { "epoch": 0.15468901063486948, "grad_norm": 0.6194220185279846, "learning_rate": 0.00015256410256410255, "loss": 1.365, "step": 120 }, { "epoch": 0.1675797615211086, "grad_norm": 0.6340938806533813, "learning_rate": 0.0001653846153846154, "loss": 1.3559, "step": 130 }, { "epoch": 0.18047051240734774, "grad_norm": 0.5935661792755127, "learning_rate": 0.00017820512820512823, "loss": 1.3447, "step": 140 }, { "epoch": 0.19336126329358685, "grad_norm": 0.5271995663642883, "learning_rate": 0.00019102564102564104, "loss": 1.3229, "step": 150 }, { "epoch": 0.20625201417982597, "grad_norm": 0.5766036510467529, "learning_rate": 0.00019999772102388783, "loss": 1.3238, "step": 160 }, { "epoch": 0.2191427650660651, "grad_norm": 0.4888124167919159, "learning_rate": 0.0001999572087826214, "loss": 1.3371, "step": 170 }, { "epoch": 0.23203351595230423, "grad_norm": 0.5345163345336914, "learning_rate": 0.00019986607624302306, "loss": 1.3455, "step": 180 }, { "epoch": 0.24492426683854335, "grad_norm": 0.4851045310497284, "learning_rate": 0.00019972436955640487, "loss": 1.3176, "step": 190 }, { "epoch": 0.2578150177247825, "grad_norm": 0.47015997767448425, "learning_rate": 0.0001995321604858227, "loss": 1.3131, "step": 200 }, { "epoch": 0.2707057686110216, "grad_norm": 0.5002295970916748, "learning_rate": 0.00019928954636973373, "loss": 1.2997, "step": 210 }, { "epoch": 0.2835965194972607, "grad_norm": 0.46887412667274475, "learning_rate": 0.00019899665007270265, "loss": 1.279, "step": 220 }, { "epoch": 0.2964872703834998, "grad_norm": 0.4887183606624603, "learning_rate": 0.00019865361992318033, "loss": 1.2773, "step": 230 }, { "epoch": 0.30937802126973896, "grad_norm": 0.49127480387687683, "learning_rate": 0.00019826062963838753, "loss": 1.2674, "step": 240 }, { "epoch": 0.3222687721559781, "grad_norm": 0.49845078587532043, "learning_rate": 0.0001978178782363411, "loss": 1.2723, "step": 250 }, { "epoch": 0.3351595230422172, "grad_norm": 0.4858417809009552, "learning_rate": 0.00019732558993506723, "loss": 1.273, "step": 260 }, { "epoch": 0.34805027392845633, "grad_norm": 0.47399166226387024, "learning_rate": 0.00019678401403905307, "loss": 1.2629, "step": 270 }, { "epoch": 0.3609410248146955, "grad_norm": 0.47289639711380005, "learning_rate": 0.0001961934248129941, "loss": 1.2408, "step": 280 }, { "epoch": 0.37383177570093457, "grad_norm": 0.48805466294288635, "learning_rate": 0.00019555412134290102, "loss": 1.2421, "step": 290 }, { "epoch": 0.3867225265871737, "grad_norm": 0.47072649002075195, "learning_rate": 0.00019486642738463674, "loss": 1.2583, "step": 300 }, { "epoch": 0.39961327747341285, "grad_norm": 0.4364190101623535, "learning_rate": 0.00019413069119995994, "loss": 1.2377, "step": 310 }, { "epoch": 0.41250402835965194, "grad_norm": 0.4522920250892639, "learning_rate": 0.0001933472853801586, "loss": 1.2331, "step": 320 }, { "epoch": 0.4253947792458911, "grad_norm": 0.46202462911605835, "learning_rate": 0.0001925166066573624, "loss": 1.2277, "step": 330 }, { "epoch": 0.4382855301321302, "grad_norm": 0.43247494101524353, "learning_rate": 0.00019163907570362963, "loss": 1.2369, "step": 340 }, { "epoch": 0.4511762810183693, "grad_norm": 0.47149762511253357, "learning_rate": 0.00019071513691791079, "loss": 1.2394, "step": 350 }, { "epoch": 0.46406703190460846, "grad_norm": 0.44876205921173096, "learning_rate": 0.00018974525820099607, "loss": 1.2318, "step": 360 }, { "epoch": 0.47695778279084755, "grad_norm": 0.44522625207901, "learning_rate": 0.00018872993071856114, "loss": 1.2214, "step": 370 }, { "epoch": 0.4898485336770867, "grad_norm": 0.45371952652931213, "learning_rate": 0.0001876696686524314, "loss": 1.2152, "step": 380 }, { "epoch": 0.5027392845633258, "grad_norm": 0.45091456174850464, "learning_rate": 0.00018656500894018987, "loss": 1.2025, "step": 390 }, { "epoch": 0.515630035449565, "grad_norm": 0.47521355748176575, "learning_rate": 0.00018541651100326175, "loss": 1.2107, "step": 400 }, { "epoch": 0.528520786335804, "grad_norm": 0.4451088607311249, "learning_rate": 0.0001842247564636121, "loss": 1.1974, "step": 410 }, { "epoch": 0.5414115372220432, "grad_norm": 0.4933655858039856, "learning_rate": 0.0001829903488492013, "loss": 1.1847, "step": 420 }, { "epoch": 0.5543022881082823, "grad_norm": 0.4385446608066559, "learning_rate": 0.00018171391328834638, "loss": 1.1925, "step": 430 }, { "epoch": 0.5671930389945214, "grad_norm": 0.46878549456596375, "learning_rate": 0.0001803960961931439, "loss": 1.176, "step": 440 }, { "epoch": 0.5800837898807606, "grad_norm": 0.45197296142578125, "learning_rate": 0.000179037564932114, "loss": 1.1888, "step": 450 }, { "epoch": 0.5929745407669996, "grad_norm": 0.4496542811393738, "learning_rate": 0.00017763900749223196, "loss": 1.1823, "step": 460 }, { "epoch": 0.6058652916532388, "grad_norm": 0.4356934428215027, "learning_rate": 0.00017620113213051797, "loss": 1.1877, "step": 470 }, { "epoch": 0.6187560425394779, "grad_norm": 0.46125900745391846, "learning_rate": 0.00017472466701536193, "loss": 1.1797, "step": 480 }, { "epoch": 0.6316467934257171, "grad_norm": 0.45797720551490784, "learning_rate": 0.0001732103598577645, "loss": 1.1777, "step": 490 }, { "epoch": 0.6445375443119562, "grad_norm": 0.4385370910167694, "learning_rate": 0.0001716589775326817, "loss": 1.1562, "step": 500 }, { "epoch": 0.6574282951981953, "grad_norm": 0.46976742148399353, "learning_rate": 0.00017007130569066413, "loss": 1.1677, "step": 510 }, { "epoch": 0.6703190460844344, "grad_norm": 0.46103382110595703, "learning_rate": 0.00016844814835998825, "loss": 1.1776, "step": 520 }, { "epoch": 0.6832097969706735, "grad_norm": 0.5240194201469421, "learning_rate": 0.00016679032753948056, "loss": 1.1489, "step": 530 }, { "epoch": 0.6961005478569127, "grad_norm": 0.4437713027000427, "learning_rate": 0.00016509868278224125, "loss": 1.1721, "step": 540 }, { "epoch": 0.7089912987431518, "grad_norm": 0.4481850862503052, "learning_rate": 0.00016337407077047784, "loss": 1.1613, "step": 550 }, { "epoch": 0.721882049629391, "grad_norm": 0.46106868982315063, "learning_rate": 0.00016161736488166462, "loss": 1.155, "step": 560 }, { "epoch": 0.73477280051563, "grad_norm": 0.4505125880241394, "learning_rate": 0.000159829454746247, "loss": 1.159, "step": 570 }, { "epoch": 0.7476635514018691, "grad_norm": 0.4489360749721527, "learning_rate": 0.00015801124579711525, "loss": 1.1794, "step": 580 }, { "epoch": 0.7605543022881083, "grad_norm": 0.4428081214427948, "learning_rate": 0.0001561636588110753, "loss": 1.1524, "step": 590 }, { "epoch": 0.7734450531743474, "grad_norm": 0.4341617226600647, "learning_rate": 0.00015428762944254932, "loss": 1.1497, "step": 600 }, { "epoch": 0.7863358040605866, "grad_norm": 0.43959757685661316, "learning_rate": 0.00015238410774974186, "loss": 1.1364, "step": 610 }, { "epoch": 0.7992265549468257, "grad_norm": 0.456399530172348, "learning_rate": 0.00015045405771351193, "loss": 1.1266, "step": 620 }, { "epoch": 0.8121173058330647, "grad_norm": 0.4572042226791382, "learning_rate": 0.00014849845674919364, "loss": 1.1417, "step": 630 }, { "epoch": 0.8250080567193039, "grad_norm": 0.4575168788433075, "learning_rate": 0.00014651829521161424, "loss": 1.1532, "step": 640 }, { "epoch": 0.837898807605543, "grad_norm": 0.4344440996646881, "learning_rate": 0.00014451457589355872, "loss": 1.137, "step": 650 }, { "epoch": 0.8507895584917822, "grad_norm": 0.4362059235572815, "learning_rate": 0.00014248831351793592, "loss": 1.1408, "step": 660 }, { "epoch": 0.8636803093780213, "grad_norm": 0.46403059363365173, "learning_rate": 0.0001404405342239028, "loss": 1.1227, "step": 670 }, { "epoch": 0.8765710602642603, "grad_norm": 0.47248131036758423, "learning_rate": 0.0001383722750472074, "loss": 1.1363, "step": 680 }, { "epoch": 0.8894618111504995, "grad_norm": 0.4404482841491699, "learning_rate": 0.00013628458339501348, "loss": 1.1179, "step": 690 }, { "epoch": 0.9023525620367386, "grad_norm": 0.4384572207927704, "learning_rate": 0.00013417851651547307, "loss": 1.1066, "step": 700 }, { "epoch": 0.9152433129229778, "grad_norm": 0.4435371458530426, "learning_rate": 0.0001320551409623149, "loss": 1.12, "step": 710 }, { "epoch": 0.9281340638092169, "grad_norm": 0.5222971439361572, "learning_rate": 0.00012991553205472092, "loss": 1.1164, "step": 720 }, { "epoch": 0.941024814695456, "grad_norm": 0.45858514308929443, "learning_rate": 0.00012776077333276326, "loss": 1.1182, "step": 730 }, { "epoch": 0.9539155655816951, "grad_norm": 0.45299065113067627, "learning_rate": 0.0001255919560086783, "loss": 1.1174, "step": 740 }, { "epoch": 0.9668063164679342, "grad_norm": 0.4541768729686737, "learning_rate": 0.0001234101784142553, "loss": 1.1221, "step": 750 }, { "epoch": 0.9796970673541734, "grad_norm": 0.4411795139312744, "learning_rate": 0.0001212165454446196, "loss": 1.1113, "step": 760 }, { "epoch": 0.9925878182404125, "grad_norm": 0.4533502161502838, "learning_rate": 0.00011901216799869188, "loss": 1.1122, "step": 770 }, { "epoch": 1.0, "eval_accuracy": 0.6748553734614313, "eval_first_token_accuracy": 0.8449519230769231, "eval_loss": 1.118744134902954, "eval_runtime": 727.7965, "eval_samples_per_second": 6.851, "eval_steps_per_second": 0.429, "step": 776 }, { "epoch": 1.0051563003544957, "grad_norm": 0.4265609085559845, "learning_rate": 0.0001167981624166072, "loss": 1.0876, "step": 780 }, { "epoch": 1.0180470512407347, "grad_norm": 0.48834115266799927, "learning_rate": 0.00011457564991437823, "loss": 1.0156, "step": 790 }, { "epoch": 1.030937802126974, "grad_norm": 0.4989129602909088, "learning_rate": 0.00011234575601608957, "loss": 1.0072, "step": 800 }, { "epoch": 1.043828553013213, "grad_norm": 0.4992086887359619, "learning_rate": 0.00011010960998391002, "loss": 1.0185, "step": 810 }, { "epoch": 1.0567193038994522, "grad_norm": 0.5244722366333008, "learning_rate": 0.00010786834424621211, "loss": 1.013, "step": 820 }, { "epoch": 1.0696100547856913, "grad_norm": 0.5480940937995911, "learning_rate": 0.000105623093824088, "loss": 1.0116, "step": 830 }, { "epoch": 1.0825008056719303, "grad_norm": 0.49391430616378784, "learning_rate": 0.00010337499575655249, "loss": 0.9852, "step": 840 }, { "epoch": 1.0953915565581696, "grad_norm": 0.48984748125076294, "learning_rate": 0.00010112518852472414, "loss": 1.0052, "step": 850 }, { "epoch": 1.1082823074444086, "grad_norm": 0.5330230593681335, "learning_rate": 9.887481147527588e-05, "loss": 1.0058, "step": 860 }, { "epoch": 1.1211730583306478, "grad_norm": 0.4981665313243866, "learning_rate": 9.66250042434475e-05, "loss": 1.0037, "step": 870 }, { "epoch": 1.1340638092168869, "grad_norm": 0.5282273888587952, "learning_rate": 9.437690617591202e-05, "loss": 0.9962, "step": 880 }, { "epoch": 1.1469545601031261, "grad_norm": 0.5284985303878784, "learning_rate": 9.213165575378793e-05, "loss": 1.0142, "step": 890 }, { "epoch": 1.1598453109893652, "grad_norm": 0.5249062776565552, "learning_rate": 8.989039001609e-05, "loss": 1.0031, "step": 900 }, { "epoch": 1.1727360618756042, "grad_norm": 0.5115840435028076, "learning_rate": 8.765424398391047e-05, "loss": 1.0149, "step": 910 }, { "epoch": 1.1856268127618435, "grad_norm": 0.5123730897903442, "learning_rate": 8.54243500856218e-05, "loss": 0.9993, "step": 920 }, { "epoch": 1.1985175636480825, "grad_norm": 0.5423418879508972, "learning_rate": 8.320183758339284e-05, "loss": 0.9911, "step": 930 }, { "epoch": 1.2114083145343217, "grad_norm": 0.5046626925468445, "learning_rate": 8.098783200130813e-05, "loss": 0.9909, "step": 940 }, { "epoch": 1.2242990654205608, "grad_norm": 0.5180662870407104, "learning_rate": 7.878345455538043e-05, "loss": 0.9852, "step": 950 }, { "epoch": 1.2371898163067998, "grad_norm": 0.5029054880142212, "learning_rate": 7.65898215857447e-05, "loss": 0.984, "step": 960 }, { "epoch": 1.250080567193039, "grad_norm": 0.5463522672653198, "learning_rate": 7.440804399132173e-05, "loss": 0.994, "step": 970 }, { "epoch": 1.262971318079278, "grad_norm": 0.517109215259552, "learning_rate": 7.223922666723678e-05, "loss": 0.9825, "step": 980 }, { "epoch": 1.2758620689655173, "grad_norm": 0.5118828415870667, "learning_rate": 7.00844679452791e-05, "loss": 0.9941, "step": 990 }, { "epoch": 1.2887528198517564, "grad_norm": 0.5242361426353455, "learning_rate": 6.794485903768513e-05, "loss": 0.9787, "step": 1000 }, { "epoch": 1.3016435707379954, "grad_norm": 0.5021597743034363, "learning_rate": 6.5821483484527e-05, "loss": 0.9871, "step": 1010 }, { "epoch": 1.3145343216242347, "grad_norm": 0.5013926029205322, "learning_rate": 6.371541660498652e-05, "loss": 0.9859, "step": 1020 }, { "epoch": 1.3274250725104737, "grad_norm": 0.5455625057220459, "learning_rate": 6.162772495279265e-05, "loss": 0.9731, "step": 1030 }, { "epoch": 1.340315823396713, "grad_norm": 0.5966176986694336, "learning_rate": 5.955946577609721e-05, "loss": 1.0026, "step": 1040 }, { "epoch": 1.353206574282952, "grad_norm": 0.5418857336044312, "learning_rate": 5.75116864820641e-05, "loss": 0.9798, "step": 1050 }, { "epoch": 1.366097325169191, "grad_norm": 0.5456270575523376, "learning_rate": 5.548542410644132e-05, "loss": 0.9737, "step": 1060 }, { "epoch": 1.3789880760554303, "grad_norm": 0.5252659916877747, "learning_rate": 5.3481704788385786e-05, "loss": 0.9775, "step": 1070 }, { "epoch": 1.3918788269416693, "grad_norm": 0.5047706961631775, "learning_rate": 5.150154325080636e-05, "loss": 0.9755, "step": 1080 }, { "epoch": 1.4047695778279086, "grad_norm": 0.5461093187332153, "learning_rate": 4.954594228648807e-05, "loss": 0.9716, "step": 1090 }, { "epoch": 1.4176603287141476, "grad_norm": 0.5116148591041565, "learning_rate": 4.761589225025811e-05, "loss": 0.9821, "step": 1100 }, { "epoch": 1.4305510796003866, "grad_norm": 0.5131816267967224, "learning_rate": 4.571237055745074e-05, "loss": 0.9857, "step": 1110 }, { "epoch": 1.4434418304866259, "grad_norm": 0.5409175157546997, "learning_rate": 4.383634118892472e-05, "loss": 0.9768, "step": 1120 }, { "epoch": 1.456332581372865, "grad_norm": 0.5277559757232666, "learning_rate": 4.1988754202884775e-05, "loss": 0.965, "step": 1130 }, { "epoch": 1.4692233322591042, "grad_norm": 0.522885799407959, "learning_rate": 4.0170545253752986e-05, "loss": 0.9576, "step": 1140 }, { "epoch": 1.4821140831453432, "grad_norm": 0.5148813128471375, "learning_rate": 3.838263511833542e-05, "loss": 0.9676, "step": 1150 }, { "epoch": 1.4950048340315822, "grad_norm": 0.5167680978775024, "learning_rate": 3.662592922952218e-05, "loss": 0.9694, "step": 1160 }, { "epoch": 1.5078955849178215, "grad_norm": 0.5245492458343506, "learning_rate": 3.4901317217758764e-05, "loss": 0.9603, "step": 1170 }, { "epoch": 1.5207863358040608, "grad_norm": 0.51069176197052, "learning_rate": 3.3209672460519425e-05, "loss": 0.9687, "step": 1180 }, { "epoch": 1.5336770866902998, "grad_norm": 0.5380834341049194, "learning_rate": 3.155185164001176e-05, "loss": 0.9653, "step": 1190 }, { "epoch": 1.5465678375765388, "grad_norm": 0.5268071889877319, "learning_rate": 2.9928694309335914e-05, "loss": 0.9785, "step": 1200 }, { "epoch": 1.5594585884627779, "grad_norm": 0.5809852480888367, "learning_rate": 2.8341022467318335e-05, "loss": 0.9628, "step": 1210 }, { "epoch": 1.572349339349017, "grad_norm": 0.5032438635826111, "learning_rate": 2.678964014223553e-05, "loss": 0.9641, "step": 1220 }, { "epoch": 1.5852400902352564, "grad_norm": 0.5497922897338867, "learning_rate": 2.52753329846381e-05, "loss": 0.969, "step": 1230 }, { "epoch": 1.5981308411214954, "grad_norm": 0.5357956886291504, "learning_rate": 2.3798867869482044e-05, "loss": 0.9598, "step": 1240 }, { "epoch": 1.6110215920077344, "grad_norm": 0.5199883580207825, "learning_rate": 2.2360992507768052e-05, "loss": 0.9733, "step": 1250 }, { "epoch": 1.6239123428939735, "grad_norm": 0.5282153487205505, "learning_rate": 2.096243506788602e-05, "loss": 0.9565, "step": 1260 }, { "epoch": 1.6368030937802127, "grad_norm": 0.5048473477363586, "learning_rate": 1.9603903806856106e-05, "loss": 0.9506, "step": 1270 }, { "epoch": 1.649693844666452, "grad_norm": 0.5167466998100281, "learning_rate": 1.8286086711653605e-05, "loss": 0.9618, "step": 1280 }, { "epoch": 1.662584595552691, "grad_norm": 0.5307776927947998, "learning_rate": 1.7009651150798712e-05, "loss": 0.9708, "step": 1290 }, { "epoch": 1.67547534643893, "grad_norm": 0.5388936400413513, "learning_rate": 1.577524353638791e-05, "loss": 0.9672, "step": 1300 }, { "epoch": 1.688366097325169, "grad_norm": 0.5254285931587219, "learning_rate": 1.4583488996738293e-05, "loss": 0.9615, "step": 1310 }, { "epoch": 1.7012568482114083, "grad_norm": 0.5476789474487305, "learning_rate": 1.3434991059810153e-05, "loss": 0.9772, "step": 1320 }, { "epoch": 1.7141475990976476, "grad_norm": 0.5789813995361328, "learning_rate": 1.2330331347568636e-05, "loss": 0.9613, "step": 1330 }, { "epoch": 1.7270383499838866, "grad_norm": 0.5327284336090088, "learning_rate": 1.1270069281438866e-05, "loss": 0.96, "step": 1340 }, { "epoch": 1.7399291008701256, "grad_norm": 0.5267541408538818, "learning_rate": 1.0254741799003976e-05, "loss": 0.9668, "step": 1350 }, { "epoch": 1.7528198517563647, "grad_norm": 0.5468846559524536, "learning_rate": 9.284863082089224e-06, "loss": 0.9686, "step": 1360 }, { "epoch": 1.765710602642604, "grad_norm": 0.6118249297142029, "learning_rate": 8.360924296370376e-06, "loss": 0.9679, "step": 1370 }, { "epoch": 1.7786013535288432, "grad_norm": 0.535074770450592, "learning_rate": 7.4833933426376345e-06, "loss": 0.957, "step": 1380 }, { "epoch": 1.7914921044150822, "grad_norm": 0.5529636144638062, "learning_rate": 6.652714619841405e-06, "loss": 0.9498, "step": 1390 }, { "epoch": 1.8043828553013213, "grad_norm": 0.5263587236404419, "learning_rate": 5.869308800040074e-06, "loss": 0.9551, "step": 1400 }, { "epoch": 1.8172736061875603, "grad_norm": 0.5330998301506042, "learning_rate": 5.133572615363269e-06, "loss": 0.9545, "step": 1410 }, { "epoch": 1.8301643570737995, "grad_norm": 0.5292595624923706, "learning_rate": 4.445878657098978e-06, "loss": 0.958, "step": 1420 }, { "epoch": 1.8430551079600388, "grad_norm": 0.536263644695282, "learning_rate": 3.806575187005901e-06, "loss": 0.96, "step": 1430 }, { "epoch": 1.8559458588462778, "grad_norm": 0.5184710025787354, "learning_rate": 3.2159859609469436e-06, "loss": 0.9556, "step": 1440 }, { "epoch": 1.8688366097325169, "grad_norm": 0.5328414440155029, "learning_rate": 2.6744100649327974e-06, "loss": 0.9576, "step": 1450 }, { "epoch": 1.881727360618756, "grad_norm": 0.5515352487564087, "learning_rate": 2.1821217636589176e-06, "loss": 0.9769, "step": 1460 }, { "epoch": 1.8946181115049952, "grad_norm": 0.5316532850265503, "learning_rate": 1.7393703616124802e-06, "loss": 0.9587, "step": 1470 }, { "epoch": 1.9075088623912344, "grad_norm": 0.521430492401123, "learning_rate": 1.3463800768196866e-06, "loss": 0.9687, "step": 1480 }, { "epoch": 1.9203996132774734, "grad_norm": 0.5210091471672058, "learning_rate": 1.0033499272973701e-06, "loss": 0.9604, "step": 1490 }, { "epoch": 1.9332903641637125, "grad_norm": 0.530375063419342, "learning_rate": 7.104536302662834e-07, "loss": 0.9459, "step": 1500 }, { "epoch": 1.9461811150499515, "grad_norm": 0.5609223246574402, "learning_rate": 4.678395141773373e-07, "loss": 0.9529, "step": 1510 }, { "epoch": 1.9590718659361908, "grad_norm": 0.5311603546142578, "learning_rate": 2.756304435951429e-07, "loss": 0.9477, "step": 1520 }, { "epoch": 1.97196261682243, "grad_norm": 0.561936616897583, "learning_rate": 1.3392375697696136e-07, "loss": 0.9538, "step": 1530 }, { "epoch": 1.984853367708669, "grad_norm": 0.5377177000045776, "learning_rate": 4.279121737859049e-08, "loss": 0.9633, "step": 1540 }, { "epoch": 1.997744118594908, "grad_norm": 0.5239057540893555, "learning_rate": 2.2789761121688735e-09, "loss": 0.9598, "step": 1550 }, { "epoch": 2.0, "eval_accuracy": 0.6875442401796923, "eval_first_token_accuracy": 0.8729967948717948, "eval_loss": 1.0595165491104126, "eval_runtime": 684.7375, "eval_samples_per_second": 7.282, "eval_steps_per_second": 0.456, "step": 1552 } ], "logging_steps": 10, "max_steps": 1552, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0014393801042297e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }