| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 1552, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012890750886239123, | |
| "grad_norm": 1.7974083423614502, | |
| "learning_rate": 1.153846153846154e-05, | |
| "loss": 2.5309, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.025781501772478246, | |
| "grad_norm": 0.9546365737915039, | |
| "learning_rate": 2.435897435897436e-05, | |
| "loss": 2.2805, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03867225265871737, | |
| "grad_norm": 0.6726247072219849, | |
| "learning_rate": 3.717948717948718e-05, | |
| "loss": 1.9329, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05156300354495649, | |
| "grad_norm": 0.5635246634483337, | |
| "learning_rate": 5e-05, | |
| "loss": 1.7205, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06445375443119562, | |
| "grad_norm": 0.48018646240234375, | |
| "learning_rate": 6.282051282051282e-05, | |
| "loss": 1.6227, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07734450531743474, | |
| "grad_norm": 0.4588874578475952, | |
| "learning_rate": 7.564102564102564e-05, | |
| "loss": 1.565, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.09023525620367387, | |
| "grad_norm": 0.5382615923881531, | |
| "learning_rate": 8.846153846153847e-05, | |
| "loss": 1.5151, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10312600708991299, | |
| "grad_norm": 0.5467047691345215, | |
| "learning_rate": 0.00010128205128205129, | |
| "loss": 1.4629, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.11601675797615212, | |
| "grad_norm": 0.5823647379875183, | |
| "learning_rate": 0.0001141025641025641, | |
| "loss": 1.457, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.12890750886239125, | |
| "grad_norm": 0.6547259092330933, | |
| "learning_rate": 0.00012692307692307693, | |
| "loss": 1.4192, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14179825974863036, | |
| "grad_norm": 0.5140829086303711, | |
| "learning_rate": 0.00013974358974358974, | |
| "loss": 1.3869, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.15468901063486948, | |
| "grad_norm": 0.6194220185279846, | |
| "learning_rate": 0.00015256410256410255, | |
| "loss": 1.365, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1675797615211086, | |
| "grad_norm": 0.6340938806533813, | |
| "learning_rate": 0.0001653846153846154, | |
| "loss": 1.3559, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.18047051240734774, | |
| "grad_norm": 0.5935661792755127, | |
| "learning_rate": 0.00017820512820512823, | |
| "loss": 1.3447, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.19336126329358685, | |
| "grad_norm": 0.5271995663642883, | |
| "learning_rate": 0.00019102564102564104, | |
| "loss": 1.3229, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.20625201417982597, | |
| "grad_norm": 0.5766036510467529, | |
| "learning_rate": 0.00019999772102388783, | |
| "loss": 1.3238, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2191427650660651, | |
| "grad_norm": 0.4888124167919159, | |
| "learning_rate": 0.0001999572087826214, | |
| "loss": 1.3371, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.23203351595230423, | |
| "grad_norm": 0.5345163345336914, | |
| "learning_rate": 0.00019986607624302306, | |
| "loss": 1.3455, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.24492426683854335, | |
| "grad_norm": 0.4851045310497284, | |
| "learning_rate": 0.00019972436955640487, | |
| "loss": 1.3176, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2578150177247825, | |
| "grad_norm": 0.47015997767448425, | |
| "learning_rate": 0.0001995321604858227, | |
| "loss": 1.3131, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2707057686110216, | |
| "grad_norm": 0.5002295970916748, | |
| "learning_rate": 0.00019928954636973373, | |
| "loss": 1.2997, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2835965194972607, | |
| "grad_norm": 0.46887412667274475, | |
| "learning_rate": 0.00019899665007270265, | |
| "loss": 1.279, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2964872703834998, | |
| "grad_norm": 0.4887183606624603, | |
| "learning_rate": 0.00019865361992318033, | |
| "loss": 1.2773, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.30937802126973896, | |
| "grad_norm": 0.49127480387687683, | |
| "learning_rate": 0.00019826062963838753, | |
| "loss": 1.2674, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3222687721559781, | |
| "grad_norm": 0.49845078587532043, | |
| "learning_rate": 0.0001978178782363411, | |
| "loss": 1.2723, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3351595230422172, | |
| "grad_norm": 0.4858417809009552, | |
| "learning_rate": 0.00019732558993506723, | |
| "loss": 1.273, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.34805027392845633, | |
| "grad_norm": 0.47399166226387024, | |
| "learning_rate": 0.00019678401403905307, | |
| "loss": 1.2629, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3609410248146955, | |
| "grad_norm": 0.47289639711380005, | |
| "learning_rate": 0.0001961934248129941, | |
| "loss": 1.2408, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.37383177570093457, | |
| "grad_norm": 0.48805466294288635, | |
| "learning_rate": 0.00019555412134290102, | |
| "loss": 1.2421, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3867225265871737, | |
| "grad_norm": 0.47072649002075195, | |
| "learning_rate": 0.00019486642738463674, | |
| "loss": 1.2583, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.39961327747341285, | |
| "grad_norm": 0.4364190101623535, | |
| "learning_rate": 0.00019413069119995994, | |
| "loss": 1.2377, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.41250402835965194, | |
| "grad_norm": 0.4522920250892639, | |
| "learning_rate": 0.0001933472853801586, | |
| "loss": 1.2331, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4253947792458911, | |
| "grad_norm": 0.46202462911605835, | |
| "learning_rate": 0.0001925166066573624, | |
| "loss": 1.2277, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4382855301321302, | |
| "grad_norm": 0.43247494101524353, | |
| "learning_rate": 0.00019163907570362963, | |
| "loss": 1.2369, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.4511762810183693, | |
| "grad_norm": 0.47149762511253357, | |
| "learning_rate": 0.00019071513691791079, | |
| "loss": 1.2394, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.46406703190460846, | |
| "grad_norm": 0.44876205921173096, | |
| "learning_rate": 0.00018974525820099607, | |
| "loss": 1.2318, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.47695778279084755, | |
| "grad_norm": 0.44522625207901, | |
| "learning_rate": 0.00018872993071856114, | |
| "loss": 1.2214, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4898485336770867, | |
| "grad_norm": 0.45371952652931213, | |
| "learning_rate": 0.0001876696686524314, | |
| "loss": 1.2152, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5027392845633258, | |
| "grad_norm": 0.45091456174850464, | |
| "learning_rate": 0.00018656500894018987, | |
| "loss": 1.2025, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.515630035449565, | |
| "grad_norm": 0.47521355748176575, | |
| "learning_rate": 0.00018541651100326175, | |
| "loss": 1.2107, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.528520786335804, | |
| "grad_norm": 0.4451088607311249, | |
| "learning_rate": 0.0001842247564636121, | |
| "loss": 1.1974, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5414115372220432, | |
| "grad_norm": 0.4933655858039856, | |
| "learning_rate": 0.0001829903488492013, | |
| "loss": 1.1847, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5543022881082823, | |
| "grad_norm": 0.4385446608066559, | |
| "learning_rate": 0.00018171391328834638, | |
| "loss": 1.1925, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5671930389945214, | |
| "grad_norm": 0.46878549456596375, | |
| "learning_rate": 0.0001803960961931439, | |
| "loss": 1.176, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5800837898807606, | |
| "grad_norm": 0.45197296142578125, | |
| "learning_rate": 0.000179037564932114, | |
| "loss": 1.1888, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5929745407669996, | |
| "grad_norm": 0.4496542811393738, | |
| "learning_rate": 0.00017763900749223196, | |
| "loss": 1.1823, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6058652916532388, | |
| "grad_norm": 0.4356934428215027, | |
| "learning_rate": 0.00017620113213051797, | |
| "loss": 1.1877, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6187560425394779, | |
| "grad_norm": 0.46125900745391846, | |
| "learning_rate": 0.00017472466701536193, | |
| "loss": 1.1797, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6316467934257171, | |
| "grad_norm": 0.45797720551490784, | |
| "learning_rate": 0.0001732103598577645, | |
| "loss": 1.1777, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6445375443119562, | |
| "grad_norm": 0.4385370910167694, | |
| "learning_rate": 0.0001716589775326817, | |
| "loss": 1.1562, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6574282951981953, | |
| "grad_norm": 0.46976742148399353, | |
| "learning_rate": 0.00017007130569066413, | |
| "loss": 1.1677, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6703190460844344, | |
| "grad_norm": 0.46103382110595703, | |
| "learning_rate": 0.00016844814835998825, | |
| "loss": 1.1776, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6832097969706735, | |
| "grad_norm": 0.5240194201469421, | |
| "learning_rate": 0.00016679032753948056, | |
| "loss": 1.1489, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6961005478569127, | |
| "grad_norm": 0.4437713027000427, | |
| "learning_rate": 0.00016509868278224125, | |
| "loss": 1.1721, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7089912987431518, | |
| "grad_norm": 0.4481850862503052, | |
| "learning_rate": 0.00016337407077047784, | |
| "loss": 1.1613, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.721882049629391, | |
| "grad_norm": 0.46106868982315063, | |
| "learning_rate": 0.00016161736488166462, | |
| "loss": 1.155, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.73477280051563, | |
| "grad_norm": 0.4505125880241394, | |
| "learning_rate": 0.000159829454746247, | |
| "loss": 1.159, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7476635514018691, | |
| "grad_norm": 0.4489360749721527, | |
| "learning_rate": 0.00015801124579711525, | |
| "loss": 1.1794, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7605543022881083, | |
| "grad_norm": 0.4428081214427948, | |
| "learning_rate": 0.0001561636588110753, | |
| "loss": 1.1524, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7734450531743474, | |
| "grad_norm": 0.4341617226600647, | |
| "learning_rate": 0.00015428762944254932, | |
| "loss": 1.1497, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7863358040605866, | |
| "grad_norm": 0.43959757685661316, | |
| "learning_rate": 0.00015238410774974186, | |
| "loss": 1.1364, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7992265549468257, | |
| "grad_norm": 0.456399530172348, | |
| "learning_rate": 0.00015045405771351193, | |
| "loss": 1.1266, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.8121173058330647, | |
| "grad_norm": 0.4572042226791382, | |
| "learning_rate": 0.00014849845674919364, | |
| "loss": 1.1417, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.8250080567193039, | |
| "grad_norm": 0.4575168788433075, | |
| "learning_rate": 0.00014651829521161424, | |
| "loss": 1.1532, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.837898807605543, | |
| "grad_norm": 0.4344440996646881, | |
| "learning_rate": 0.00014451457589355872, | |
| "loss": 1.137, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8507895584917822, | |
| "grad_norm": 0.4362059235572815, | |
| "learning_rate": 0.00014248831351793592, | |
| "loss": 1.1408, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.8636803093780213, | |
| "grad_norm": 0.46403059363365173, | |
| "learning_rate": 0.0001404405342239028, | |
| "loss": 1.1227, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8765710602642603, | |
| "grad_norm": 0.47248131036758423, | |
| "learning_rate": 0.0001383722750472074, | |
| "loss": 1.1363, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8894618111504995, | |
| "grad_norm": 0.4404482841491699, | |
| "learning_rate": 0.00013628458339501348, | |
| "loss": 1.1179, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9023525620367386, | |
| "grad_norm": 0.4384572207927704, | |
| "learning_rate": 0.00013417851651547307, | |
| "loss": 1.1066, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9152433129229778, | |
| "grad_norm": 0.4435371458530426, | |
| "learning_rate": 0.0001320551409623149, | |
| "loss": 1.12, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.9281340638092169, | |
| "grad_norm": 0.5222971439361572, | |
| "learning_rate": 0.00012991553205472092, | |
| "loss": 1.1164, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.941024814695456, | |
| "grad_norm": 0.45858514308929443, | |
| "learning_rate": 0.00012776077333276326, | |
| "loss": 1.1182, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.9539155655816951, | |
| "grad_norm": 0.45299065113067627, | |
| "learning_rate": 0.0001255919560086783, | |
| "loss": 1.1174, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.9668063164679342, | |
| "grad_norm": 0.4541768729686737, | |
| "learning_rate": 0.0001234101784142553, | |
| "loss": 1.1221, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9796970673541734, | |
| "grad_norm": 0.4411795139312744, | |
| "learning_rate": 0.0001212165454446196, | |
| "loss": 1.1113, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9925878182404125, | |
| "grad_norm": 0.4533502161502838, | |
| "learning_rate": 0.00011901216799869188, | |
| "loss": 1.1122, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.6748553734614313, | |
| "eval_first_token_accuracy": 0.8449519230769231, | |
| "eval_loss": 1.118744134902954, | |
| "eval_runtime": 727.7965, | |
| "eval_samples_per_second": 6.851, | |
| "eval_steps_per_second": 0.429, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 1.0051563003544957, | |
| "grad_norm": 0.4265609085559845, | |
| "learning_rate": 0.0001167981624166072, | |
| "loss": 1.0876, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.0180470512407347, | |
| "grad_norm": 0.48834115266799927, | |
| "learning_rate": 0.00011457564991437823, | |
| "loss": 1.0156, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.030937802126974, | |
| "grad_norm": 0.4989129602909088, | |
| "learning_rate": 0.00011234575601608957, | |
| "loss": 1.0072, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.043828553013213, | |
| "grad_norm": 0.4992086887359619, | |
| "learning_rate": 0.00011010960998391002, | |
| "loss": 1.0185, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.0567193038994522, | |
| "grad_norm": 0.5244722366333008, | |
| "learning_rate": 0.00010786834424621211, | |
| "loss": 1.013, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.0696100547856913, | |
| "grad_norm": 0.5480940937995911, | |
| "learning_rate": 0.000105623093824088, | |
| "loss": 1.0116, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.0825008056719303, | |
| "grad_norm": 0.49391430616378784, | |
| "learning_rate": 0.00010337499575655249, | |
| "loss": 0.9852, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.0953915565581696, | |
| "grad_norm": 0.48984748125076294, | |
| "learning_rate": 0.00010112518852472414, | |
| "loss": 1.0052, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.1082823074444086, | |
| "grad_norm": 0.5330230593681335, | |
| "learning_rate": 9.887481147527588e-05, | |
| "loss": 1.0058, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.1211730583306478, | |
| "grad_norm": 0.4981665313243866, | |
| "learning_rate": 9.66250042434475e-05, | |
| "loss": 1.0037, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.1340638092168869, | |
| "grad_norm": 0.5282273888587952, | |
| "learning_rate": 9.437690617591202e-05, | |
| "loss": 0.9962, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.1469545601031261, | |
| "grad_norm": 0.5284985303878784, | |
| "learning_rate": 9.213165575378793e-05, | |
| "loss": 1.0142, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.1598453109893652, | |
| "grad_norm": 0.5249062776565552, | |
| "learning_rate": 8.989039001609e-05, | |
| "loss": 1.0031, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1727360618756042, | |
| "grad_norm": 0.5115840435028076, | |
| "learning_rate": 8.765424398391047e-05, | |
| "loss": 1.0149, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.1856268127618435, | |
| "grad_norm": 0.5123730897903442, | |
| "learning_rate": 8.54243500856218e-05, | |
| "loss": 0.9993, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.1985175636480825, | |
| "grad_norm": 0.5423418879508972, | |
| "learning_rate": 8.320183758339284e-05, | |
| "loss": 0.9911, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.2114083145343217, | |
| "grad_norm": 0.5046626925468445, | |
| "learning_rate": 8.098783200130813e-05, | |
| "loss": 0.9909, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.2242990654205608, | |
| "grad_norm": 0.5180662870407104, | |
| "learning_rate": 7.878345455538043e-05, | |
| "loss": 0.9852, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.2371898163067998, | |
| "grad_norm": 0.5029054880142212, | |
| "learning_rate": 7.65898215857447e-05, | |
| "loss": 0.984, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.250080567193039, | |
| "grad_norm": 0.5463522672653198, | |
| "learning_rate": 7.440804399132173e-05, | |
| "loss": 0.994, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.262971318079278, | |
| "grad_norm": 0.517109215259552, | |
| "learning_rate": 7.223922666723678e-05, | |
| "loss": 0.9825, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.2758620689655173, | |
| "grad_norm": 0.5118828415870667, | |
| "learning_rate": 7.00844679452791e-05, | |
| "loss": 0.9941, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.2887528198517564, | |
| "grad_norm": 0.5242361426353455, | |
| "learning_rate": 6.794485903768513e-05, | |
| "loss": 0.9787, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.3016435707379954, | |
| "grad_norm": 0.5021597743034363, | |
| "learning_rate": 6.5821483484527e-05, | |
| "loss": 0.9871, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.3145343216242347, | |
| "grad_norm": 0.5013926029205322, | |
| "learning_rate": 6.371541660498652e-05, | |
| "loss": 0.9859, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.3274250725104737, | |
| "grad_norm": 0.5455625057220459, | |
| "learning_rate": 6.162772495279265e-05, | |
| "loss": 0.9731, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.340315823396713, | |
| "grad_norm": 0.5966176986694336, | |
| "learning_rate": 5.955946577609721e-05, | |
| "loss": 1.0026, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.353206574282952, | |
| "grad_norm": 0.5418857336044312, | |
| "learning_rate": 5.75116864820641e-05, | |
| "loss": 0.9798, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.366097325169191, | |
| "grad_norm": 0.5456270575523376, | |
| "learning_rate": 5.548542410644132e-05, | |
| "loss": 0.9737, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.3789880760554303, | |
| "grad_norm": 0.5252659916877747, | |
| "learning_rate": 5.3481704788385786e-05, | |
| "loss": 0.9775, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.3918788269416693, | |
| "grad_norm": 0.5047706961631775, | |
| "learning_rate": 5.150154325080636e-05, | |
| "loss": 0.9755, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.4047695778279086, | |
| "grad_norm": 0.5461093187332153, | |
| "learning_rate": 4.954594228648807e-05, | |
| "loss": 0.9716, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.4176603287141476, | |
| "grad_norm": 0.5116148591041565, | |
| "learning_rate": 4.761589225025811e-05, | |
| "loss": 0.9821, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.4305510796003866, | |
| "grad_norm": 0.5131816267967224, | |
| "learning_rate": 4.571237055745074e-05, | |
| "loss": 0.9857, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.4434418304866259, | |
| "grad_norm": 0.5409175157546997, | |
| "learning_rate": 4.383634118892472e-05, | |
| "loss": 0.9768, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.456332581372865, | |
| "grad_norm": 0.5277559757232666, | |
| "learning_rate": 4.1988754202884775e-05, | |
| "loss": 0.965, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.4692233322591042, | |
| "grad_norm": 0.522885799407959, | |
| "learning_rate": 4.0170545253752986e-05, | |
| "loss": 0.9576, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.4821140831453432, | |
| "grad_norm": 0.5148813128471375, | |
| "learning_rate": 3.838263511833542e-05, | |
| "loss": 0.9676, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.4950048340315822, | |
| "grad_norm": 0.5167680978775024, | |
| "learning_rate": 3.662592922952218e-05, | |
| "loss": 0.9694, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.5078955849178215, | |
| "grad_norm": 0.5245492458343506, | |
| "learning_rate": 3.4901317217758764e-05, | |
| "loss": 0.9603, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.5207863358040608, | |
| "grad_norm": 0.51069176197052, | |
| "learning_rate": 3.3209672460519425e-05, | |
| "loss": 0.9687, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.5336770866902998, | |
| "grad_norm": 0.5380834341049194, | |
| "learning_rate": 3.155185164001176e-05, | |
| "loss": 0.9653, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.5465678375765388, | |
| "grad_norm": 0.5268071889877319, | |
| "learning_rate": 2.9928694309335914e-05, | |
| "loss": 0.9785, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.5594585884627779, | |
| "grad_norm": 0.5809852480888367, | |
| "learning_rate": 2.8341022467318335e-05, | |
| "loss": 0.9628, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.572349339349017, | |
| "grad_norm": 0.5032438635826111, | |
| "learning_rate": 2.678964014223553e-05, | |
| "loss": 0.9641, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.5852400902352564, | |
| "grad_norm": 0.5497922897338867, | |
| "learning_rate": 2.52753329846381e-05, | |
| "loss": 0.969, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.5981308411214954, | |
| "grad_norm": 0.5357956886291504, | |
| "learning_rate": 2.3798867869482044e-05, | |
| "loss": 0.9598, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.6110215920077344, | |
| "grad_norm": 0.5199883580207825, | |
| "learning_rate": 2.2360992507768052e-05, | |
| "loss": 0.9733, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.6239123428939735, | |
| "grad_norm": 0.5282153487205505, | |
| "learning_rate": 2.096243506788602e-05, | |
| "loss": 0.9565, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.6368030937802127, | |
| "grad_norm": 0.5048473477363586, | |
| "learning_rate": 1.9603903806856106e-05, | |
| "loss": 0.9506, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.649693844666452, | |
| "grad_norm": 0.5167466998100281, | |
| "learning_rate": 1.8286086711653605e-05, | |
| "loss": 0.9618, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.662584595552691, | |
| "grad_norm": 0.5307776927947998, | |
| "learning_rate": 1.7009651150798712e-05, | |
| "loss": 0.9708, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.67547534643893, | |
| "grad_norm": 0.5388936400413513, | |
| "learning_rate": 1.577524353638791e-05, | |
| "loss": 0.9672, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.688366097325169, | |
| "grad_norm": 0.5254285931587219, | |
| "learning_rate": 1.4583488996738293e-05, | |
| "loss": 0.9615, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.7012568482114083, | |
| "grad_norm": 0.5476789474487305, | |
| "learning_rate": 1.3434991059810153e-05, | |
| "loss": 0.9772, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.7141475990976476, | |
| "grad_norm": 0.5789813995361328, | |
| "learning_rate": 1.2330331347568636e-05, | |
| "loss": 0.9613, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.7270383499838866, | |
| "grad_norm": 0.5327284336090088, | |
| "learning_rate": 1.1270069281438866e-05, | |
| "loss": 0.96, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.7399291008701256, | |
| "grad_norm": 0.5267541408538818, | |
| "learning_rate": 1.0254741799003976e-05, | |
| "loss": 0.9668, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.7528198517563647, | |
| "grad_norm": 0.5468846559524536, | |
| "learning_rate": 9.284863082089224e-06, | |
| "loss": 0.9686, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.765710602642604, | |
| "grad_norm": 0.6118249297142029, | |
| "learning_rate": 8.360924296370376e-06, | |
| "loss": 0.9679, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.7786013535288432, | |
| "grad_norm": 0.535074770450592, | |
| "learning_rate": 7.4833933426376345e-06, | |
| "loss": 0.957, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.7914921044150822, | |
| "grad_norm": 0.5529636144638062, | |
| "learning_rate": 6.652714619841405e-06, | |
| "loss": 0.9498, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.8043828553013213, | |
| "grad_norm": 0.5263587236404419, | |
| "learning_rate": 5.869308800040074e-06, | |
| "loss": 0.9551, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.8172736061875603, | |
| "grad_norm": 0.5330998301506042, | |
| "learning_rate": 5.133572615363269e-06, | |
| "loss": 0.9545, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.8301643570737995, | |
| "grad_norm": 0.5292595624923706, | |
| "learning_rate": 4.445878657098978e-06, | |
| "loss": 0.958, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.8430551079600388, | |
| "grad_norm": 0.536263644695282, | |
| "learning_rate": 3.806575187005901e-06, | |
| "loss": 0.96, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.8559458588462778, | |
| "grad_norm": 0.5184710025787354, | |
| "learning_rate": 3.2159859609469436e-06, | |
| "loss": 0.9556, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.8688366097325169, | |
| "grad_norm": 0.5328414440155029, | |
| "learning_rate": 2.6744100649327974e-06, | |
| "loss": 0.9576, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.881727360618756, | |
| "grad_norm": 0.5515352487564087, | |
| "learning_rate": 2.1821217636589176e-06, | |
| "loss": 0.9769, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.8946181115049952, | |
| "grad_norm": 0.5316532850265503, | |
| "learning_rate": 1.7393703616124802e-06, | |
| "loss": 0.9587, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.9075088623912344, | |
| "grad_norm": 0.521430492401123, | |
| "learning_rate": 1.3463800768196866e-06, | |
| "loss": 0.9687, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.9203996132774734, | |
| "grad_norm": 0.5210091471672058, | |
| "learning_rate": 1.0033499272973701e-06, | |
| "loss": 0.9604, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.9332903641637125, | |
| "grad_norm": 0.530375063419342, | |
| "learning_rate": 7.104536302662834e-07, | |
| "loss": 0.9459, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.9461811150499515, | |
| "grad_norm": 0.5609223246574402, | |
| "learning_rate": 4.678395141773373e-07, | |
| "loss": 0.9529, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.9590718659361908, | |
| "grad_norm": 0.5311603546142578, | |
| "learning_rate": 2.756304435951429e-07, | |
| "loss": 0.9477, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.97196261682243, | |
| "grad_norm": 0.561936616897583, | |
| "learning_rate": 1.3392375697696136e-07, | |
| "loss": 0.9538, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.984853367708669, | |
| "grad_norm": 0.5377177000045776, | |
| "learning_rate": 4.279121737859049e-08, | |
| "loss": 0.9633, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.997744118594908, | |
| "grad_norm": 0.5239057540893555, | |
| "learning_rate": 2.2789761121688735e-09, | |
| "loss": 0.9598, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.6875442401796923, | |
| "eval_first_token_accuracy": 0.8729967948717948, | |
| "eval_loss": 1.0595165491104126, | |
| "eval_runtime": 684.7375, | |
| "eval_samples_per_second": 7.282, | |
| "eval_steps_per_second": 0.456, | |
| "step": 1552 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1552, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0014393801042297e+19, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |