Fanucci
Training in progress, step 840, checkpoint
bc801f8 verified
{
"best_metric": 1.4602320194244385,
"best_model_checkpoint": "miner_id_24/checkpoint-800",
"epoch": 0.8365492344080667,
"eval_steps": 100,
"global_step": 840,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000995891945723889,
"grad_norm": 16.56658172607422,
"learning_rate": 2.5e-05,
"loss": 27.5358,
"step": 1
},
{
"epoch": 0.000995891945723889,
"eval_loss": 3.4175312519073486,
"eval_runtime": 287.5927,
"eval_samples_per_second": 4.656,
"eval_steps_per_second": 1.165,
"step": 1
},
{
"epoch": 0.001991783891447778,
"grad_norm": 16.847103118896484,
"learning_rate": 5e-05,
"loss": 28.6136,
"step": 2
},
{
"epoch": 0.002987675837171667,
"grad_norm": 14.910314559936523,
"learning_rate": 7.5e-05,
"loss": 26.2465,
"step": 3
},
{
"epoch": 0.003983567782895556,
"grad_norm": 14.995889663696289,
"learning_rate": 0.0001,
"loss": 23.7459,
"step": 4
},
{
"epoch": 0.004979459728619445,
"grad_norm": 18.5001220703125,
"learning_rate": 0.000125,
"loss": 24.4332,
"step": 5
},
{
"epoch": 0.005975351674343334,
"grad_norm": 13.621232986450195,
"learning_rate": 0.00015,
"loss": 22.1308,
"step": 6
},
{
"epoch": 0.006971243620067223,
"grad_norm": 12.984804153442383,
"learning_rate": 0.000175,
"loss": 19.3733,
"step": 7
},
{
"epoch": 0.007967135565791112,
"grad_norm": 13.488744735717773,
"learning_rate": 0.0002,
"loss": 16.8823,
"step": 8
},
{
"epoch": 0.008963027511515,
"grad_norm": 16.1417293548584,
"learning_rate": 0.00022500000000000002,
"loss": 16.2773,
"step": 9
},
{
"epoch": 0.00995891945723889,
"grad_norm": 13.883444786071777,
"learning_rate": 0.00025,
"loss": 13.4328,
"step": 10
},
{
"epoch": 0.010954811402962778,
"grad_norm": 21.88507843017578,
"learning_rate": 0.00024999910458769255,
"loss": 15.1014,
"step": 11
},
{
"epoch": 0.011950703348686668,
"grad_norm": 18.264280319213867,
"learning_rate": 0.0002499964183635983,
"loss": 13.3915,
"step": 12
},
{
"epoch": 0.012946595294410557,
"grad_norm": 21.488842010498047,
"learning_rate": 0.0002499919413662018,
"loss": 14.5928,
"step": 13
},
{
"epoch": 0.013942487240134445,
"grad_norm": 11.669401168823242,
"learning_rate": 0.00024998567365964314,
"loss": 13.6025,
"step": 14
},
{
"epoch": 0.014938379185858334,
"grad_norm": 12.027138710021973,
"learning_rate": 0.0002499776153337172,
"loss": 14.022,
"step": 15
},
{
"epoch": 0.015934271131582224,
"grad_norm": 13.013602256774902,
"learning_rate": 0.00024996776650387245,
"loss": 13.9936,
"step": 16
},
{
"epoch": 0.01693016307730611,
"grad_norm": 11.030466079711914,
"learning_rate": 0.000249956127311209,
"loss": 13.786,
"step": 17
},
{
"epoch": 0.01792605502303,
"grad_norm": 10.779586791992188,
"learning_rate": 0.000249942697922477,
"loss": 12.499,
"step": 18
},
{
"epoch": 0.01892194696875389,
"grad_norm": 11.274680137634277,
"learning_rate": 0.00024992747853007374,
"loss": 12.219,
"step": 19
},
{
"epoch": 0.01991783891447778,
"grad_norm": 9.957880020141602,
"learning_rate": 0.00024991046935204144,
"loss": 13.1162,
"step": 20
},
{
"epoch": 0.02091373086020167,
"grad_norm": 8.227347373962402,
"learning_rate": 0.0002498916706320637,
"loss": 13.0072,
"step": 21
},
{
"epoch": 0.021909622805925556,
"grad_norm": 7.3595051765441895,
"learning_rate": 0.00024987108263946215,
"loss": 12.4328,
"step": 22
},
{
"epoch": 0.022905514751649446,
"grad_norm": 10.374775886535645,
"learning_rate": 0.00024984870566919273,
"loss": 13.644,
"step": 23
},
{
"epoch": 0.023901406697373336,
"grad_norm": 7.763638496398926,
"learning_rate": 0.00024982454004184127,
"loss": 12.0825,
"step": 24
},
{
"epoch": 0.024897298643097223,
"grad_norm": 8.642582893371582,
"learning_rate": 0.0002497985861036189,
"loss": 13.2585,
"step": 25
},
{
"epoch": 0.025893190588821113,
"grad_norm": 7.242372035980225,
"learning_rate": 0.0002497708442263573,
"loss": 11.6953,
"step": 26
},
{
"epoch": 0.026889082534545,
"grad_norm": 7.458141326904297,
"learning_rate": 0.0002497413148075032,
"loss": 12.0056,
"step": 27
},
{
"epoch": 0.02788497448026889,
"grad_norm": 7.2677836418151855,
"learning_rate": 0.0002497099982701126,
"loss": 11.1914,
"step": 28
},
{
"epoch": 0.02888086642599278,
"grad_norm": 7.0768022537231445,
"learning_rate": 0.0002496768950628449,
"loss": 12.0455,
"step": 29
},
{
"epoch": 0.029876758371716668,
"grad_norm": 7.336495399475098,
"learning_rate": 0.0002496420056599565,
"loss": 13.927,
"step": 30
},
{
"epoch": 0.030872650317440558,
"grad_norm": 6.7654829025268555,
"learning_rate": 0.00024960533056129374,
"loss": 11.885,
"step": 31
},
{
"epoch": 0.03186854226316445,
"grad_norm": 7.940896511077881,
"learning_rate": 0.000249566870292286,
"loss": 13.0075,
"step": 32
},
{
"epoch": 0.032864434208888335,
"grad_norm": 6.816346168518066,
"learning_rate": 0.000249526625403938,
"loss": 12.771,
"step": 33
},
{
"epoch": 0.03386032615461222,
"grad_norm": 6.953782081604004,
"learning_rate": 0.0002494845964728221,
"loss": 13.2487,
"step": 34
},
{
"epoch": 0.034856218100336116,
"grad_norm": 8.73217487335205,
"learning_rate": 0.0002494407841010699,
"loss": 13.9326,
"step": 35
},
{
"epoch": 0.03585211004606,
"grad_norm": 6.887628078460693,
"learning_rate": 0.0002493951889163634,
"loss": 11.7964,
"step": 36
},
{
"epoch": 0.03684800199178389,
"grad_norm": 6.188755035400391,
"learning_rate": 0.00024934781157192666,
"loss": 12.3109,
"step": 37
},
{
"epoch": 0.03784389393750778,
"grad_norm": 6.760220527648926,
"learning_rate": 0.00024929865274651565,
"loss": 12.6023,
"step": 38
},
{
"epoch": 0.03883978588323167,
"grad_norm": 7.21213436126709,
"learning_rate": 0.0002492477131444091,
"loss": 13.9952,
"step": 39
},
{
"epoch": 0.03983567782895556,
"grad_norm": 7.4104485511779785,
"learning_rate": 0.00024919499349539815,
"loss": 13.8991,
"step": 40
},
{
"epoch": 0.04083156977467945,
"grad_norm": 6.497596740722656,
"learning_rate": 0.0002491404945547759,
"loss": 12.7215,
"step": 41
},
{
"epoch": 0.04182746172040334,
"grad_norm": 7.254849433898926,
"learning_rate": 0.0002490842171033268,
"loss": 12.7342,
"step": 42
},
{
"epoch": 0.042823353666127224,
"grad_norm": 6.134991645812988,
"learning_rate": 0.0002490261619473152,
"loss": 11.3577,
"step": 43
},
{
"epoch": 0.04381924561185111,
"grad_norm": 6.685878276824951,
"learning_rate": 0.0002489663299184738,
"loss": 12.3739,
"step": 44
},
{
"epoch": 0.044815137557575005,
"grad_norm": 7.548392295837402,
"learning_rate": 0.00024890472187399216,
"loss": 12.5839,
"step": 45
},
{
"epoch": 0.04581102950329889,
"grad_norm": 6.434916973114014,
"learning_rate": 0.00024884133869650376,
"loss": 12.3762,
"step": 46
},
{
"epoch": 0.04680692144902278,
"grad_norm": 7.479133605957031,
"learning_rate": 0.00024877618129407386,
"loss": 13.4078,
"step": 47
},
{
"epoch": 0.04780281339474667,
"grad_norm": 6.191986083984375,
"learning_rate": 0.00024870925060018633,
"loss": 11.6391,
"step": 48
},
{
"epoch": 0.04879870534047056,
"grad_norm": 6.074136257171631,
"learning_rate": 0.00024864054757373023,
"loss": 11.4169,
"step": 49
},
{
"epoch": 0.049794597286194446,
"grad_norm": 6.535557270050049,
"learning_rate": 0.0002485700731989861,
"loss": 12.7648,
"step": 50
},
{
"epoch": 0.05079048923191834,
"grad_norm": 7.5086517333984375,
"learning_rate": 0.0002484978284856119,
"loss": 13.1785,
"step": 51
},
{
"epoch": 0.05178638117764223,
"grad_norm": 6.3925395011901855,
"learning_rate": 0.00024842381446862856,
"loss": 12.2323,
"step": 52
},
{
"epoch": 0.052782273123366114,
"grad_norm": 6.091940879821777,
"learning_rate": 0.000248348032208405,
"loss": 11.4875,
"step": 53
},
{
"epoch": 0.05377816506909,
"grad_norm": 6.079466342926025,
"learning_rate": 0.0002482704827906432,
"loss": 11.7412,
"step": 54
},
{
"epoch": 0.054774057014813894,
"grad_norm": 7.269871234893799,
"learning_rate": 0.0002481911673263624,
"loss": 13.4444,
"step": 55
},
{
"epoch": 0.05576994896053778,
"grad_norm": 6.833920955657959,
"learning_rate": 0.00024811008695188326,
"loss": 13.2974,
"step": 56
},
{
"epoch": 0.05676584090626167,
"grad_norm": 6.509270668029785,
"learning_rate": 0.0002480272428288116,
"loss": 13.3969,
"step": 57
},
{
"epoch": 0.05776173285198556,
"grad_norm": 6.10461950302124,
"learning_rate": 0.00024794263614402176,
"loss": 13.0233,
"step": 58
},
{
"epoch": 0.05875762479770945,
"grad_norm": 6.248282432556152,
"learning_rate": 0.0002478562681096397,
"loss": 11.664,
"step": 59
},
{
"epoch": 0.059753516743433335,
"grad_norm": 5.46411657333374,
"learning_rate": 0.0002477681399630253,
"loss": 10.8694,
"step": 60
},
{
"epoch": 0.06074940868915723,
"grad_norm": 6.836920738220215,
"learning_rate": 0.00024767825296675516,
"loss": 12.7564,
"step": 61
},
{
"epoch": 0.061745300634881116,
"grad_norm": 8.421375274658203,
"learning_rate": 0.000247586608408604,
"loss": 12.9136,
"step": 62
},
{
"epoch": 0.06274119258060501,
"grad_norm": 5.612268924713135,
"learning_rate": 0.00024749320760152635,
"loss": 11.2727,
"step": 63
},
{
"epoch": 0.0637370845263289,
"grad_norm": 7.760491371154785,
"learning_rate": 0.00024739805188363803,
"loss": 13.7674,
"step": 64
},
{
"epoch": 0.06473297647205278,
"grad_norm": 8.422126770019531,
"learning_rate": 0.00024730114261819656,
"loss": 12.9104,
"step": 65
},
{
"epoch": 0.06572886841777667,
"grad_norm": 7.5595831871032715,
"learning_rate": 0.0002472024811935821,
"loss": 12.0906,
"step": 66
},
{
"epoch": 0.06672476036350056,
"grad_norm": 7.558265209197998,
"learning_rate": 0.0002471020690232769,
"loss": 11.7225,
"step": 67
},
{
"epoch": 0.06772065230922444,
"grad_norm": 8.700068473815918,
"learning_rate": 0.00024699990754584584,
"loss": 12.7639,
"step": 68
},
{
"epoch": 0.06871654425494834,
"grad_norm": 6.340944766998291,
"learning_rate": 0.0002468959982249151,
"loss": 12.6408,
"step": 69
},
{
"epoch": 0.06971243620067223,
"grad_norm": 5.689936637878418,
"learning_rate": 0.0002467903425491517,
"loss": 11.4365,
"step": 70
},
{
"epoch": 0.07070832814639612,
"grad_norm": 6.519106864929199,
"learning_rate": 0.00024668294203224184,
"loss": 11.9108,
"step": 71
},
{
"epoch": 0.07170422009212,
"grad_norm": 5.775708198547363,
"learning_rate": 0.00024657379821286954,
"loss": 11.2813,
"step": 72
},
{
"epoch": 0.07270011203784389,
"grad_norm": 6.3621296882629395,
"learning_rate": 0.00024646291265469425,
"loss": 11.5422,
"step": 73
},
{
"epoch": 0.07369600398356778,
"grad_norm": 6.622462749481201,
"learning_rate": 0.0002463502869463287,
"loss": 12.9495,
"step": 74
},
{
"epoch": 0.07469189592929167,
"grad_norm": 7.1500163078308105,
"learning_rate": 0.0002462359227013159,
"loss": 13.1289,
"step": 75
},
{
"epoch": 0.07568778787501557,
"grad_norm": 6.552374362945557,
"learning_rate": 0.0002461198215581064,
"loss": 11.7408,
"step": 76
},
{
"epoch": 0.07668367982073945,
"grad_norm": 6.086798191070557,
"learning_rate": 0.00024600198518003453,
"loss": 11.8846,
"step": 77
},
{
"epoch": 0.07767957176646334,
"grad_norm": 7.924241065979004,
"learning_rate": 0.00024588241525529445,
"loss": 13.0962,
"step": 78
},
{
"epoch": 0.07867546371218723,
"grad_norm": 6.5235090255737305,
"learning_rate": 0.0002457611134969164,
"loss": 11.482,
"step": 79
},
{
"epoch": 0.07967135565791111,
"grad_norm": 6.921808242797852,
"learning_rate": 0.0002456380816427417,
"loss": 12.5718,
"step": 80
},
{
"epoch": 0.080667247603635,
"grad_norm": 6.6802077293396,
"learning_rate": 0.0002455133214553981,
"loss": 11.822,
"step": 81
},
{
"epoch": 0.0816631395493589,
"grad_norm": 6.431028366088867,
"learning_rate": 0.0002453868347222746,
"loss": 13.067,
"step": 82
},
{
"epoch": 0.08265903149508279,
"grad_norm": 6.637655735015869,
"learning_rate": 0.0002452586232554956,
"loss": 13.5535,
"step": 83
},
{
"epoch": 0.08365492344080667,
"grad_norm": 6.012635231018066,
"learning_rate": 0.0002451286888918951,
"loss": 12.0727,
"step": 84
},
{
"epoch": 0.08465081538653056,
"grad_norm": 6.7169508934021,
"learning_rate": 0.00024499703349299034,
"loss": 12.3954,
"step": 85
},
{
"epoch": 0.08564670733225445,
"grad_norm": 6.19224214553833,
"learning_rate": 0.0002448636589449552,
"loss": 12.1299,
"step": 86
},
{
"epoch": 0.08664259927797834,
"grad_norm": 5.419031143188477,
"learning_rate": 0.0002447285671585931,
"loss": 10.0219,
"step": 87
},
{
"epoch": 0.08763849122370222,
"grad_norm": 6.183884143829346,
"learning_rate": 0.00024459176006930947,
"loss": 13.1559,
"step": 88
},
{
"epoch": 0.08863438316942612,
"grad_norm": 5.832536697387695,
"learning_rate": 0.0002444532396370844,
"loss": 11.4276,
"step": 89
},
{
"epoch": 0.08963027511515001,
"grad_norm": 6.158980846405029,
"learning_rate": 0.0002443130078464444,
"loss": 12.6743,
"step": 90
},
{
"epoch": 0.0906261670608739,
"grad_norm": 7.851455211639404,
"learning_rate": 0.0002441710667064337,
"loss": 15.0347,
"step": 91
},
{
"epoch": 0.09162205900659778,
"grad_norm": 6.618966102600098,
"learning_rate": 0.00024402741825058576,
"loss": 12.8038,
"step": 92
},
{
"epoch": 0.09261795095232167,
"grad_norm": 7.711861610412598,
"learning_rate": 0.0002438820645368942,
"loss": 14.378,
"step": 93
},
{
"epoch": 0.09361384289804556,
"grad_norm": 6.327939510345459,
"learning_rate": 0.00024373500764778307,
"loss": 11.8801,
"step": 94
},
{
"epoch": 0.09460973484376944,
"grad_norm": 6.685515880584717,
"learning_rate": 0.00024358624969007705,
"loss": 12.5321,
"step": 95
},
{
"epoch": 0.09560562678949334,
"grad_norm": 6.379256725311279,
"learning_rate": 0.0002434357927949716,
"loss": 12.7246,
"step": 96
},
{
"epoch": 0.09660151873521723,
"grad_norm": 5.819457054138184,
"learning_rate": 0.00024328363911800183,
"loss": 11.6533,
"step": 97
},
{
"epoch": 0.09759741068094112,
"grad_norm": 6.651020050048828,
"learning_rate": 0.00024312979083901227,
"loss": 13.6643,
"step": 98
},
{
"epoch": 0.098593302626665,
"grad_norm": 6.834366321563721,
"learning_rate": 0.00024297425016212517,
"loss": 12.5779,
"step": 99
},
{
"epoch": 0.09958919457238889,
"grad_norm": 5.578615188598633,
"learning_rate": 0.0002428170193157091,
"loss": 12.1336,
"step": 100
},
{
"epoch": 0.09958919457238889,
"eval_loss": 1.5528450012207031,
"eval_runtime": 289.8128,
"eval_samples_per_second": 4.62,
"eval_steps_per_second": 1.156,
"step": 100
},
{
"epoch": 0.10058508651811278,
"grad_norm": 6.1246747970581055,
"learning_rate": 0.000242658100552347,
"loss": 12.9692,
"step": 101
},
{
"epoch": 0.10158097846383668,
"grad_norm": 6.364856719970703,
"learning_rate": 0.00024249749614880397,
"loss": 10.5347,
"step": 102
},
{
"epoch": 0.10257687040956057,
"grad_norm": 6.769341945648193,
"learning_rate": 0.00024233520840599457,
"loss": 12.3651,
"step": 103
},
{
"epoch": 0.10357276235528445,
"grad_norm": 7.491860389709473,
"learning_rate": 0.00024217123964894986,
"loss": 12.3008,
"step": 104
},
{
"epoch": 0.10456865430100834,
"grad_norm": 6.12730073928833,
"learning_rate": 0.00024200559222678408,
"loss": 11.3047,
"step": 105
},
{
"epoch": 0.10556454624673223,
"grad_norm": 7.921679496765137,
"learning_rate": 0.00024183826851266116,
"loss": 13.0155,
"step": 106
},
{
"epoch": 0.10656043819245611,
"grad_norm": 6.370965480804443,
"learning_rate": 0.00024166927090376052,
"loss": 12.2079,
"step": 107
},
{
"epoch": 0.10755633013818,
"grad_norm": 7.240699768066406,
"learning_rate": 0.00024149860182124267,
"loss": 13.1831,
"step": 108
},
{
"epoch": 0.1085522220839039,
"grad_norm": 6.247211456298828,
"learning_rate": 0.0002413262637102148,
"loss": 12.2088,
"step": 109
},
{
"epoch": 0.10954811402962779,
"grad_norm": 6.4018168449401855,
"learning_rate": 0.00024115225903969568,
"loss": 12.0962,
"step": 110
},
{
"epoch": 0.11054400597535168,
"grad_norm": 6.562201976776123,
"learning_rate": 0.00024097659030257993,
"loss": 11.6653,
"step": 111
},
{
"epoch": 0.11153989792107556,
"grad_norm": 7.297281742095947,
"learning_rate": 0.0002407992600156028,
"loss": 12.4297,
"step": 112
},
{
"epoch": 0.11253578986679945,
"grad_norm": 6.0937299728393555,
"learning_rate": 0.00024062027071930386,
"loss": 12.2878,
"step": 113
},
{
"epoch": 0.11353168181252334,
"grad_norm": 6.412242889404297,
"learning_rate": 0.0002404396249779906,
"loss": 12.9066,
"step": 114
},
{
"epoch": 0.11452757375824724,
"grad_norm": 6.969156742095947,
"learning_rate": 0.00024025732537970168,
"loss": 11.7735,
"step": 115
},
{
"epoch": 0.11552346570397112,
"grad_norm": 6.48976993560791,
"learning_rate": 0.00024007337453617005,
"loss": 11.5464,
"step": 116
},
{
"epoch": 0.11651935764969501,
"grad_norm": 6.79917049407959,
"learning_rate": 0.00023988777508278524,
"loss": 14.0385,
"step": 117
},
{
"epoch": 0.1175152495954189,
"grad_norm": 5.524596691131592,
"learning_rate": 0.00023970052967855587,
"loss": 12.4122,
"step": 118
},
{
"epoch": 0.11851114154114278,
"grad_norm": 8.637755393981934,
"learning_rate": 0.00023951164100607128,
"loss": 12.9614,
"step": 119
},
{
"epoch": 0.11950703348686667,
"grad_norm": 7.299292087554932,
"learning_rate": 0.00023932111177146342,
"loss": 12.6715,
"step": 120
},
{
"epoch": 0.12050292543259056,
"grad_norm": 6.199335098266602,
"learning_rate": 0.0002391289447043678,
"loss": 11.9946,
"step": 121
},
{
"epoch": 0.12149881737831446,
"grad_norm": 5.948755741119385,
"learning_rate": 0.0002389351425578845,
"loss": 12.3719,
"step": 122
},
{
"epoch": 0.12249470932403834,
"grad_norm": 6.023370265960693,
"learning_rate": 0.00023873970810853884,
"loss": 11.4807,
"step": 123
},
{
"epoch": 0.12349060126976223,
"grad_norm": 5.60730504989624,
"learning_rate": 0.00023854264415624135,
"loss": 11.9841,
"step": 124
},
{
"epoch": 0.12448649321548612,
"grad_norm": 5.656817436218262,
"learning_rate": 0.0002383439535242478,
"loss": 11.3724,
"step": 125
},
{
"epoch": 0.12548238516121002,
"grad_norm": 6.920670509338379,
"learning_rate": 0.0002381436390591189,
"loss": 12.0686,
"step": 126
},
{
"epoch": 0.1264782771069339,
"grad_norm": 7.383021831512451,
"learning_rate": 0.00023794170363067914,
"loss": 14.0681,
"step": 127
},
{
"epoch": 0.1274741690526578,
"grad_norm": 8.584571838378906,
"learning_rate": 0.00023773815013197608,
"loss": 15.4667,
"step": 128
},
{
"epoch": 0.12847006099838168,
"grad_norm": 12.736078262329102,
"learning_rate": 0.00023753298147923858,
"loss": 12.1753,
"step": 129
},
{
"epoch": 0.12946595294410557,
"grad_norm": 6.229617118835449,
"learning_rate": 0.0002373262006118353,
"loss": 13.8618,
"step": 130
},
{
"epoch": 0.13046184488982945,
"grad_norm": 5.858534336090088,
"learning_rate": 0.0002371178104922323,
"loss": 12.3115,
"step": 131
},
{
"epoch": 0.13145773683555334,
"grad_norm": 6.3285603523254395,
"learning_rate": 0.00023690781410595085,
"loss": 12.8249,
"step": 132
},
{
"epoch": 0.13245362878127723,
"grad_norm": 5.9630632400512695,
"learning_rate": 0.00023669621446152463,
"loss": 13.0259,
"step": 133
},
{
"epoch": 0.13344952072700111,
"grad_norm": 6.730586528778076,
"learning_rate": 0.0002364830145904563,
"loss": 12.6343,
"step": 134
},
{
"epoch": 0.134445412672725,
"grad_norm": 7.248769760131836,
"learning_rate": 0.0002362682175471746,
"loss": 11.8677,
"step": 135
},
{
"epoch": 0.1354413046184489,
"grad_norm": 6.6009979248046875,
"learning_rate": 0.0002360518264089901,
"loss": 12.3887,
"step": 136
},
{
"epoch": 0.13643719656417277,
"grad_norm": 5.64854097366333,
"learning_rate": 0.00023583384427605146,
"loss": 10.898,
"step": 137
},
{
"epoch": 0.1374330885098967,
"grad_norm": 7.965289115905762,
"learning_rate": 0.00023561427427130083,
"loss": 13.6869,
"step": 138
},
{
"epoch": 0.13842898045562058,
"grad_norm": 8.878471374511719,
"learning_rate": 0.00023539311954042912,
"loss": 12.4927,
"step": 139
},
{
"epoch": 0.13942487240134446,
"grad_norm": 6.698742866516113,
"learning_rate": 0.0002351703832518311,
"loss": 11.4779,
"step": 140
},
{
"epoch": 0.14042076434706835,
"grad_norm": 8.585515975952148,
"learning_rate": 0.00023494606859655976,
"loss": 13.2428,
"step": 141
},
{
"epoch": 0.14141665629279224,
"grad_norm": 6.803221225738525,
"learning_rate": 0.00023472017878828073,
"loss": 11.7988,
"step": 142
},
{
"epoch": 0.14241254823851612,
"grad_norm": 7.021427154541016,
"learning_rate": 0.00023449271706322622,
"loss": 12.4274,
"step": 143
},
{
"epoch": 0.14340844018424,
"grad_norm": 5.856128692626953,
"learning_rate": 0.00023426368668014874,
"loss": 12.2856,
"step": 144
},
{
"epoch": 0.1444043321299639,
"grad_norm": 6.2399210929870605,
"learning_rate": 0.00023403309092027424,
"loss": 12.307,
"step": 145
},
{
"epoch": 0.14540022407568778,
"grad_norm": 6.5773210525512695,
"learning_rate": 0.00023380093308725518,
"loss": 12.4102,
"step": 146
},
{
"epoch": 0.14639611602141167,
"grad_norm": 7.773036479949951,
"learning_rate": 0.00023356721650712338,
"loss": 14.0709,
"step": 147
},
{
"epoch": 0.14739200796713556,
"grad_norm": 5.980135917663574,
"learning_rate": 0.00023333194452824195,
"loss": 11.8756,
"step": 148
},
{
"epoch": 0.14838789991285944,
"grad_norm": 6.181486129760742,
"learning_rate": 0.00023309512052125775,
"loss": 12.3727,
"step": 149
},
{
"epoch": 0.14938379185858333,
"grad_norm": 13.033953666687012,
"learning_rate": 0.00023285674787905286,
"loss": 13.3292,
"step": 150
},
{
"epoch": 0.15037968380430725,
"grad_norm": 6.594436168670654,
"learning_rate": 0.00023261683001669602,
"loss": 13.3059,
"step": 151
},
{
"epoch": 0.15137557575003113,
"grad_norm": 7.247442722320557,
"learning_rate": 0.00023237537037139384,
"loss": 12.0958,
"step": 152
},
{
"epoch": 0.15237146769575502,
"grad_norm": 6.844114303588867,
"learning_rate": 0.0002321323724024412,
"loss": 12.2067,
"step": 153
},
{
"epoch": 0.1533673596414789,
"grad_norm": 5.828722953796387,
"learning_rate": 0.0002318878395911721,
"loss": 11.2265,
"step": 154
},
{
"epoch": 0.1543632515872028,
"grad_norm": 7.751287460327148,
"learning_rate": 0.00023164177544090958,
"loss": 14.7071,
"step": 155
},
{
"epoch": 0.15535914353292668,
"grad_norm": 6.924627780914307,
"learning_rate": 0.00023139418347691555,
"loss": 11.9454,
"step": 156
},
{
"epoch": 0.15635503547865057,
"grad_norm": 6.240747451782227,
"learning_rate": 0.0002311450672463402,
"loss": 12.9397,
"step": 157
},
{
"epoch": 0.15735092742437445,
"grad_norm": 6.814810752868652,
"learning_rate": 0.00023089443031817147,
"loss": 11.8282,
"step": 158
},
{
"epoch": 0.15834681937009834,
"grad_norm": 7.39363956451416,
"learning_rate": 0.0002306422762831835,
"loss": 14.9512,
"step": 159
},
{
"epoch": 0.15934271131582223,
"grad_norm": 6.947161674499512,
"learning_rate": 0.00023038860875388556,
"loss": 12.2441,
"step": 160
},
{
"epoch": 0.16033860326154611,
"grad_norm": 6.200173854827881,
"learning_rate": 0.00023013343136447006,
"loss": 11.8266,
"step": 161
},
{
"epoch": 0.16133449520727,
"grad_norm": 6.987513065338135,
"learning_rate": 0.00022987674777076068,
"loss": 12.7667,
"step": 162
},
{
"epoch": 0.1623303871529939,
"grad_norm": 6.824809551239014,
"learning_rate": 0.0002296185616501597,
"loss": 12.4046,
"step": 163
},
{
"epoch": 0.1633262790987178,
"grad_norm": 6.129730701446533,
"learning_rate": 0.00022935887670159566,
"loss": 12.8159,
"step": 164
},
{
"epoch": 0.1643221710444417,
"grad_norm": 7.575254917144775,
"learning_rate": 0.00022909769664547014,
"loss": 11.9914,
"step": 165
},
{
"epoch": 0.16531806299016558,
"grad_norm": 6.266209602355957,
"learning_rate": 0.0002288350252236045,
"loss": 11.6365,
"step": 166
},
{
"epoch": 0.16631395493588946,
"grad_norm": 7.802381992340088,
"learning_rate": 0.00022857086619918634,
"loss": 12.7036,
"step": 167
},
{
"epoch": 0.16730984688161335,
"grad_norm": 7.222281455993652,
"learning_rate": 0.00022830522335671555,
"loss": 11.8082,
"step": 168
},
{
"epoch": 0.16830573882733724,
"grad_norm": 7.55634069442749,
"learning_rate": 0.00022803810050195004,
"loss": 12.8177,
"step": 169
},
{
"epoch": 0.16930163077306112,
"grad_norm": 6.894867420196533,
"learning_rate": 0.00022776950146185127,
"loss": 12.4942,
"step": 170
},
{
"epoch": 0.170297522718785,
"grad_norm": 7.815672874450684,
"learning_rate": 0.0002274994300845294,
"loss": 13.8819,
"step": 171
},
{
"epoch": 0.1712934146645089,
"grad_norm": 6.255967140197754,
"learning_rate": 0.00022722789023918823,
"loss": 11.8453,
"step": 172
},
{
"epoch": 0.17228930661023278,
"grad_norm": 7.725053310394287,
"learning_rate": 0.0002269548858160697,
"loss": 12.0805,
"step": 173
},
{
"epoch": 0.17328519855595667,
"grad_norm": 8.110404968261719,
"learning_rate": 0.00022668042072639805,
"loss": 11.1405,
"step": 174
},
{
"epoch": 0.17428109050168056,
"grad_norm": 6.437393665313721,
"learning_rate": 0.00022640449890232403,
"loss": 11.7487,
"step": 175
},
{
"epoch": 0.17527698244740444,
"grad_norm": 6.381730556488037,
"learning_rate": 0.00022612712429686844,
"loss": 12.7643,
"step": 176
},
{
"epoch": 0.17627287439312836,
"grad_norm": 5.764153480529785,
"learning_rate": 0.00022584830088386539,
"loss": 11.6039,
"step": 177
},
{
"epoch": 0.17726876633885225,
"grad_norm": 5.821381568908691,
"learning_rate": 0.00022556803265790553,
"loss": 12.7432,
"step": 178
},
{
"epoch": 0.17826465828457613,
"grad_norm": 5.812039375305176,
"learning_rate": 0.00022528632363427882,
"loss": 11.2202,
"step": 179
},
{
"epoch": 0.17926055023030002,
"grad_norm": 6.043585300445557,
"learning_rate": 0.00022500317784891684,
"loss": 11.7178,
"step": 180
},
{
"epoch": 0.1802564421760239,
"grad_norm": 6.763357639312744,
"learning_rate": 0.0002247185993583351,
"loss": 13.0897,
"step": 181
},
{
"epoch": 0.1812523341217478,
"grad_norm": 6.222050666809082,
"learning_rate": 0.00022443259223957498,
"loss": 12.9561,
"step": 182
},
{
"epoch": 0.18224822606747168,
"grad_norm": 6.555530071258545,
"learning_rate": 0.00022414516059014516,
"loss": 13.3252,
"step": 183
},
{
"epoch": 0.18324411801319557,
"grad_norm": 6.633552551269531,
"learning_rate": 0.00022385630852796306,
"loss": 12.5151,
"step": 184
},
{
"epoch": 0.18424000995891945,
"grad_norm": 5.674712657928467,
"learning_rate": 0.00022356604019129573,
"loss": 10.8516,
"step": 185
},
{
"epoch": 0.18523590190464334,
"grad_norm": 6.16930627822876,
"learning_rate": 0.00022327435973870058,
"loss": 12.2844,
"step": 186
},
{
"epoch": 0.18623179385036723,
"grad_norm": 6.338482856750488,
"learning_rate": 0.00022298127134896595,
"loss": 13.021,
"step": 187
},
{
"epoch": 0.18722768579609111,
"grad_norm": 5.933238506317139,
"learning_rate": 0.000222686779221051,
"loss": 11.8773,
"step": 188
},
{
"epoch": 0.188223577741815,
"grad_norm": 6.159973621368408,
"learning_rate": 0.00022239088757402582,
"loss": 11.4913,
"step": 189
},
{
"epoch": 0.1892194696875389,
"grad_norm": 7.25203800201416,
"learning_rate": 0.0002220936006470107,
"loss": 13.7217,
"step": 190
},
{
"epoch": 0.1902153616332628,
"grad_norm": 6.028827667236328,
"learning_rate": 0.00022179492269911564,
"loss": 11.364,
"step": 191
},
{
"epoch": 0.1912112535789867,
"grad_norm": 6.040252685546875,
"learning_rate": 0.00022149485800937918,
"loss": 12.5145,
"step": 192
},
{
"epoch": 0.19220714552471058,
"grad_norm": 5.860878944396973,
"learning_rate": 0.00022119341087670723,
"loss": 11.5606,
"step": 193
},
{
"epoch": 0.19320303747043446,
"grad_norm": 6.734287261962891,
"learning_rate": 0.00022089058561981128,
"loss": 12.0882,
"step": 194
},
{
"epoch": 0.19419892941615835,
"grad_norm": 6.507574558258057,
"learning_rate": 0.00022058638657714683,
"loss": 13.0287,
"step": 195
},
{
"epoch": 0.19519482136188224,
"grad_norm": 5.8352179527282715,
"learning_rate": 0.00022028081810685084,
"loss": 11.9273,
"step": 196
},
{
"epoch": 0.19619071330760612,
"grad_norm": 6.327966690063477,
"learning_rate": 0.00021997388458667972,
"loss": 11.9534,
"step": 197
},
{
"epoch": 0.19718660525333,
"grad_norm": 7.161978721618652,
"learning_rate": 0.00021966559041394619,
"loss": 14.3134,
"step": 198
},
{
"epoch": 0.1981824971990539,
"grad_norm": 7.602406978607178,
"learning_rate": 0.00021935594000545663,
"loss": 13.8131,
"step": 199
},
{
"epoch": 0.19917838914477778,
"grad_norm": 6.5593767166137695,
"learning_rate": 0.00021904493779744766,
"loss": 12.1681,
"step": 200
},
{
"epoch": 0.19917838914477778,
"eval_loss": 1.545896291732788,
"eval_runtime": 289.9141,
"eval_samples_per_second": 4.619,
"eval_steps_per_second": 1.156,
"step": 200
},
{
"epoch": 0.20017428109050167,
"grad_norm": 7.11292839050293,
"learning_rate": 0.00021873258824552257,
"loss": 11.3805,
"step": 201
},
{
"epoch": 0.20117017303622556,
"grad_norm": 5.791740417480469,
"learning_rate": 0.0002184188958245874,
"loss": 11.4427,
"step": 202
},
{
"epoch": 0.20216606498194944,
"grad_norm": 6.359834671020508,
"learning_rate": 0.0002181038650287871,
"loss": 12.8047,
"step": 203
},
{
"epoch": 0.20316195692767336,
"grad_norm": 6.810841083526611,
"learning_rate": 0.00021778750037144086,
"loss": 12.9342,
"step": 204
},
{
"epoch": 0.20415784887339725,
"grad_norm": 6.595353126525879,
"learning_rate": 0.0002174698063849776,
"loss": 11.8879,
"step": 205
},
{
"epoch": 0.20515374081912113,
"grad_norm": 6.707181453704834,
"learning_rate": 0.00021715078762087108,
"loss": 12.4952,
"step": 206
},
{
"epoch": 0.20614963276484502,
"grad_norm": 5.755362033843994,
"learning_rate": 0.00021683044864957444,
"loss": 12.742,
"step": 207
},
{
"epoch": 0.2071455247105689,
"grad_norm": 5.790565490722656,
"learning_rate": 0.00021650879406045508,
"loss": 10.6683,
"step": 208
},
{
"epoch": 0.2081414166562928,
"grad_norm": 6.435606002807617,
"learning_rate": 0.0002161858284617286,
"loss": 12.8725,
"step": 209
},
{
"epoch": 0.20913730860201668,
"grad_norm": 6.840058326721191,
"learning_rate": 0.00021586155648039296,
"loss": 11.7874,
"step": 210
},
{
"epoch": 0.21013320054774057,
"grad_norm": 6.608837604522705,
"learning_rate": 0.00021553598276216217,
"loss": 13.3114,
"step": 211
},
{
"epoch": 0.21112909249346445,
"grad_norm": 5.996092319488525,
"learning_rate": 0.00021520911197139958,
"loss": 11.8956,
"step": 212
},
{
"epoch": 0.21212498443918834,
"grad_norm": 5.224332332611084,
"learning_rate": 0.00021488094879105134,
"loss": 10.4107,
"step": 213
},
{
"epoch": 0.21312087638491223,
"grad_norm": 5.718519687652588,
"learning_rate": 0.000214551497922579,
"loss": 11.4343,
"step": 214
},
{
"epoch": 0.21411676833063611,
"grad_norm": 6.161787033081055,
"learning_rate": 0.00021422076408589237,
"loss": 11.9857,
"step": 215
},
{
"epoch": 0.21511266027636,
"grad_norm": 6.545485019683838,
"learning_rate": 0.00021388875201928183,
"loss": 13.6983,
"step": 216
},
{
"epoch": 0.21610855222208392,
"grad_norm": 6.377624988555908,
"learning_rate": 0.0002135554664793504,
"loss": 12.8974,
"step": 217
},
{
"epoch": 0.2171044441678078,
"grad_norm": 6.514633655548096,
"learning_rate": 0.0002132209122409457,
"loss": 12.2978,
"step": 218
},
{
"epoch": 0.2181003361135317,
"grad_norm": 5.409609794616699,
"learning_rate": 0.00021288509409709148,
"loss": 10.8475,
"step": 219
},
{
"epoch": 0.21909622805925558,
"grad_norm": 5.895532131195068,
"learning_rate": 0.00021254801685891887,
"loss": 10.9371,
"step": 220
},
{
"epoch": 0.22009212000497946,
"grad_norm": 5.818058013916016,
"learning_rate": 0.0002122096853555976,
"loss": 12.6627,
"step": 221
},
{
"epoch": 0.22108801195070335,
"grad_norm": 5.7109375,
"learning_rate": 0.00021187010443426675,
"loss": 12.3313,
"step": 222
},
{
"epoch": 0.22208390389642724,
"grad_norm": 6.082381248474121,
"learning_rate": 0.0002115292789599653,
"loss": 11.4319,
"step": 223
},
{
"epoch": 0.22307979584215112,
"grad_norm": 5.8958048820495605,
"learning_rate": 0.00021118721381556245,
"loss": 11.6646,
"step": 224
},
{
"epoch": 0.224075687787875,
"grad_norm": 5.592798233032227,
"learning_rate": 0.00021084391390168764,
"loss": 11.1113,
"step": 225
},
{
"epoch": 0.2250715797335989,
"grad_norm": 5.9306182861328125,
"learning_rate": 0.00021049938413666037,
"loss": 12.4602,
"step": 226
},
{
"epoch": 0.22606747167932278,
"grad_norm": 5.919130325317383,
"learning_rate": 0.0002101536294564197,
"loss": 11.9998,
"step": 227
},
{
"epoch": 0.22706336362504667,
"grad_norm": 6.082193374633789,
"learning_rate": 0.00020980665481445355,
"loss": 11.6682,
"step": 228
},
{
"epoch": 0.22805925557077056,
"grad_norm": 6.038330078125,
"learning_rate": 0.00020945846518172776,
"loss": 11.8126,
"step": 229
},
{
"epoch": 0.22905514751649447,
"grad_norm": 6.054876327514648,
"learning_rate": 0.00020910906554661484,
"loss": 11.6183,
"step": 230
},
{
"epoch": 0.23005103946221836,
"grad_norm": 6.542704105377197,
"learning_rate": 0.0002087584609148226,
"loss": 12.4262,
"step": 231
},
{
"epoch": 0.23104693140794225,
"grad_norm": 5.895127296447754,
"learning_rate": 0.00020840665630932225,
"loss": 11.9887,
"step": 232
},
{
"epoch": 0.23204282335366613,
"grad_norm": 6.9951863288879395,
"learning_rate": 0.00020805365677027646,
"loss": 12.2263,
"step": 233
},
{
"epoch": 0.23303871529939002,
"grad_norm": 6.330830097198486,
"learning_rate": 0.0002076994673549675,
"loss": 13.8628,
"step": 234
},
{
"epoch": 0.2340346072451139,
"grad_norm": 5.970463752746582,
"learning_rate": 0.00020734409313772424,
"loss": 11.4963,
"step": 235
},
{
"epoch": 0.2350304991908378,
"grad_norm": 6.236823081970215,
"learning_rate": 0.00020698753920984987,
"loss": 12.8625,
"step": 236
},
{
"epoch": 0.23602639113656168,
"grad_norm": 5.732664108276367,
"learning_rate": 0.00020662981067954883,
"loss": 11.9416,
"step": 237
},
{
"epoch": 0.23702228308228557,
"grad_norm": 6.58101224899292,
"learning_rate": 0.00020627091267185355,
"loss": 11.9936,
"step": 238
},
{
"epoch": 0.23801817502800945,
"grad_norm": 6.880969047546387,
"learning_rate": 0.0002059108503285511,
"loss": 13.5075,
"step": 239
},
{
"epoch": 0.23901406697373334,
"grad_norm": 5.696409702301025,
"learning_rate": 0.00020554962880810963,
"loss": 11.9267,
"step": 240
},
{
"epoch": 0.24000995891945723,
"grad_norm": 5.385843276977539,
"learning_rate": 0.00020518725328560417,
"loss": 10.0967,
"step": 241
},
{
"epoch": 0.24100585086518111,
"grad_norm": 6.039820671081543,
"learning_rate": 0.00020482372895264282,
"loss": 10.6372,
"step": 242
},
{
"epoch": 0.24200174281090503,
"grad_norm": 5.8607916831970215,
"learning_rate": 0.00020445906101729212,
"loss": 11.0905,
"step": 243
},
{
"epoch": 0.24299763475662892,
"grad_norm": 6.985530376434326,
"learning_rate": 0.00020409325470400263,
"loss": 11.455,
"step": 244
},
{
"epoch": 0.2439935267023528,
"grad_norm": 5.882106304168701,
"learning_rate": 0.0002037263152535339,
"loss": 12.6405,
"step": 245
},
{
"epoch": 0.2449894186480767,
"grad_norm": 5.3988776206970215,
"learning_rate": 0.0002033582479228796,
"loss": 11.5435,
"step": 246
},
{
"epoch": 0.24598531059380058,
"grad_norm": 7.481232166290283,
"learning_rate": 0.00020298905798519197,
"loss": 13.4326,
"step": 247
},
{
"epoch": 0.24698120253952446,
"grad_norm": 6.147839546203613,
"learning_rate": 0.00020261875072970643,
"loss": 11.5964,
"step": 248
},
{
"epoch": 0.24797709448524835,
"grad_norm": 8.947158813476562,
"learning_rate": 0.0002022473314616658,
"loss": 12.0333,
"step": 249
},
{
"epoch": 0.24897298643097224,
"grad_norm": 6.649617671966553,
"learning_rate": 0.00020187480550224422,
"loss": 13.5379,
"step": 250
},
{
"epoch": 0.24996887837669612,
"grad_norm": 6.397286891937256,
"learning_rate": 0.00020150117818847088,
"loss": 11.1719,
"step": 251
},
{
"epoch": 0.25096477032242004,
"grad_norm": 5.593772888183594,
"learning_rate": 0.0002011264548731538,
"loss": 11.7092,
"step": 252
},
{
"epoch": 0.2519606622681439,
"grad_norm": 6.753810405731201,
"learning_rate": 0.00020075064092480284,
"loss": 13.9164,
"step": 253
},
{
"epoch": 0.2529565542138678,
"grad_norm": 5.851145267486572,
"learning_rate": 0.0002003737417275529,
"loss": 12.3499,
"step": 254
},
{
"epoch": 0.25395244615959167,
"grad_norm": 5.922454833984375,
"learning_rate": 0.00019999576268108694,
"loss": 12.029,
"step": 255
},
{
"epoch": 0.2549483381053156,
"grad_norm": 5.876878261566162,
"learning_rate": 0.0001996167092005584,
"loss": 11.8283,
"step": 256
},
{
"epoch": 0.25594423005103945,
"grad_norm": 6.563518047332764,
"learning_rate": 0.00019923658671651363,
"loss": 13.6434,
"step": 257
},
{
"epoch": 0.25694012199676336,
"grad_norm": 5.781143665313721,
"learning_rate": 0.00019885540067481427,
"loss": 11.871,
"step": 258
},
{
"epoch": 0.2579360139424872,
"grad_norm": 6.5753912925720215,
"learning_rate": 0.00019847315653655914,
"loss": 12.0575,
"step": 259
},
{
"epoch": 0.25893190588821113,
"grad_norm": 5.838126182556152,
"learning_rate": 0.0001980898597780059,
"loss": 12.2294,
"step": 260
},
{
"epoch": 0.259927797833935,
"grad_norm": 5.333262920379639,
"learning_rate": 0.00019770551589049268,
"loss": 11.7186,
"step": 261
},
{
"epoch": 0.2609236897796589,
"grad_norm": 6.755197525024414,
"learning_rate": 0.00019732013038035952,
"loss": 12.5457,
"step": 262
},
{
"epoch": 0.2619195817253828,
"grad_norm": 6.212986946105957,
"learning_rate": 0.00019693370876886916,
"loss": 12.6501,
"step": 263
},
{
"epoch": 0.2629154736711067,
"grad_norm": 5.702981948852539,
"learning_rate": 0.00019654625659212835,
"loss": 12.1024,
"step": 264
},
{
"epoch": 0.2639113656168306,
"grad_norm": 5.915462970733643,
"learning_rate": 0.00019615777940100825,
"loss": 11.2701,
"step": 265
},
{
"epoch": 0.26490725756255445,
"grad_norm": 6.289284706115723,
"learning_rate": 0.00019576828276106497,
"loss": 12.4175,
"step": 266
},
{
"epoch": 0.26590314950827837,
"grad_norm": 6.77176570892334,
"learning_rate": 0.0001953777722524599,
"loss": 12.4483,
"step": 267
},
{
"epoch": 0.26689904145400223,
"grad_norm": 6.126654148101807,
"learning_rate": 0.00019498625346987963,
"loss": 11.3449,
"step": 268
},
{
"epoch": 0.26789493339972614,
"grad_norm": 5.859162330627441,
"learning_rate": 0.000194593732022456,
"loss": 11.2103,
"step": 269
},
{
"epoch": 0.26889082534545,
"grad_norm": 6.313859939575195,
"learning_rate": 0.00019420021353368556,
"loss": 11.3182,
"step": 270
},
{
"epoch": 0.2698867172911739,
"grad_norm": 6.366731643676758,
"learning_rate": 0.0001938057036413491,
"loss": 12.3075,
"step": 271
},
{
"epoch": 0.2708826092368978,
"grad_norm": 6.110815525054932,
"learning_rate": 0.00019341020799743075,
"loss": 12.6064,
"step": 272
},
{
"epoch": 0.2718785011826217,
"grad_norm": 6.0649094581604,
"learning_rate": 0.00019301373226803727,
"loss": 12.5147,
"step": 273
},
{
"epoch": 0.27287439312834555,
"grad_norm": 7.83512020111084,
"learning_rate": 0.00019261628213331655,
"loss": 12.0182,
"step": 274
},
{
"epoch": 0.27387028507406946,
"grad_norm": 6.209272861480713,
"learning_rate": 0.00019221786328737651,
"loss": 13.2367,
"step": 275
},
{
"epoch": 0.2748661770197934,
"grad_norm": 5.425859451293945,
"learning_rate": 0.00019181848143820336,
"loss": 11.2941,
"step": 276
},
{
"epoch": 0.27586206896551724,
"grad_norm": 6.430884838104248,
"learning_rate": 0.00019141814230757987,
"loss": 11.9494,
"step": 277
},
{
"epoch": 0.27685796091124115,
"grad_norm": 5.499374866485596,
"learning_rate": 0.00019101685163100342,
"loss": 11.3679,
"step": 278
},
{
"epoch": 0.277853852856965,
"grad_norm": 6.815554141998291,
"learning_rate": 0.00019061461515760368,
"loss": 12.3956,
"step": 279
},
{
"epoch": 0.2788497448026889,
"grad_norm": 5.373319149017334,
"learning_rate": 0.00019021143865006058,
"loss": 11.1815,
"step": 280
},
{
"epoch": 0.2798456367484128,
"grad_norm": 5.54528284072876,
"learning_rate": 0.00018980732788452138,
"loss": 11.2036,
"step": 281
},
{
"epoch": 0.2808415286941367,
"grad_norm": 5.694971084594727,
"learning_rate": 0.00018940228865051812,
"loss": 11.9744,
"step": 282
},
{
"epoch": 0.28183742063986056,
"grad_norm": 6.034758567810059,
"learning_rate": 0.0001889963267508847,
"loss": 12.2473,
"step": 283
},
{
"epoch": 0.2828333125855845,
"grad_norm": 5.718782901763916,
"learning_rate": 0.0001885894480016736,
"loss": 11.7797,
"step": 284
},
{
"epoch": 0.28382920453130833,
"grad_norm": 5.70759391784668,
"learning_rate": 0.00018818165823207263,
"loss": 12.6039,
"step": 285
},
{
"epoch": 0.28482509647703225,
"grad_norm": 5.922822952270508,
"learning_rate": 0.00018777296328432143,
"loss": 12.0609,
"step": 286
},
{
"epoch": 0.2858209884227561,
"grad_norm": 6.213382720947266,
"learning_rate": 0.00018736336901362783,
"loss": 12.721,
"step": 287
},
{
"epoch": 0.28681688036848,
"grad_norm": 5.525908946990967,
"learning_rate": 0.00018695288128808376,
"loss": 10.897,
"step": 288
},
{
"epoch": 0.28781277231420394,
"grad_norm": 5.966858386993408,
"learning_rate": 0.00018654150598858152,
"loss": 11.5072,
"step": 289
},
{
"epoch": 0.2888086642599278,
"grad_norm": 6.100443363189697,
"learning_rate": 0.00018612924900872916,
"loss": 12.3079,
"step": 290
},
{
"epoch": 0.2898045562056517,
"grad_norm": 5.905527591705322,
"learning_rate": 0.00018571611625476625,
"loss": 11.6011,
"step": 291
},
{
"epoch": 0.29080044815137557,
"grad_norm": 18.4097900390625,
"learning_rate": 0.0001853021136454792,
"loss": 12.0392,
"step": 292
},
{
"epoch": 0.2917963400970995,
"grad_norm": 6.539964199066162,
"learning_rate": 0.0001848872471121166,
"loss": 12.8208,
"step": 293
},
{
"epoch": 0.29279223204282334,
"grad_norm": 5.856533527374268,
"learning_rate": 0.00018447152259830398,
"loss": 11.9306,
"step": 294
},
{
"epoch": 0.29378812398854726,
"grad_norm": 6.079041957855225,
"learning_rate": 0.00018405494605995887,
"loss": 11.4677,
"step": 295
},
{
"epoch": 0.2947840159342711,
"grad_norm": 7.1472039222717285,
"learning_rate": 0.00018363752346520548,
"loss": 12.2707,
"step": 296
},
{
"epoch": 0.29577990787999503,
"grad_norm": 6.571793556213379,
"learning_rate": 0.00018321926079428903,
"loss": 12.2654,
"step": 297
},
{
"epoch": 0.2967757998257189,
"grad_norm": 6.160271644592285,
"learning_rate": 0.00018280016403949024,
"loss": 12.8305,
"step": 298
},
{
"epoch": 0.2977716917714428,
"grad_norm": 6.2567572593688965,
"learning_rate": 0.00018238023920503935,
"loss": 12.1013,
"step": 299
},
{
"epoch": 0.29876758371716666,
"grad_norm": 7.734886646270752,
"learning_rate": 0.00018195949230703022,
"loss": 13.1276,
"step": 300
},
{
"epoch": 0.29876758371716666,
"eval_loss": 1.516517162322998,
"eval_runtime": 289.8399,
"eval_samples_per_second": 4.62,
"eval_steps_per_second": 1.156,
"step": 300
},
{
"epoch": 0.2997634756628906,
"grad_norm": 7.519866943359375,
"learning_rate": 0.00018153792937333405,
"loss": 12.56,
"step": 301
},
{
"epoch": 0.3007593676086145,
"grad_norm": 6.709288120269775,
"learning_rate": 0.00018111555644351307,
"loss": 12.5308,
"step": 302
},
{
"epoch": 0.30175525955433835,
"grad_norm": 5.9070281982421875,
"learning_rate": 0.000180692379568734,
"loss": 10.9037,
"step": 303
},
{
"epoch": 0.30275115150006227,
"grad_norm": 7.148046493530273,
"learning_rate": 0.00018026840481168138,
"loss": 12.2679,
"step": 304
},
{
"epoch": 0.3037470434457861,
"grad_norm": 5.894509315490723,
"learning_rate": 0.00017984363824647065,
"loss": 11.3378,
"step": 305
},
{
"epoch": 0.30474293539151004,
"grad_norm": 5.6303229331970215,
"learning_rate": 0.00017941808595856113,
"loss": 10.7154,
"step": 306
},
{
"epoch": 0.3057388273372339,
"grad_norm": 5.428180694580078,
"learning_rate": 0.00017899175404466897,
"loss": 10.747,
"step": 307
},
{
"epoch": 0.3067347192829578,
"grad_norm": 5.827486038208008,
"learning_rate": 0.0001785646486126796,
"loss": 12.2957,
"step": 308
},
{
"epoch": 0.30773061122868167,
"grad_norm": 6.739930152893066,
"learning_rate": 0.0001781367757815604,
"loss": 13.2347,
"step": 309
},
{
"epoch": 0.3087265031744056,
"grad_norm": 6.382988929748535,
"learning_rate": 0.000177708141681273,
"loss": 12.367,
"step": 310
},
{
"epoch": 0.30972239512012945,
"grad_norm": 6.265737533569336,
"learning_rate": 0.00017727875245268534,
"loss": 12.4424,
"step": 311
},
{
"epoch": 0.31071828706585336,
"grad_norm": 6.454976558685303,
"learning_rate": 0.00017684861424748386,
"loss": 12.2459,
"step": 312
},
{
"epoch": 0.3117141790115772,
"grad_norm": 5.488022804260254,
"learning_rate": 0.00017641773322808518,
"loss": 10.8191,
"step": 313
},
{
"epoch": 0.31271007095730113,
"grad_norm": 9.193635940551758,
"learning_rate": 0.00017598611556754804,
"loss": 13.3586,
"step": 314
},
{
"epoch": 0.31370596290302505,
"grad_norm": 6.968887805938721,
"learning_rate": 0.0001755537674494846,
"loss": 13.9658,
"step": 315
},
{
"epoch": 0.3147018548487489,
"grad_norm": 5.535203456878662,
"learning_rate": 0.00017512069506797224,
"loss": 12.2494,
"step": 316
},
{
"epoch": 0.3156977467944728,
"grad_norm": 5.723720073699951,
"learning_rate": 0.00017468690462746426,
"loss": 12.0819,
"step": 317
},
{
"epoch": 0.3166936387401967,
"grad_norm": 6.4840850830078125,
"learning_rate": 0.00017425240234270148,
"loss": 11.7859,
"step": 318
},
{
"epoch": 0.3176895306859206,
"grad_norm": 6.271676063537598,
"learning_rate": 0.00017381719443862305,
"loss": 13.0953,
"step": 319
},
{
"epoch": 0.31868542263164445,
"grad_norm": 6.508022785186768,
"learning_rate": 0.00017338128715027717,
"loss": 11.8154,
"step": 320
},
{
"epoch": 0.31968131457736837,
"grad_norm": 5.159334659576416,
"learning_rate": 0.00017294468672273178,
"loss": 11.3996,
"step": 321
},
{
"epoch": 0.32067720652309223,
"grad_norm": 5.714421272277832,
"learning_rate": 0.00017250739941098532,
"loss": 12.8778,
"step": 322
},
{
"epoch": 0.32167309846881614,
"grad_norm": 6.049591064453125,
"learning_rate": 0.00017206943147987677,
"loss": 12.7078,
"step": 323
},
{
"epoch": 0.32266899041454,
"grad_norm": 5.64936637878418,
"learning_rate": 0.00017163078920399616,
"loss": 12.0689,
"step": 324
},
{
"epoch": 0.3236648823602639,
"grad_norm": 5.128944396972656,
"learning_rate": 0.00017119147886759462,
"loss": 11.2092,
"step": 325
},
{
"epoch": 0.3246607743059878,
"grad_norm": 5.202014923095703,
"learning_rate": 0.00017075150676449418,
"loss": 10.8206,
"step": 326
},
{
"epoch": 0.3256566662517117,
"grad_norm": 5.825450420379639,
"learning_rate": 0.00017031087919799792,
"loss": 11.1645,
"step": 327
},
{
"epoch": 0.3266525581974356,
"grad_norm": 6.169241428375244,
"learning_rate": 0.0001698696024807993,
"loss": 12.5059,
"step": 328
},
{
"epoch": 0.32764845014315946,
"grad_norm": 5.669341087341309,
"learning_rate": 0.00016942768293489198,
"loss": 12.112,
"step": 329
},
{
"epoch": 0.3286443420888834,
"grad_norm": 5.91945219039917,
"learning_rate": 0.00016898512689147912,
"loss": 12.5704,
"step": 330
},
{
"epoch": 0.32964023403460724,
"grad_norm": 5.3270134925842285,
"learning_rate": 0.0001685419406908829,
"loss": 11.1185,
"step": 331
},
{
"epoch": 0.33063612598033115,
"grad_norm": 6.189324378967285,
"learning_rate": 0.0001680981306824533,
"loss": 13.594,
"step": 332
},
{
"epoch": 0.331632017926055,
"grad_norm": 5.268672466278076,
"learning_rate": 0.00016765370322447738,
"loss": 11.815,
"step": 333
},
{
"epoch": 0.3326279098717789,
"grad_norm": 6.1424407958984375,
"learning_rate": 0.0001672086646840883,
"loss": 12.4271,
"step": 334
},
{
"epoch": 0.3336238018175028,
"grad_norm": 6.57133674621582,
"learning_rate": 0.00016676302143717376,
"loss": 11.8458,
"step": 335
},
{
"epoch": 0.3346196937632267,
"grad_norm": 6.405190467834473,
"learning_rate": 0.0001663167798682849,
"loss": 13.4903,
"step": 336
},
{
"epoch": 0.33561558570895056,
"grad_norm": 5.354515075683594,
"learning_rate": 0.00016586994637054486,
"loss": 10.9491,
"step": 337
},
{
"epoch": 0.3366114776546745,
"grad_norm": 5.805549144744873,
"learning_rate": 0.00016542252734555706,
"loss": 12.0369,
"step": 338
},
{
"epoch": 0.33760736960039833,
"grad_norm": 7.639823913574219,
"learning_rate": 0.0001649745292033135,
"loss": 13.4881,
"step": 339
},
{
"epoch": 0.33860326154612225,
"grad_norm": 6.16525936126709,
"learning_rate": 0.000164525958362103,
"loss": 11.7652,
"step": 340
},
{
"epoch": 0.33959915349184616,
"grad_norm": 6.666656017303467,
"learning_rate": 0.00016407682124841916,
"loss": 12.8449,
"step": 341
},
{
"epoch": 0.34059504543757,
"grad_norm": 7.354873180389404,
"learning_rate": 0.00016362712429686844,
"loss": 10.8402,
"step": 342
},
{
"epoch": 0.34159093738329394,
"grad_norm": 6.7231526374816895,
"learning_rate": 0.00016317687395007774,
"loss": 13.0444,
"step": 343
},
{
"epoch": 0.3425868293290178,
"grad_norm": 6.6920485496521,
"learning_rate": 0.0001627260766586023,
"loss": 12.295,
"step": 344
},
{
"epoch": 0.3435827212747417,
"grad_norm": 5.398272514343262,
"learning_rate": 0.00016227473888083318,
"loss": 11.9999,
"step": 345
},
{
"epoch": 0.34457861322046557,
"grad_norm": 6.592508316040039,
"learning_rate": 0.00016182286708290485,
"loss": 11.0985,
"step": 346
},
{
"epoch": 0.3455745051661895,
"grad_norm": 6.421080589294434,
"learning_rate": 0.00016137046773860242,
"loss": 12.2588,
"step": 347
},
{
"epoch": 0.34657039711191334,
"grad_norm": 7.451521396636963,
"learning_rate": 0.0001609175473292689,
"loss": 11.8329,
"step": 348
},
{
"epoch": 0.34756628905763726,
"grad_norm": 7.403144836425781,
"learning_rate": 0.00016046411234371249,
"loss": 12.3238,
"step": 349
},
{
"epoch": 0.3485621810033611,
"grad_norm": 6.073176383972168,
"learning_rate": 0.0001600101692781134,
"loss": 11.3786,
"step": 350
},
{
"epoch": 0.34955807294908503,
"grad_norm": 5.982321262359619,
"learning_rate": 0.00015955572463593093,
"loss": 12.0612,
"step": 351
},
{
"epoch": 0.3505539648948089,
"grad_norm": 6.197265148162842,
"learning_rate": 0.00015910078492781038,
"loss": 13.0901,
"step": 352
},
{
"epoch": 0.3515498568405328,
"grad_norm": 5.7054443359375,
"learning_rate": 0.00015864535667148953,
"loss": 11.8859,
"step": 353
},
{
"epoch": 0.3525457487862567,
"grad_norm": 6.113707065582275,
"learning_rate": 0.00015818944639170538,
"loss": 12.4638,
"step": 354
},
{
"epoch": 0.3535416407319806,
"grad_norm": 5.500970840454102,
"learning_rate": 0.00015773306062010083,
"loss": 12.5954,
"step": 355
},
{
"epoch": 0.3545375326777045,
"grad_norm": 6.778495788574219,
"learning_rate": 0.00015727620589513084,
"loss": 12.1178,
"step": 356
},
{
"epoch": 0.35553342462342835,
"grad_norm": 5.658638954162598,
"learning_rate": 0.0001568188887619689,
"loss": 12.3659,
"step": 357
},
{
"epoch": 0.35652931656915227,
"grad_norm": 5.579876899719238,
"learning_rate": 0.0001563611157724132,
"loss": 11.3356,
"step": 358
},
{
"epoch": 0.3575252085148761,
"grad_norm": 5.567670822143555,
"learning_rate": 0.0001559028934847929,
"loss": 11.3109,
"step": 359
},
{
"epoch": 0.35852110046060004,
"grad_norm": 5.965104579925537,
"learning_rate": 0.00015544422846387398,
"loss": 11.5413,
"step": 360
},
{
"epoch": 0.3595169924063239,
"grad_norm": 5.479038715362549,
"learning_rate": 0.00015498512728076536,
"loss": 11.1271,
"step": 361
},
{
"epoch": 0.3605128843520478,
"grad_norm": 6.937322616577148,
"learning_rate": 0.0001545255965128246,
"loss": 12.4041,
"step": 362
},
{
"epoch": 0.3615087762977717,
"grad_norm": 6.604974269866943,
"learning_rate": 0.00015406564274356377,
"loss": 13.2388,
"step": 363
},
{
"epoch": 0.3625046682434956,
"grad_norm": 5.381167411804199,
"learning_rate": 0.00015360527256255517,
"loss": 12.4161,
"step": 364
},
{
"epoch": 0.36350056018921945,
"grad_norm": 6.176576137542725,
"learning_rate": 0.00015314449256533677,
"loss": 11.5011,
"step": 365
},
{
"epoch": 0.36449645213494336,
"grad_norm": 6.568368434906006,
"learning_rate": 0.00015268330935331787,
"loss": 12.6016,
"step": 366
},
{
"epoch": 0.3654923440806673,
"grad_norm": 6.896481513977051,
"learning_rate": 0.00015222172953368446,
"loss": 13.5208,
"step": 367
},
{
"epoch": 0.36648823602639113,
"grad_norm": 5.2163262367248535,
"learning_rate": 0.0001517597597193046,
"loss": 10.7047,
"step": 368
},
{
"epoch": 0.36748412797211505,
"grad_norm": 5.657341480255127,
"learning_rate": 0.00015129740652863354,
"loss": 11.5574,
"step": 369
},
{
"epoch": 0.3684800199178389,
"grad_norm": 6.786725997924805,
"learning_rate": 0.0001508346765856191,
"loss": 12.2539,
"step": 370
},
{
"epoch": 0.3694759118635628,
"grad_norm": 5.714908599853516,
"learning_rate": 0.00015037157651960677,
"loss": 11.4831,
"step": 371
},
{
"epoch": 0.3704718038092867,
"grad_norm": 6.115590572357178,
"learning_rate": 0.0001499081129652443,
"loss": 11.983,
"step": 372
},
{
"epoch": 0.3714676957550106,
"grad_norm": 5.770305633544922,
"learning_rate": 0.00014944429256238742,
"loss": 12.0888,
"step": 373
},
{
"epoch": 0.37246358770073446,
"grad_norm": 6.408126354217529,
"learning_rate": 0.000148980121956004,
"loss": 12.551,
"step": 374
},
{
"epoch": 0.37345947964645837,
"grad_norm": 5.341275215148926,
"learning_rate": 0.00014851560779607922,
"loss": 10.8883,
"step": 375
},
{
"epoch": 0.37445537159218223,
"grad_norm": 5.945551872253418,
"learning_rate": 0.00014805075673752022,
"loss": 11.5723,
"step": 376
},
{
"epoch": 0.37545126353790614,
"grad_norm": 5.617563724517822,
"learning_rate": 0.0001475855754400608,
"loss": 11.3891,
"step": 377
},
{
"epoch": 0.37644715548363,
"grad_norm": 5.866842269897461,
"learning_rate": 0.00014712007056816583,
"loss": 11.773,
"step": 378
},
{
"epoch": 0.3774430474293539,
"grad_norm": 5.4498796463012695,
"learning_rate": 0.00014665424879093598,
"loss": 11.6255,
"step": 379
},
{
"epoch": 0.3784389393750778,
"grad_norm": 6.97362756729126,
"learning_rate": 0.0001461881167820121,
"loss": 11.3264,
"step": 380
},
{
"epoch": 0.3794348313208017,
"grad_norm": 6.620206832885742,
"learning_rate": 0.0001457216812194796,
"loss": 12.2684,
"step": 381
},
{
"epoch": 0.3804307232665256,
"grad_norm": 6.0084228515625,
"learning_rate": 0.00014525494878577278,
"loss": 12.1192,
"step": 382
},
{
"epoch": 0.38142661521224946,
"grad_norm": 6.702908515930176,
"learning_rate": 0.00014478792616757908,
"loss": 11.2421,
"step": 383
},
{
"epoch": 0.3824225071579734,
"grad_norm": 5.771916389465332,
"learning_rate": 0.00014432062005574332,
"loss": 12.0659,
"step": 384
},
{
"epoch": 0.38341839910369724,
"grad_norm": 6.246331691741943,
"learning_rate": 0.00014385303714517175,
"loss": 12.3364,
"step": 385
},
{
"epoch": 0.38441429104942115,
"grad_norm": 6.237067699432373,
"learning_rate": 0.00014338518413473632,
"loss": 10.9562,
"step": 386
},
{
"epoch": 0.385410182995145,
"grad_norm": 5.690719127655029,
"learning_rate": 0.00014291706772717847,
"loss": 11.8223,
"step": 387
},
{
"epoch": 0.3864060749408689,
"grad_norm": 6.914798259735107,
"learning_rate": 0.00014244869462901331,
"loss": 12.0813,
"step": 388
},
{
"epoch": 0.3874019668865928,
"grad_norm": 6.975170612335205,
"learning_rate": 0.00014198007155043343,
"loss": 10.7795,
"step": 389
},
{
"epoch": 0.3883978588323167,
"grad_norm": 5.881531715393066,
"learning_rate": 0.00014151120520521283,
"loss": 11.8511,
"step": 390
},
{
"epoch": 0.38939375077804056,
"grad_norm": 5.874497413635254,
"learning_rate": 0.00014104210231061053,
"loss": 11.0519,
"step": 391
},
{
"epoch": 0.3903896427237645,
"grad_norm": 6.757877826690674,
"learning_rate": 0.00014057276958727468,
"loss": 12.7317,
"step": 392
},
{
"epoch": 0.39138553466948833,
"grad_norm": 7.4160637855529785,
"learning_rate": 0.0001401032137591461,
"loss": 11.9433,
"step": 393
},
{
"epoch": 0.39238142661521225,
"grad_norm": 5.124035835266113,
"learning_rate": 0.00013963344155336178,
"loss": 10.2948,
"step": 394
},
{
"epoch": 0.39337731856093616,
"grad_norm": 6.715900897979736,
"learning_rate": 0.00013916345970015875,
"loss": 12.0971,
"step": 395
},
{
"epoch": 0.39437321050666,
"grad_norm": 6.183508396148682,
"learning_rate": 0.00013869327493277762,
"loss": 12.2012,
"step": 396
},
{
"epoch": 0.39536910245238394,
"grad_norm": 6.416903018951416,
"learning_rate": 0.000138222893987366,
"loss": 12.4187,
"step": 397
},
{
"epoch": 0.3963649943981078,
"grad_norm": 6.140800476074219,
"learning_rate": 0.00013775232360288214,
"loss": 11.6096,
"step": 398
},
{
"epoch": 0.3973608863438317,
"grad_norm": 6.360043048858643,
"learning_rate": 0.00013728157052099823,
"loss": 12.4352,
"step": 399
},
{
"epoch": 0.39835677828955557,
"grad_norm": 5.971815586090088,
"learning_rate": 0.00013681064148600392,
"loss": 10.6359,
"step": 400
},
{
"epoch": 0.39835677828955557,
"eval_loss": 1.5056407451629639,
"eval_runtime": 289.8388,
"eval_samples_per_second": 4.62,
"eval_steps_per_second": 1.156,
"step": 400
},
{
"epoch": 0.3993526702352795,
"grad_norm": 6.695412635803223,
"learning_rate": 0.00013633954324470968,
"loss": 11.4601,
"step": 401
},
{
"epoch": 0.40034856218100334,
"grad_norm": 6.6551337242126465,
"learning_rate": 0.00013586828254635015,
"loss": 12.2329,
"step": 402
},
{
"epoch": 0.40134445412672726,
"grad_norm": 6.626735687255859,
"learning_rate": 0.0001353968661424873,
"loss": 12.1324,
"step": 403
},
{
"epoch": 0.4023403460724511,
"grad_norm": 6.322345733642578,
"learning_rate": 0.00013492530078691403,
"loss": 12.0786,
"step": 404
},
{
"epoch": 0.40333623801817503,
"grad_norm": 4.97235631942749,
"learning_rate": 0.00013445359323555712,
"loss": 10.4874,
"step": 405
},
{
"epoch": 0.4043321299638989,
"grad_norm": 6.183226108551025,
"learning_rate": 0.0001339817502463804,
"loss": 12.173,
"step": 406
},
{
"epoch": 0.4053280219096228,
"grad_norm": 6.04211950302124,
"learning_rate": 0.00013350977857928836,
"loss": 11.1317,
"step": 407
},
{
"epoch": 0.4063239138553467,
"grad_norm": 5.817178726196289,
"learning_rate": 0.0001330376849960287,
"loss": 12.1543,
"step": 408
},
{
"epoch": 0.4073198058010706,
"grad_norm": 7.050534248352051,
"learning_rate": 0.0001325654762600959,
"loss": 12.6231,
"step": 409
},
{
"epoch": 0.4083156977467945,
"grad_norm": 6.723251819610596,
"learning_rate": 0.00013209315913663427,
"loss": 12.7058,
"step": 410
},
{
"epoch": 0.40931158969251835,
"grad_norm": 4.917559623718262,
"learning_rate": 0.0001316207403923408,
"loss": 10.9655,
"step": 411
},
{
"epoch": 0.41030748163824227,
"grad_norm": 6.4542107582092285,
"learning_rate": 0.00013114822679536836,
"loss": 13.6992,
"step": 412
},
{
"epoch": 0.4113033735839661,
"grad_norm": 6.2309346199035645,
"learning_rate": 0.0001306756251152289,
"loss": 12.3255,
"step": 413
},
{
"epoch": 0.41229926552969004,
"grad_norm": 6.741308689117432,
"learning_rate": 0.00013020294212269615,
"loss": 13.0598,
"step": 414
},
{
"epoch": 0.4132951574754139,
"grad_norm": 6.19504976272583,
"learning_rate": 0.0001297301845897088,
"loss": 14.3294,
"step": 415
},
{
"epoch": 0.4142910494211378,
"grad_norm": 6.095429420471191,
"learning_rate": 0.0001292573592892735,
"loss": 12.5742,
"step": 416
},
{
"epoch": 0.4152869413668617,
"grad_norm": 5.67078161239624,
"learning_rate": 0.00012878447299536768,
"loss": 12.8696,
"step": 417
},
{
"epoch": 0.4162828333125856,
"grad_norm": 5.176650524139404,
"learning_rate": 0.00012831153248284272,
"loss": 12.2691,
"step": 418
},
{
"epoch": 0.41727872525830945,
"grad_norm": 6.270726680755615,
"learning_rate": 0.00012783854452732668,
"loss": 11.5467,
"step": 419
},
{
"epoch": 0.41827461720403336,
"grad_norm": 6.120121002197266,
"learning_rate": 0.00012736551590512737,
"loss": 12.1084,
"step": 420
},
{
"epoch": 0.4192705091497573,
"grad_norm": 5.2254862785339355,
"learning_rate": 0.00012689245339313521,
"loss": 12.1979,
"step": 421
},
{
"epoch": 0.42026640109548113,
"grad_norm": 5.770168781280518,
"learning_rate": 0.00012641936376872606,
"loss": 12.6844,
"step": 422
},
{
"epoch": 0.42126229304120505,
"grad_norm": 6.377243995666504,
"learning_rate": 0.00012594625380966436,
"loss": 12.6859,
"step": 423
},
{
"epoch": 0.4222581849869289,
"grad_norm": 5.389707565307617,
"learning_rate": 0.00012547313029400567,
"loss": 11.773,
"step": 424
},
{
"epoch": 0.4232540769326528,
"grad_norm": 5.801501750946045,
"learning_rate": 0.000125,
"loss": 12.0606,
"step": 425
},
{
"epoch": 0.4242499688783767,
"grad_norm": 6.649625301361084,
"learning_rate": 0.0001245268697059943,
"loss": 12.1181,
"step": 426
},
{
"epoch": 0.4252458608241006,
"grad_norm": 6.166293144226074,
"learning_rate": 0.0001240537461903357,
"loss": 12.4382,
"step": 427
},
{
"epoch": 0.42624175276982446,
"grad_norm": 5.566735744476318,
"learning_rate": 0.00012358063623127394,
"loss": 11.7602,
"step": 428
},
{
"epoch": 0.42723764471554837,
"grad_norm": 6.631081581115723,
"learning_rate": 0.00012310754660686482,
"loss": 12.4029,
"step": 429
},
{
"epoch": 0.42823353666127223,
"grad_norm": 5.083254814147949,
"learning_rate": 0.00012263448409487266,
"loss": 11.5772,
"step": 430
},
{
"epoch": 0.42922942860699614,
"grad_norm": 5.555343151092529,
"learning_rate": 0.00012216145547267333,
"loss": 10.4996,
"step": 431
},
{
"epoch": 0.43022532055272,
"grad_norm": 4.766176223754883,
"learning_rate": 0.00012168846751715729,
"loss": 11.4977,
"step": 432
},
{
"epoch": 0.4312212124984439,
"grad_norm": 5.567246437072754,
"learning_rate": 0.00012121552700463235,
"loss": 11.6256,
"step": 433
},
{
"epoch": 0.43221710444416783,
"grad_norm": 6.13475227355957,
"learning_rate": 0.00012074264071072653,
"loss": 12.4283,
"step": 434
},
{
"epoch": 0.4332129963898917,
"grad_norm": 5.974978923797607,
"learning_rate": 0.00012026981541029122,
"loss": 12.2817,
"step": 435
},
{
"epoch": 0.4342088883356156,
"grad_norm": 5.512807846069336,
"learning_rate": 0.00011979705787730388,
"loss": 11.7294,
"step": 436
},
{
"epoch": 0.43520478028133947,
"grad_norm": 5.102151393890381,
"learning_rate": 0.00011932437488477113,
"loss": 9.8982,
"step": 437
},
{
"epoch": 0.4362006722270634,
"grad_norm": 6.0491623878479,
"learning_rate": 0.00011885177320463165,
"loss": 11.6918,
"step": 438
},
{
"epoch": 0.43719656417278724,
"grad_norm": 6.371288299560547,
"learning_rate": 0.00011837925960765928,
"loss": 12.9814,
"step": 439
},
{
"epoch": 0.43819245611851115,
"grad_norm": 5.732490062713623,
"learning_rate": 0.00011790684086336576,
"loss": 12.1104,
"step": 440
},
{
"epoch": 0.439188348064235,
"grad_norm": 6.025941848754883,
"learning_rate": 0.00011743452373990408,
"loss": 12.2868,
"step": 441
},
{
"epoch": 0.4401842400099589,
"grad_norm": 7.7239603996276855,
"learning_rate": 0.00011696231500397135,
"loss": 12.834,
"step": 442
},
{
"epoch": 0.4411801319556828,
"grad_norm": 6.518691062927246,
"learning_rate": 0.00011649022142071167,
"loss": 12.9192,
"step": 443
},
{
"epoch": 0.4421760239014067,
"grad_norm": 5.347179889678955,
"learning_rate": 0.00011601824975361959,
"loss": 10.4923,
"step": 444
},
{
"epoch": 0.44317191584713056,
"grad_norm": 6.918272495269775,
"learning_rate": 0.00011554640676444295,
"loss": 11.6119,
"step": 445
},
{
"epoch": 0.4441678077928545,
"grad_norm": 6.190727710723877,
"learning_rate": 0.00011507469921308598,
"loss": 13.9115,
"step": 446
},
{
"epoch": 0.4451636997385784,
"grad_norm": 6.001166343688965,
"learning_rate": 0.0001146031338575127,
"loss": 11.5416,
"step": 447
},
{
"epoch": 0.44615959168430225,
"grad_norm": 9.23908519744873,
"learning_rate": 0.00011413171745364992,
"loss": 13.9503,
"step": 448
},
{
"epoch": 0.44715548363002616,
"grad_norm": 6.2685723304748535,
"learning_rate": 0.00011366045675529033,
"loss": 10.9264,
"step": 449
},
{
"epoch": 0.44815137557575,
"grad_norm": 5.3125901222229,
"learning_rate": 0.0001131893585139961,
"loss": 11.8189,
"step": 450
},
{
"epoch": 0.44914726752147394,
"grad_norm": 5.640747547149658,
"learning_rate": 0.00011271842947900179,
"loss": 10.8639,
"step": 451
},
{
"epoch": 0.4501431594671978,
"grad_norm": 6.120314598083496,
"learning_rate": 0.00011224767639711789,
"loss": 13.9796,
"step": 452
},
{
"epoch": 0.4511390514129217,
"grad_norm": 5.786348342895508,
"learning_rate": 0.000111777106012634,
"loss": 12.8427,
"step": 453
},
{
"epoch": 0.45213494335864557,
"grad_norm": 5.406647682189941,
"learning_rate": 0.00011130672506722242,
"loss": 11.3131,
"step": 454
},
{
"epoch": 0.4531308353043695,
"grad_norm": 6.1720099449157715,
"learning_rate": 0.00011083654029984128,
"loss": 12.1748,
"step": 455
},
{
"epoch": 0.45412672725009334,
"grad_norm": 6.570352554321289,
"learning_rate": 0.00011036655844663824,
"loss": 12.7982,
"step": 456
},
{
"epoch": 0.45512261919581726,
"grad_norm": 5.541633129119873,
"learning_rate": 0.00010989678624085394,
"loss": 11.4866,
"step": 457
},
{
"epoch": 0.4561185111415411,
"grad_norm": 6.466024875640869,
"learning_rate": 0.00010942723041272531,
"loss": 12.9477,
"step": 458
},
{
"epoch": 0.45711440308726503,
"grad_norm": 4.921136379241943,
"learning_rate": 0.00010895789768938948,
"loss": 10.5616,
"step": 459
},
{
"epoch": 0.45811029503298895,
"grad_norm": 7.542169094085693,
"learning_rate": 0.00010848879479478724,
"loss": 13.1838,
"step": 460
},
{
"epoch": 0.4591061869787128,
"grad_norm": 6.347965240478516,
"learning_rate": 0.00010801992844956659,
"loss": 12.3061,
"step": 461
},
{
"epoch": 0.4601020789244367,
"grad_norm": 5.8513689041137695,
"learning_rate": 0.00010755130537098669,
"loss": 11.7932,
"step": 462
},
{
"epoch": 0.4610979708701606,
"grad_norm": 6.4326677322387695,
"learning_rate": 0.00010708293227282158,
"loss": 11.9086,
"step": 463
},
{
"epoch": 0.4620938628158845,
"grad_norm": 5.559072494506836,
"learning_rate": 0.00010661481586526371,
"loss": 11.826,
"step": 464
},
{
"epoch": 0.46308975476160835,
"grad_norm": 5.315805435180664,
"learning_rate": 0.00010614696285482828,
"loss": 12.2405,
"step": 465
},
{
"epoch": 0.46408564670733227,
"grad_norm": 5.958090305328369,
"learning_rate": 0.00010567937994425675,
"loss": 10.8293,
"step": 466
},
{
"epoch": 0.4650815386530561,
"grad_norm": 5.258885860443115,
"learning_rate": 0.00010521207383242094,
"loss": 10.2966,
"step": 467
},
{
"epoch": 0.46607743059878004,
"grad_norm": 5.901886940002441,
"learning_rate": 0.00010474505121422722,
"loss": 11.8841,
"step": 468
},
{
"epoch": 0.4670733225445039,
"grad_norm": 5.9542341232299805,
"learning_rate": 0.00010427831878052043,
"loss": 13.1809,
"step": 469
},
{
"epoch": 0.4680692144902278,
"grad_norm": 5.833163738250732,
"learning_rate": 0.00010381188321798792,
"loss": 11.8099,
"step": 470
},
{
"epoch": 0.4690651064359517,
"grad_norm": 5.479732513427734,
"learning_rate": 0.00010334575120906404,
"loss": 11.7125,
"step": 471
},
{
"epoch": 0.4700609983816756,
"grad_norm": 5.714720726013184,
"learning_rate": 0.00010287992943183422,
"loss": 11.6581,
"step": 472
},
{
"epoch": 0.4710568903273995,
"grad_norm": 5.256033420562744,
"learning_rate": 0.00010241442455993925,
"loss": 10.8862,
"step": 473
},
{
"epoch": 0.47205278227312336,
"grad_norm": 5.788590908050537,
"learning_rate": 0.00010194924326247976,
"loss": 12.4129,
"step": 474
},
{
"epoch": 0.4730486742188473,
"grad_norm": 6.205699443817139,
"learning_rate": 0.00010148439220392081,
"loss": 12.2752,
"step": 475
},
{
"epoch": 0.47404456616457114,
"grad_norm": 5.725231170654297,
"learning_rate": 0.00010101987804399601,
"loss": 12.1739,
"step": 476
},
{
"epoch": 0.47504045811029505,
"grad_norm": 5.984923362731934,
"learning_rate": 0.00010055570743761256,
"loss": 10.9013,
"step": 477
},
{
"epoch": 0.4760363500560189,
"grad_norm": 6.258663654327393,
"learning_rate": 0.00010009188703475571,
"loss": 11.7148,
"step": 478
},
{
"epoch": 0.4770322420017428,
"grad_norm": 5.729332447052002,
"learning_rate": 9.962842348039328e-05,
"loss": 11.2752,
"step": 479
},
{
"epoch": 0.4780281339474667,
"grad_norm": 5.4574995040893555,
"learning_rate": 9.916532341438088e-05,
"loss": 12.89,
"step": 480
},
{
"epoch": 0.4790240258931906,
"grad_norm": 6.573443412780762,
"learning_rate": 9.87025934713665e-05,
"loss": 12.5011,
"step": 481
},
{
"epoch": 0.48001991783891446,
"grad_norm": 6.030612468719482,
"learning_rate": 9.824024028069541e-05,
"loss": 12.3289,
"step": 482
},
{
"epoch": 0.48101580978463837,
"grad_norm": 6.0058698654174805,
"learning_rate": 9.777827046631553e-05,
"loss": 13.7869,
"step": 483
},
{
"epoch": 0.48201170173036223,
"grad_norm": 5.5297040939331055,
"learning_rate": 9.731669064668217e-05,
"loss": 12.0592,
"step": 484
},
{
"epoch": 0.48300759367608614,
"grad_norm": 5.423988342285156,
"learning_rate": 9.685550743466325e-05,
"loss": 10.2873,
"step": 485
},
{
"epoch": 0.48400348562181006,
"grad_norm": 5.130181789398193,
"learning_rate": 9.639472743744486e-05,
"loss": 9.5329,
"step": 486
},
{
"epoch": 0.4849993775675339,
"grad_norm": 6.554098606109619,
"learning_rate": 9.593435725643623e-05,
"loss": 13.0928,
"step": 487
},
{
"epoch": 0.48599526951325783,
"grad_norm": 5.409817218780518,
"learning_rate": 9.547440348717542e-05,
"loss": 11.0439,
"step": 488
},
{
"epoch": 0.4869911614589817,
"grad_norm": 5.266180038452148,
"learning_rate": 9.501487271923463e-05,
"loss": 10.6856,
"step": 489
},
{
"epoch": 0.4879870534047056,
"grad_norm": 6.0554633140563965,
"learning_rate": 9.455577153612602e-05,
"loss": 11.6784,
"step": 490
},
{
"epoch": 0.48898294535042947,
"grad_norm": 5.316808223724365,
"learning_rate": 9.40971065152071e-05,
"loss": 9.9632,
"step": 491
},
{
"epoch": 0.4899788372961534,
"grad_norm": 5.960623264312744,
"learning_rate": 9.363888422758678e-05,
"loss": 11.169,
"step": 492
},
{
"epoch": 0.49097472924187724,
"grad_norm": 6.7030253410339355,
"learning_rate": 9.318111123803113e-05,
"loss": 12.5777,
"step": 493
},
{
"epoch": 0.49197062118760115,
"grad_norm": 5.716718673706055,
"learning_rate": 9.272379410486917e-05,
"loss": 12.1007,
"step": 494
},
{
"epoch": 0.492966513133325,
"grad_norm": 5.541021823883057,
"learning_rate": 9.226693937989916e-05,
"loss": 11.0471,
"step": 495
},
{
"epoch": 0.4939624050790489,
"grad_norm": 6.103461742401123,
"learning_rate": 9.181055360829463e-05,
"loss": 11.6358,
"step": 496
},
{
"epoch": 0.4949582970247728,
"grad_norm": 5.171212196350098,
"learning_rate": 9.135464332851049e-05,
"loss": 11.737,
"step": 497
},
{
"epoch": 0.4959541889704967,
"grad_norm": 6.258893013000488,
"learning_rate": 9.089921507218962e-05,
"loss": 12.2711,
"step": 498
},
{
"epoch": 0.4969500809162206,
"grad_norm": 5.942646026611328,
"learning_rate": 9.044427536406909e-05,
"loss": 12.4533,
"step": 499
},
{
"epoch": 0.4979459728619445,
"grad_norm": 5.429832458496094,
"learning_rate": 8.998983072188663e-05,
"loss": 11.4921,
"step": 500
},
{
"epoch": 0.4979459728619445,
"eval_loss": 1.483846664428711,
"eval_runtime": 289.886,
"eval_samples_per_second": 4.619,
"eval_steps_per_second": 1.156,
"step": 500
},
{
"epoch": 0.4989418648076684,
"grad_norm": 6.279896259307861,
"learning_rate": 8.953588765628753e-05,
"loss": 12.5236,
"step": 501
},
{
"epoch": 0.49993775675339225,
"grad_norm": 7.839783668518066,
"learning_rate": 8.90824526707311e-05,
"loss": 11.226,
"step": 502
},
{
"epoch": 0.5009336486991162,
"grad_norm": 5.284313678741455,
"learning_rate": 8.86295322613976e-05,
"loss": 10.7881,
"step": 503
},
{
"epoch": 0.5019295406448401,
"grad_norm": 5.617724418640137,
"learning_rate": 8.817713291709513e-05,
"loss": 12.3775,
"step": 504
},
{
"epoch": 0.5029254325905639,
"grad_norm": 5.214620113372803,
"learning_rate": 8.772526111916685e-05,
"loss": 11.2123,
"step": 505
},
{
"epoch": 0.5039213245362878,
"grad_norm": 6.912374973297119,
"learning_rate": 8.727392334139771e-05,
"loss": 13.137,
"step": 506
},
{
"epoch": 0.5049172164820117,
"grad_norm": 6.51848840713501,
"learning_rate": 8.682312604992227e-05,
"loss": 11.9434,
"step": 507
},
{
"epoch": 0.5059131084277356,
"grad_norm": 5.211724758148193,
"learning_rate": 8.637287570313158e-05,
"loss": 10.2712,
"step": 508
},
{
"epoch": 0.5069090003734594,
"grad_norm": 5.400852203369141,
"learning_rate": 8.592317875158085e-05,
"loss": 10.2705,
"step": 509
},
{
"epoch": 0.5079048923191833,
"grad_norm": 5.548620223999023,
"learning_rate": 8.5474041637897e-05,
"loss": 10.4555,
"step": 510
},
{
"epoch": 0.5089007842649073,
"grad_norm": 6.38204288482666,
"learning_rate": 8.502547079668653e-05,
"loss": 11.8434,
"step": 511
},
{
"epoch": 0.5098966762106312,
"grad_norm": 5.735969543457031,
"learning_rate": 8.457747265444296e-05,
"loss": 10.9467,
"step": 512
},
{
"epoch": 0.5108925681563551,
"grad_norm": 5.540611743927002,
"learning_rate": 8.413005362945512e-05,
"loss": 12.3345,
"step": 513
},
{
"epoch": 0.5118884601020789,
"grad_norm": 6.0366692543029785,
"learning_rate": 8.368322013171513e-05,
"loss": 11.6241,
"step": 514
},
{
"epoch": 0.5128843520478028,
"grad_norm": 5.270688056945801,
"learning_rate": 8.323697856282627e-05,
"loss": 10.389,
"step": 515
},
{
"epoch": 0.5138802439935267,
"grad_norm": 6.122136116027832,
"learning_rate": 8.27913353159117e-05,
"loss": 10.98,
"step": 516
},
{
"epoch": 0.5148761359392506,
"grad_norm": 6.037084579467773,
"learning_rate": 8.234629677552263e-05,
"loss": 12.568,
"step": 517
},
{
"epoch": 0.5158720278849744,
"grad_norm": 7.1887125968933105,
"learning_rate": 8.190186931754673e-05,
"loss": 12.7303,
"step": 518
},
{
"epoch": 0.5168679198306984,
"grad_norm": 5.168606758117676,
"learning_rate": 8.14580593091171e-05,
"loss": 11.5019,
"step": 519
},
{
"epoch": 0.5178638117764223,
"grad_norm": 5.976624965667725,
"learning_rate": 8.101487310852087e-05,
"loss": 12.8172,
"step": 520
},
{
"epoch": 0.5188597037221462,
"grad_norm": 5.563510417938232,
"learning_rate": 8.057231706510807e-05,
"loss": 11.5814,
"step": 521
},
{
"epoch": 0.51985559566787,
"grad_norm": 5.869482040405273,
"learning_rate": 8.013039751920074e-05,
"loss": 12.1924,
"step": 522
},
{
"epoch": 0.5208514876135939,
"grad_norm": 6.094331741333008,
"learning_rate": 7.968912080200211e-05,
"loss": 11.5143,
"step": 523
},
{
"epoch": 0.5218473795593178,
"grad_norm": 5.655307769775391,
"learning_rate": 7.924849323550581e-05,
"loss": 11.5072,
"step": 524
},
{
"epoch": 0.5228432715050417,
"grad_norm": 5.7834367752075195,
"learning_rate": 7.880852113240539e-05,
"loss": 11.97,
"step": 525
},
{
"epoch": 0.5238391634507656,
"grad_norm": 7.464441776275635,
"learning_rate": 7.836921079600384e-05,
"loss": 12.2958,
"step": 526
},
{
"epoch": 0.5248350553964894,
"grad_norm": 5.94898796081543,
"learning_rate": 7.793056852012324e-05,
"loss": 12.3025,
"step": 527
},
{
"epoch": 0.5258309473422134,
"grad_norm": 5.301365852355957,
"learning_rate": 7.749260058901467e-05,
"loss": 10.7913,
"step": 528
},
{
"epoch": 0.5268268392879373,
"grad_norm": 5.742236137390137,
"learning_rate": 7.705531327726824e-05,
"loss": 11.6252,
"step": 529
},
{
"epoch": 0.5278227312336612,
"grad_norm": 5.871049404144287,
"learning_rate": 7.661871284972286e-05,
"loss": 12.7432,
"step": 530
},
{
"epoch": 0.528818623179385,
"grad_norm": 5.6256866455078125,
"learning_rate": 7.618280556137697e-05,
"loss": 11.5549,
"step": 531
},
{
"epoch": 0.5298145151251089,
"grad_norm": 6.3018903732299805,
"learning_rate": 7.574759765729853e-05,
"loss": 11.8701,
"step": 532
},
{
"epoch": 0.5308104070708328,
"grad_norm": 5.372267246246338,
"learning_rate": 7.531309537253574e-05,
"loss": 10.7988,
"step": 533
},
{
"epoch": 0.5318062990165567,
"grad_norm": 5.44589376449585,
"learning_rate": 7.487930493202783e-05,
"loss": 12.7583,
"step": 534
},
{
"epoch": 0.5328021909622805,
"grad_norm": 6.493985176086426,
"learning_rate": 7.444623255051538e-05,
"loss": 12.9094,
"step": 535
},
{
"epoch": 0.5337980829080045,
"grad_norm": 5.483232021331787,
"learning_rate": 7.4013884432452e-05,
"loss": 12.4428,
"step": 536
},
{
"epoch": 0.5347939748537284,
"grad_norm": 6.090009689331055,
"learning_rate": 7.358226677191488e-05,
"loss": 13.0753,
"step": 537
},
{
"epoch": 0.5357898667994523,
"grad_norm": 5.5943522453308105,
"learning_rate": 7.315138575251617e-05,
"loss": 10.7217,
"step": 538
},
{
"epoch": 0.5367857587451762,
"grad_norm": 5.108747482299805,
"learning_rate": 7.272124754731468e-05,
"loss": 10.47,
"step": 539
},
{
"epoch": 0.5377816506909,
"grad_norm": 7.087966442108154,
"learning_rate": 7.229185831872701e-05,
"loss": 11.7998,
"step": 540
},
{
"epoch": 0.5387775426366239,
"grad_norm": 5.637360095977783,
"learning_rate": 7.18632242184396e-05,
"loss": 11.9396,
"step": 541
},
{
"epoch": 0.5397734345823478,
"grad_norm": 5.862568378448486,
"learning_rate": 7.143535138732044e-05,
"loss": 11.211,
"step": 542
},
{
"epoch": 0.5407693265280717,
"grad_norm": 5.098068714141846,
"learning_rate": 7.100824595533109e-05,
"loss": 11.7145,
"step": 543
},
{
"epoch": 0.5417652184737956,
"grad_norm": 5.469205856323242,
"learning_rate": 7.05819140414389e-05,
"loss": 11.3163,
"step": 544
},
{
"epoch": 0.5427611104195195,
"grad_norm": 4.86387825012207,
"learning_rate": 7.015636175352935e-05,
"loss": 12.6151,
"step": 545
},
{
"epoch": 0.5437570023652434,
"grad_norm": 5.209597110748291,
"learning_rate": 6.973159518831865e-05,
"loss": 10.7285,
"step": 546
},
{
"epoch": 0.5447528943109673,
"grad_norm": 6.904428958892822,
"learning_rate": 6.930762043126598e-05,
"loss": 12.6723,
"step": 547
},
{
"epoch": 0.5457487862566911,
"grad_norm": 5.574467658996582,
"learning_rate": 6.888444355648694e-05,
"loss": 11.2369,
"step": 548
},
{
"epoch": 0.546744678202415,
"grad_norm": 5.7822184562683105,
"learning_rate": 6.846207062666598e-05,
"loss": 11.7722,
"step": 549
},
{
"epoch": 0.5477405701481389,
"grad_norm": 5.367188930511475,
"learning_rate": 6.804050769296982e-05,
"loss": 11.7805,
"step": 550
},
{
"epoch": 0.5487364620938628,
"grad_norm": 4.832945823669434,
"learning_rate": 6.761976079496069e-05,
"loss": 10.9897,
"step": 551
},
{
"epoch": 0.5497323540395868,
"grad_norm": 6.3430962562561035,
"learning_rate": 6.719983596050979e-05,
"loss": 13.1188,
"step": 552
},
{
"epoch": 0.5507282459853106,
"grad_norm": 5.310800075531006,
"learning_rate": 6.678073920571101e-05,
"loss": 11.8806,
"step": 553
},
{
"epoch": 0.5517241379310345,
"grad_norm": 5.725086688995361,
"learning_rate": 6.636247653479451e-05,
"loss": 12.9238,
"step": 554
},
{
"epoch": 0.5527200298767584,
"grad_norm": 6.0460920333862305,
"learning_rate": 6.594505394004116e-05,
"loss": 12.2236,
"step": 555
},
{
"epoch": 0.5537159218224823,
"grad_norm": 5.62431526184082,
"learning_rate": 6.552847740169603e-05,
"loss": 12.1332,
"step": 556
},
{
"epoch": 0.5547118137682061,
"grad_norm": 5.583924770355225,
"learning_rate": 6.511275288788341e-05,
"loss": 11.2317,
"step": 557
},
{
"epoch": 0.55570770571393,
"grad_norm": 5.76798677444458,
"learning_rate": 6.469788635452085e-05,
"loss": 11.5159,
"step": 558
},
{
"epoch": 0.5567035976596539,
"grad_norm": 4.825177192687988,
"learning_rate": 6.428388374523377e-05,
"loss": 10.9255,
"step": 559
},
{
"epoch": 0.5576994896053779,
"grad_norm": 5.40618896484375,
"learning_rate": 6.387075099127085e-05,
"loss": 12.1328,
"step": 560
},
{
"epoch": 0.5586953815511017,
"grad_norm": 5.495939254760742,
"learning_rate": 6.345849401141848e-05,
"loss": 12.011,
"step": 561
},
{
"epoch": 0.5596912734968256,
"grad_norm": 4.913009166717529,
"learning_rate": 6.304711871191624e-05,
"loss": 11.018,
"step": 562
},
{
"epoch": 0.5606871654425495,
"grad_norm": 6.538085460662842,
"learning_rate": 6.263663098637223e-05,
"loss": 12.6505,
"step": 563
},
{
"epoch": 0.5616830573882734,
"grad_norm": 5.621396064758301,
"learning_rate": 6.222703671567862e-05,
"loss": 11.4574,
"step": 564
},
{
"epoch": 0.5626789493339973,
"grad_norm": 5.406931400299072,
"learning_rate": 6.181834176792743e-05,
"loss": 11.9075,
"step": 565
},
{
"epoch": 0.5636748412797211,
"grad_norm": 5.263576030731201,
"learning_rate": 6.14105519983264e-05,
"loss": 12.0203,
"step": 566
},
{
"epoch": 0.564670733225445,
"grad_norm": 5.000280857086182,
"learning_rate": 6.100367324911534e-05,
"loss": 11.9682,
"step": 567
},
{
"epoch": 0.565666625171169,
"grad_norm": 4.999345779418945,
"learning_rate": 6.059771134948186e-05,
"loss": 11.1666,
"step": 568
},
{
"epoch": 0.5666625171168929,
"grad_norm": 4.500539779663086,
"learning_rate": 6.019267211547863e-05,
"loss": 10.7409,
"step": 569
},
{
"epoch": 0.5676584090626167,
"grad_norm": 4.902435779571533,
"learning_rate": 5.978856134993944e-05,
"loss": 10.0251,
"step": 570
},
{
"epoch": 0.5686543010083406,
"grad_norm": 4.8086066246032715,
"learning_rate": 5.938538484239635e-05,
"loss": 10.9553,
"step": 571
},
{
"epoch": 0.5696501929540645,
"grad_norm": 4.757142543792725,
"learning_rate": 5.898314836899664e-05,
"loss": 12.2329,
"step": 572
},
{
"epoch": 0.5706460848997884,
"grad_norm": 5.331392288208008,
"learning_rate": 5.8581857692420166e-05,
"loss": 11.5873,
"step": 573
},
{
"epoch": 0.5716419768455122,
"grad_norm": 6.0972723960876465,
"learning_rate": 5.8181518561796684e-05,
"loss": 13.181,
"step": 574
},
{
"epoch": 0.5726378687912361,
"grad_norm": 5.213257312774658,
"learning_rate": 5.7782136712623484e-05,
"loss": 12.3342,
"step": 575
},
{
"epoch": 0.57363376073696,
"grad_norm": 5.662869453430176,
"learning_rate": 5.7383717866683506e-05,
"loss": 12.8475,
"step": 576
},
{
"epoch": 0.574629652682684,
"grad_norm": 5.137444972991943,
"learning_rate": 5.6986267731962766e-05,
"loss": 12.0747,
"step": 577
},
{
"epoch": 0.5756255446284079,
"grad_norm": 5.8213677406311035,
"learning_rate": 5.6589792002569264e-05,
"loss": 11.4558,
"step": 578
},
{
"epoch": 0.5766214365741317,
"grad_norm": 5.571473598480225,
"learning_rate": 5.6194296358650935e-05,
"loss": 11.3248,
"step": 579
},
{
"epoch": 0.5776173285198556,
"grad_norm": 6.315218925476074,
"learning_rate": 5.5799786466314435e-05,
"loss": 14.3275,
"step": 580
},
{
"epoch": 0.5786132204655795,
"grad_norm": 5.514521598815918,
"learning_rate": 5.5406267977544e-05,
"loss": 11.3491,
"step": 581
},
{
"epoch": 0.5796091124113034,
"grad_norm": 5.521172523498535,
"learning_rate": 5.501374653012038e-05,
"loss": 12.0774,
"step": 582
},
{
"epoch": 0.5806050043570272,
"grad_norm": 4.984793663024902,
"learning_rate": 5.462222774754014e-05,
"loss": 11.1853,
"step": 583
},
{
"epoch": 0.5816008963027511,
"grad_norm": 5.918780326843262,
"learning_rate": 5.423171723893501e-05,
"loss": 11.6143,
"step": 584
},
{
"epoch": 0.582596788248475,
"grad_norm": 5.09220027923584,
"learning_rate": 5.384222059899178e-05,
"loss": 11.6102,
"step": 585
},
{
"epoch": 0.583592680194199,
"grad_norm": 5.182462692260742,
"learning_rate": 5.345374340787168e-05,
"loss": 11.4592,
"step": 586
},
{
"epoch": 0.5845885721399228,
"grad_norm": 5.29030179977417,
"learning_rate": 5.306629123113084e-05,
"loss": 10.9399,
"step": 587
},
{
"epoch": 0.5855844640856467,
"grad_norm": 5.2069807052612305,
"learning_rate": 5.2679869619640555e-05,
"loss": 11.6267,
"step": 588
},
{
"epoch": 0.5865803560313706,
"grad_norm": 5.4248809814453125,
"learning_rate": 5.229448410950732e-05,
"loss": 10.9395,
"step": 589
},
{
"epoch": 0.5875762479770945,
"grad_norm": 5.220003128051758,
"learning_rate": 5.1910140221994114e-05,
"loss": 12.5726,
"step": 590
},
{
"epoch": 0.5885721399228184,
"grad_norm": 5.750555038452148,
"learning_rate": 5.152684346344087e-05,
"loss": 11.9805,
"step": 591
},
{
"epoch": 0.5895680318685422,
"grad_norm": 5.346360683441162,
"learning_rate": 5.114459932518573e-05,
"loss": 11.6031,
"step": 592
},
{
"epoch": 0.5905639238142661,
"grad_norm": 5.487764358520508,
"learning_rate": 5.0763413283486394e-05,
"loss": 11.916,
"step": 593
},
{
"epoch": 0.5915598157599901,
"grad_norm": 6.088346004486084,
"learning_rate": 5.038329079944165e-05,
"loss": 11.9875,
"step": 594
},
{
"epoch": 0.592555707705714,
"grad_norm": 5.190619468688965,
"learning_rate": 5.000423731891307e-05,
"loss": 12.271,
"step": 595
},
{
"epoch": 0.5935515996514378,
"grad_norm": 5.797019004821777,
"learning_rate": 4.962625827244707e-05,
"loss": 12.3821,
"step": 596
},
{
"epoch": 0.5945474915971617,
"grad_norm": 5.294320106506348,
"learning_rate": 4.9249359075197204e-05,
"loss": 11.1323,
"step": 597
},
{
"epoch": 0.5955433835428856,
"grad_norm": 5.5578413009643555,
"learning_rate": 4.8873545126846195e-05,
"loss": 12.2285,
"step": 598
},
{
"epoch": 0.5965392754886095,
"grad_norm": 6.470888137817383,
"learning_rate": 4.849882181152911e-05,
"loss": 13.0034,
"step": 599
},
{
"epoch": 0.5975351674343333,
"grad_norm": 5.115138053894043,
"learning_rate": 4.81251944977558e-05,
"loss": 11.1868,
"step": 600
},
{
"epoch": 0.5975351674343333,
"eval_loss": 1.469951868057251,
"eval_runtime": 289.9264,
"eval_samples_per_second": 4.618,
"eval_steps_per_second": 1.155,
"step": 600
},
{
"epoch": 0.5985310593800572,
"grad_norm": 6.162230014801025,
"learning_rate": 4.775266853833421e-05,
"loss": 12.9121,
"step": 601
},
{
"epoch": 0.5995269513257812,
"grad_norm": 5.4764275550842285,
"learning_rate": 4.738124927029358e-05,
"loss": 10.9161,
"step": 602
},
{
"epoch": 0.6005228432715051,
"grad_norm": 5.016139507293701,
"learning_rate": 4.7010942014808056e-05,
"loss": 11.407,
"step": 603
},
{
"epoch": 0.601518735217229,
"grad_norm": 5.546393394470215,
"learning_rate": 4.664175207712043e-05,
"loss": 12.3953,
"step": 604
},
{
"epoch": 0.6025146271629528,
"grad_norm": 5.607834815979004,
"learning_rate": 4.627368474646608e-05,
"loss": 13.2932,
"step": 605
},
{
"epoch": 0.6035105191086767,
"grad_norm": 5.625347137451172,
"learning_rate": 4.590674529599742e-05,
"loss": 10.9166,
"step": 606
},
{
"epoch": 0.6045064110544006,
"grad_norm": 5.0566301345825195,
"learning_rate": 4.554093898270788e-05,
"loss": 12.1924,
"step": 607
},
{
"epoch": 0.6055023030001245,
"grad_norm": 4.960422515869141,
"learning_rate": 4.5176271047357196e-05,
"loss": 12.0646,
"step": 608
},
{
"epoch": 0.6064981949458483,
"grad_norm": 5.206432819366455,
"learning_rate": 4.4812746714395866e-05,
"loss": 11.2001,
"step": 609
},
{
"epoch": 0.6074940868915722,
"grad_norm": 6.366247177124023,
"learning_rate": 4.4450371191890365e-05,
"loss": 12.0904,
"step": 610
},
{
"epoch": 0.6084899788372962,
"grad_norm": 6.073464393615723,
"learning_rate": 4.408914967144888e-05,
"loss": 11.4838,
"step": 611
},
{
"epoch": 0.6094858707830201,
"grad_norm": 5.903442859649658,
"learning_rate": 4.372908732814647e-05,
"loss": 11.7165,
"step": 612
},
{
"epoch": 0.6104817627287439,
"grad_norm": 6.564481735229492,
"learning_rate": 4.3370189320451195e-05,
"loss": 12.9723,
"step": 613
},
{
"epoch": 0.6114776546744678,
"grad_norm": 6.052420139312744,
"learning_rate": 4.301246079015016e-05,
"loss": 12.628,
"step": 614
},
{
"epoch": 0.6124735466201917,
"grad_norm": 5.082505226135254,
"learning_rate": 4.26559068622758e-05,
"loss": 11.0174,
"step": 615
},
{
"epoch": 0.6134694385659156,
"grad_norm": 6.022928714752197,
"learning_rate": 4.230053264503256e-05,
"loss": 13.0683,
"step": 616
},
{
"epoch": 0.6144653305116395,
"grad_norm": 5.392704963684082,
"learning_rate": 4.1946343229723514e-05,
"loss": 12.6063,
"step": 617
},
{
"epoch": 0.6154612224573633,
"grad_norm": 6.246056079864502,
"learning_rate": 4.159334369067781e-05,
"loss": 13.3403,
"step": 618
},
{
"epoch": 0.6164571144030873,
"grad_norm": 7.040495872497559,
"learning_rate": 4.124153908517739e-05,
"loss": 11.5163,
"step": 619
},
{
"epoch": 0.6174530063488112,
"grad_norm": 5.689401626586914,
"learning_rate": 4.089093445338514e-05,
"loss": 12.6618,
"step": 620
},
{
"epoch": 0.6184488982945351,
"grad_norm": 5.199860572814941,
"learning_rate": 4.054153481827226e-05,
"loss": 11.5894,
"step": 621
},
{
"epoch": 0.6194447902402589,
"grad_norm": 4.949602127075195,
"learning_rate": 4.019334518554649e-05,
"loss": 11.6394,
"step": 622
},
{
"epoch": 0.6204406821859828,
"grad_norm": 5.066486358642578,
"learning_rate": 3.984637054358034e-05,
"loss": 10.7524,
"step": 623
},
{
"epoch": 0.6214365741317067,
"grad_norm": 5.321683406829834,
"learning_rate": 3.950061586333967e-05,
"loss": 12.0487,
"step": 624
},
{
"epoch": 0.6224324660774306,
"grad_norm": 5.394808769226074,
"learning_rate": 3.9156086098312395e-05,
"loss": 11.734,
"step": 625
},
{
"epoch": 0.6234283580231544,
"grad_norm": 5.522308826446533,
"learning_rate": 3.881278618443754e-05,
"loss": 10.2733,
"step": 626
},
{
"epoch": 0.6244242499688784,
"grad_norm": 4.8424787521362305,
"learning_rate": 3.847072104003474e-05,
"loss": 11.1419,
"step": 627
},
{
"epoch": 0.6254201419146023,
"grad_norm": 5.075899600982666,
"learning_rate": 3.812989556573327e-05,
"loss": 11.4723,
"step": 628
},
{
"epoch": 0.6264160338603262,
"grad_norm": 5.3835835456848145,
"learning_rate": 3.779031464440241e-05,
"loss": 11.468,
"step": 629
},
{
"epoch": 0.6274119258060501,
"grad_norm": 6.307840347290039,
"learning_rate": 3.7451983141081184e-05,
"loss": 10.6013,
"step": 630
},
{
"epoch": 0.6284078177517739,
"grad_norm": 5.412378311157227,
"learning_rate": 3.711490590290853e-05,
"loss": 11.0776,
"step": 631
},
{
"epoch": 0.6294037096974978,
"grad_norm": 5.071283340454102,
"learning_rate": 3.67790877590543e-05,
"loss": 11.9251,
"step": 632
},
{
"epoch": 0.6303996016432217,
"grad_norm": 5.967265605926514,
"learning_rate": 3.6444533520649594e-05,
"loss": 11.9519,
"step": 633
},
{
"epoch": 0.6313954935889456,
"grad_norm": 5.716352939605713,
"learning_rate": 3.611124798071819e-05,
"loss": 11.6726,
"step": 634
},
{
"epoch": 0.6323913855346694,
"grad_norm": 5.153800964355469,
"learning_rate": 3.5779235914107616e-05,
"loss": 11.6163,
"step": 635
},
{
"epoch": 0.6333872774803934,
"grad_norm": 4.7233710289001465,
"learning_rate": 3.544850207742104e-05,
"loss": 10.8191,
"step": 636
},
{
"epoch": 0.6343831694261173,
"grad_norm": 5.4549384117126465,
"learning_rate": 3.511905120894869e-05,
"loss": 12.1209,
"step": 637
},
{
"epoch": 0.6353790613718412,
"grad_norm": 4.882791996002197,
"learning_rate": 3.4790888028600406e-05,
"loss": 11.3777,
"step": 638
},
{
"epoch": 0.636374953317565,
"grad_norm": 6.203968524932861,
"learning_rate": 3.4464017237837874e-05,
"loss": 12.2292,
"step": 639
},
{
"epoch": 0.6373708452632889,
"grad_norm": 5.759111404418945,
"learning_rate": 3.413844351960703e-05,
"loss": 11.9604,
"step": 640
},
{
"epoch": 0.6383667372090128,
"grad_norm": 5.6707763671875,
"learning_rate": 3.3814171538271406e-05,
"loss": 11.5694,
"step": 641
},
{
"epoch": 0.6393626291547367,
"grad_norm": 5.256777286529541,
"learning_rate": 3.349120593954494e-05,
"loss": 11.3768,
"step": 642
},
{
"epoch": 0.6403585211004607,
"grad_norm": 4.985710620880127,
"learning_rate": 3.316955135042557e-05,
"loss": 11.9468,
"step": 643
},
{
"epoch": 0.6413544130461845,
"grad_norm": 6.197348117828369,
"learning_rate": 3.284921237912897e-05,
"loss": 11.4821,
"step": 644
},
{
"epoch": 0.6423503049919084,
"grad_norm": 5.89221715927124,
"learning_rate": 3.253019361502242e-05,
"loss": 11.8649,
"step": 645
},
{
"epoch": 0.6433461969376323,
"grad_norm": 4.958759784698486,
"learning_rate": 3.2212499628559183e-05,
"loss": 11.3049,
"step": 646
},
{
"epoch": 0.6443420888833562,
"grad_norm": 6.130683422088623,
"learning_rate": 3.189613497121291e-05,
"loss": 11.6303,
"step": 647
},
{
"epoch": 0.64533798082908,
"grad_norm": 5.429010391235352,
"learning_rate": 3.158110417541264e-05,
"loss": 11.1867,
"step": 648
},
{
"epoch": 0.6463338727748039,
"grad_norm": 5.844196796417236,
"learning_rate": 3.126741175447746e-05,
"loss": 11.8687,
"step": 649
},
{
"epoch": 0.6473297647205278,
"grad_norm": 5.631326675415039,
"learning_rate": 3.0955062202552344e-05,
"loss": 12.6221,
"step": 650
},
{
"epoch": 0.6483256566662517,
"grad_norm": 4.835573673248291,
"learning_rate": 3.0644059994543364e-05,
"loss": 11.0964,
"step": 651
},
{
"epoch": 0.6493215486119756,
"grad_norm": 5.213573455810547,
"learning_rate": 3.0334409586053822e-05,
"loss": 11.2015,
"step": 652
},
{
"epoch": 0.6503174405576995,
"grad_norm": 5.156842231750488,
"learning_rate": 3.0026115413320316e-05,
"loss": 11.8143,
"step": 653
},
{
"epoch": 0.6513133325034234,
"grad_norm": 5.973851680755615,
"learning_rate": 2.971918189314915e-05,
"loss": 10.897,
"step": 654
},
{
"epoch": 0.6523092244491473,
"grad_norm": 4.918872356414795,
"learning_rate": 2.9413613422853187e-05,
"loss": 11.468,
"step": 655
},
{
"epoch": 0.6533051163948712,
"grad_norm": 5.078589916229248,
"learning_rate": 2.9109414380188673e-05,
"loss": 11.5197,
"step": 656
},
{
"epoch": 0.654301008340595,
"grad_norm": 5.085513114929199,
"learning_rate": 2.88065891232928e-05,
"loss": 11.2995,
"step": 657
},
{
"epoch": 0.6552969002863189,
"grad_norm": 5.631070613861084,
"learning_rate": 2.850514199062085e-05,
"loss": 10.8211,
"step": 658
},
{
"epoch": 0.6562927922320428,
"grad_norm": 6.195810794830322,
"learning_rate": 2.8205077300884367e-05,
"loss": 11.3694,
"step": 659
},
{
"epoch": 0.6572886841777668,
"grad_norm": 5.141655445098877,
"learning_rate": 2.790639935298933e-05,
"loss": 11.9514,
"step": 660
},
{
"epoch": 0.6582845761234906,
"grad_norm": 4.97012186050415,
"learning_rate": 2.760911242597418e-05,
"loss": 10.6117,
"step": 661
},
{
"epoch": 0.6592804680692145,
"grad_norm": 5.144041061401367,
"learning_rate": 2.7313220778948974e-05,
"loss": 10.5493,
"step": 662
},
{
"epoch": 0.6602763600149384,
"grad_norm": 5.334214210510254,
"learning_rate": 2.7018728651034052e-05,
"loss": 11.3385,
"step": 663
},
{
"epoch": 0.6612722519606623,
"grad_norm": 5.036880016326904,
"learning_rate": 2.6725640261299408e-05,
"loss": 12.6018,
"step": 664
},
{
"epoch": 0.6622681439063861,
"grad_norm": 5.173761367797852,
"learning_rate": 2.643395980870428e-05,
"loss": 10.3298,
"step": 665
},
{
"epoch": 0.66326403585211,
"grad_norm": 5.325448513031006,
"learning_rate": 2.6143691472036934e-05,
"loss": 11.6448,
"step": 666
},
{
"epoch": 0.6642599277978339,
"grad_norm": 5.270651340484619,
"learning_rate": 2.5854839409854837e-05,
"loss": 10.9496,
"step": 667
},
{
"epoch": 0.6652558197435579,
"grad_norm": 5.0304999351501465,
"learning_rate": 2.5567407760425002e-05,
"loss": 12.0285,
"step": 668
},
{
"epoch": 0.6662517116892818,
"grad_norm": 5.503153324127197,
"learning_rate": 2.5281400641664925e-05,
"loss": 11.4235,
"step": 669
},
{
"epoch": 0.6672476036350056,
"grad_norm": 6.423144340515137,
"learning_rate": 2.499682215108319e-05,
"loss": 12.1289,
"step": 670
},
{
"epoch": 0.6682434955807295,
"grad_norm": 6.0244879722595215,
"learning_rate": 2.4713676365721194e-05,
"loss": 12.4105,
"step": 671
},
{
"epoch": 0.6692393875264534,
"grad_norm": 5.558983325958252,
"learning_rate": 2.4431967342094465e-05,
"loss": 11.8495,
"step": 672
},
{
"epoch": 0.6702352794721773,
"grad_norm": 4.985186576843262,
"learning_rate": 2.415169911613463e-05,
"loss": 11.2055,
"step": 673
},
{
"epoch": 0.6712311714179011,
"grad_norm": 6.300296306610107,
"learning_rate": 2.3872875703131582e-05,
"loss": 11.8234,
"step": 674
},
{
"epoch": 0.672227063363625,
"grad_norm": 5.401225566864014,
"learning_rate": 2.359550109767597e-05,
"loss": 11.3489,
"step": 675
},
{
"epoch": 0.673222955309349,
"grad_norm": 5.588122367858887,
"learning_rate": 2.331957927360198e-05,
"loss": 11.3187,
"step": 676
},
{
"epoch": 0.6742188472550729,
"grad_norm": 5.249821662902832,
"learning_rate": 2.3045114183930307e-05,
"loss": 12.5959,
"step": 677
},
{
"epoch": 0.6752147392007967,
"grad_norm": 6.340845584869385,
"learning_rate": 2.2772109760811786e-05,
"loss": 11.3688,
"step": 678
},
{
"epoch": 0.6762106311465206,
"grad_norm": 4.663110256195068,
"learning_rate": 2.2500569915470588e-05,
"loss": 11.021,
"step": 679
},
{
"epoch": 0.6772065230922445,
"grad_norm": 6.123586654663086,
"learning_rate": 2.223049853814875e-05,
"loss": 11.7617,
"step": 680
},
{
"epoch": 0.6782024150379684,
"grad_norm": 5.436185836791992,
"learning_rate": 2.1961899498049997e-05,
"loss": 11.8642,
"step": 681
},
{
"epoch": 0.6791983069836923,
"grad_norm": 5.2976765632629395,
"learning_rate": 2.1694776643284453e-05,
"loss": 11.2544,
"step": 682
},
{
"epoch": 0.6801941989294161,
"grad_norm": 5.813897609710693,
"learning_rate": 2.1429133800813654e-05,
"loss": 11.6754,
"step": 683
},
{
"epoch": 0.68119009087514,
"grad_norm": 5.2720184326171875,
"learning_rate": 2.116497477639552e-05,
"loss": 12.3032,
"step": 684
},
{
"epoch": 0.682185982820864,
"grad_norm": 4.996230125427246,
"learning_rate": 2.090230335452989e-05,
"loss": 11.7477,
"step": 685
},
{
"epoch": 0.6831818747665879,
"grad_norm": 6.202296733856201,
"learning_rate": 2.064112329840437e-05,
"loss": 11.789,
"step": 686
},
{
"epoch": 0.6841777667123117,
"grad_norm": 5.982651710510254,
"learning_rate": 2.0381438349840326e-05,
"loss": 11.3074,
"step": 687
},
{
"epoch": 0.6851736586580356,
"grad_norm": 5.608039379119873,
"learning_rate": 2.0123252229239357e-05,
"loss": 11.9267,
"step": 688
},
{
"epoch": 0.6861695506037595,
"grad_norm": 5.310582637786865,
"learning_rate": 1.986656863552992e-05,
"loss": 12.5416,
"step": 689
},
{
"epoch": 0.6871654425494834,
"grad_norm": 5.351050853729248,
"learning_rate": 1.9611391246114468e-05,
"loss": 12.0263,
"step": 690
},
{
"epoch": 0.6881613344952072,
"grad_norm": 5.318876266479492,
"learning_rate": 1.93577237168165e-05,
"loss": 11.4462,
"step": 691
},
{
"epoch": 0.6891572264409311,
"grad_norm": 7.177599906921387,
"learning_rate": 1.910556968182854e-05,
"loss": 12.3857,
"step": 692
},
{
"epoch": 0.690153118386655,
"grad_norm": 4.861364841461182,
"learning_rate": 1.8854932753659778e-05,
"loss": 10.3558,
"step": 693
},
{
"epoch": 0.691149010332379,
"grad_norm": 4.699554443359375,
"learning_rate": 1.8605816523084462e-05,
"loss": 11.7694,
"step": 694
},
{
"epoch": 0.6921449022781029,
"grad_norm": 5.567951202392578,
"learning_rate": 1.8358224559090418e-05,
"loss": 11.883,
"step": 695
},
{
"epoch": 0.6931407942238267,
"grad_norm": 7.005056858062744,
"learning_rate": 1.8112160408827906e-05,
"loss": 12.0574,
"step": 696
},
{
"epoch": 0.6941366861695506,
"grad_norm": 4.946389198303223,
"learning_rate": 1.786762759755882e-05,
"loss": 10.6037,
"step": 697
},
{
"epoch": 0.6951325781152745,
"grad_norm": 6.21707820892334,
"learning_rate": 1.762462962860617e-05,
"loss": 12.1973,
"step": 698
},
{
"epoch": 0.6961284700609984,
"grad_norm": 5.530759334564209,
"learning_rate": 1.738316998330397e-05,
"loss": 12.2209,
"step": 699
},
{
"epoch": 0.6971243620067222,
"grad_norm": 5.81333065032959,
"learning_rate": 1.7143252120947138e-05,
"loss": 11.2614,
"step": 700
},
{
"epoch": 0.6971243620067222,
"eval_loss": 1.4631643295288086,
"eval_runtime": 289.9209,
"eval_samples_per_second": 4.619,
"eval_steps_per_second": 1.155,
"step": 700
},
{
"epoch": 0.6981202539524461,
"grad_norm": 5.98025369644165,
"learning_rate": 1.6904879478742253e-05,
"loss": 12.586,
"step": 701
},
{
"epoch": 0.6991161458981701,
"grad_norm": 4.925411701202393,
"learning_rate": 1.6668055471758064e-05,
"loss": 11.1688,
"step": 702
},
{
"epoch": 0.700112037843894,
"grad_norm": 5.7626566886901855,
"learning_rate": 1.6432783492876634e-05,
"loss": 12.3594,
"step": 703
},
{
"epoch": 0.7011079297896178,
"grad_norm": 6.179996967315674,
"learning_rate": 1.6199066912744793e-05,
"loss": 13.2897,
"step": 704
},
{
"epoch": 0.7021038217353417,
"grad_norm": 5.263505458831787,
"learning_rate": 1.5966909079725783e-05,
"loss": 11.9709,
"step": 705
},
{
"epoch": 0.7030997136810656,
"grad_norm": 5.473526954650879,
"learning_rate": 1.5736313319851285e-05,
"loss": 12.4221,
"step": 706
},
{
"epoch": 0.7040956056267895,
"grad_norm": 5.03949499130249,
"learning_rate": 1.5507282936773768e-05,
"loss": 11.0594,
"step": 707
},
{
"epoch": 0.7050914975725134,
"grad_norm": 5.114665508270264,
"learning_rate": 1.5279821211719307e-05,
"loss": 10.9889,
"step": 708
},
{
"epoch": 0.7060873895182372,
"grad_norm": 6.260105133056641,
"learning_rate": 1.5053931403440275e-05,
"loss": 12.1907,
"step": 709
},
{
"epoch": 0.7070832814639612,
"grad_norm": 5.071542263031006,
"learning_rate": 1.4829616748168891e-05,
"loss": 11.6752,
"step": 710
},
{
"epoch": 0.7080791734096851,
"grad_norm": 5.426812171936035,
"learning_rate": 1.4606880459570884e-05,
"loss": 12.0812,
"step": 711
},
{
"epoch": 0.709075065355409,
"grad_norm": 5.589237689971924,
"learning_rate": 1.4385725728699187e-05,
"loss": 12.6791,
"step": 712
},
{
"epoch": 0.7100709573011328,
"grad_norm": 5.184196472167969,
"learning_rate": 1.4166155723948554e-05,
"loss": 9.6767,
"step": 713
},
{
"epoch": 0.7110668492468567,
"grad_norm": 5.003568172454834,
"learning_rate": 1.3948173591009916e-05,
"loss": 10.6604,
"step": 714
},
{
"epoch": 0.7120627411925806,
"grad_norm": 5.232964038848877,
"learning_rate": 1.3731782452825428e-05,
"loss": 12.3529,
"step": 715
},
{
"epoch": 0.7130586331383045,
"grad_norm": 5.2426629066467285,
"learning_rate": 1.35169854095437e-05,
"loss": 10.9877,
"step": 716
},
{
"epoch": 0.7140545250840283,
"grad_norm": 5.008747100830078,
"learning_rate": 1.3303785538475403e-05,
"loss": 11.3483,
"step": 717
},
{
"epoch": 0.7150504170297522,
"grad_norm": 5.193508148193359,
"learning_rate": 1.3092185894049133e-05,
"loss": 11.6414,
"step": 718
},
{
"epoch": 0.7160463089754762,
"grad_norm": 4.987440586090088,
"learning_rate": 1.2882189507767705e-05,
"loss": 11.1112,
"step": 719
},
{
"epoch": 0.7170422009212001,
"grad_norm": 4.958714485168457,
"learning_rate": 1.267379938816475e-05,
"loss": 11.5351,
"step": 720
},
{
"epoch": 0.718038092866924,
"grad_norm": 5.384908676147461,
"learning_rate": 1.2467018520761416e-05,
"loss": 11.5047,
"step": 721
},
{
"epoch": 0.7190339848126478,
"grad_norm": 5.524514675140381,
"learning_rate": 1.2261849868023936e-05,
"loss": 11.5071,
"step": 722
},
{
"epoch": 0.7200298767583717,
"grad_norm": 4.934844970703125,
"learning_rate": 1.2058296369320848e-05,
"loss": 11.9821,
"step": 723
},
{
"epoch": 0.7210257687040956,
"grad_norm": 5.9136457443237305,
"learning_rate": 1.1856360940881106e-05,
"loss": 11.1714,
"step": 724
},
{
"epoch": 0.7220216606498195,
"grad_norm": 5.626174449920654,
"learning_rate": 1.1656046475752178e-05,
"loss": 11.0057,
"step": 725
},
{
"epoch": 0.7230175525955433,
"grad_norm": 6.411077499389648,
"learning_rate": 1.1457355843758663e-05,
"loss": 11.485,
"step": 726
},
{
"epoch": 0.7240134445412673,
"grad_norm": 5.232149600982666,
"learning_rate": 1.126029189146116e-05,
"loss": 11.9511,
"step": 727
},
{
"epoch": 0.7250093364869912,
"grad_norm": 6.392588138580322,
"learning_rate": 1.1064857442115468e-05,
"loss": 12.3394,
"step": 728
},
{
"epoch": 0.7260052284327151,
"grad_norm": 4.912868499755859,
"learning_rate": 1.0871055295632232e-05,
"loss": 10.2057,
"step": 729
},
{
"epoch": 0.7270011203784389,
"grad_norm": 4.6938652992248535,
"learning_rate": 1.067888822853659e-05,
"loss": 10.9687,
"step": 730
},
{
"epoch": 0.7279970123241628,
"grad_norm": 7.743927955627441,
"learning_rate": 1.048835899392872e-05,
"loss": 12.2057,
"step": 731
},
{
"epoch": 0.7289929042698867,
"grad_norm": 7.151386260986328,
"learning_rate": 1.0299470321444168e-05,
"loss": 12.7718,
"step": 732
},
{
"epoch": 0.7299887962156106,
"grad_norm": 5.987144470214844,
"learning_rate": 1.0112224917214738e-05,
"loss": 11.8501,
"step": 733
},
{
"epoch": 0.7309846881613346,
"grad_norm": 7.631872653961182,
"learning_rate": 9.926625463829942e-06,
"loss": 12.742,
"step": 734
},
{
"epoch": 0.7319805801070584,
"grad_norm": 5.5182085037231445,
"learning_rate": 9.742674620298305e-06,
"loss": 11.0,
"step": 735
},
{
"epoch": 0.7329764720527823,
"grad_norm": 4.892227649688721,
"learning_rate": 9.560375022009418e-06,
"loss": 11.4375,
"step": 736
},
{
"epoch": 0.7339723639985062,
"grad_norm": 5.391102313995361,
"learning_rate": 9.37972928069615e-06,
"loss": 11.0945,
"step": 737
},
{
"epoch": 0.7349682559442301,
"grad_norm": 5.407810688018799,
"learning_rate": 9.200739984397202e-06,
"loss": 11.0342,
"step": 738
},
{
"epoch": 0.7359641478899539,
"grad_norm": 5.871718883514404,
"learning_rate": 9.02340969742009e-06,
"loss": 11.9766,
"step": 739
},
{
"epoch": 0.7369600398356778,
"grad_norm": 5.485849857330322,
"learning_rate": 8.847740960304357e-06,
"loss": 11.4418,
"step": 740
},
{
"epoch": 0.7379559317814017,
"grad_norm": 6.5848164558410645,
"learning_rate": 8.673736289785197e-06,
"loss": 13.1044,
"step": 741
},
{
"epoch": 0.7389518237271256,
"grad_norm": 6.7905473709106445,
"learning_rate": 8.50139817875735e-06,
"loss": 10.9534,
"step": 742
},
{
"epoch": 0.7399477156728494,
"grad_norm": 4.94359827041626,
"learning_rate": 8.330729096239539e-06,
"loss": 11.1527,
"step": 743
},
{
"epoch": 0.7409436076185734,
"grad_norm": 5.690808296203613,
"learning_rate": 8.161731487338827e-06,
"loss": 12.6257,
"step": 744
},
{
"epoch": 0.7419394995642973,
"grad_norm": 5.709432125091553,
"learning_rate": 7.994407773215903e-06,
"loss": 11.0885,
"step": 745
},
{
"epoch": 0.7429353915100212,
"grad_norm": 5.193820476531982,
"learning_rate": 7.828760351050165e-06,
"loss": 11.0625,
"step": 746
},
{
"epoch": 0.7439312834557451,
"grad_norm": 5.115967750549316,
"learning_rate": 7.664791594005433e-06,
"loss": 9.8947,
"step": 747
},
{
"epoch": 0.7449271754014689,
"grad_norm": 5.30134916305542,
"learning_rate": 7.502503851196024e-06,
"loss": 11.6142,
"step": 748
},
{
"epoch": 0.7459230673471928,
"grad_norm": 5.2966108322143555,
"learning_rate": 7.341899447652997e-06,
"loss": 11.1821,
"step": 749
},
{
"epoch": 0.7469189592929167,
"grad_norm": 5.26341438293457,
"learning_rate": 7.182980684290921e-06,
"loss": 12.1355,
"step": 750
},
{
"epoch": 0.7479148512386407,
"grad_norm": 6.888838768005371,
"learning_rate": 7.0257498378748366e-06,
"loss": 13.0227,
"step": 751
},
{
"epoch": 0.7489107431843645,
"grad_norm": 5.804958343505859,
"learning_rate": 6.87020916098774e-06,
"loss": 11.9386,
"step": 752
},
{
"epoch": 0.7499066351300884,
"grad_norm": 4.986551284790039,
"learning_rate": 6.716360881998174e-06,
"loss": 10.9869,
"step": 753
},
{
"epoch": 0.7509025270758123,
"grad_norm": 4.778614044189453,
"learning_rate": 6.564207205028441e-06,
"loss": 11.1464,
"step": 754
},
{
"epoch": 0.7518984190215362,
"grad_norm": 6.451168060302734,
"learning_rate": 6.413750309922958e-06,
"loss": 12.4462,
"step": 755
},
{
"epoch": 0.75289431096726,
"grad_norm": 5.491706371307373,
"learning_rate": 6.264992352216961e-06,
"loss": 12.6588,
"step": 756
},
{
"epoch": 0.7538902029129839,
"grad_norm": 5.205382823944092,
"learning_rate": 6.117935463105809e-06,
"loss": 10.741,
"step": 757
},
{
"epoch": 0.7548860948587078,
"grad_norm": 5.023211479187012,
"learning_rate": 5.972581749414244e-06,
"loss": 11.0733,
"step": 758
},
{
"epoch": 0.7558819868044317,
"grad_norm": 5.587502956390381,
"learning_rate": 5.828933293566333e-06,
"loss": 11.1592,
"step": 759
},
{
"epoch": 0.7568778787501556,
"grad_norm": 5.698174476623535,
"learning_rate": 5.686992153555614e-06,
"loss": 12.2832,
"step": 760
},
{
"epoch": 0.7578737706958795,
"grad_norm": 5.90291166305542,
"learning_rate": 5.546760362915571e-06,
"loss": 12.3435,
"step": 761
},
{
"epoch": 0.7588696626416034,
"grad_norm": 5.5049591064453125,
"learning_rate": 5.408239930690548e-06,
"loss": 11.5343,
"step": 762
},
{
"epoch": 0.7598655545873273,
"grad_norm": 5.788873672485352,
"learning_rate": 5.271432841406937e-06,
"loss": 12.3758,
"step": 763
},
{
"epoch": 0.7608614465330512,
"grad_norm": 5.662258148193359,
"learning_rate": 5.1363410550448045e-06,
"loss": 11.152,
"step": 764
},
{
"epoch": 0.761857338478775,
"grad_norm": 5.340865612030029,
"learning_rate": 5.0029665070096544e-06,
"loss": 11.2238,
"step": 765
},
{
"epoch": 0.7628532304244989,
"grad_norm": 5.056994915008545,
"learning_rate": 4.871311108104917e-06,
"loss": 9.9053,
"step": 766
},
{
"epoch": 0.7638491223702228,
"grad_norm": 5.196466445922852,
"learning_rate": 4.741376744504422e-06,
"loss": 12.2725,
"step": 767
},
{
"epoch": 0.7648450143159468,
"grad_norm": 5.003063201904297,
"learning_rate": 4.613165277725428e-06,
"loss": 10.418,
"step": 768
},
{
"epoch": 0.7658409062616706,
"grad_norm": 5.598369598388672,
"learning_rate": 4.486678544601913e-06,
"loss": 10.3517,
"step": 769
},
{
"epoch": 0.7668367982073945,
"grad_norm": 5.888363361358643,
"learning_rate": 4.36191835725834e-06,
"loss": 11.2754,
"step": 770
},
{
"epoch": 0.7678326901531184,
"grad_norm": 5.854308128356934,
"learning_rate": 4.238886503083628e-06,
"loss": 13.0128,
"step": 771
},
{
"epoch": 0.7688285820988423,
"grad_norm": 6.681763648986816,
"learning_rate": 4.117584744705527e-06,
"loss": 12.1851,
"step": 772
},
{
"epoch": 0.7698244740445661,
"grad_norm": 6.164341449737549,
"learning_rate": 3.998014819965479e-06,
"loss": 11.9651,
"step": 773
},
{
"epoch": 0.77082036599029,
"grad_norm": 5.836161136627197,
"learning_rate": 3.880178441893562e-06,
"loss": 12.3736,
"step": 774
},
{
"epoch": 0.7718162579360139,
"grad_norm": 5.648519039154053,
"learning_rate": 3.764077298684096e-06,
"loss": 11.6445,
"step": 775
},
{
"epoch": 0.7728121498817379,
"grad_norm": 5.881044387817383,
"learning_rate": 3.6497130536713716e-06,
"loss": 13.0027,
"step": 776
},
{
"epoch": 0.7738080418274618,
"grad_norm": 5.143442153930664,
"learning_rate": 3.5370873453057646e-06,
"loss": 11.9685,
"step": 777
},
{
"epoch": 0.7748039337731856,
"grad_norm": 5.579311370849609,
"learning_rate": 3.426201787130476e-06,
"loss": 11.7632,
"step": 778
},
{
"epoch": 0.7757998257189095,
"grad_norm": 5.457061290740967,
"learning_rate": 3.3170579677581636e-06,
"loss": 12.535,
"step": 779
},
{
"epoch": 0.7767957176646334,
"grad_norm": 5.1317644119262695,
"learning_rate": 3.2096574508483357e-06,
"loss": 10.979,
"step": 780
},
{
"epoch": 0.7777916096103573,
"grad_norm": 5.227689266204834,
"learning_rate": 3.1040017750848943e-06,
"loss": 11.2438,
"step": 781
},
{
"epoch": 0.7787875015560811,
"grad_norm": 6.507405757904053,
"learning_rate": 3.0000924541541687e-06,
"loss": 12.94,
"step": 782
},
{
"epoch": 0.779783393501805,
"grad_norm": 5.245728492736816,
"learning_rate": 2.8979309767230844e-06,
"loss": 11.8061,
"step": 783
},
{
"epoch": 0.780779285447529,
"grad_norm": 5.699490547180176,
"learning_rate": 2.7975188064179173e-06,
"loss": 11.486,
"step": 784
},
{
"epoch": 0.7817751773932529,
"grad_norm": 5.269382476806641,
"learning_rate": 2.6988573818034213e-06,
"loss": 11.2483,
"step": 785
},
{
"epoch": 0.7827710693389767,
"grad_norm": 5.229419231414795,
"learning_rate": 2.601948116361996e-06,
"loss": 11.6245,
"step": 786
},
{
"epoch": 0.7837669612847006,
"grad_norm": 5.528886318206787,
"learning_rate": 2.5067923984736647e-06,
"loss": 12.4577,
"step": 787
},
{
"epoch": 0.7847628532304245,
"grad_norm": 5.435757637023926,
"learning_rate": 2.413391591396044e-06,
"loss": 12.7685,
"step": 788
},
{
"epoch": 0.7857587451761484,
"grad_norm": 5.185883522033691,
"learning_rate": 2.3217470332448226e-06,
"loss": 11.2059,
"step": 789
},
{
"epoch": 0.7867546371218723,
"grad_norm": 5.466739177703857,
"learning_rate": 2.2318600369746754e-06,
"loss": 11.3974,
"step": 790
},
{
"epoch": 0.7877505290675961,
"grad_norm": 5.585619926452637,
"learning_rate": 2.143731890360337e-06,
"loss": 11.8463,
"step": 791
},
{
"epoch": 0.78874642101332,
"grad_norm": 6.319226264953613,
"learning_rate": 2.057363855978253e-06,
"loss": 12.4982,
"step": 792
},
{
"epoch": 0.789742312959044,
"grad_norm": 5.821755886077881,
"learning_rate": 1.972757171188444e-06,
"loss": 11.5502,
"step": 793
},
{
"epoch": 0.7907382049047679,
"grad_norm": 6.314916133880615,
"learning_rate": 1.8899130481167815e-06,
"loss": 12.7203,
"step": 794
},
{
"epoch": 0.7917340968504917,
"grad_norm": 5.605375289916992,
"learning_rate": 1.8088326736376004e-06,
"loss": 11.8973,
"step": 795
},
{
"epoch": 0.7927299887962156,
"grad_norm": 5.3746843338012695,
"learning_rate": 1.729517209356782e-06,
"loss": 11.2227,
"step": 796
},
{
"epoch": 0.7937258807419395,
"grad_norm": 4.950900554656982,
"learning_rate": 1.6519677915949743e-06,
"loss": 10.9039,
"step": 797
},
{
"epoch": 0.7947217726876634,
"grad_norm": 5.662406921386719,
"learning_rate": 1.5761855313714684e-06,
"loss": 12.4767,
"step": 798
},
{
"epoch": 0.7957176646333872,
"grad_norm": 6.010452747344971,
"learning_rate": 1.502171514388112e-06,
"loss": 11.6454,
"step": 799
},
{
"epoch": 0.7967135565791111,
"grad_norm": 6.123356342315674,
"learning_rate": 1.4299268010139339e-06,
"loss": 11.4508,
"step": 800
},
{
"epoch": 0.7967135565791111,
"eval_loss": 1.4602320194244385,
"eval_runtime": 289.8374,
"eval_samples_per_second": 4.62,
"eval_steps_per_second": 1.156,
"step": 800
},
{
"epoch": 0.797709448524835,
"grad_norm": 5.147581100463867,
"learning_rate": 1.3594524262698082e-06,
"loss": 11.0485,
"step": 801
},
{
"epoch": 0.798705340470559,
"grad_norm": 4.97263765335083,
"learning_rate": 1.290749399813676e-06,
"loss": 11.5831,
"step": 802
},
{
"epoch": 0.7997012324162829,
"grad_norm": 6.502606391906738,
"learning_rate": 1.2238187059261384e-06,
"loss": 10.9787,
"step": 803
},
{
"epoch": 0.8006971243620067,
"grad_norm": 5.0996527671813965,
"learning_rate": 1.1586613034962333e-06,
"loss": 11.5943,
"step": 804
},
{
"epoch": 0.8016930163077306,
"grad_norm": 5.570478439331055,
"learning_rate": 1.0952781260078342e-06,
"loss": 12.3992,
"step": 805
},
{
"epoch": 0.8026889082534545,
"grad_norm": 5.16178035736084,
"learning_rate": 1.0336700815261613e-06,
"loss": 11.8013,
"step": 806
},
{
"epoch": 0.8036848001991784,
"grad_norm": 5.67569637298584,
"learning_rate": 9.738380526848194e-07,
"loss": 12.3158,
"step": 807
},
{
"epoch": 0.8046806921449022,
"grad_norm": 5.282881259918213,
"learning_rate": 9.157828966731979e-07,
"loss": 12.0702,
"step": 808
},
{
"epoch": 0.8056765840906261,
"grad_norm": 5.958638668060303,
"learning_rate": 8.595054452241041e-07,
"loss": 12.2277,
"step": 809
},
{
"epoch": 0.8066724760363501,
"grad_norm": 5.30989408493042,
"learning_rate": 8.050065046018851e-07,
"loss": 11.5156,
"step": 810
},
{
"epoch": 0.807668367982074,
"grad_norm": 4.585794925689697,
"learning_rate": 7.522868555909223e-07,
"loss": 10.3803,
"step": 811
},
{
"epoch": 0.8086642599277978,
"grad_norm": 5.959741115570068,
"learning_rate": 7.013472534843635e-07,
"loss": 13.3266,
"step": 812
},
{
"epoch": 0.8096601518735217,
"grad_norm": 6.225035190582275,
"learning_rate": 6.521884280733526e-07,
"loss": 12.5264,
"step": 813
},
{
"epoch": 0.8106560438192456,
"grad_norm": 4.888844013214111,
"learning_rate": 6.048110836365666e-07,
"loss": 10.7281,
"step": 814
},
{
"epoch": 0.8116519357649695,
"grad_norm": 5.534932613372803,
"learning_rate": 5.592158989301405e-07,
"loss": 12.486,
"step": 815
},
{
"epoch": 0.8126478277106934,
"grad_norm": 5.48603630065918,
"learning_rate": 5.154035271778684e-07,
"loss": 11.7151,
"step": 816
},
{
"epoch": 0.8136437196564172,
"grad_norm": 5.270609378814697,
"learning_rate": 4.733745960619762e-07,
"loss": 11.9809,
"step": 817
},
{
"epoch": 0.8146396116021412,
"grad_norm": 5.433355331420898,
"learning_rate": 4.331297077140167e-07,
"loss": 12.2935,
"step": 818
},
{
"epoch": 0.8156355035478651,
"grad_norm": 5.731333255767822,
"learning_rate": 3.94669438706266e-07,
"loss": 12.0509,
"step": 819
},
{
"epoch": 0.816631395493589,
"grad_norm": 5.320248126983643,
"learning_rate": 3.579943400434937e-07,
"loss": 11.0143,
"step": 820
},
{
"epoch": 0.8176272874393128,
"grad_norm": 5.686704635620117,
"learning_rate": 3.231049371550804e-07,
"loss": 12.9909,
"step": 821
},
{
"epoch": 0.8186231793850367,
"grad_norm": 5.398179054260254,
"learning_rate": 2.900017298874125e-07,
"loss": 10.8497,
"step": 822
},
{
"epoch": 0.8196190713307606,
"grad_norm": 6.222070693969727,
"learning_rate": 2.5868519249680475e-07,
"loss": 12.435,
"step": 823
},
{
"epoch": 0.8206149632764845,
"grad_norm": 5.970067024230957,
"learning_rate": 2.2915577364267247e-07,
"loss": 13.6407,
"step": 824
},
{
"epoch": 0.8216108552222083,
"grad_norm": 5.729474067687988,
"learning_rate": 2.0141389638109187e-07,
"loss": 12.3141,
"step": 825
},
{
"epoch": 0.8226067471679323,
"grad_norm": 6.03525972366333,
"learning_rate": 1.7545995815876348e-07,
"loss": 11.4593,
"step": 826
},
{
"epoch": 0.8236026391136562,
"grad_norm": 5.3436360359191895,
"learning_rate": 1.5129433080728062e-07,
"loss": 11.6677,
"step": 827
},
{
"epoch": 0.8245985310593801,
"grad_norm": 4.991388320922852,
"learning_rate": 1.2891736053785575e-07,
"loss": 10.8385,
"step": 828
},
{
"epoch": 0.825594423005104,
"grad_norm": 5.2977118492126465,
"learning_rate": 1.0832936793633841e-07,
"loss": 12.2961,
"step": 829
},
{
"epoch": 0.8265903149508278,
"grad_norm": 5.460627555847168,
"learning_rate": 8.953064795856614e-08,
"loss": 11.1108,
"step": 830
},
{
"epoch": 0.8275862068965517,
"grad_norm": 5.315445423126221,
"learning_rate": 7.252146992625664e-08,
"loss": 10.9681,
"step": 831
},
{
"epoch": 0.8285820988422756,
"grad_norm": 5.33111572265625,
"learning_rate": 5.730207752302485e-08,
"loss": 11.7059,
"step": 832
},
{
"epoch": 0.8295779907879995,
"grad_norm": 5.866223335266113,
"learning_rate": 4.387268879098294e-08,
"loss": 13.3463,
"step": 833
},
{
"epoch": 0.8305738827337233,
"grad_norm": 5.038965225219727,
"learning_rate": 3.223349612756221e-08,
"loss": 11.1416,
"step": 834
},
{
"epoch": 0.8315697746794473,
"grad_norm": 5.116003036499023,
"learning_rate": 2.2384666282779244e-08,
"loss": 11.2051,
"step": 835
},
{
"epoch": 0.8325656666251712,
"grad_norm": 5.213040351867676,
"learning_rate": 1.4326340356862754e-08,
"loss": 11.9355,
"step": 836
},
{
"epoch": 0.8335615585708951,
"grad_norm": 5.077611446380615,
"learning_rate": 8.058633798199711e-09,
"loss": 11.5202,
"step": 837
},
{
"epoch": 0.8345574505166189,
"grad_norm": 6.297743797302246,
"learning_rate": 3.58163640169773e-09,
"loss": 11.4272,
"step": 838
},
{
"epoch": 0.8355533424623428,
"grad_norm": 6.544887065887451,
"learning_rate": 8.954123074805809e-10,
"loss": 12.609,
"step": 839
},
{
"epoch": 0.8365492344080667,
"grad_norm": 5.315311908721924,
"learning_rate": 0.0,
"loss": 12.2755,
"step": 840
}
],
"logging_steps": 1,
"max_steps": 840,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 2,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.286387212989235e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}