smolvlm-verifier / trainer_state.json
ynshah3's picture
update directory structure
a8cc986
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999795262371272,
"eval_steps": 500,
"global_step": 24421,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010236881436439204,
"grad_norm": 1.6028783321380615,
"learning_rate": 5e-05,
"loss": 1.4176,
"step": 25
},
{
"epoch": 0.0020473762872878407,
"grad_norm": 0.7411264777183533,
"learning_rate": 0.0001,
"loss": 0.5905,
"step": 50
},
{
"epoch": 0.003071064430931761,
"grad_norm": 1.4544100761413574,
"learning_rate": 9.989741906364122e-05,
"loss": 0.2869,
"step": 75
},
{
"epoch": 0.004094752574575681,
"grad_norm": 0.31396615505218506,
"learning_rate": 9.979483812728243e-05,
"loss": 0.23,
"step": 100
},
{
"epoch": 0.005118440718219602,
"grad_norm": 0.7768864035606384,
"learning_rate": 9.969225719092365e-05,
"loss": 0.2192,
"step": 125
},
{
"epoch": 0.006142128861863522,
"grad_norm": 0.3052266836166382,
"learning_rate": 9.958967625456485e-05,
"loss": 0.2055,
"step": 150
},
{
"epoch": 0.007165817005507442,
"grad_norm": 0.2620660960674286,
"learning_rate": 9.948709531820606e-05,
"loss": 0.1947,
"step": 175
},
{
"epoch": 0.008189505149151363,
"grad_norm": 0.31376323103904724,
"learning_rate": 9.938451438184728e-05,
"loss": 0.1904,
"step": 200
},
{
"epoch": 0.009213193292795283,
"grad_norm": 0.30490225553512573,
"learning_rate": 9.92819334454885e-05,
"loss": 0.1953,
"step": 225
},
{
"epoch": 0.010236881436439204,
"grad_norm": 0.33286434412002563,
"learning_rate": 9.917935250912971e-05,
"loss": 0.1928,
"step": 250
},
{
"epoch": 0.011260569580083124,
"grad_norm": 0.36407458782196045,
"learning_rate": 9.907677157277092e-05,
"loss": 0.1771,
"step": 275
},
{
"epoch": 0.012284257723727043,
"grad_norm": 0.3681598901748657,
"learning_rate": 9.897419063641214e-05,
"loss": 0.1858,
"step": 300
},
{
"epoch": 0.013307945867370965,
"grad_norm": 0.26045089960098267,
"learning_rate": 9.887160970005335e-05,
"loss": 0.1759,
"step": 325
},
{
"epoch": 0.014331634011014885,
"grad_norm": 0.6006647348403931,
"learning_rate": 9.876902876369455e-05,
"loss": 0.1698,
"step": 350
},
{
"epoch": 0.015355322154658804,
"grad_norm": 0.5427098870277405,
"learning_rate": 9.866644782733577e-05,
"loss": 0.1769,
"step": 375
},
{
"epoch": 0.016379010298302726,
"grad_norm": 0.37887805700302124,
"learning_rate": 9.856386689097698e-05,
"loss": 0.1772,
"step": 400
},
{
"epoch": 0.017402698441946644,
"grad_norm": 0.31953874230384827,
"learning_rate": 9.84612859546182e-05,
"loss": 0.1776,
"step": 425
},
{
"epoch": 0.018426386585590565,
"grad_norm": 0.2421693652868271,
"learning_rate": 9.835870501825941e-05,
"loss": 0.1731,
"step": 450
},
{
"epoch": 0.019450074729234487,
"grad_norm": 0.445103257894516,
"learning_rate": 9.825612408190063e-05,
"loss": 0.1664,
"step": 475
},
{
"epoch": 0.020473762872878408,
"grad_norm": 0.3354673683643341,
"learning_rate": 9.815354314554184e-05,
"loss": 0.1749,
"step": 500
},
{
"epoch": 0.021497451016522326,
"grad_norm": 0.24303950369358063,
"learning_rate": 9.805096220918306e-05,
"loss": 0.1655,
"step": 525
},
{
"epoch": 0.022521139160166247,
"grad_norm": 0.2817317247390747,
"learning_rate": 9.794838127282426e-05,
"loss": 0.1673,
"step": 550
},
{
"epoch": 0.02354482730381017,
"grad_norm": 0.4167644679546356,
"learning_rate": 9.784580033646547e-05,
"loss": 0.1638,
"step": 575
},
{
"epoch": 0.024568515447454087,
"grad_norm": 0.339884877204895,
"learning_rate": 9.774321940010669e-05,
"loss": 0.1609,
"step": 600
},
{
"epoch": 0.025592203591098008,
"grad_norm": 0.2876884937286377,
"learning_rate": 9.76406384637479e-05,
"loss": 0.1659,
"step": 625
},
{
"epoch": 0.02661589173474193,
"grad_norm": 0.3253774642944336,
"learning_rate": 9.753805752738912e-05,
"loss": 0.1557,
"step": 650
},
{
"epoch": 0.027639579878385848,
"grad_norm": 0.24355168640613556,
"learning_rate": 9.743547659103033e-05,
"loss": 0.167,
"step": 675
},
{
"epoch": 0.02866326802202977,
"grad_norm": 0.3610304892063141,
"learning_rate": 9.733289565467155e-05,
"loss": 0.1605,
"step": 700
},
{
"epoch": 0.02968695616567369,
"grad_norm": 0.3692477345466614,
"learning_rate": 9.723031471831276e-05,
"loss": 0.1546,
"step": 725
},
{
"epoch": 0.03071064430931761,
"grad_norm": 0.63128262758255,
"learning_rate": 9.712773378195396e-05,
"loss": 0.1652,
"step": 750
},
{
"epoch": 0.031734332452961526,
"grad_norm": 0.3006066381931305,
"learning_rate": 9.702515284559518e-05,
"loss": 0.1596,
"step": 775
},
{
"epoch": 0.03275802059660545,
"grad_norm": 0.6283088326454163,
"learning_rate": 9.692257190923639e-05,
"loss": 0.1535,
"step": 800
},
{
"epoch": 0.03378170874024937,
"grad_norm": 0.4018152356147766,
"learning_rate": 9.68199909728776e-05,
"loss": 0.1603,
"step": 825
},
{
"epoch": 0.03480539688389329,
"grad_norm": 0.36612433195114136,
"learning_rate": 9.671741003651882e-05,
"loss": 0.1524,
"step": 850
},
{
"epoch": 0.03582908502753721,
"grad_norm": 0.41699767112731934,
"learning_rate": 9.661482910016003e-05,
"loss": 0.1547,
"step": 875
},
{
"epoch": 0.03685277317118113,
"grad_norm": 0.25239649415016174,
"learning_rate": 9.651224816380125e-05,
"loss": 0.161,
"step": 900
},
{
"epoch": 0.037876461314825055,
"grad_norm": 0.28330907225608826,
"learning_rate": 9.640966722744246e-05,
"loss": 0.1544,
"step": 925
},
{
"epoch": 0.03890014945846897,
"grad_norm": 0.393118292093277,
"learning_rate": 9.630708629108367e-05,
"loss": 0.1529,
"step": 950
},
{
"epoch": 0.03992383760211289,
"grad_norm": 0.2385636419057846,
"learning_rate": 9.620450535472488e-05,
"loss": 0.1573,
"step": 975
},
{
"epoch": 0.040947525745756816,
"grad_norm": 0.7217739820480347,
"learning_rate": 9.61019244183661e-05,
"loss": 0.1625,
"step": 1000
},
{
"epoch": 0.041971213889400734,
"grad_norm": 0.4201323091983795,
"learning_rate": 9.599934348200731e-05,
"loss": 0.1554,
"step": 1025
},
{
"epoch": 0.04299490203304465,
"grad_norm": 0.2981342375278473,
"learning_rate": 9.589676254564852e-05,
"loss": 0.1606,
"step": 1050
},
{
"epoch": 0.04401859017668858,
"grad_norm": 0.32909801602363586,
"learning_rate": 9.579418160928974e-05,
"loss": 0.1611,
"step": 1075
},
{
"epoch": 0.045042278320332495,
"grad_norm": 0.3763565719127655,
"learning_rate": 9.569160067293095e-05,
"loss": 0.1509,
"step": 1100
},
{
"epoch": 0.04606596646397641,
"grad_norm": 0.26296111941337585,
"learning_rate": 9.558901973657215e-05,
"loss": 0.1458,
"step": 1125
},
{
"epoch": 0.04708965460762034,
"grad_norm": 0.2251584380865097,
"learning_rate": 9.548643880021337e-05,
"loss": 0.1483,
"step": 1150
},
{
"epoch": 0.048113342751264256,
"grad_norm": 0.24623946845531464,
"learning_rate": 9.538385786385458e-05,
"loss": 0.1521,
"step": 1175
},
{
"epoch": 0.049137030894908174,
"grad_norm": 0.45473411679267883,
"learning_rate": 9.52812769274958e-05,
"loss": 0.1549,
"step": 1200
},
{
"epoch": 0.0501607190385521,
"grad_norm": 0.23999722301959991,
"learning_rate": 9.517869599113701e-05,
"loss": 0.1442,
"step": 1225
},
{
"epoch": 0.051184407182196016,
"grad_norm": 0.32882001996040344,
"learning_rate": 9.507611505477823e-05,
"loss": 0.153,
"step": 1250
},
{
"epoch": 0.052208095325839934,
"grad_norm": 0.44401663541793823,
"learning_rate": 9.497353411841944e-05,
"loss": 0.1521,
"step": 1275
},
{
"epoch": 0.05323178346948386,
"grad_norm": 0.2603824734687805,
"learning_rate": 9.487095318206066e-05,
"loss": 0.1543,
"step": 1300
},
{
"epoch": 0.05425547161312778,
"grad_norm": 0.28304556012153625,
"learning_rate": 9.476837224570186e-05,
"loss": 0.1491,
"step": 1325
},
{
"epoch": 0.055279159756771695,
"grad_norm": 0.40350213646888733,
"learning_rate": 9.466579130934307e-05,
"loss": 0.1431,
"step": 1350
},
{
"epoch": 0.05630284790041562,
"grad_norm": 0.3348640501499176,
"learning_rate": 9.456321037298429e-05,
"loss": 0.1439,
"step": 1375
},
{
"epoch": 0.05732653604405954,
"grad_norm": 0.3141482472419739,
"learning_rate": 9.44606294366255e-05,
"loss": 0.148,
"step": 1400
},
{
"epoch": 0.058350224187703456,
"grad_norm": 0.2608078420162201,
"learning_rate": 9.435804850026672e-05,
"loss": 0.1461,
"step": 1425
},
{
"epoch": 0.05937391233134738,
"grad_norm": 0.2971978485584259,
"learning_rate": 9.425546756390793e-05,
"loss": 0.1435,
"step": 1450
},
{
"epoch": 0.0603976004749913,
"grad_norm": 0.33824801445007324,
"learning_rate": 9.415288662754915e-05,
"loss": 0.1476,
"step": 1475
},
{
"epoch": 0.06142128861863522,
"grad_norm": 0.22219249606132507,
"learning_rate": 9.405030569119036e-05,
"loss": 0.1443,
"step": 1500
},
{
"epoch": 0.06244497676227914,
"grad_norm": 0.30279237031936646,
"learning_rate": 9.394772475483156e-05,
"loss": 0.1451,
"step": 1525
},
{
"epoch": 0.06346866490592305,
"grad_norm": 0.7361096739768982,
"learning_rate": 9.384514381847278e-05,
"loss": 0.139,
"step": 1550
},
{
"epoch": 0.06449235304956698,
"grad_norm": 0.2694852650165558,
"learning_rate": 9.374256288211399e-05,
"loss": 0.14,
"step": 1575
},
{
"epoch": 0.0655160411932109,
"grad_norm": 0.2227030247449875,
"learning_rate": 9.36399819457552e-05,
"loss": 0.1409,
"step": 1600
},
{
"epoch": 0.06653972933685481,
"grad_norm": 0.3561594486236572,
"learning_rate": 9.353740100939642e-05,
"loss": 0.1386,
"step": 1625
},
{
"epoch": 0.06756341748049874,
"grad_norm": 0.3476031720638275,
"learning_rate": 9.343482007303764e-05,
"loss": 0.1397,
"step": 1650
},
{
"epoch": 0.06858710562414266,
"grad_norm": 0.3784942626953125,
"learning_rate": 9.333223913667885e-05,
"loss": 0.1437,
"step": 1675
},
{
"epoch": 0.06961079376778657,
"grad_norm": 0.38352203369140625,
"learning_rate": 9.322965820032006e-05,
"loss": 0.1345,
"step": 1700
},
{
"epoch": 0.0706344819114305,
"grad_norm": 0.2508692741394043,
"learning_rate": 9.312707726396127e-05,
"loss": 0.1396,
"step": 1725
},
{
"epoch": 0.07165817005507442,
"grad_norm": 0.5086421966552734,
"learning_rate": 9.302449632760248e-05,
"loss": 0.1377,
"step": 1750
},
{
"epoch": 0.07268185819871835,
"grad_norm": 0.40866467356681824,
"learning_rate": 9.29219153912437e-05,
"loss": 0.1347,
"step": 1775
},
{
"epoch": 0.07370554634236226,
"grad_norm": 0.3897942304611206,
"learning_rate": 9.281933445488491e-05,
"loss": 0.1317,
"step": 1800
},
{
"epoch": 0.07472923448600619,
"grad_norm": 0.3895871937274933,
"learning_rate": 9.271675351852612e-05,
"loss": 0.1415,
"step": 1825
},
{
"epoch": 0.07575292262965011,
"grad_norm": 0.3118538558483124,
"learning_rate": 9.261417258216734e-05,
"loss": 0.1276,
"step": 1850
},
{
"epoch": 0.07677661077329402,
"grad_norm": 0.5007463097572327,
"learning_rate": 9.251159164580855e-05,
"loss": 0.1401,
"step": 1875
},
{
"epoch": 0.07780029891693795,
"grad_norm": 0.37419870495796204,
"learning_rate": 9.240901070944977e-05,
"loss": 0.1338,
"step": 1900
},
{
"epoch": 0.07882398706058187,
"grad_norm": 0.4792192876338959,
"learning_rate": 9.230642977309097e-05,
"loss": 0.1343,
"step": 1925
},
{
"epoch": 0.07984767520422578,
"grad_norm": 0.7688687443733215,
"learning_rate": 9.220384883673218e-05,
"loss": 0.133,
"step": 1950
},
{
"epoch": 0.0808713633478697,
"grad_norm": 0.8818038105964661,
"learning_rate": 9.21012679003734e-05,
"loss": 0.1354,
"step": 1975
},
{
"epoch": 0.08189505149151363,
"grad_norm": 0.4251585304737091,
"learning_rate": 9.199868696401461e-05,
"loss": 0.1301,
"step": 2000
},
{
"epoch": 0.08291873963515754,
"grad_norm": 0.33509576320648193,
"learning_rate": 9.189610602765583e-05,
"loss": 0.1336,
"step": 2025
},
{
"epoch": 0.08394242777880147,
"grad_norm": 0.1928907036781311,
"learning_rate": 9.179352509129704e-05,
"loss": 0.1355,
"step": 2050
},
{
"epoch": 0.08496611592244539,
"grad_norm": 0.2787665128707886,
"learning_rate": 9.169094415493826e-05,
"loss": 0.1314,
"step": 2075
},
{
"epoch": 0.0859898040660893,
"grad_norm": 0.4015423357486725,
"learning_rate": 9.158836321857946e-05,
"loss": 0.1303,
"step": 2100
},
{
"epoch": 0.08701349220973323,
"grad_norm": 0.579844057559967,
"learning_rate": 9.148578228222067e-05,
"loss": 0.1277,
"step": 2125
},
{
"epoch": 0.08803718035337715,
"grad_norm": 0.3636709153652191,
"learning_rate": 9.138320134586189e-05,
"loss": 0.128,
"step": 2150
},
{
"epoch": 0.08906086849702106,
"grad_norm": 0.25872743129730225,
"learning_rate": 9.12806204095031e-05,
"loss": 0.1312,
"step": 2175
},
{
"epoch": 0.09008455664066499,
"grad_norm": 0.32024118304252625,
"learning_rate": 9.117803947314432e-05,
"loss": 0.1295,
"step": 2200
},
{
"epoch": 0.09110824478430891,
"grad_norm": 0.23083104193210602,
"learning_rate": 9.107545853678553e-05,
"loss": 0.1292,
"step": 2225
},
{
"epoch": 0.09213193292795283,
"grad_norm": 0.27154719829559326,
"learning_rate": 9.097287760042675e-05,
"loss": 0.1275,
"step": 2250
},
{
"epoch": 0.09315562107159675,
"grad_norm": 0.29432374238967896,
"learning_rate": 9.087029666406796e-05,
"loss": 0.1246,
"step": 2275
},
{
"epoch": 0.09417930921524068,
"grad_norm": 0.5287219882011414,
"learning_rate": 9.076771572770916e-05,
"loss": 0.1293,
"step": 2300
},
{
"epoch": 0.09520299735888459,
"grad_norm": 0.3348105549812317,
"learning_rate": 9.066513479135038e-05,
"loss": 0.1226,
"step": 2325
},
{
"epoch": 0.09622668550252851,
"grad_norm": 0.2081725001335144,
"learning_rate": 9.056255385499159e-05,
"loss": 0.1242,
"step": 2350
},
{
"epoch": 0.09725037364617244,
"grad_norm": 0.27878373861312866,
"learning_rate": 9.04599729186328e-05,
"loss": 0.1343,
"step": 2375
},
{
"epoch": 0.09827406178981635,
"grad_norm": 0.40117210149765015,
"learning_rate": 9.035739198227402e-05,
"loss": 0.1265,
"step": 2400
},
{
"epoch": 0.09929774993346027,
"grad_norm": 0.46459710597991943,
"learning_rate": 9.025481104591524e-05,
"loss": 0.1218,
"step": 2425
},
{
"epoch": 0.1003214380771042,
"grad_norm": 0.19930683076381683,
"learning_rate": 9.015223010955645e-05,
"loss": 0.1258,
"step": 2450
},
{
"epoch": 0.10134512622074811,
"grad_norm": 0.3851957321166992,
"learning_rate": 9.004964917319766e-05,
"loss": 0.1243,
"step": 2475
},
{
"epoch": 0.10236881436439203,
"grad_norm": 0.3303160071372986,
"learning_rate": 8.994706823683887e-05,
"loss": 0.1264,
"step": 2500
},
{
"epoch": 0.10339250250803596,
"grad_norm": 0.3450019359588623,
"learning_rate": 8.984448730048008e-05,
"loss": 0.122,
"step": 2525
},
{
"epoch": 0.10441619065167987,
"grad_norm": 0.36742231249809265,
"learning_rate": 8.97419063641213e-05,
"loss": 0.1216,
"step": 2550
},
{
"epoch": 0.1054398787953238,
"grad_norm": 0.2524435222148895,
"learning_rate": 8.963932542776251e-05,
"loss": 0.1238,
"step": 2575
},
{
"epoch": 0.10646356693896772,
"grad_norm": 0.38917961716651917,
"learning_rate": 8.953674449140372e-05,
"loss": 0.1252,
"step": 2600
},
{
"epoch": 0.10748725508261163,
"grad_norm": 0.3554433584213257,
"learning_rate": 8.943416355504494e-05,
"loss": 0.1213,
"step": 2625
},
{
"epoch": 0.10851094322625555,
"grad_norm": 0.2701007127761841,
"learning_rate": 8.933158261868615e-05,
"loss": 0.1255,
"step": 2650
},
{
"epoch": 0.10953463136989948,
"grad_norm": 0.40730130672454834,
"learning_rate": 8.922900168232737e-05,
"loss": 0.1205,
"step": 2675
},
{
"epoch": 0.11055831951354339,
"grad_norm": 0.36011001467704773,
"learning_rate": 8.912642074596857e-05,
"loss": 0.1208,
"step": 2700
},
{
"epoch": 0.11158200765718732,
"grad_norm": 0.2509096562862396,
"learning_rate": 8.902383980960978e-05,
"loss": 0.1234,
"step": 2725
},
{
"epoch": 0.11260569580083124,
"grad_norm": 0.34861189126968384,
"learning_rate": 8.8921258873251e-05,
"loss": 0.1306,
"step": 2750
},
{
"epoch": 0.11362938394447515,
"grad_norm": 0.20540310442447662,
"learning_rate": 8.881867793689221e-05,
"loss": 0.1174,
"step": 2775
},
{
"epoch": 0.11465307208811908,
"grad_norm": 0.26270365715026855,
"learning_rate": 8.871609700053343e-05,
"loss": 0.1299,
"step": 2800
},
{
"epoch": 0.115676760231763,
"grad_norm": 0.5314069986343384,
"learning_rate": 8.861351606417464e-05,
"loss": 0.1193,
"step": 2825
},
{
"epoch": 0.11670044837540691,
"grad_norm": 0.26417431235313416,
"learning_rate": 8.851093512781586e-05,
"loss": 0.1221,
"step": 2850
},
{
"epoch": 0.11772413651905084,
"grad_norm": 0.2860862612724304,
"learning_rate": 8.840835419145706e-05,
"loss": 0.1273,
"step": 2875
},
{
"epoch": 0.11874782466269476,
"grad_norm": 0.27751094102859497,
"learning_rate": 8.830577325509827e-05,
"loss": 0.1206,
"step": 2900
},
{
"epoch": 0.11977151280633867,
"grad_norm": 0.45580488443374634,
"learning_rate": 8.820319231873949e-05,
"loss": 0.1187,
"step": 2925
},
{
"epoch": 0.1207952009499826,
"grad_norm": 0.2574482560157776,
"learning_rate": 8.81006113823807e-05,
"loss": 0.1229,
"step": 2950
},
{
"epoch": 0.12181888909362652,
"grad_norm": 0.2733965516090393,
"learning_rate": 8.799803044602192e-05,
"loss": 0.1191,
"step": 2975
},
{
"epoch": 0.12284257723727043,
"grad_norm": 0.2117166668176651,
"learning_rate": 8.789544950966313e-05,
"loss": 0.1205,
"step": 3000
},
{
"epoch": 0.12386626538091436,
"grad_norm": 0.5137503147125244,
"learning_rate": 8.779286857330435e-05,
"loss": 0.1264,
"step": 3025
},
{
"epoch": 0.12488995352455828,
"grad_norm": 0.23070771992206573,
"learning_rate": 8.769028763694556e-05,
"loss": 0.118,
"step": 3050
},
{
"epoch": 0.1259136416682022,
"grad_norm": 0.2723982334136963,
"learning_rate": 8.758770670058676e-05,
"loss": 0.1152,
"step": 3075
},
{
"epoch": 0.1269373298118461,
"grad_norm": 0.3011278212070465,
"learning_rate": 8.748512576422798e-05,
"loss": 0.1187,
"step": 3100
},
{
"epoch": 0.12796101795549003,
"grad_norm": 0.22801214456558228,
"learning_rate": 8.738254482786919e-05,
"loss": 0.1182,
"step": 3125
},
{
"epoch": 0.12898470609913396,
"grad_norm": 0.3295694589614868,
"learning_rate": 8.72799638915104e-05,
"loss": 0.1213,
"step": 3150
},
{
"epoch": 0.13000839424277788,
"grad_norm": 0.34608685970306396,
"learning_rate": 8.717738295515162e-05,
"loss": 0.1199,
"step": 3175
},
{
"epoch": 0.1310320823864218,
"grad_norm": 0.5989237427711487,
"learning_rate": 8.707480201879284e-05,
"loss": 0.1173,
"step": 3200
},
{
"epoch": 0.13205577053006573,
"grad_norm": 0.3048112094402313,
"learning_rate": 8.697222108243405e-05,
"loss": 0.1178,
"step": 3225
},
{
"epoch": 0.13307945867370963,
"grad_norm": 0.3791589140892029,
"learning_rate": 8.686964014607527e-05,
"loss": 0.1175,
"step": 3250
},
{
"epoch": 0.13410314681735355,
"grad_norm": 0.1966562420129776,
"learning_rate": 8.676705920971647e-05,
"loss": 0.1192,
"step": 3275
},
{
"epoch": 0.13512683496099748,
"grad_norm": 0.36613497138023376,
"learning_rate": 8.666447827335768e-05,
"loss": 0.1167,
"step": 3300
},
{
"epoch": 0.1361505231046414,
"grad_norm": 0.35663649439811707,
"learning_rate": 8.65618973369989e-05,
"loss": 0.122,
"step": 3325
},
{
"epoch": 0.13717421124828533,
"grad_norm": 0.2863902151584625,
"learning_rate": 8.645931640064011e-05,
"loss": 0.1157,
"step": 3350
},
{
"epoch": 0.13819789939192925,
"grad_norm": 0.3368700444698334,
"learning_rate": 8.635673546428133e-05,
"loss": 0.1174,
"step": 3375
},
{
"epoch": 0.13922158753557315,
"grad_norm": 0.3548611104488373,
"learning_rate": 8.625415452792254e-05,
"loss": 0.118,
"step": 3400
},
{
"epoch": 0.14024527567921707,
"grad_norm": 0.25708600878715515,
"learning_rate": 8.615157359156375e-05,
"loss": 0.1119,
"step": 3425
},
{
"epoch": 0.141268963822861,
"grad_norm": 0.24036449193954468,
"learning_rate": 8.604899265520497e-05,
"loss": 0.115,
"step": 3450
},
{
"epoch": 0.14229265196650492,
"grad_norm": 0.45417720079421997,
"learning_rate": 8.594641171884617e-05,
"loss": 0.1199,
"step": 3475
},
{
"epoch": 0.14331634011014885,
"grad_norm": 0.28222933411598206,
"learning_rate": 8.584383078248738e-05,
"loss": 0.113,
"step": 3500
},
{
"epoch": 0.14434002825379277,
"grad_norm": 0.2157520204782486,
"learning_rate": 8.57412498461286e-05,
"loss": 0.1146,
"step": 3525
},
{
"epoch": 0.1453637163974367,
"grad_norm": 0.3632587790489197,
"learning_rate": 8.563866890976981e-05,
"loss": 0.1174,
"step": 3550
},
{
"epoch": 0.1463874045410806,
"grad_norm": 0.23103779554367065,
"learning_rate": 8.553608797341103e-05,
"loss": 0.1111,
"step": 3575
},
{
"epoch": 0.14741109268472452,
"grad_norm": 0.316450297832489,
"learning_rate": 8.543350703705224e-05,
"loss": 0.1148,
"step": 3600
},
{
"epoch": 0.14843478082836845,
"grad_norm": 0.2546501159667969,
"learning_rate": 8.533092610069346e-05,
"loss": 0.1116,
"step": 3625
},
{
"epoch": 0.14945846897201237,
"grad_norm": 0.5451907515525818,
"learning_rate": 8.522834516433467e-05,
"loss": 0.1154,
"step": 3650
},
{
"epoch": 0.1504821571156563,
"grad_norm": 0.3568204939365387,
"learning_rate": 8.512576422797587e-05,
"loss": 0.1152,
"step": 3675
},
{
"epoch": 0.15150584525930022,
"grad_norm": 0.22811046242713928,
"learning_rate": 8.502318329161709e-05,
"loss": 0.1164,
"step": 3700
},
{
"epoch": 0.15252953340294412,
"grad_norm": 0.2431710660457611,
"learning_rate": 8.49206023552583e-05,
"loss": 0.1127,
"step": 3725
},
{
"epoch": 0.15355322154658804,
"grad_norm": 0.27546626329421997,
"learning_rate": 8.481802141889952e-05,
"loss": 0.1174,
"step": 3750
},
{
"epoch": 0.15457690969023197,
"grad_norm": 0.23295095562934875,
"learning_rate": 8.471544048254073e-05,
"loss": 0.1124,
"step": 3775
},
{
"epoch": 0.1556005978338759,
"grad_norm": 0.2244202196598053,
"learning_rate": 8.461285954618195e-05,
"loss": 0.1104,
"step": 3800
},
{
"epoch": 0.15662428597751982,
"grad_norm": 0.19517052173614502,
"learning_rate": 8.451027860982316e-05,
"loss": 0.1114,
"step": 3825
},
{
"epoch": 0.15764797412116374,
"grad_norm": 0.26743006706237793,
"learning_rate": 8.440769767346436e-05,
"loss": 0.1112,
"step": 3850
},
{
"epoch": 0.15867166226480764,
"grad_norm": 0.26785147190093994,
"learning_rate": 8.430511673710558e-05,
"loss": 0.1126,
"step": 3875
},
{
"epoch": 0.15969535040845156,
"grad_norm": 0.2772103250026703,
"learning_rate": 8.420253580074679e-05,
"loss": 0.1137,
"step": 3900
},
{
"epoch": 0.1607190385520955,
"grad_norm": 0.268732488155365,
"learning_rate": 8.409995486438801e-05,
"loss": 0.1148,
"step": 3925
},
{
"epoch": 0.1617427266957394,
"grad_norm": 0.42661407589912415,
"learning_rate": 8.399737392802922e-05,
"loss": 0.1147,
"step": 3950
},
{
"epoch": 0.16276641483938334,
"grad_norm": 0.2644007205963135,
"learning_rate": 8.389479299167044e-05,
"loss": 0.1179,
"step": 3975
},
{
"epoch": 0.16379010298302726,
"grad_norm": 0.4172644019126892,
"learning_rate": 8.379221205531165e-05,
"loss": 0.1167,
"step": 4000
},
{
"epoch": 0.16481379112667116,
"grad_norm": 0.2200649082660675,
"learning_rate": 8.368963111895287e-05,
"loss": 0.1154,
"step": 4025
},
{
"epoch": 0.16583747927031509,
"grad_norm": 0.3296594023704529,
"learning_rate": 8.358705018259407e-05,
"loss": 0.1136,
"step": 4050
},
{
"epoch": 0.166861167413959,
"grad_norm": 0.2407379001379013,
"learning_rate": 8.348446924623528e-05,
"loss": 0.1127,
"step": 4075
},
{
"epoch": 0.16788485555760294,
"grad_norm": 0.19917023181915283,
"learning_rate": 8.33818883098765e-05,
"loss": 0.1167,
"step": 4100
},
{
"epoch": 0.16890854370124686,
"grad_norm": 0.2644532024860382,
"learning_rate": 8.327930737351771e-05,
"loss": 0.1156,
"step": 4125
},
{
"epoch": 0.16993223184489079,
"grad_norm": 0.22355978190898895,
"learning_rate": 8.317672643715893e-05,
"loss": 0.113,
"step": 4150
},
{
"epoch": 0.17095591998853468,
"grad_norm": 0.3826581835746765,
"learning_rate": 8.307414550080014e-05,
"loss": 0.1172,
"step": 4175
},
{
"epoch": 0.1719796081321786,
"grad_norm": 0.2284521907567978,
"learning_rate": 8.297156456444136e-05,
"loss": 0.1135,
"step": 4200
},
{
"epoch": 0.17300329627582253,
"grad_norm": 0.2520081400871277,
"learning_rate": 8.286898362808257e-05,
"loss": 0.1146,
"step": 4225
},
{
"epoch": 0.17402698441946646,
"grad_norm": 0.385019451379776,
"learning_rate": 8.276640269172377e-05,
"loss": 0.1102,
"step": 4250
},
{
"epoch": 0.17505067256311038,
"grad_norm": 0.24445098638534546,
"learning_rate": 8.266382175536499e-05,
"loss": 0.1124,
"step": 4275
},
{
"epoch": 0.1760743607067543,
"grad_norm": 0.24673700332641602,
"learning_rate": 8.25612408190062e-05,
"loss": 0.112,
"step": 4300
},
{
"epoch": 0.1770980488503982,
"grad_norm": 0.2432449609041214,
"learning_rate": 8.245865988264741e-05,
"loss": 0.1121,
"step": 4325
},
{
"epoch": 0.17812173699404213,
"grad_norm": 0.3263969123363495,
"learning_rate": 8.235607894628863e-05,
"loss": 0.1131,
"step": 4350
},
{
"epoch": 0.17914542513768605,
"grad_norm": 0.25198620557785034,
"learning_rate": 8.225349800992984e-05,
"loss": 0.1106,
"step": 4375
},
{
"epoch": 0.18016911328132998,
"grad_norm": 0.31025946140289307,
"learning_rate": 8.215091707357106e-05,
"loss": 0.1083,
"step": 4400
},
{
"epoch": 0.1811928014249739,
"grad_norm": 0.2822698950767517,
"learning_rate": 8.204833613721227e-05,
"loss": 0.1128,
"step": 4425
},
{
"epoch": 0.18221648956861783,
"grad_norm": 0.35102951526641846,
"learning_rate": 8.194575520085347e-05,
"loss": 0.1119,
"step": 4450
},
{
"epoch": 0.18324017771226173,
"grad_norm": 0.2636832892894745,
"learning_rate": 8.184317426449469e-05,
"loss": 0.1154,
"step": 4475
},
{
"epoch": 0.18426386585590565,
"grad_norm": 0.2501748204231262,
"learning_rate": 8.17405933281359e-05,
"loss": 0.1146,
"step": 4500
},
{
"epoch": 0.18528755399954958,
"grad_norm": 0.24221724271774292,
"learning_rate": 8.163801239177712e-05,
"loss": 0.1111,
"step": 4525
},
{
"epoch": 0.1863112421431935,
"grad_norm": 0.23959171772003174,
"learning_rate": 8.153543145541833e-05,
"loss": 0.1121,
"step": 4550
},
{
"epoch": 0.18733493028683743,
"grad_norm": 0.28256523609161377,
"learning_rate": 8.143285051905955e-05,
"loss": 0.1056,
"step": 4575
},
{
"epoch": 0.18835861843048135,
"grad_norm": 0.1967180222272873,
"learning_rate": 8.133026958270076e-05,
"loss": 0.1105,
"step": 4600
},
{
"epoch": 0.18938230657412525,
"grad_norm": 0.25965237617492676,
"learning_rate": 8.122768864634198e-05,
"loss": 0.1082,
"step": 4625
},
{
"epoch": 0.19040599471776917,
"grad_norm": 0.2722185552120209,
"learning_rate": 8.112510770998318e-05,
"loss": 0.1134,
"step": 4650
},
{
"epoch": 0.1914296828614131,
"grad_norm": 0.24172380566596985,
"learning_rate": 8.102252677362439e-05,
"loss": 0.1138,
"step": 4675
},
{
"epoch": 0.19245337100505702,
"grad_norm": 0.26783162355422974,
"learning_rate": 8.091994583726561e-05,
"loss": 0.1079,
"step": 4700
},
{
"epoch": 0.19347705914870095,
"grad_norm": 0.2563905715942383,
"learning_rate": 8.081736490090682e-05,
"loss": 0.1097,
"step": 4725
},
{
"epoch": 0.19450074729234487,
"grad_norm": 0.31813859939575195,
"learning_rate": 8.071478396454804e-05,
"loss": 0.1107,
"step": 4750
},
{
"epoch": 0.19552443543598877,
"grad_norm": 0.2353924810886383,
"learning_rate": 8.061220302818925e-05,
"loss": 0.1112,
"step": 4775
},
{
"epoch": 0.1965481235796327,
"grad_norm": 0.24150237441062927,
"learning_rate": 8.050962209183047e-05,
"loss": 0.1073,
"step": 4800
},
{
"epoch": 0.19757181172327662,
"grad_norm": 0.31365466117858887,
"learning_rate": 8.040704115547167e-05,
"loss": 0.1091,
"step": 4825
},
{
"epoch": 0.19859549986692054,
"grad_norm": 0.3214346468448639,
"learning_rate": 8.030446021911288e-05,
"loss": 0.1122,
"step": 4850
},
{
"epoch": 0.19961918801056447,
"grad_norm": 0.2675853967666626,
"learning_rate": 8.02018792827541e-05,
"loss": 0.1078,
"step": 4875
},
{
"epoch": 0.2006428761542084,
"grad_norm": 0.2487669289112091,
"learning_rate": 8.009929834639531e-05,
"loss": 0.1087,
"step": 4900
},
{
"epoch": 0.2016665642978523,
"grad_norm": 0.23890641331672668,
"learning_rate": 7.999671741003653e-05,
"loss": 0.1143,
"step": 4925
},
{
"epoch": 0.20269025244149622,
"grad_norm": 0.25644829869270325,
"learning_rate": 7.989413647367774e-05,
"loss": 0.1117,
"step": 4950
},
{
"epoch": 0.20371394058514014,
"grad_norm": 0.24456225335597992,
"learning_rate": 7.979155553731896e-05,
"loss": 0.115,
"step": 4975
},
{
"epoch": 0.20473762872878407,
"grad_norm": 0.17908118665218353,
"learning_rate": 7.968897460096017e-05,
"loss": 0.1124,
"step": 5000
},
{
"epoch": 0.205761316872428,
"grad_norm": 0.35271450877189636,
"learning_rate": 7.958639366460137e-05,
"loss": 0.1101,
"step": 5025
},
{
"epoch": 0.20678500501607192,
"grad_norm": 0.2770853340625763,
"learning_rate": 7.948381272824259e-05,
"loss": 0.1119,
"step": 5050
},
{
"epoch": 0.2078086931597158,
"grad_norm": 0.3154667317867279,
"learning_rate": 7.93812317918838e-05,
"loss": 0.1089,
"step": 5075
},
{
"epoch": 0.20883238130335974,
"grad_norm": 0.27350950241088867,
"learning_rate": 7.927865085552502e-05,
"loss": 0.1113,
"step": 5100
},
{
"epoch": 0.20985606944700366,
"grad_norm": 0.24580037593841553,
"learning_rate": 7.917606991916623e-05,
"loss": 0.1112,
"step": 5125
},
{
"epoch": 0.2108797575906476,
"grad_norm": 0.23447053134441376,
"learning_rate": 7.907348898280744e-05,
"loss": 0.1108,
"step": 5150
},
{
"epoch": 0.2119034457342915,
"grad_norm": 0.2380298674106598,
"learning_rate": 7.897090804644866e-05,
"loss": 0.1082,
"step": 5175
},
{
"epoch": 0.21292713387793544,
"grad_norm": 0.22617502510547638,
"learning_rate": 7.886832711008987e-05,
"loss": 0.108,
"step": 5200
},
{
"epoch": 0.21395082202157933,
"grad_norm": 0.2923017740249634,
"learning_rate": 7.876574617373108e-05,
"loss": 0.1094,
"step": 5225
},
{
"epoch": 0.21497451016522326,
"grad_norm": 0.280912846326828,
"learning_rate": 7.866316523737229e-05,
"loss": 0.1108,
"step": 5250
},
{
"epoch": 0.21599819830886718,
"grad_norm": 0.24020980298519135,
"learning_rate": 7.85605843010135e-05,
"loss": 0.1104,
"step": 5275
},
{
"epoch": 0.2170218864525111,
"grad_norm": 0.2545349597930908,
"learning_rate": 7.845800336465472e-05,
"loss": 0.1105,
"step": 5300
},
{
"epoch": 0.21804557459615503,
"grad_norm": 0.22493721544742584,
"learning_rate": 7.835542242829593e-05,
"loss": 0.1086,
"step": 5325
},
{
"epoch": 0.21906926273979896,
"grad_norm": 0.26803284883499146,
"learning_rate": 7.825284149193715e-05,
"loss": 0.107,
"step": 5350
},
{
"epoch": 0.22009295088344286,
"grad_norm": 0.22854533791542053,
"learning_rate": 7.815026055557836e-05,
"loss": 0.1097,
"step": 5375
},
{
"epoch": 0.22111663902708678,
"grad_norm": 0.19401207566261292,
"learning_rate": 7.804767961921958e-05,
"loss": 0.1082,
"step": 5400
},
{
"epoch": 0.2221403271707307,
"grad_norm": 0.22267797589302063,
"learning_rate": 7.794509868286078e-05,
"loss": 0.1107,
"step": 5425
},
{
"epoch": 0.22316401531437463,
"grad_norm": 0.19586950540542603,
"learning_rate": 7.7842517746502e-05,
"loss": 0.1054,
"step": 5450
},
{
"epoch": 0.22418770345801856,
"grad_norm": 0.23129217326641083,
"learning_rate": 7.773993681014321e-05,
"loss": 0.1093,
"step": 5475
},
{
"epoch": 0.22521139160166248,
"grad_norm": 0.26472529768943787,
"learning_rate": 7.763735587378442e-05,
"loss": 0.1052,
"step": 5500
},
{
"epoch": 0.22623507974530638,
"grad_norm": 0.22230687737464905,
"learning_rate": 7.753477493742564e-05,
"loss": 0.1093,
"step": 5525
},
{
"epoch": 0.2272587678889503,
"grad_norm": 0.3101346492767334,
"learning_rate": 7.743219400106685e-05,
"loss": 0.1036,
"step": 5550
},
{
"epoch": 0.22828245603259423,
"grad_norm": 0.18460065126419067,
"learning_rate": 7.732961306470807e-05,
"loss": 0.1108,
"step": 5575
},
{
"epoch": 0.22930614417623815,
"grad_norm": 0.20973823964595795,
"learning_rate": 7.722703212834928e-05,
"loss": 0.1096,
"step": 5600
},
{
"epoch": 0.23032983231988208,
"grad_norm": 0.277650386095047,
"learning_rate": 7.712445119199048e-05,
"loss": 0.1065,
"step": 5625
},
{
"epoch": 0.231353520463526,
"grad_norm": 0.22262975573539734,
"learning_rate": 7.70218702556317e-05,
"loss": 0.1103,
"step": 5650
},
{
"epoch": 0.2323772086071699,
"grad_norm": 0.24553848803043365,
"learning_rate": 7.691928931927291e-05,
"loss": 0.1111,
"step": 5675
},
{
"epoch": 0.23340089675081382,
"grad_norm": 0.30652496218681335,
"learning_rate": 7.681670838291413e-05,
"loss": 0.1066,
"step": 5700
},
{
"epoch": 0.23442458489445775,
"grad_norm": 0.17171849310398102,
"learning_rate": 7.671412744655534e-05,
"loss": 0.1074,
"step": 5725
},
{
"epoch": 0.23544827303810167,
"grad_norm": 0.27997660636901855,
"learning_rate": 7.661154651019656e-05,
"loss": 0.1057,
"step": 5750
},
{
"epoch": 0.2364719611817456,
"grad_norm": 0.302190899848938,
"learning_rate": 7.650896557383777e-05,
"loss": 0.1078,
"step": 5775
},
{
"epoch": 0.23749564932538952,
"grad_norm": 0.29618439078330994,
"learning_rate": 7.640638463747897e-05,
"loss": 0.1078,
"step": 5800
},
{
"epoch": 0.23851933746903342,
"grad_norm": 0.25362005829811096,
"learning_rate": 7.630380370112019e-05,
"loss": 0.1052,
"step": 5825
},
{
"epoch": 0.23954302561267735,
"grad_norm": 0.22422952950000763,
"learning_rate": 7.62012227647614e-05,
"loss": 0.1069,
"step": 5850
},
{
"epoch": 0.24056671375632127,
"grad_norm": 0.21477550268173218,
"learning_rate": 7.609864182840262e-05,
"loss": 0.1079,
"step": 5875
},
{
"epoch": 0.2415904018999652,
"grad_norm": 0.17787286639213562,
"learning_rate": 7.599606089204383e-05,
"loss": 0.1087,
"step": 5900
},
{
"epoch": 0.24261409004360912,
"grad_norm": 0.25852805376052856,
"learning_rate": 7.589347995568505e-05,
"loss": 0.1055,
"step": 5925
},
{
"epoch": 0.24363777818725305,
"grad_norm": 0.2465522438287735,
"learning_rate": 7.579089901932626e-05,
"loss": 0.1052,
"step": 5950
},
{
"epoch": 0.24466146633089694,
"grad_norm": 0.20638887584209442,
"learning_rate": 7.568831808296747e-05,
"loss": 0.1059,
"step": 5975
},
{
"epoch": 0.24568515447454087,
"grad_norm": 0.24599237740039825,
"learning_rate": 7.558573714660868e-05,
"loss": 0.1052,
"step": 6000
},
{
"epoch": 0.2467088426181848,
"grad_norm": 0.2663975954055786,
"learning_rate": 7.548315621024989e-05,
"loss": 0.1051,
"step": 6025
},
{
"epoch": 0.24773253076182872,
"grad_norm": 0.2528514266014099,
"learning_rate": 7.53805752738911e-05,
"loss": 0.108,
"step": 6050
},
{
"epoch": 0.24875621890547264,
"grad_norm": 0.23383919894695282,
"learning_rate": 7.527799433753232e-05,
"loss": 0.108,
"step": 6075
},
{
"epoch": 0.24977990704911657,
"grad_norm": 0.23460572957992554,
"learning_rate": 7.517541340117353e-05,
"loss": 0.1052,
"step": 6100
},
{
"epoch": 0.25080359519276046,
"grad_norm": 0.23296788334846497,
"learning_rate": 7.507283246481475e-05,
"loss": 0.1045,
"step": 6125
},
{
"epoch": 0.2518272833364044,
"grad_norm": 0.2544507682323456,
"learning_rate": 7.497025152845596e-05,
"loss": 0.1073,
"step": 6150
},
{
"epoch": 0.2528509714800483,
"grad_norm": 0.33089134097099304,
"learning_rate": 7.486767059209718e-05,
"loss": 0.1047,
"step": 6175
},
{
"epoch": 0.2538746596236922,
"grad_norm": 0.2965986132621765,
"learning_rate": 7.476508965573838e-05,
"loss": 0.1094,
"step": 6200
},
{
"epoch": 0.25489834776733616,
"grad_norm": 0.2606011927127838,
"learning_rate": 7.46625087193796e-05,
"loss": 0.1035,
"step": 6225
},
{
"epoch": 0.25592203591098006,
"grad_norm": 0.21870043873786926,
"learning_rate": 7.455992778302081e-05,
"loss": 0.1081,
"step": 6250
},
{
"epoch": 0.256945724054624,
"grad_norm": 0.37876567244529724,
"learning_rate": 7.445734684666202e-05,
"loss": 0.1049,
"step": 6275
},
{
"epoch": 0.2579694121982679,
"grad_norm": 0.26862943172454834,
"learning_rate": 7.435476591030324e-05,
"loss": 0.0993,
"step": 6300
},
{
"epoch": 0.25899310034191186,
"grad_norm": 0.23476149141788483,
"learning_rate": 7.425218497394445e-05,
"loss": 0.1059,
"step": 6325
},
{
"epoch": 0.26001678848555576,
"grad_norm": 0.21397703886032104,
"learning_rate": 7.414960403758567e-05,
"loss": 0.1068,
"step": 6350
},
{
"epoch": 0.26104047662919966,
"grad_norm": 0.18096783757209778,
"learning_rate": 7.404702310122688e-05,
"loss": 0.1072,
"step": 6375
},
{
"epoch": 0.2620641647728436,
"grad_norm": 0.2302347868680954,
"learning_rate": 7.394444216486808e-05,
"loss": 0.1106,
"step": 6400
},
{
"epoch": 0.2630878529164875,
"grad_norm": 0.23029176890850067,
"learning_rate": 7.38418612285093e-05,
"loss": 0.1064,
"step": 6425
},
{
"epoch": 0.26411154106013146,
"grad_norm": 0.22477678954601288,
"learning_rate": 7.373928029215051e-05,
"loss": 0.1066,
"step": 6450
},
{
"epoch": 0.26513522920377536,
"grad_norm": 0.30752694606781006,
"learning_rate": 7.363669935579173e-05,
"loss": 0.1072,
"step": 6475
},
{
"epoch": 0.26615891734741925,
"grad_norm": 0.21718832850456238,
"learning_rate": 7.353411841943294e-05,
"loss": 0.1077,
"step": 6500
},
{
"epoch": 0.2671826054910632,
"grad_norm": 0.24620802700519562,
"learning_rate": 7.343153748307416e-05,
"loss": 0.1053,
"step": 6525
},
{
"epoch": 0.2682062936347071,
"grad_norm": 0.1965140402317047,
"learning_rate": 7.332895654671537e-05,
"loss": 0.1057,
"step": 6550
},
{
"epoch": 0.26922998177835106,
"grad_norm": 0.25057727098464966,
"learning_rate": 7.322637561035657e-05,
"loss": 0.1037,
"step": 6575
},
{
"epoch": 0.27025366992199495,
"grad_norm": 0.2844404876232147,
"learning_rate": 7.312379467399779e-05,
"loss": 0.1026,
"step": 6600
},
{
"epoch": 0.2712773580656389,
"grad_norm": 0.23390497267246246,
"learning_rate": 7.3021213737639e-05,
"loss": 0.1032,
"step": 6625
},
{
"epoch": 0.2723010462092828,
"grad_norm": 0.19829843938350677,
"learning_rate": 7.291863280128022e-05,
"loss": 0.1091,
"step": 6650
},
{
"epoch": 0.2733247343529267,
"grad_norm": 0.24273422360420227,
"learning_rate": 7.281605186492143e-05,
"loss": 0.1075,
"step": 6675
},
{
"epoch": 0.27434842249657065,
"grad_norm": 0.3134569823741913,
"learning_rate": 7.271347092856265e-05,
"loss": 0.103,
"step": 6700
},
{
"epoch": 0.27537211064021455,
"grad_norm": 0.18153002858161926,
"learning_rate": 7.261088999220386e-05,
"loss": 0.1055,
"step": 6725
},
{
"epoch": 0.2763957987838585,
"grad_norm": 0.22859077155590057,
"learning_rate": 7.250830905584507e-05,
"loss": 0.1082,
"step": 6750
},
{
"epoch": 0.2774194869275024,
"grad_norm": 0.2673007845878601,
"learning_rate": 7.240572811948628e-05,
"loss": 0.1045,
"step": 6775
},
{
"epoch": 0.2784431750711463,
"grad_norm": 0.2651185691356659,
"learning_rate": 7.230314718312749e-05,
"loss": 0.1033,
"step": 6800
},
{
"epoch": 0.27946686321479025,
"grad_norm": 0.2199607491493225,
"learning_rate": 7.22005662467687e-05,
"loss": 0.1056,
"step": 6825
},
{
"epoch": 0.28049055135843415,
"grad_norm": 0.2549345791339874,
"learning_rate": 7.209798531040992e-05,
"loss": 0.1053,
"step": 6850
},
{
"epoch": 0.2815142395020781,
"grad_norm": 0.22934679687023163,
"learning_rate": 7.199540437405113e-05,
"loss": 0.1065,
"step": 6875
},
{
"epoch": 0.282537927645722,
"grad_norm": 0.2626487910747528,
"learning_rate": 7.189282343769235e-05,
"loss": 0.1034,
"step": 6900
},
{
"epoch": 0.28356161578936595,
"grad_norm": 0.2974385917186737,
"learning_rate": 7.179024250133356e-05,
"loss": 0.1046,
"step": 6925
},
{
"epoch": 0.28458530393300985,
"grad_norm": 0.2448814958333969,
"learning_rate": 7.168766156497478e-05,
"loss": 0.1067,
"step": 6950
},
{
"epoch": 0.28560899207665374,
"grad_norm": 0.39903128147125244,
"learning_rate": 7.158508062861598e-05,
"loss": 0.1013,
"step": 6975
},
{
"epoch": 0.2866326802202977,
"grad_norm": 0.25461485981941223,
"learning_rate": 7.14824996922572e-05,
"loss": 0.1051,
"step": 7000
},
{
"epoch": 0.2876563683639416,
"grad_norm": 0.22692956030368805,
"learning_rate": 7.137991875589841e-05,
"loss": 0.1051,
"step": 7025
},
{
"epoch": 0.28868005650758555,
"grad_norm": 0.18912681937217712,
"learning_rate": 7.127733781953962e-05,
"loss": 0.1049,
"step": 7050
},
{
"epoch": 0.28970374465122944,
"grad_norm": 0.29922547936439514,
"learning_rate": 7.117475688318084e-05,
"loss": 0.1028,
"step": 7075
},
{
"epoch": 0.2907274327948734,
"grad_norm": 0.39868420362472534,
"learning_rate": 7.107217594682205e-05,
"loss": 0.1046,
"step": 7100
},
{
"epoch": 0.2917511209385173,
"grad_norm": 0.2455105036497116,
"learning_rate": 7.096959501046327e-05,
"loss": 0.108,
"step": 7125
},
{
"epoch": 0.2927748090821612,
"grad_norm": 0.22028543055057526,
"learning_rate": 7.086701407410448e-05,
"loss": 0.1031,
"step": 7150
},
{
"epoch": 0.29379849722580514,
"grad_norm": 0.27611467242240906,
"learning_rate": 7.076443313774568e-05,
"loss": 0.1073,
"step": 7175
},
{
"epoch": 0.29482218536944904,
"grad_norm": 0.31651851534843445,
"learning_rate": 7.06618522013869e-05,
"loss": 0.1005,
"step": 7200
},
{
"epoch": 0.295845873513093,
"grad_norm": 0.2306353896856308,
"learning_rate": 7.055927126502811e-05,
"loss": 0.1035,
"step": 7225
},
{
"epoch": 0.2968695616567369,
"grad_norm": 0.22398217022418976,
"learning_rate": 7.045669032866933e-05,
"loss": 0.1059,
"step": 7250
},
{
"epoch": 0.2978932498003808,
"grad_norm": 0.24632596969604492,
"learning_rate": 7.035410939231054e-05,
"loss": 0.106,
"step": 7275
},
{
"epoch": 0.29891693794402474,
"grad_norm": 0.21331587433815002,
"learning_rate": 7.025152845595176e-05,
"loss": 0.0994,
"step": 7300
},
{
"epoch": 0.29994062608766864,
"grad_norm": 0.37877365946769714,
"learning_rate": 7.014894751959297e-05,
"loss": 0.102,
"step": 7325
},
{
"epoch": 0.3009643142313126,
"grad_norm": 0.28108686208724976,
"learning_rate": 7.004636658323419e-05,
"loss": 0.104,
"step": 7350
},
{
"epoch": 0.3019880023749565,
"grad_norm": 0.25342661142349243,
"learning_rate": 6.994378564687539e-05,
"loss": 0.1044,
"step": 7375
},
{
"epoch": 0.30301169051860044,
"grad_norm": 0.7590738534927368,
"learning_rate": 6.98412047105166e-05,
"loss": 0.106,
"step": 7400
},
{
"epoch": 0.30403537866224434,
"grad_norm": 0.20050746202468872,
"learning_rate": 6.973862377415782e-05,
"loss": 0.1069,
"step": 7425
},
{
"epoch": 0.30505906680588823,
"grad_norm": 0.27144044637680054,
"learning_rate": 6.963604283779903e-05,
"loss": 0.104,
"step": 7450
},
{
"epoch": 0.3060827549495322,
"grad_norm": 0.2616618275642395,
"learning_rate": 6.953346190144025e-05,
"loss": 0.101,
"step": 7475
},
{
"epoch": 0.3071064430931761,
"grad_norm": 0.27171334624290466,
"learning_rate": 6.943088096508146e-05,
"loss": 0.1036,
"step": 7500
},
{
"epoch": 0.30813013123682004,
"grad_norm": 0.19246098399162292,
"learning_rate": 6.932830002872268e-05,
"loss": 0.1035,
"step": 7525
},
{
"epoch": 0.30915381938046393,
"grad_norm": 0.2488516867160797,
"learning_rate": 6.922571909236388e-05,
"loss": 0.1053,
"step": 7550
},
{
"epoch": 0.31017750752410783,
"grad_norm": 0.2559676170349121,
"learning_rate": 6.912313815600509e-05,
"loss": 0.1039,
"step": 7575
},
{
"epoch": 0.3112011956677518,
"grad_norm": 0.19615231454372406,
"learning_rate": 6.90205572196463e-05,
"loss": 0.1016,
"step": 7600
},
{
"epoch": 0.3122248838113957,
"grad_norm": 0.22992445528507233,
"learning_rate": 6.891797628328752e-05,
"loss": 0.103,
"step": 7625
},
{
"epoch": 0.31324857195503963,
"grad_norm": 0.25916945934295654,
"learning_rate": 6.881539534692874e-05,
"loss": 0.1033,
"step": 7650
},
{
"epoch": 0.31427226009868353,
"grad_norm": 0.2485833466053009,
"learning_rate": 6.871281441056995e-05,
"loss": 0.1023,
"step": 7675
},
{
"epoch": 0.3152959482423275,
"grad_norm": 0.3130246102809906,
"learning_rate": 6.861023347421116e-05,
"loss": 0.1013,
"step": 7700
},
{
"epoch": 0.3163196363859714,
"grad_norm": 0.17889827489852905,
"learning_rate": 6.850765253785238e-05,
"loss": 0.1063,
"step": 7725
},
{
"epoch": 0.3173433245296153,
"grad_norm": 0.23844337463378906,
"learning_rate": 6.840507160149358e-05,
"loss": 0.1024,
"step": 7750
},
{
"epoch": 0.31836701267325923,
"grad_norm": 0.2489156275987625,
"learning_rate": 6.83024906651348e-05,
"loss": 0.1018,
"step": 7775
},
{
"epoch": 0.31939070081690313,
"grad_norm": 0.24830876290798187,
"learning_rate": 6.819990972877601e-05,
"loss": 0.1018,
"step": 7800
},
{
"epoch": 0.3204143889605471,
"grad_norm": 0.23647700250148773,
"learning_rate": 6.809732879241722e-05,
"loss": 0.1054,
"step": 7825
},
{
"epoch": 0.321438077104191,
"grad_norm": 0.3480120003223419,
"learning_rate": 6.799474785605844e-05,
"loss": 0.0991,
"step": 7850
},
{
"epoch": 0.3224617652478349,
"grad_norm": 0.2117711305618286,
"learning_rate": 6.789216691969965e-05,
"loss": 0.1019,
"step": 7875
},
{
"epoch": 0.3234854533914788,
"grad_norm": 0.21510981023311615,
"learning_rate": 6.778958598334087e-05,
"loss": 0.1023,
"step": 7900
},
{
"epoch": 0.3245091415351227,
"grad_norm": 0.21288833022117615,
"learning_rate": 6.768700504698208e-05,
"loss": 0.1018,
"step": 7925
},
{
"epoch": 0.3255328296787667,
"grad_norm": 0.2654208242893219,
"learning_rate": 6.758442411062328e-05,
"loss": 0.1,
"step": 7950
},
{
"epoch": 0.3265565178224106,
"grad_norm": 0.23810634016990662,
"learning_rate": 6.74818431742645e-05,
"loss": 0.1007,
"step": 7975
},
{
"epoch": 0.3275802059660545,
"grad_norm": 0.26225727796554565,
"learning_rate": 6.737926223790571e-05,
"loss": 0.1037,
"step": 8000
},
{
"epoch": 0.3286038941096984,
"grad_norm": 0.28832173347473145,
"learning_rate": 6.727668130154693e-05,
"loss": 0.1024,
"step": 8025
},
{
"epoch": 0.3296275822533423,
"grad_norm": 0.25963491201400757,
"learning_rate": 6.717410036518814e-05,
"loss": 0.106,
"step": 8050
},
{
"epoch": 0.3306512703969863,
"grad_norm": 0.3249678611755371,
"learning_rate": 6.707151942882936e-05,
"loss": 0.0958,
"step": 8075
},
{
"epoch": 0.33167495854063017,
"grad_norm": 0.25855204463005066,
"learning_rate": 6.696893849247057e-05,
"loss": 0.1004,
"step": 8100
},
{
"epoch": 0.3326986466842741,
"grad_norm": 0.2253751903772354,
"learning_rate": 6.686635755611179e-05,
"loss": 0.1013,
"step": 8125
},
{
"epoch": 0.333722334827918,
"grad_norm": 0.25214654207229614,
"learning_rate": 6.676377661975299e-05,
"loss": 0.1016,
"step": 8150
},
{
"epoch": 0.3347460229715619,
"grad_norm": 0.2561601996421814,
"learning_rate": 6.66611956833942e-05,
"loss": 0.1011,
"step": 8175
},
{
"epoch": 0.33576971111520587,
"grad_norm": 0.2241383194923401,
"learning_rate": 6.655861474703542e-05,
"loss": 0.1013,
"step": 8200
},
{
"epoch": 0.33679339925884977,
"grad_norm": 0.23701010644435883,
"learning_rate": 6.645603381067663e-05,
"loss": 0.0994,
"step": 8225
},
{
"epoch": 0.3378170874024937,
"grad_norm": 0.2312152236700058,
"learning_rate": 6.635345287431785e-05,
"loss": 0.0969,
"step": 8250
},
{
"epoch": 0.3388407755461376,
"grad_norm": 0.5713122487068176,
"learning_rate": 6.625087193795906e-05,
"loss": 0.1032,
"step": 8275
},
{
"epoch": 0.33986446368978157,
"grad_norm": 0.2621745467185974,
"learning_rate": 6.614829100160028e-05,
"loss": 0.1009,
"step": 8300
},
{
"epoch": 0.34088815183342547,
"grad_norm": 0.24803993105888367,
"learning_rate": 6.604571006524149e-05,
"loss": 0.0992,
"step": 8325
},
{
"epoch": 0.34191183997706937,
"grad_norm": 0.20469900965690613,
"learning_rate": 6.594312912888269e-05,
"loss": 0.1021,
"step": 8350
},
{
"epoch": 0.3429355281207133,
"grad_norm": 0.26485446095466614,
"learning_rate": 6.58405481925239e-05,
"loss": 0.1023,
"step": 8375
},
{
"epoch": 0.3439592162643572,
"grad_norm": 0.30211177468299866,
"learning_rate": 6.573796725616512e-05,
"loss": 0.1,
"step": 8400
},
{
"epoch": 0.34498290440800117,
"grad_norm": 0.19773200154304504,
"learning_rate": 6.563538631980634e-05,
"loss": 0.0998,
"step": 8425
},
{
"epoch": 0.34600659255164506,
"grad_norm": 0.37499427795410156,
"learning_rate": 6.553280538344755e-05,
"loss": 0.0969,
"step": 8450
},
{
"epoch": 0.34703028069528896,
"grad_norm": 0.23352007567882538,
"learning_rate": 6.543022444708877e-05,
"loss": 0.1013,
"step": 8475
},
{
"epoch": 0.3480539688389329,
"grad_norm": 0.22725583612918854,
"learning_rate": 6.532764351072998e-05,
"loss": 0.1005,
"step": 8500
},
{
"epoch": 0.3490776569825768,
"grad_norm": 0.2472585290670395,
"learning_rate": 6.522506257437118e-05,
"loss": 0.0981,
"step": 8525
},
{
"epoch": 0.35010134512622076,
"grad_norm": 0.24253399670124054,
"learning_rate": 6.51224816380124e-05,
"loss": 0.1029,
"step": 8550
},
{
"epoch": 0.35112503326986466,
"grad_norm": 0.22759589552879333,
"learning_rate": 6.501990070165361e-05,
"loss": 0.1026,
"step": 8575
},
{
"epoch": 0.3521487214135086,
"grad_norm": 0.3092879056930542,
"learning_rate": 6.491731976529482e-05,
"loss": 0.105,
"step": 8600
},
{
"epoch": 0.3531724095571525,
"grad_norm": 0.21212832629680634,
"learning_rate": 6.481473882893604e-05,
"loss": 0.1039,
"step": 8625
},
{
"epoch": 0.3541960977007964,
"grad_norm": 0.22957822680473328,
"learning_rate": 6.471215789257725e-05,
"loss": 0.1039,
"step": 8650
},
{
"epoch": 0.35521978584444036,
"grad_norm": 0.2514593005180359,
"learning_rate": 6.460957695621847e-05,
"loss": 0.105,
"step": 8675
},
{
"epoch": 0.35624347398808426,
"grad_norm": 0.32485923171043396,
"learning_rate": 6.450699601985968e-05,
"loss": 0.1043,
"step": 8700
},
{
"epoch": 0.3572671621317282,
"grad_norm": 0.25438931584358215,
"learning_rate": 6.440441508350088e-05,
"loss": 0.1033,
"step": 8725
},
{
"epoch": 0.3582908502753721,
"grad_norm": 0.26107901334762573,
"learning_rate": 6.43018341471421e-05,
"loss": 0.106,
"step": 8750
},
{
"epoch": 0.359314538419016,
"grad_norm": 0.20148183405399323,
"learning_rate": 6.41992532107833e-05,
"loss": 0.102,
"step": 8775
},
{
"epoch": 0.36033822656265996,
"grad_norm": 0.3115244209766388,
"learning_rate": 6.409667227442452e-05,
"loss": 0.1002,
"step": 8800
},
{
"epoch": 0.36136191470630386,
"grad_norm": 0.2722707688808441,
"learning_rate": 6.399409133806573e-05,
"loss": 0.0993,
"step": 8825
},
{
"epoch": 0.3623856028499478,
"grad_norm": 0.3244341015815735,
"learning_rate": 6.389151040170694e-05,
"loss": 0.0973,
"step": 8850
},
{
"epoch": 0.3634092909935917,
"grad_norm": 0.24697239696979523,
"learning_rate": 6.378892946534816e-05,
"loss": 0.0958,
"step": 8875
},
{
"epoch": 0.36443297913723566,
"grad_norm": 0.23170702159404755,
"learning_rate": 6.368634852898937e-05,
"loss": 0.1039,
"step": 8900
},
{
"epoch": 0.36545666728087955,
"grad_norm": 0.25722336769104004,
"learning_rate": 6.358376759263059e-05,
"loss": 0.1027,
"step": 8925
},
{
"epoch": 0.36648035542452345,
"grad_norm": 0.2329777032136917,
"learning_rate": 6.348118665627179e-05,
"loss": 0.1003,
"step": 8950
},
{
"epoch": 0.3675040435681674,
"grad_norm": 0.3008142411708832,
"learning_rate": 6.3378605719913e-05,
"loss": 0.0975,
"step": 8975
},
{
"epoch": 0.3685277317118113,
"grad_norm": 0.19098886847496033,
"learning_rate": 6.327602478355422e-05,
"loss": 0.097,
"step": 9000
},
{
"epoch": 0.36955141985545525,
"grad_norm": 0.2393869310617447,
"learning_rate": 6.317344384719543e-05,
"loss": 0.0992,
"step": 9025
},
{
"epoch": 0.37057510799909915,
"grad_norm": 0.24962279200553894,
"learning_rate": 6.307086291083665e-05,
"loss": 0.1006,
"step": 9050
},
{
"epoch": 0.37159879614274305,
"grad_norm": 0.20281440019607544,
"learning_rate": 6.296828197447786e-05,
"loss": 0.097,
"step": 9075
},
{
"epoch": 0.372622484286387,
"grad_norm": 0.21669328212738037,
"learning_rate": 6.286570103811908e-05,
"loss": 0.1008,
"step": 9100
},
{
"epoch": 0.3736461724300309,
"grad_norm": 0.21775703132152557,
"learning_rate": 6.276312010176029e-05,
"loss": 0.1046,
"step": 9125
},
{
"epoch": 0.37466986057367485,
"grad_norm": 0.24492838978767395,
"learning_rate": 6.26605391654015e-05,
"loss": 0.0989,
"step": 9150
},
{
"epoch": 0.37569354871731875,
"grad_norm": 0.2119276523590088,
"learning_rate": 6.255795822904271e-05,
"loss": 0.1038,
"step": 9175
},
{
"epoch": 0.3767172368609627,
"grad_norm": 0.2842216193675995,
"learning_rate": 6.245537729268392e-05,
"loss": 0.1004,
"step": 9200
},
{
"epoch": 0.3777409250046066,
"grad_norm": 0.2775871455669403,
"learning_rate": 6.235279635632514e-05,
"loss": 0.1008,
"step": 9225
},
{
"epoch": 0.3787646131482505,
"grad_norm": 0.26387348771095276,
"learning_rate": 6.225021541996635e-05,
"loss": 0.0972,
"step": 9250
},
{
"epoch": 0.37978830129189445,
"grad_norm": 0.2945527136325836,
"learning_rate": 6.214763448360757e-05,
"loss": 0.1044,
"step": 9275
},
{
"epoch": 0.38081198943553835,
"grad_norm": 0.34967219829559326,
"learning_rate": 6.204505354724878e-05,
"loss": 0.1018,
"step": 9300
},
{
"epoch": 0.3818356775791823,
"grad_norm": 0.2373281568288803,
"learning_rate": 6.194247261089e-05,
"loss": 0.1028,
"step": 9325
},
{
"epoch": 0.3828593657228262,
"grad_norm": 0.27347394824028015,
"learning_rate": 6.18398916745312e-05,
"loss": 0.0995,
"step": 9350
},
{
"epoch": 0.3838830538664701,
"grad_norm": 0.2860616147518158,
"learning_rate": 6.173731073817241e-05,
"loss": 0.0983,
"step": 9375
},
{
"epoch": 0.38490674201011404,
"grad_norm": 0.3643983006477356,
"learning_rate": 6.163472980181363e-05,
"loss": 0.0957,
"step": 9400
},
{
"epoch": 0.38593043015375794,
"grad_norm": 0.3181641399860382,
"learning_rate": 6.153214886545484e-05,
"loss": 0.0989,
"step": 9425
},
{
"epoch": 0.3869541182974019,
"grad_norm": 0.24089764058589935,
"learning_rate": 6.142956792909606e-05,
"loss": 0.0982,
"step": 9450
},
{
"epoch": 0.3879778064410458,
"grad_norm": 0.2490035593509674,
"learning_rate": 6.132698699273727e-05,
"loss": 0.1039,
"step": 9475
},
{
"epoch": 0.38900149458468974,
"grad_norm": 0.2765063941478729,
"learning_rate": 6.122440605637849e-05,
"loss": 0.0937,
"step": 9500
},
{
"epoch": 0.39002518272833364,
"grad_norm": 0.45849937200546265,
"learning_rate": 6.11218251200197e-05,
"loss": 0.1002,
"step": 9525
},
{
"epoch": 0.39104887087197754,
"grad_norm": 0.23391731083393097,
"learning_rate": 6.101924418366091e-05,
"loss": 0.0995,
"step": 9550
},
{
"epoch": 0.3920725590156215,
"grad_norm": 0.258109986782074,
"learning_rate": 6.091666324730212e-05,
"loss": 0.1032,
"step": 9575
},
{
"epoch": 0.3930962471592654,
"grad_norm": 0.2020760029554367,
"learning_rate": 6.081408231094333e-05,
"loss": 0.1012,
"step": 9600
},
{
"epoch": 0.39411993530290934,
"grad_norm": 0.20322605967521667,
"learning_rate": 6.0711501374584545e-05,
"loss": 0.1009,
"step": 9625
},
{
"epoch": 0.39514362344655324,
"grad_norm": 0.3139131963253021,
"learning_rate": 6.060892043822576e-05,
"loss": 0.1009,
"step": 9650
},
{
"epoch": 0.39616731159019714,
"grad_norm": 0.2019822746515274,
"learning_rate": 6.0506339501866974e-05,
"loss": 0.1021,
"step": 9675
},
{
"epoch": 0.3971909997338411,
"grad_norm": 0.21363505721092224,
"learning_rate": 6.040375856550818e-05,
"loss": 0.0996,
"step": 9700
},
{
"epoch": 0.398214687877485,
"grad_norm": 0.25607529282569885,
"learning_rate": 6.03011776291494e-05,
"loss": 0.0968,
"step": 9725
},
{
"epoch": 0.39923837602112894,
"grad_norm": 0.28837454319000244,
"learning_rate": 6.019859669279061e-05,
"loss": 0.1003,
"step": 9750
},
{
"epoch": 0.40026206416477284,
"grad_norm": 0.22750523686408997,
"learning_rate": 6.009601575643182e-05,
"loss": 0.0976,
"step": 9775
},
{
"epoch": 0.4012857523084168,
"grad_norm": 0.2659379541873932,
"learning_rate": 5.9993434820073034e-05,
"loss": 0.0991,
"step": 9800
},
{
"epoch": 0.4023094404520607,
"grad_norm": 0.3965132534503937,
"learning_rate": 5.989085388371425e-05,
"loss": 0.1039,
"step": 9825
},
{
"epoch": 0.4033331285957046,
"grad_norm": 0.2643307149410248,
"learning_rate": 5.978827294735546e-05,
"loss": 0.1018,
"step": 9850
},
{
"epoch": 0.40435681673934853,
"grad_norm": 0.2745136618614197,
"learning_rate": 5.968569201099667e-05,
"loss": 0.1037,
"step": 9875
},
{
"epoch": 0.40538050488299243,
"grad_norm": 0.235930934548378,
"learning_rate": 5.9583111074637886e-05,
"loss": 0.0995,
"step": 9900
},
{
"epoch": 0.4064041930266364,
"grad_norm": 0.23560784757137299,
"learning_rate": 5.94805301382791e-05,
"loss": 0.1001,
"step": 9925
},
{
"epoch": 0.4074278811702803,
"grad_norm": 0.3324751555919647,
"learning_rate": 5.9377949201920315e-05,
"loss": 0.1007,
"step": 9950
},
{
"epoch": 0.4084515693139242,
"grad_norm": 0.22333605587482452,
"learning_rate": 5.927536826556152e-05,
"loss": 0.1035,
"step": 9975
},
{
"epoch": 0.40947525745756813,
"grad_norm": 0.23905926942825317,
"learning_rate": 5.917278732920274e-05,
"loss": 0.1019,
"step": 10000
},
{
"epoch": 0.41049894560121203,
"grad_norm": 0.24543020129203796,
"learning_rate": 5.907020639284395e-05,
"loss": 0.1005,
"step": 10025
},
{
"epoch": 0.411522633744856,
"grad_norm": 0.2597710192203522,
"learning_rate": 5.896762545648517e-05,
"loss": 0.0937,
"step": 10050
},
{
"epoch": 0.4125463218884999,
"grad_norm": 0.2141934633255005,
"learning_rate": 5.8865044520126375e-05,
"loss": 0.1047,
"step": 10075
},
{
"epoch": 0.41357001003214383,
"grad_norm": 0.18962982296943665,
"learning_rate": 5.876246358376759e-05,
"loss": 0.1019,
"step": 10100
},
{
"epoch": 0.41459369817578773,
"grad_norm": 0.16786764562129974,
"learning_rate": 5.8659882647408804e-05,
"loss": 0.098,
"step": 10125
},
{
"epoch": 0.4156173863194316,
"grad_norm": 0.2587350904941559,
"learning_rate": 5.855730171105002e-05,
"loss": 0.1018,
"step": 10150
},
{
"epoch": 0.4166410744630756,
"grad_norm": 0.23551388084888458,
"learning_rate": 5.845472077469123e-05,
"loss": 0.1017,
"step": 10175
},
{
"epoch": 0.4176647626067195,
"grad_norm": 0.40040743350982666,
"learning_rate": 5.835213983833244e-05,
"loss": 0.099,
"step": 10200
},
{
"epoch": 0.41868845075036343,
"grad_norm": 0.274138480424881,
"learning_rate": 5.8249558901973656e-05,
"loss": 0.0988,
"step": 10225
},
{
"epoch": 0.4197121388940073,
"grad_norm": 0.21808317303657532,
"learning_rate": 5.814697796561487e-05,
"loss": 0.0974,
"step": 10250
},
{
"epoch": 0.4207358270376513,
"grad_norm": 0.2756749093532562,
"learning_rate": 5.804439702925608e-05,
"loss": 0.1007,
"step": 10275
},
{
"epoch": 0.4217595151812952,
"grad_norm": 0.28059181571006775,
"learning_rate": 5.7941816092897293e-05,
"loss": 0.0956,
"step": 10300
},
{
"epoch": 0.42278320332493907,
"grad_norm": 0.2666233479976654,
"learning_rate": 5.783923515653851e-05,
"loss": 0.1014,
"step": 10325
},
{
"epoch": 0.423806891468583,
"grad_norm": 0.17817972600460052,
"learning_rate": 5.773665422017972e-05,
"loss": 0.098,
"step": 10350
},
{
"epoch": 0.4248305796122269,
"grad_norm": 0.2498740404844284,
"learning_rate": 5.763407328382093e-05,
"loss": 0.1,
"step": 10375
},
{
"epoch": 0.4258542677558709,
"grad_norm": 0.2427319437265396,
"learning_rate": 5.7531492347462145e-05,
"loss": 0.0985,
"step": 10400
},
{
"epoch": 0.42687795589951477,
"grad_norm": 0.1904958337545395,
"learning_rate": 5.742891141110336e-05,
"loss": 0.1024,
"step": 10425
},
{
"epoch": 0.42790164404315867,
"grad_norm": 0.246423602104187,
"learning_rate": 5.7326330474744575e-05,
"loss": 0.0996,
"step": 10450
},
{
"epoch": 0.4289253321868026,
"grad_norm": 0.3124719262123108,
"learning_rate": 5.722374953838578e-05,
"loss": 0.0985,
"step": 10475
},
{
"epoch": 0.4299490203304465,
"grad_norm": 0.2046365588903427,
"learning_rate": 5.7121168602027e-05,
"loss": 0.1031,
"step": 10500
},
{
"epoch": 0.43097270847409047,
"grad_norm": 0.22781619429588318,
"learning_rate": 5.701858766566821e-05,
"loss": 0.0963,
"step": 10525
},
{
"epoch": 0.43199639661773437,
"grad_norm": 0.28827908635139465,
"learning_rate": 5.6916006729309427e-05,
"loss": 0.1015,
"step": 10550
},
{
"epoch": 0.4330200847613783,
"grad_norm": 0.27641138434410095,
"learning_rate": 5.6813425792950634e-05,
"loss": 0.0973,
"step": 10575
},
{
"epoch": 0.4340437729050222,
"grad_norm": 0.3545154929161072,
"learning_rate": 5.671084485659185e-05,
"loss": 0.0954,
"step": 10600
},
{
"epoch": 0.4350674610486661,
"grad_norm": 0.28629690408706665,
"learning_rate": 5.6608263920233064e-05,
"loss": 0.0937,
"step": 10625
},
{
"epoch": 0.43609114919231007,
"grad_norm": 0.2316763550043106,
"learning_rate": 5.650568298387428e-05,
"loss": 0.0975,
"step": 10650
},
{
"epoch": 0.43711483733595397,
"grad_norm": 0.28071022033691406,
"learning_rate": 5.6403102047515486e-05,
"loss": 0.1011,
"step": 10675
},
{
"epoch": 0.4381385254795979,
"grad_norm": 0.2439073920249939,
"learning_rate": 5.63005211111567e-05,
"loss": 0.0994,
"step": 10700
},
{
"epoch": 0.4391622136232418,
"grad_norm": 0.2952822744846344,
"learning_rate": 5.6197940174797916e-05,
"loss": 0.0983,
"step": 10725
},
{
"epoch": 0.4401859017668857,
"grad_norm": 0.28434520959854126,
"learning_rate": 5.6095359238439124e-05,
"loss": 0.0962,
"step": 10750
},
{
"epoch": 0.44120958991052966,
"grad_norm": 0.24351637065410614,
"learning_rate": 5.599277830208034e-05,
"loss": 0.0944,
"step": 10775
},
{
"epoch": 0.44223327805417356,
"grad_norm": 0.27679064869880676,
"learning_rate": 5.589019736572155e-05,
"loss": 0.0959,
"step": 10800
},
{
"epoch": 0.4432569661978175,
"grad_norm": 0.3108427822589874,
"learning_rate": 5.578761642936277e-05,
"loss": 0.0941,
"step": 10825
},
{
"epoch": 0.4442806543414614,
"grad_norm": 0.19497576355934143,
"learning_rate": 5.5685035493003976e-05,
"loss": 0.0981,
"step": 10850
},
{
"epoch": 0.44530434248510536,
"grad_norm": 0.22953549027442932,
"learning_rate": 5.558245455664519e-05,
"loss": 0.0976,
"step": 10875
},
{
"epoch": 0.44632803062874926,
"grad_norm": 0.5247841477394104,
"learning_rate": 5.5479873620286405e-05,
"loss": 0.1014,
"step": 10900
},
{
"epoch": 0.44735171877239316,
"grad_norm": 0.20830635726451874,
"learning_rate": 5.537729268392762e-05,
"loss": 0.0946,
"step": 10925
},
{
"epoch": 0.4483754069160371,
"grad_norm": 0.24027594923973083,
"learning_rate": 5.527471174756883e-05,
"loss": 0.0953,
"step": 10950
},
{
"epoch": 0.449399095059681,
"grad_norm": 0.2040860950946808,
"learning_rate": 5.517213081121004e-05,
"loss": 0.0961,
"step": 10975
},
{
"epoch": 0.45042278320332496,
"grad_norm": 0.18586771190166473,
"learning_rate": 5.506954987485126e-05,
"loss": 0.0982,
"step": 11000
},
{
"epoch": 0.45144647134696886,
"grad_norm": 0.32001617550849915,
"learning_rate": 5.496696893849247e-05,
"loss": 0.0981,
"step": 11025
},
{
"epoch": 0.45247015949061276,
"grad_norm": 0.28242385387420654,
"learning_rate": 5.486438800213368e-05,
"loss": 0.0981,
"step": 11050
},
{
"epoch": 0.4534938476342567,
"grad_norm": 0.3456820547580719,
"learning_rate": 5.4761807065774894e-05,
"loss": 0.0941,
"step": 11075
},
{
"epoch": 0.4545175357779006,
"grad_norm": 0.22706662118434906,
"learning_rate": 5.465922612941611e-05,
"loss": 0.0985,
"step": 11100
},
{
"epoch": 0.45554122392154456,
"grad_norm": 0.2593896985054016,
"learning_rate": 5.455664519305732e-05,
"loss": 0.0983,
"step": 11125
},
{
"epoch": 0.45656491206518846,
"grad_norm": 0.22201375663280487,
"learning_rate": 5.445406425669853e-05,
"loss": 0.1026,
"step": 11150
},
{
"epoch": 0.4575886002088324,
"grad_norm": 0.23291830718517303,
"learning_rate": 5.4351483320339746e-05,
"loss": 0.0995,
"step": 11175
},
{
"epoch": 0.4586122883524763,
"grad_norm": 0.24490538239479065,
"learning_rate": 5.424890238398096e-05,
"loss": 0.0953,
"step": 11200
},
{
"epoch": 0.4596359764961202,
"grad_norm": 0.3179132640361786,
"learning_rate": 5.4146321447622175e-05,
"loss": 0.0983,
"step": 11225
},
{
"epoch": 0.46065966463976415,
"grad_norm": 0.23889416456222534,
"learning_rate": 5.404374051126338e-05,
"loss": 0.0971,
"step": 11250
},
{
"epoch": 0.46168335278340805,
"grad_norm": 0.5037365555763245,
"learning_rate": 5.39411595749046e-05,
"loss": 0.0936,
"step": 11275
},
{
"epoch": 0.462707040927052,
"grad_norm": 0.2585156559944153,
"learning_rate": 5.383857863854581e-05,
"loss": 0.0918,
"step": 11300
},
{
"epoch": 0.4637307290706959,
"grad_norm": 0.2691129148006439,
"learning_rate": 5.373599770218703e-05,
"loss": 0.0959,
"step": 11325
},
{
"epoch": 0.4647544172143398,
"grad_norm": 0.24569182097911835,
"learning_rate": 5.3633416765828235e-05,
"loss": 0.0966,
"step": 11350
},
{
"epoch": 0.46577810535798375,
"grad_norm": 0.3655073344707489,
"learning_rate": 5.353083582946945e-05,
"loss": 0.0951,
"step": 11375
},
{
"epoch": 0.46680179350162765,
"grad_norm": 0.24223706126213074,
"learning_rate": 5.3428254893110664e-05,
"loss": 0.1008,
"step": 11400
},
{
"epoch": 0.4678254816452716,
"grad_norm": 0.2586074769496918,
"learning_rate": 5.332567395675188e-05,
"loss": 0.1011,
"step": 11425
},
{
"epoch": 0.4688491697889155,
"grad_norm": 0.2603899836540222,
"learning_rate": 5.322309302039309e-05,
"loss": 0.0984,
"step": 11450
},
{
"epoch": 0.46987285793255945,
"grad_norm": 0.25967130064964294,
"learning_rate": 5.31205120840343e-05,
"loss": 0.0965,
"step": 11475
},
{
"epoch": 0.47089654607620335,
"grad_norm": 0.2673439085483551,
"learning_rate": 5.3017931147675516e-05,
"loss": 0.1025,
"step": 11500
},
{
"epoch": 0.47192023421984725,
"grad_norm": 0.24883116781711578,
"learning_rate": 5.291535021131673e-05,
"loss": 0.0943,
"step": 11525
},
{
"epoch": 0.4729439223634912,
"grad_norm": 0.29023605585098267,
"learning_rate": 5.281276927495794e-05,
"loss": 0.0993,
"step": 11550
},
{
"epoch": 0.4739676105071351,
"grad_norm": 0.21741856634616852,
"learning_rate": 5.2710188338599153e-05,
"loss": 0.0937,
"step": 11575
},
{
"epoch": 0.47499129865077905,
"grad_norm": 0.24658936262130737,
"learning_rate": 5.260760740224037e-05,
"loss": 0.0971,
"step": 11600
},
{
"epoch": 0.47601498679442295,
"grad_norm": 0.3309827148914337,
"learning_rate": 5.2505026465881576e-05,
"loss": 0.1003,
"step": 11625
},
{
"epoch": 0.47703867493806684,
"grad_norm": 0.22925116121768951,
"learning_rate": 5.240244552952279e-05,
"loss": 0.0961,
"step": 11650
},
{
"epoch": 0.4780623630817108,
"grad_norm": 0.33367425203323364,
"learning_rate": 5.2299864593164005e-05,
"loss": 0.0959,
"step": 11675
},
{
"epoch": 0.4790860512253547,
"grad_norm": 0.2225172519683838,
"learning_rate": 5.219728365680522e-05,
"loss": 0.0983,
"step": 11700
},
{
"epoch": 0.48010973936899864,
"grad_norm": 0.26799845695495605,
"learning_rate": 5.209470272044643e-05,
"loss": 0.0984,
"step": 11725
},
{
"epoch": 0.48113342751264254,
"grad_norm": 0.28932616114616394,
"learning_rate": 5.199212178408764e-05,
"loss": 0.0973,
"step": 11750
},
{
"epoch": 0.4821571156562865,
"grad_norm": 0.3406207859516144,
"learning_rate": 5.188954084772886e-05,
"loss": 0.091,
"step": 11775
},
{
"epoch": 0.4831808037999304,
"grad_norm": 0.2970975935459137,
"learning_rate": 5.178695991137007e-05,
"loss": 0.0939,
"step": 11800
},
{
"epoch": 0.4842044919435743,
"grad_norm": 0.2747635245323181,
"learning_rate": 5.168437897501128e-05,
"loss": 0.0927,
"step": 11825
},
{
"epoch": 0.48522818008721824,
"grad_norm": 0.2211136370897293,
"learning_rate": 5.1581798038652494e-05,
"loss": 0.096,
"step": 11850
},
{
"epoch": 0.48625186823086214,
"grad_norm": 0.2881365418434143,
"learning_rate": 5.147921710229371e-05,
"loss": 0.0933,
"step": 11875
},
{
"epoch": 0.4872755563745061,
"grad_norm": 0.2213411182165146,
"learning_rate": 5.1376636165934924e-05,
"loss": 0.0982,
"step": 11900
},
{
"epoch": 0.48829924451815,
"grad_norm": 0.23638983070850372,
"learning_rate": 5.127405522957613e-05,
"loss": 0.1,
"step": 11925
},
{
"epoch": 0.4893229326617939,
"grad_norm": 0.2544683814048767,
"learning_rate": 5.1171474293217346e-05,
"loss": 0.1005,
"step": 11950
},
{
"epoch": 0.49034662080543784,
"grad_norm": 0.3138396441936493,
"learning_rate": 5.106889335685856e-05,
"loss": 0.0952,
"step": 11975
},
{
"epoch": 0.49137030894908174,
"grad_norm": 0.352205365896225,
"learning_rate": 5.0966312420499776e-05,
"loss": 0.0954,
"step": 12000
},
{
"epoch": 0.4923939970927257,
"grad_norm": 0.2083396166563034,
"learning_rate": 5.0863731484140984e-05,
"loss": 0.0947,
"step": 12025
},
{
"epoch": 0.4934176852363696,
"grad_norm": 0.2839849591255188,
"learning_rate": 5.07611505477822e-05,
"loss": 0.0986,
"step": 12050
},
{
"epoch": 0.49444137338001354,
"grad_norm": 0.26629742980003357,
"learning_rate": 5.065856961142341e-05,
"loss": 0.1007,
"step": 12075
},
{
"epoch": 0.49546506152365744,
"grad_norm": 0.2845945656299591,
"learning_rate": 5.055598867506463e-05,
"loss": 0.0995,
"step": 12100
},
{
"epoch": 0.49648874966730133,
"grad_norm": 0.22998517751693726,
"learning_rate": 5.0453407738705835e-05,
"loss": 0.0971,
"step": 12125
},
{
"epoch": 0.4975124378109453,
"grad_norm": 0.21335995197296143,
"learning_rate": 5.035082680234705e-05,
"loss": 0.0956,
"step": 12150
},
{
"epoch": 0.4985361259545892,
"grad_norm": 0.2018250823020935,
"learning_rate": 5.0248245865988265e-05,
"loss": 0.0982,
"step": 12175
},
{
"epoch": 0.49955981409823313,
"grad_norm": 0.2268654853105545,
"learning_rate": 5.014566492962948e-05,
"loss": 0.0951,
"step": 12200
},
{
"epoch": 0.5005835022418771,
"grad_norm": 0.22491568326950073,
"learning_rate": 5.004308399327069e-05,
"loss": 0.0991,
"step": 12225
},
{
"epoch": 0.5016071903855209,
"grad_norm": 0.2399354726076126,
"learning_rate": 4.994050305691191e-05,
"loss": 0.0978,
"step": 12250
},
{
"epoch": 0.5026308785291649,
"grad_norm": 0.2331288754940033,
"learning_rate": 4.9837922120553123e-05,
"loss": 0.0959,
"step": 12275
},
{
"epoch": 0.5036545666728088,
"grad_norm": 0.23224005103111267,
"learning_rate": 4.973534118419433e-05,
"loss": 0.0957,
"step": 12300
},
{
"epoch": 0.5046782548164527,
"grad_norm": 0.28165706992149353,
"learning_rate": 4.9632760247835546e-05,
"loss": 0.0985,
"step": 12325
},
{
"epoch": 0.5057019429600966,
"grad_norm": 0.22725163400173187,
"learning_rate": 4.953017931147676e-05,
"loss": 0.0934,
"step": 12350
},
{
"epoch": 0.5067256311037406,
"grad_norm": 0.30300387740135193,
"learning_rate": 4.9427598375117975e-05,
"loss": 0.0947,
"step": 12375
},
{
"epoch": 0.5077493192473844,
"grad_norm": 0.22563788294792175,
"learning_rate": 4.932501743875918e-05,
"loss": 0.0964,
"step": 12400
},
{
"epoch": 0.5087730073910284,
"grad_norm": 0.2117646187543869,
"learning_rate": 4.92224365024004e-05,
"loss": 0.0952,
"step": 12425
},
{
"epoch": 0.5097966955346723,
"grad_norm": 0.22831501066684723,
"learning_rate": 4.911985556604161e-05,
"loss": 0.0965,
"step": 12450
},
{
"epoch": 0.5108203836783163,
"grad_norm": 0.2967502474784851,
"learning_rate": 4.901727462968283e-05,
"loss": 0.0947,
"step": 12475
},
{
"epoch": 0.5118440718219601,
"grad_norm": 0.22398816049098969,
"learning_rate": 4.8914693693324035e-05,
"loss": 0.0935,
"step": 12500
},
{
"epoch": 0.5128677599656041,
"grad_norm": 0.26190030574798584,
"learning_rate": 4.881211275696525e-05,
"loss": 0.0965,
"step": 12525
},
{
"epoch": 0.513891448109248,
"grad_norm": 0.2718106508255005,
"learning_rate": 4.8709531820606464e-05,
"loss": 0.0955,
"step": 12550
},
{
"epoch": 0.5149151362528919,
"grad_norm": 0.27051568031311035,
"learning_rate": 4.860695088424768e-05,
"loss": 0.094,
"step": 12575
},
{
"epoch": 0.5159388243965358,
"grad_norm": 0.2031087726354599,
"learning_rate": 4.850436994788889e-05,
"loss": 0.0988,
"step": 12600
},
{
"epoch": 0.5169625125401798,
"grad_norm": 0.23844382166862488,
"learning_rate": 4.84017890115301e-05,
"loss": 0.0915,
"step": 12625
},
{
"epoch": 0.5179862006838237,
"grad_norm": 0.31874755024909973,
"learning_rate": 4.8299208075171316e-05,
"loss": 0.0908,
"step": 12650
},
{
"epoch": 0.5190098888274676,
"grad_norm": 0.23138810694217682,
"learning_rate": 4.819662713881253e-05,
"loss": 0.0962,
"step": 12675
},
{
"epoch": 0.5200335769711115,
"grad_norm": 0.27298617362976074,
"learning_rate": 4.809404620245374e-05,
"loss": 0.0975,
"step": 12700
},
{
"epoch": 0.5210572651147555,
"grad_norm": 0.25157856941223145,
"learning_rate": 4.7991465266094954e-05,
"loss": 0.0967,
"step": 12725
},
{
"epoch": 0.5220809532583993,
"grad_norm": 0.20571890473365784,
"learning_rate": 4.788888432973617e-05,
"loss": 0.0939,
"step": 12750
},
{
"epoch": 0.5231046414020433,
"grad_norm": 0.24462303519248962,
"learning_rate": 4.778630339337738e-05,
"loss": 0.0944,
"step": 12775
},
{
"epoch": 0.5241283295456872,
"grad_norm": 0.2392750382423401,
"learning_rate": 4.768372245701859e-05,
"loss": 0.0959,
"step": 12800
},
{
"epoch": 0.5251520176893312,
"grad_norm": 0.2759506106376648,
"learning_rate": 4.7581141520659805e-05,
"loss": 0.0957,
"step": 12825
},
{
"epoch": 0.526175705832975,
"grad_norm": 0.24135975539684296,
"learning_rate": 4.747856058430102e-05,
"loss": 0.0917,
"step": 12850
},
{
"epoch": 0.527199393976619,
"grad_norm": 0.2595539689064026,
"learning_rate": 4.737597964794223e-05,
"loss": 0.0984,
"step": 12875
},
{
"epoch": 0.5282230821202629,
"grad_norm": 0.2650289535522461,
"learning_rate": 4.727339871158344e-05,
"loss": 0.0938,
"step": 12900
},
{
"epoch": 0.5292467702639068,
"grad_norm": 0.24425174295902252,
"learning_rate": 4.717081777522466e-05,
"loss": 0.0991,
"step": 12925
},
{
"epoch": 0.5302704584075507,
"grad_norm": 0.24873086810112,
"learning_rate": 4.706823683886587e-05,
"loss": 0.0914,
"step": 12950
},
{
"epoch": 0.5312941465511947,
"grad_norm": 0.280268132686615,
"learning_rate": 4.696565590250708e-05,
"loss": 0.0966,
"step": 12975
},
{
"epoch": 0.5323178346948385,
"grad_norm": 0.24281346797943115,
"learning_rate": 4.6863074966148295e-05,
"loss": 0.0932,
"step": 13000
},
{
"epoch": 0.5333415228384825,
"grad_norm": 0.24113261699676514,
"learning_rate": 4.676049402978951e-05,
"loss": 0.0944,
"step": 13025
},
{
"epoch": 0.5343652109821264,
"grad_norm": 0.2524602711200714,
"learning_rate": 4.6657913093430724e-05,
"loss": 0.0922,
"step": 13050
},
{
"epoch": 0.5353888991257704,
"grad_norm": 0.24346871674060822,
"learning_rate": 4.655533215707193e-05,
"loss": 0.0932,
"step": 13075
},
{
"epoch": 0.5364125872694142,
"grad_norm": 0.29335957765579224,
"learning_rate": 4.6452751220713147e-05,
"loss": 0.0942,
"step": 13100
},
{
"epoch": 0.5374362754130582,
"grad_norm": 0.31220850348472595,
"learning_rate": 4.6350170284354354e-05,
"loss": 0.0931,
"step": 13125
},
{
"epoch": 0.5384599635567021,
"grad_norm": 0.2569523751735687,
"learning_rate": 4.624758934799557e-05,
"loss": 0.1005,
"step": 13150
},
{
"epoch": 0.539483651700346,
"grad_norm": 0.26125669479370117,
"learning_rate": 4.6145008411636784e-05,
"loss": 0.0924,
"step": 13175
},
{
"epoch": 0.5405073398439899,
"grad_norm": 0.31377628445625305,
"learning_rate": 4.604242747527799e-05,
"loss": 0.0966,
"step": 13200
},
{
"epoch": 0.5415310279876339,
"grad_norm": 0.25545644760131836,
"learning_rate": 4.5939846538919206e-05,
"loss": 0.0997,
"step": 13225
},
{
"epoch": 0.5425547161312778,
"grad_norm": 0.2510424554347992,
"learning_rate": 4.583726560256042e-05,
"loss": 0.0939,
"step": 13250
},
{
"epoch": 0.5435784042749217,
"grad_norm": 0.2709631323814392,
"learning_rate": 4.5734684666201636e-05,
"loss": 0.0918,
"step": 13275
},
{
"epoch": 0.5446020924185656,
"grad_norm": 0.2531428337097168,
"learning_rate": 4.5632103729842844e-05,
"loss": 0.0968,
"step": 13300
},
{
"epoch": 0.5456257805622096,
"grad_norm": 0.3153735101222992,
"learning_rate": 4.552952279348406e-05,
"loss": 0.0957,
"step": 13325
},
{
"epoch": 0.5466494687058534,
"grad_norm": 0.2258891612291336,
"learning_rate": 4.542694185712527e-05,
"loss": 0.0947,
"step": 13350
},
{
"epoch": 0.5476731568494974,
"grad_norm": 0.2671023905277252,
"learning_rate": 4.532436092076649e-05,
"loss": 0.0957,
"step": 13375
},
{
"epoch": 0.5486968449931413,
"grad_norm": 0.333008348941803,
"learning_rate": 4.5221779984407695e-05,
"loss": 0.0953,
"step": 13400
},
{
"epoch": 0.5497205331367853,
"grad_norm": 0.2922687828540802,
"learning_rate": 4.511919904804891e-05,
"loss": 0.0963,
"step": 13425
},
{
"epoch": 0.5507442212804291,
"grad_norm": 0.27738088369369507,
"learning_rate": 4.5016618111690125e-05,
"loss": 0.0926,
"step": 13450
},
{
"epoch": 0.551767909424073,
"grad_norm": 0.28781887888908386,
"learning_rate": 4.491403717533134e-05,
"loss": 0.0986,
"step": 13475
},
{
"epoch": 0.552791597567717,
"grad_norm": 0.2727603018283844,
"learning_rate": 4.481145623897255e-05,
"loss": 0.0995,
"step": 13500
},
{
"epoch": 0.5538152857113608,
"grad_norm": 0.2735615670681,
"learning_rate": 4.470887530261376e-05,
"loss": 0.0959,
"step": 13525
},
{
"epoch": 0.5548389738550048,
"grad_norm": 0.2211311310529709,
"learning_rate": 4.460629436625498e-05,
"loss": 0.0872,
"step": 13550
},
{
"epoch": 0.5558626619986488,
"grad_norm": 0.2359626144170761,
"learning_rate": 4.4503713429896185e-05,
"loss": 0.0967,
"step": 13575
},
{
"epoch": 0.5568863501422926,
"grad_norm": 0.27807098627090454,
"learning_rate": 4.44011324935374e-05,
"loss": 0.0953,
"step": 13600
},
{
"epoch": 0.5579100382859365,
"grad_norm": 0.2691691219806671,
"learning_rate": 4.4298551557178614e-05,
"loss": 0.0942,
"step": 13625
},
{
"epoch": 0.5589337264295805,
"grad_norm": 0.22528734803199768,
"learning_rate": 4.419597062081983e-05,
"loss": 0.0975,
"step": 13650
},
{
"epoch": 0.5599574145732245,
"grad_norm": 0.22979159653186798,
"learning_rate": 4.4093389684461036e-05,
"loss": 0.0946,
"step": 13675
},
{
"epoch": 0.5609811027168683,
"grad_norm": 0.35849061608314514,
"learning_rate": 4.399080874810225e-05,
"loss": 0.0886,
"step": 13700
},
{
"epoch": 0.5620047908605122,
"grad_norm": 0.2247435599565506,
"learning_rate": 4.3888227811743466e-05,
"loss": 0.0942,
"step": 13725
},
{
"epoch": 0.5630284790041562,
"grad_norm": 0.2186431735754013,
"learning_rate": 4.378564687538468e-05,
"loss": 0.0958,
"step": 13750
},
{
"epoch": 0.5640521671478,
"grad_norm": 0.26496851444244385,
"learning_rate": 4.368306593902589e-05,
"loss": 0.0932,
"step": 13775
},
{
"epoch": 0.565075855291444,
"grad_norm": 0.20004922151565552,
"learning_rate": 4.35804850026671e-05,
"loss": 0.1003,
"step": 13800
},
{
"epoch": 0.566099543435088,
"grad_norm": 0.25645968317985535,
"learning_rate": 4.347790406630832e-05,
"loss": 0.0932,
"step": 13825
},
{
"epoch": 0.5671232315787319,
"grad_norm": 0.24646583199501038,
"learning_rate": 4.337532312994953e-05,
"loss": 0.0954,
"step": 13850
},
{
"epoch": 0.5681469197223757,
"grad_norm": 0.25467848777770996,
"learning_rate": 4.327274219359074e-05,
"loss": 0.0982,
"step": 13875
},
{
"epoch": 0.5691706078660197,
"grad_norm": 0.24455401301383972,
"learning_rate": 4.3170161257231955e-05,
"loss": 0.0913,
"step": 13900
},
{
"epoch": 0.5701942960096636,
"grad_norm": 0.26495805382728577,
"learning_rate": 4.306758032087317e-05,
"loss": 0.0894,
"step": 13925
},
{
"epoch": 0.5712179841533075,
"grad_norm": 0.23517432808876038,
"learning_rate": 4.2964999384514384e-05,
"loss": 0.0941,
"step": 13950
},
{
"epoch": 0.5722416722969514,
"grad_norm": 0.25355222821235657,
"learning_rate": 4.286241844815559e-05,
"loss": 0.09,
"step": 13975
},
{
"epoch": 0.5732653604405954,
"grad_norm": 0.2494240403175354,
"learning_rate": 4.275983751179681e-05,
"loss": 0.1003,
"step": 14000
},
{
"epoch": 0.5742890485842393,
"grad_norm": 0.22723737359046936,
"learning_rate": 4.265725657543802e-05,
"loss": 0.1001,
"step": 14025
},
{
"epoch": 0.5753127367278832,
"grad_norm": 0.19633585214614868,
"learning_rate": 4.2554675639079236e-05,
"loss": 0.0934,
"step": 14050
},
{
"epoch": 0.5763364248715271,
"grad_norm": 0.24108199775218964,
"learning_rate": 4.2452094702720444e-05,
"loss": 0.0932,
"step": 14075
},
{
"epoch": 0.5773601130151711,
"grad_norm": 0.28201839327812195,
"learning_rate": 4.234951376636166e-05,
"loss": 0.0931,
"step": 14100
},
{
"epoch": 0.5783838011588149,
"grad_norm": 0.29982468485832214,
"learning_rate": 4.224693283000287e-05,
"loss": 0.0941,
"step": 14125
},
{
"epoch": 0.5794074893024589,
"grad_norm": 0.2526194453239441,
"learning_rate": 4.214435189364409e-05,
"loss": 0.0937,
"step": 14150
},
{
"epoch": 0.5804311774461028,
"grad_norm": 0.2288905531167984,
"learning_rate": 4.2041770957285296e-05,
"loss": 0.0949,
"step": 14175
},
{
"epoch": 0.5814548655897468,
"grad_norm": 0.19117498397827148,
"learning_rate": 4.193919002092651e-05,
"loss": 0.0938,
"step": 14200
},
{
"epoch": 0.5824785537333906,
"grad_norm": 0.2719483971595764,
"learning_rate": 4.1836609084567725e-05,
"loss": 0.0911,
"step": 14225
},
{
"epoch": 0.5835022418770346,
"grad_norm": 0.2975625991821289,
"learning_rate": 4.173402814820894e-05,
"loss": 0.0991,
"step": 14250
},
{
"epoch": 0.5845259300206785,
"grad_norm": 0.2232026904821396,
"learning_rate": 4.163144721185015e-05,
"loss": 0.0971,
"step": 14275
},
{
"epoch": 0.5855496181643224,
"grad_norm": 0.26348811388015747,
"learning_rate": 4.152886627549136e-05,
"loss": 0.0946,
"step": 14300
},
{
"epoch": 0.5865733063079663,
"grad_norm": 0.320698082447052,
"learning_rate": 4.142628533913258e-05,
"loss": 0.0909,
"step": 14325
},
{
"epoch": 0.5875969944516103,
"grad_norm": 0.27873241901397705,
"learning_rate": 4.132370440277379e-05,
"loss": 0.0986,
"step": 14350
},
{
"epoch": 0.5886206825952541,
"grad_norm": 0.22352805733680725,
"learning_rate": 4.1221123466415e-05,
"loss": 0.0959,
"step": 14375
},
{
"epoch": 0.5896443707388981,
"grad_norm": 0.2206275910139084,
"learning_rate": 4.1118542530056214e-05,
"loss": 0.0973,
"step": 14400
},
{
"epoch": 0.590668058882542,
"grad_norm": 0.20755049586296082,
"learning_rate": 4.101596159369743e-05,
"loss": 0.0987,
"step": 14425
},
{
"epoch": 0.591691747026186,
"grad_norm": 0.25802165269851685,
"learning_rate": 4.091338065733864e-05,
"loss": 0.0898,
"step": 14450
},
{
"epoch": 0.5927154351698298,
"grad_norm": 0.21148554980754852,
"learning_rate": 4.081079972097985e-05,
"loss": 0.0906,
"step": 14475
},
{
"epoch": 0.5937391233134738,
"grad_norm": 0.28330081701278687,
"learning_rate": 4.0708218784621066e-05,
"loss": 0.0952,
"step": 14500
},
{
"epoch": 0.5947628114571177,
"grad_norm": 0.26006045937538147,
"learning_rate": 4.060563784826228e-05,
"loss": 0.0879,
"step": 14525
},
{
"epoch": 0.5957864996007616,
"grad_norm": 0.2529297173023224,
"learning_rate": 4.050305691190349e-05,
"loss": 0.0899,
"step": 14550
},
{
"epoch": 0.5968101877444055,
"grad_norm": 0.25934335589408875,
"learning_rate": 4.0400475975544703e-05,
"loss": 0.0974,
"step": 14575
},
{
"epoch": 0.5978338758880495,
"grad_norm": 0.34801098704338074,
"learning_rate": 4.029789503918592e-05,
"loss": 0.0966,
"step": 14600
},
{
"epoch": 0.5988575640316934,
"grad_norm": 0.3519505262374878,
"learning_rate": 4.019531410282713e-05,
"loss": 0.0976,
"step": 14625
},
{
"epoch": 0.5998812521753373,
"grad_norm": 0.21879540383815765,
"learning_rate": 4.009273316646834e-05,
"loss": 0.0929,
"step": 14650
},
{
"epoch": 0.6009049403189812,
"grad_norm": 0.3088552951812744,
"learning_rate": 3.9990152230109555e-05,
"loss": 0.0895,
"step": 14675
},
{
"epoch": 0.6019286284626252,
"grad_norm": 0.1896054446697235,
"learning_rate": 3.988757129375077e-05,
"loss": 0.0954,
"step": 14700
},
{
"epoch": 0.602952316606269,
"grad_norm": 0.24023133516311646,
"learning_rate": 3.9784990357391985e-05,
"loss": 0.0928,
"step": 14725
},
{
"epoch": 0.603976004749913,
"grad_norm": 0.2335812747478485,
"learning_rate": 3.968240942103319e-05,
"loss": 0.0915,
"step": 14750
},
{
"epoch": 0.6049996928935569,
"grad_norm": 0.2618425488471985,
"learning_rate": 3.957982848467441e-05,
"loss": 0.093,
"step": 14775
},
{
"epoch": 0.6060233810372009,
"grad_norm": 0.28540539741516113,
"learning_rate": 3.947724754831562e-05,
"loss": 0.0945,
"step": 14800
},
{
"epoch": 0.6070470691808447,
"grad_norm": 0.34257885813713074,
"learning_rate": 3.9374666611956837e-05,
"loss": 0.0937,
"step": 14825
},
{
"epoch": 0.6080707573244887,
"grad_norm": 0.242543026804924,
"learning_rate": 3.9272085675598045e-05,
"loss": 0.1006,
"step": 14850
},
{
"epoch": 0.6090944454681326,
"grad_norm": 0.3368709683418274,
"learning_rate": 3.916950473923926e-05,
"loss": 0.098,
"step": 14875
},
{
"epoch": 0.6101181336117765,
"grad_norm": 0.2498284876346588,
"learning_rate": 3.9066923802880474e-05,
"loss": 0.0945,
"step": 14900
},
{
"epoch": 0.6111418217554204,
"grad_norm": 0.36862441897392273,
"learning_rate": 3.896434286652169e-05,
"loss": 0.0947,
"step": 14925
},
{
"epoch": 0.6121655098990644,
"grad_norm": 0.2516944110393524,
"learning_rate": 3.8861761930162896e-05,
"loss": 0.0904,
"step": 14950
},
{
"epoch": 0.6131891980427082,
"grad_norm": 0.23044705390930176,
"learning_rate": 3.875918099380411e-05,
"loss": 0.0922,
"step": 14975
},
{
"epoch": 0.6142128861863522,
"grad_norm": 0.322510689496994,
"learning_rate": 3.8656600057445326e-05,
"loss": 0.0939,
"step": 15000
},
{
"epoch": 0.6152365743299961,
"grad_norm": 0.28902101516723633,
"learning_rate": 3.855401912108654e-05,
"loss": 0.092,
"step": 15025
},
{
"epoch": 0.6162602624736401,
"grad_norm": 0.33545222878456116,
"learning_rate": 3.845143818472775e-05,
"loss": 0.0931,
"step": 15050
},
{
"epoch": 0.6172839506172839,
"grad_norm": 0.24440859258174896,
"learning_rate": 3.834885724836896e-05,
"loss": 0.0957,
"step": 15075
},
{
"epoch": 0.6183076387609279,
"grad_norm": 0.25635841488838196,
"learning_rate": 3.824627631201018e-05,
"loss": 0.0945,
"step": 15100
},
{
"epoch": 0.6193313269045718,
"grad_norm": 0.26487112045288086,
"learning_rate": 3.814369537565139e-05,
"loss": 0.094,
"step": 15125
},
{
"epoch": 0.6203550150482157,
"grad_norm": 0.2371329963207245,
"learning_rate": 3.80411144392926e-05,
"loss": 0.0939,
"step": 15150
},
{
"epoch": 0.6213787031918596,
"grad_norm": 0.23745235800743103,
"learning_rate": 3.7938533502933815e-05,
"loss": 0.0894,
"step": 15175
},
{
"epoch": 0.6224023913355036,
"grad_norm": 0.32679396867752075,
"learning_rate": 3.783595256657503e-05,
"loss": 0.0916,
"step": 15200
},
{
"epoch": 0.6234260794791475,
"grad_norm": 0.236038938164711,
"learning_rate": 3.7733371630216244e-05,
"loss": 0.0896,
"step": 15225
},
{
"epoch": 0.6244497676227914,
"grad_norm": 0.17879773676395416,
"learning_rate": 3.763079069385745e-05,
"loss": 0.0971,
"step": 15250
},
{
"epoch": 0.6254734557664353,
"grad_norm": 0.30429938435554504,
"learning_rate": 3.752820975749867e-05,
"loss": 0.0884,
"step": 15275
},
{
"epoch": 0.6264971439100793,
"grad_norm": 0.332989364862442,
"learning_rate": 3.742562882113988e-05,
"loss": 0.0998,
"step": 15300
},
{
"epoch": 0.6275208320537231,
"grad_norm": 0.2244502305984497,
"learning_rate": 3.7323047884781096e-05,
"loss": 0.0894,
"step": 15325
},
{
"epoch": 0.6285445201973671,
"grad_norm": 0.22671306133270264,
"learning_rate": 3.7220466948422304e-05,
"loss": 0.0957,
"step": 15350
},
{
"epoch": 0.629568208341011,
"grad_norm": 0.22526578605175018,
"learning_rate": 3.711788601206352e-05,
"loss": 0.0892,
"step": 15375
},
{
"epoch": 0.630591896484655,
"grad_norm": 0.2514040768146515,
"learning_rate": 3.701530507570473e-05,
"loss": 0.0916,
"step": 15400
},
{
"epoch": 0.6316155846282988,
"grad_norm": 0.22782598435878754,
"learning_rate": 3.691272413934594e-05,
"loss": 0.0977,
"step": 15425
},
{
"epoch": 0.6326392727719428,
"grad_norm": 0.24572695791721344,
"learning_rate": 3.6810143202987156e-05,
"loss": 0.0919,
"step": 15450
},
{
"epoch": 0.6336629609155867,
"grad_norm": 0.231769859790802,
"learning_rate": 3.670756226662837e-05,
"loss": 0.0904,
"step": 15475
},
{
"epoch": 0.6346866490592306,
"grad_norm": 0.28821659088134766,
"learning_rate": 3.6604981330269585e-05,
"loss": 0.0905,
"step": 15500
},
{
"epoch": 0.6357103372028745,
"grad_norm": 0.19901390373706818,
"learning_rate": 3.650240039391079e-05,
"loss": 0.0885,
"step": 15525
},
{
"epoch": 0.6367340253465185,
"grad_norm": 0.24236318469047546,
"learning_rate": 3.639981945755201e-05,
"loss": 0.0929,
"step": 15550
},
{
"epoch": 0.6377577134901623,
"grad_norm": 0.27218177914619446,
"learning_rate": 3.629723852119322e-05,
"loss": 0.0925,
"step": 15575
},
{
"epoch": 0.6387814016338063,
"grad_norm": 0.29827386140823364,
"learning_rate": 3.619465758483444e-05,
"loss": 0.0913,
"step": 15600
},
{
"epoch": 0.6398050897774502,
"grad_norm": 0.2742908000946045,
"learning_rate": 3.6092076648475645e-05,
"loss": 0.0941,
"step": 15625
},
{
"epoch": 0.6408287779210942,
"grad_norm": 0.28651776909828186,
"learning_rate": 3.598949571211686e-05,
"loss": 0.099,
"step": 15650
},
{
"epoch": 0.641852466064738,
"grad_norm": 0.2705094814300537,
"learning_rate": 3.5886914775758074e-05,
"loss": 0.0972,
"step": 15675
},
{
"epoch": 0.642876154208382,
"grad_norm": 0.2905079424381256,
"learning_rate": 3.578433383939929e-05,
"loss": 0.0952,
"step": 15700
},
{
"epoch": 0.6438998423520259,
"grad_norm": 0.28126639127731323,
"learning_rate": 3.56817529030405e-05,
"loss": 0.0928,
"step": 15725
},
{
"epoch": 0.6449235304956698,
"grad_norm": 0.23970367014408112,
"learning_rate": 3.557917196668171e-05,
"loss": 0.0945,
"step": 15750
},
{
"epoch": 0.6459472186393137,
"grad_norm": 0.23676908016204834,
"learning_rate": 3.5476591030322926e-05,
"loss": 0.0936,
"step": 15775
},
{
"epoch": 0.6469709067829577,
"grad_norm": 0.2415960431098938,
"learning_rate": 3.537401009396414e-05,
"loss": 0.0923,
"step": 15800
},
{
"epoch": 0.6479945949266016,
"grad_norm": 0.25593453645706177,
"learning_rate": 3.527142915760535e-05,
"loss": 0.0924,
"step": 15825
},
{
"epoch": 0.6490182830702454,
"grad_norm": 0.24800516664981842,
"learning_rate": 3.5168848221246563e-05,
"loss": 0.0945,
"step": 15850
},
{
"epoch": 0.6500419712138894,
"grad_norm": 0.24197053909301758,
"learning_rate": 3.506626728488778e-05,
"loss": 0.0932,
"step": 15875
},
{
"epoch": 0.6510656593575334,
"grad_norm": 0.25776922702789307,
"learning_rate": 3.496368634852899e-05,
"loss": 0.0909,
"step": 15900
},
{
"epoch": 0.6520893475011772,
"grad_norm": 0.1953815519809723,
"learning_rate": 3.48611054121702e-05,
"loss": 0.0927,
"step": 15925
},
{
"epoch": 0.6531130356448211,
"grad_norm": 0.267980694770813,
"learning_rate": 3.4758524475811415e-05,
"loss": 0.0917,
"step": 15950
},
{
"epoch": 0.6541367237884651,
"grad_norm": 0.2663339674472809,
"learning_rate": 3.465594353945263e-05,
"loss": 0.0921,
"step": 15975
},
{
"epoch": 0.655160411932109,
"grad_norm": 0.32129451632499695,
"learning_rate": 3.4553362603093845e-05,
"loss": 0.0906,
"step": 16000
},
{
"epoch": 0.6561841000757529,
"grad_norm": 0.23216140270233154,
"learning_rate": 3.445078166673505e-05,
"loss": 0.0916,
"step": 16025
},
{
"epoch": 0.6572077882193968,
"grad_norm": 0.26740553975105286,
"learning_rate": 3.434820073037627e-05,
"loss": 0.0939,
"step": 16050
},
{
"epoch": 0.6582314763630408,
"grad_norm": 0.26812317967414856,
"learning_rate": 3.424561979401748e-05,
"loss": 0.0903,
"step": 16075
},
{
"epoch": 0.6592551645066846,
"grad_norm": 0.2955368459224701,
"learning_rate": 3.4143038857658697e-05,
"loss": 0.0907,
"step": 16100
},
{
"epoch": 0.6602788526503286,
"grad_norm": 0.23007504642009735,
"learning_rate": 3.4040457921299904e-05,
"loss": 0.0894,
"step": 16125
},
{
"epoch": 0.6613025407939725,
"grad_norm": 0.2416328340768814,
"learning_rate": 3.393787698494112e-05,
"loss": 0.0872,
"step": 16150
},
{
"epoch": 0.6623262289376164,
"grad_norm": 0.2104121297597885,
"learning_rate": 3.3835296048582334e-05,
"loss": 0.0877,
"step": 16175
},
{
"epoch": 0.6633499170812603,
"grad_norm": 0.23629434406757355,
"learning_rate": 3.373271511222355e-05,
"loss": 0.0943,
"step": 16200
},
{
"epoch": 0.6643736052249043,
"grad_norm": 0.2717180550098419,
"learning_rate": 3.3630134175864756e-05,
"loss": 0.0879,
"step": 16225
},
{
"epoch": 0.6653972933685482,
"grad_norm": 0.27863848209381104,
"learning_rate": 3.352755323950597e-05,
"loss": 0.0881,
"step": 16250
},
{
"epoch": 0.6664209815121921,
"grad_norm": 0.2909884452819824,
"learning_rate": 3.3424972303147186e-05,
"loss": 0.0938,
"step": 16275
},
{
"epoch": 0.667444669655836,
"grad_norm": 0.18690423667430878,
"learning_rate": 3.3322391366788394e-05,
"loss": 0.0945,
"step": 16300
},
{
"epoch": 0.66846835779948,
"grad_norm": 0.2364642322063446,
"learning_rate": 3.321981043042961e-05,
"loss": 0.0981,
"step": 16325
},
{
"epoch": 0.6694920459431238,
"grad_norm": 0.23339948058128357,
"learning_rate": 3.311722949407082e-05,
"loss": 0.0959,
"step": 16350
},
{
"epoch": 0.6705157340867678,
"grad_norm": 0.3215301036834717,
"learning_rate": 3.301464855771204e-05,
"loss": 0.0915,
"step": 16375
},
{
"epoch": 0.6715394222304117,
"grad_norm": 0.21121945977210999,
"learning_rate": 3.2912067621353245e-05,
"loss": 0.0965,
"step": 16400
},
{
"epoch": 0.6725631103740557,
"grad_norm": 0.2474169135093689,
"learning_rate": 3.280948668499446e-05,
"loss": 0.0901,
"step": 16425
},
{
"epoch": 0.6735867985176995,
"grad_norm": 0.27990350127220154,
"learning_rate": 3.2706905748635675e-05,
"loss": 0.0914,
"step": 16450
},
{
"epoch": 0.6746104866613435,
"grad_norm": 0.23860132694244385,
"learning_rate": 3.260432481227689e-05,
"loss": 0.0892,
"step": 16475
},
{
"epoch": 0.6756341748049874,
"grad_norm": 0.29351699352264404,
"learning_rate": 3.25017438759181e-05,
"loss": 0.0956,
"step": 16500
},
{
"epoch": 0.6766578629486313,
"grad_norm": 0.2769309878349304,
"learning_rate": 3.239916293955931e-05,
"loss": 0.0938,
"step": 16525
},
{
"epoch": 0.6776815510922752,
"grad_norm": 0.1899634450674057,
"learning_rate": 3.229658200320053e-05,
"loss": 0.0927,
"step": 16550
},
{
"epoch": 0.6787052392359192,
"grad_norm": 0.23339390754699707,
"learning_rate": 3.219400106684174e-05,
"loss": 0.0944,
"step": 16575
},
{
"epoch": 0.6797289273795631,
"grad_norm": 0.30219605565071106,
"learning_rate": 3.209142013048295e-05,
"loss": 0.0908,
"step": 16600
},
{
"epoch": 0.680752615523207,
"grad_norm": 0.24272675812244415,
"learning_rate": 3.1988839194124164e-05,
"loss": 0.0905,
"step": 16625
},
{
"epoch": 0.6817763036668509,
"grad_norm": 0.28862476348876953,
"learning_rate": 3.188625825776538e-05,
"loss": 0.0958,
"step": 16650
},
{
"epoch": 0.6827999918104949,
"grad_norm": 0.230793759226799,
"learning_rate": 3.178367732140659e-05,
"loss": 0.0942,
"step": 16675
},
{
"epoch": 0.6838236799541387,
"grad_norm": 0.256304532289505,
"learning_rate": 3.16810963850478e-05,
"loss": 0.0908,
"step": 16700
},
{
"epoch": 0.6848473680977827,
"grad_norm": 0.24292372167110443,
"learning_rate": 3.1578515448689016e-05,
"loss": 0.0919,
"step": 16725
},
{
"epoch": 0.6858710562414266,
"grad_norm": 0.3442842662334442,
"learning_rate": 3.147593451233023e-05,
"loss": 0.0906,
"step": 16750
},
{
"epoch": 0.6868947443850705,
"grad_norm": 0.28444263339042664,
"learning_rate": 3.1373353575971445e-05,
"loss": 0.0904,
"step": 16775
},
{
"epoch": 0.6879184325287144,
"grad_norm": 0.2305566966533661,
"learning_rate": 3.127077263961265e-05,
"loss": 0.0948,
"step": 16800
},
{
"epoch": 0.6889421206723584,
"grad_norm": 0.3065620958805084,
"learning_rate": 3.116819170325387e-05,
"loss": 0.0916,
"step": 16825
},
{
"epoch": 0.6899658088160023,
"grad_norm": 0.34748420119285583,
"learning_rate": 3.106561076689508e-05,
"loss": 0.0898,
"step": 16850
},
{
"epoch": 0.6909894969596462,
"grad_norm": 0.28425559401512146,
"learning_rate": 3.09630298305363e-05,
"loss": 0.0941,
"step": 16875
},
{
"epoch": 0.6920131851032901,
"grad_norm": 0.31354910135269165,
"learning_rate": 3.0860448894177505e-05,
"loss": 0.0945,
"step": 16900
},
{
"epoch": 0.6930368732469341,
"grad_norm": 0.2128172069787979,
"learning_rate": 3.075786795781872e-05,
"loss": 0.0909,
"step": 16925
},
{
"epoch": 0.6940605613905779,
"grad_norm": 0.2469140887260437,
"learning_rate": 3.0655287021459934e-05,
"loss": 0.0939,
"step": 16950
},
{
"epoch": 0.6950842495342219,
"grad_norm": 0.35298585891723633,
"learning_rate": 3.055270608510115e-05,
"loss": 0.089,
"step": 16975
},
{
"epoch": 0.6961079376778658,
"grad_norm": 0.26399216055870056,
"learning_rate": 3.045012514874236e-05,
"loss": 0.0968,
"step": 17000
},
{
"epoch": 0.6971316258215098,
"grad_norm": 0.2543809413909912,
"learning_rate": 3.034754421238357e-05,
"loss": 0.0908,
"step": 17025
},
{
"epoch": 0.6981553139651536,
"grad_norm": 0.24737343192100525,
"learning_rate": 3.0244963276024786e-05,
"loss": 0.094,
"step": 17050
},
{
"epoch": 0.6991790021087976,
"grad_norm": 0.2577686607837677,
"learning_rate": 3.0142382339665997e-05,
"loss": 0.0933,
"step": 17075
},
{
"epoch": 0.7002026902524415,
"grad_norm": 0.28968894481658936,
"learning_rate": 3.0039801403307212e-05,
"loss": 0.0965,
"step": 17100
},
{
"epoch": 0.7012263783960854,
"grad_norm": 0.2456517517566681,
"learning_rate": 2.9937220466948423e-05,
"loss": 0.0953,
"step": 17125
},
{
"epoch": 0.7022500665397293,
"grad_norm": 0.25714367628097534,
"learning_rate": 2.9834639530589638e-05,
"loss": 0.0888,
"step": 17150
},
{
"epoch": 0.7032737546833733,
"grad_norm": 0.2177487164735794,
"learning_rate": 2.973205859423085e-05,
"loss": 0.0917,
"step": 17175
},
{
"epoch": 0.7042974428270172,
"grad_norm": 0.20064932107925415,
"learning_rate": 2.962947765787206e-05,
"loss": 0.0899,
"step": 17200
},
{
"epoch": 0.7053211309706611,
"grad_norm": 0.2717735469341278,
"learning_rate": 2.9526896721513275e-05,
"loss": 0.0939,
"step": 17225
},
{
"epoch": 0.706344819114305,
"grad_norm": 0.20536677539348602,
"learning_rate": 2.9424315785154487e-05,
"loss": 0.0941,
"step": 17250
},
{
"epoch": 0.707368507257949,
"grad_norm": 0.28099992871284485,
"learning_rate": 2.93217348487957e-05,
"loss": 0.0881,
"step": 17275
},
{
"epoch": 0.7083921954015928,
"grad_norm": 0.21004413068294525,
"learning_rate": 2.9219153912436913e-05,
"loss": 0.0945,
"step": 17300
},
{
"epoch": 0.7094158835452368,
"grad_norm": 0.24377816915512085,
"learning_rate": 2.9116572976078127e-05,
"loss": 0.0943,
"step": 17325
},
{
"epoch": 0.7104395716888807,
"grad_norm": 0.2159167379140854,
"learning_rate": 2.901399203971934e-05,
"loss": 0.0916,
"step": 17350
},
{
"epoch": 0.7114632598325247,
"grad_norm": 0.3277469277381897,
"learning_rate": 2.8911411103360553e-05,
"loss": 0.0894,
"step": 17375
},
{
"epoch": 0.7124869479761685,
"grad_norm": 0.3423548638820648,
"learning_rate": 2.8808830167001764e-05,
"loss": 0.0922,
"step": 17400
},
{
"epoch": 0.7135106361198125,
"grad_norm": 0.20151039958000183,
"learning_rate": 2.870624923064298e-05,
"loss": 0.0906,
"step": 17425
},
{
"epoch": 0.7145343242634564,
"grad_norm": 0.29227256774902344,
"learning_rate": 2.860366829428419e-05,
"loss": 0.0914,
"step": 17450
},
{
"epoch": 0.7155580124071003,
"grad_norm": 0.31062838435173035,
"learning_rate": 2.8501087357925405e-05,
"loss": 0.0932,
"step": 17475
},
{
"epoch": 0.7165817005507442,
"grad_norm": 0.24426613748073578,
"learning_rate": 2.8398506421566616e-05,
"loss": 0.0938,
"step": 17500
},
{
"epoch": 0.7176053886943882,
"grad_norm": 0.2505645155906677,
"learning_rate": 2.829592548520783e-05,
"loss": 0.0924,
"step": 17525
},
{
"epoch": 0.718629076838032,
"grad_norm": 0.21960324048995972,
"learning_rate": 2.8193344548849042e-05,
"loss": 0.0912,
"step": 17550
},
{
"epoch": 0.719652764981676,
"grad_norm": 0.25820910930633545,
"learning_rate": 2.8090763612490257e-05,
"loss": 0.0913,
"step": 17575
},
{
"epoch": 0.7206764531253199,
"grad_norm": 0.23069611191749573,
"learning_rate": 2.7988182676131468e-05,
"loss": 0.0903,
"step": 17600
},
{
"epoch": 0.7217001412689639,
"grad_norm": 0.2641305923461914,
"learning_rate": 2.7885601739772683e-05,
"loss": 0.0899,
"step": 17625
},
{
"epoch": 0.7227238294126077,
"grad_norm": 0.28528881072998047,
"learning_rate": 2.7783020803413894e-05,
"loss": 0.0922,
"step": 17650
},
{
"epoch": 0.7237475175562517,
"grad_norm": 0.297124445438385,
"learning_rate": 2.768043986705511e-05,
"loss": 0.0899,
"step": 17675
},
{
"epoch": 0.7247712056998956,
"grad_norm": 0.2650444805622101,
"learning_rate": 2.757785893069632e-05,
"loss": 0.0903,
"step": 17700
},
{
"epoch": 0.7257948938435395,
"grad_norm": 0.2515466809272766,
"learning_rate": 2.7475277994337535e-05,
"loss": 0.0943,
"step": 17725
},
{
"epoch": 0.7268185819871834,
"grad_norm": 0.29468923807144165,
"learning_rate": 2.7372697057978746e-05,
"loss": 0.0935,
"step": 17750
},
{
"epoch": 0.7278422701308274,
"grad_norm": 0.28869664669036865,
"learning_rate": 2.727011612161996e-05,
"loss": 0.0877,
"step": 17775
},
{
"epoch": 0.7288659582744713,
"grad_norm": 0.2862752377986908,
"learning_rate": 2.7167535185261172e-05,
"loss": 0.0894,
"step": 17800
},
{
"epoch": 0.7298896464181152,
"grad_norm": 0.4324943721294403,
"learning_rate": 2.7064954248902387e-05,
"loss": 0.0964,
"step": 17825
},
{
"epoch": 0.7309133345617591,
"grad_norm": 0.2106688767671585,
"learning_rate": 2.6962373312543598e-05,
"loss": 0.0941,
"step": 17850
},
{
"epoch": 0.7319370227054031,
"grad_norm": 0.2924487292766571,
"learning_rate": 2.6859792376184813e-05,
"loss": 0.0895,
"step": 17875
},
{
"epoch": 0.7329607108490469,
"grad_norm": 0.21302323043346405,
"learning_rate": 2.6757211439826024e-05,
"loss": 0.0951,
"step": 17900
},
{
"epoch": 0.7339843989926909,
"grad_norm": 0.2614041268825531,
"learning_rate": 2.665463050346724e-05,
"loss": 0.0885,
"step": 17925
},
{
"epoch": 0.7350080871363348,
"grad_norm": 0.2530576288700104,
"learning_rate": 2.655204956710845e-05,
"loss": 0.0906,
"step": 17950
},
{
"epoch": 0.7360317752799788,
"grad_norm": 0.21055959165096283,
"learning_rate": 2.6449468630749665e-05,
"loss": 0.0905,
"step": 17975
},
{
"epoch": 0.7370554634236226,
"grad_norm": 0.23487575352191925,
"learning_rate": 2.6346887694390876e-05,
"loss": 0.0886,
"step": 18000
},
{
"epoch": 0.7380791515672666,
"grad_norm": 0.2657538950443268,
"learning_rate": 2.624430675803209e-05,
"loss": 0.0902,
"step": 18025
},
{
"epoch": 0.7391028397109105,
"grad_norm": 0.2803148627281189,
"learning_rate": 2.6141725821673302e-05,
"loss": 0.0914,
"step": 18050
},
{
"epoch": 0.7401265278545544,
"grad_norm": 0.29323095083236694,
"learning_rate": 2.6039144885314516e-05,
"loss": 0.0863,
"step": 18075
},
{
"epoch": 0.7411502159981983,
"grad_norm": 0.2417263686656952,
"learning_rate": 2.5936563948955728e-05,
"loss": 0.091,
"step": 18100
},
{
"epoch": 0.7421739041418423,
"grad_norm": 0.30392271280288696,
"learning_rate": 2.583398301259694e-05,
"loss": 0.094,
"step": 18125
},
{
"epoch": 0.7431975922854861,
"grad_norm": 0.24675561487674713,
"learning_rate": 2.5731402076238154e-05,
"loss": 0.09,
"step": 18150
},
{
"epoch": 0.74422128042913,
"grad_norm": 0.28635236620903015,
"learning_rate": 2.5628821139879365e-05,
"loss": 0.0944,
"step": 18175
},
{
"epoch": 0.745244968572774,
"grad_norm": 0.3268403112888336,
"learning_rate": 2.552624020352058e-05,
"loss": 0.0914,
"step": 18200
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.32864445447921753,
"learning_rate": 2.542365926716179e-05,
"loss": 0.0956,
"step": 18225
},
{
"epoch": 0.7472923448600618,
"grad_norm": 0.2175736427307129,
"learning_rate": 2.5321078330803006e-05,
"loss": 0.0922,
"step": 18250
},
{
"epoch": 0.7483160330037057,
"grad_norm": 0.26862508058547974,
"learning_rate": 2.5218497394444217e-05,
"loss": 0.0881,
"step": 18275
},
{
"epoch": 0.7493397211473497,
"grad_norm": 0.2962358593940735,
"learning_rate": 2.511591645808543e-05,
"loss": 0.0886,
"step": 18300
},
{
"epoch": 0.7503634092909935,
"grad_norm": 0.21592926979064941,
"learning_rate": 2.5013335521726643e-05,
"loss": 0.0852,
"step": 18325
},
{
"epoch": 0.7513870974346375,
"grad_norm": 0.4917377531528473,
"learning_rate": 2.4910754585367857e-05,
"loss": 0.088,
"step": 18350
},
{
"epoch": 0.7524107855782814,
"grad_norm": 0.2455429881811142,
"learning_rate": 2.480817364900907e-05,
"loss": 0.0937,
"step": 18375
},
{
"epoch": 0.7534344737219254,
"grad_norm": 0.22315055131912231,
"learning_rate": 2.4705592712650283e-05,
"loss": 0.0928,
"step": 18400
},
{
"epoch": 0.7544581618655692,
"grad_norm": 0.2998165190219879,
"learning_rate": 2.4603011776291495e-05,
"loss": 0.0973,
"step": 18425
},
{
"epoch": 0.7554818500092132,
"grad_norm": 0.29680758714675903,
"learning_rate": 2.450043083993271e-05,
"loss": 0.0883,
"step": 18450
},
{
"epoch": 0.7565055381528571,
"grad_norm": 0.34500744938850403,
"learning_rate": 2.439784990357392e-05,
"loss": 0.0936,
"step": 18475
},
{
"epoch": 0.757529226296501,
"grad_norm": 0.2546531856060028,
"learning_rate": 2.4295268967215135e-05,
"loss": 0.0929,
"step": 18500
},
{
"epoch": 0.7585529144401449,
"grad_norm": 0.2985497713088989,
"learning_rate": 2.4192688030856347e-05,
"loss": 0.0931,
"step": 18525
},
{
"epoch": 0.7595766025837889,
"grad_norm": 0.21997804939746857,
"learning_rate": 2.409010709449756e-05,
"loss": 0.0922,
"step": 18550
},
{
"epoch": 0.7606002907274328,
"grad_norm": 0.33792802691459656,
"learning_rate": 2.3987526158138772e-05,
"loss": 0.0856,
"step": 18575
},
{
"epoch": 0.7616239788710767,
"grad_norm": 0.21099922060966492,
"learning_rate": 2.3884945221779987e-05,
"loss": 0.096,
"step": 18600
},
{
"epoch": 0.7626476670147206,
"grad_norm": 0.29002106189727783,
"learning_rate": 2.37823642854212e-05,
"loss": 0.0938,
"step": 18625
},
{
"epoch": 0.7636713551583646,
"grad_norm": 0.23993101716041565,
"learning_rate": 2.3679783349062413e-05,
"loss": 0.0875,
"step": 18650
},
{
"epoch": 0.7646950433020084,
"grad_norm": 0.2299950271844864,
"learning_rate": 2.3577202412703624e-05,
"loss": 0.0903,
"step": 18675
},
{
"epoch": 0.7657187314456524,
"grad_norm": 0.2547556757926941,
"learning_rate": 2.347462147634484e-05,
"loss": 0.0966,
"step": 18700
},
{
"epoch": 0.7667424195892963,
"grad_norm": 0.24056895077228546,
"learning_rate": 2.337204053998605e-05,
"loss": 0.0901,
"step": 18725
},
{
"epoch": 0.7677661077329402,
"grad_norm": 0.2962265610694885,
"learning_rate": 2.3269459603627265e-05,
"loss": 0.0941,
"step": 18750
},
{
"epoch": 0.7687897958765841,
"grad_norm": 0.3107589781284332,
"learning_rate": 2.3166878667268476e-05,
"loss": 0.0922,
"step": 18775
},
{
"epoch": 0.7698134840202281,
"grad_norm": 0.2781747877597809,
"learning_rate": 2.306429773090969e-05,
"loss": 0.0909,
"step": 18800
},
{
"epoch": 0.770837172163872,
"grad_norm": 0.3311710059642792,
"learning_rate": 2.2961716794550902e-05,
"loss": 0.0877,
"step": 18825
},
{
"epoch": 0.7718608603075159,
"grad_norm": 0.2895514965057373,
"learning_rate": 2.2859135858192117e-05,
"loss": 0.0973,
"step": 18850
},
{
"epoch": 0.7728845484511598,
"grad_norm": 0.24788254499435425,
"learning_rate": 2.2756554921833328e-05,
"loss": 0.0922,
"step": 18875
},
{
"epoch": 0.7739082365948038,
"grad_norm": 0.3390001952648163,
"learning_rate": 2.2653973985474543e-05,
"loss": 0.0951,
"step": 18900
},
{
"epoch": 0.7749319247384476,
"grad_norm": 0.3275790214538574,
"learning_rate": 2.2551393049115754e-05,
"loss": 0.0902,
"step": 18925
},
{
"epoch": 0.7759556128820916,
"grad_norm": 0.2598778009414673,
"learning_rate": 2.244881211275697e-05,
"loss": 0.0936,
"step": 18950
},
{
"epoch": 0.7769793010257355,
"grad_norm": 0.32007846236228943,
"learning_rate": 2.234623117639818e-05,
"loss": 0.093,
"step": 18975
},
{
"epoch": 0.7780029891693795,
"grad_norm": 0.25675615668296814,
"learning_rate": 2.2243650240039395e-05,
"loss": 0.097,
"step": 19000
},
{
"epoch": 0.7790266773130233,
"grad_norm": 0.20342758297920227,
"learning_rate": 2.2141069303680606e-05,
"loss": 0.0941,
"step": 19025
},
{
"epoch": 0.7800503654566673,
"grad_norm": 0.2361544668674469,
"learning_rate": 2.203848836732182e-05,
"loss": 0.0903,
"step": 19050
},
{
"epoch": 0.7810740536003112,
"grad_norm": 0.2677974998950958,
"learning_rate": 2.1935907430963032e-05,
"loss": 0.0938,
"step": 19075
},
{
"epoch": 0.7820977417439551,
"grad_norm": 0.3720152676105499,
"learning_rate": 2.1833326494604243e-05,
"loss": 0.0899,
"step": 19100
},
{
"epoch": 0.783121429887599,
"grad_norm": 0.30042845010757446,
"learning_rate": 2.1730745558245458e-05,
"loss": 0.0906,
"step": 19125
},
{
"epoch": 0.784145118031243,
"grad_norm": 0.25269991159439087,
"learning_rate": 2.162816462188667e-05,
"loss": 0.0899,
"step": 19150
},
{
"epoch": 0.7851688061748869,
"grad_norm": 0.21545687317848206,
"learning_rate": 2.1525583685527884e-05,
"loss": 0.0888,
"step": 19175
},
{
"epoch": 0.7861924943185308,
"grad_norm": 0.24490401148796082,
"learning_rate": 2.1423002749169095e-05,
"loss": 0.0899,
"step": 19200
},
{
"epoch": 0.7872161824621747,
"grad_norm": 0.3394610583782196,
"learning_rate": 2.132042181281031e-05,
"loss": 0.0981,
"step": 19225
},
{
"epoch": 0.7882398706058187,
"grad_norm": 0.27232640981674194,
"learning_rate": 2.121784087645152e-05,
"loss": 0.0888,
"step": 19250
},
{
"epoch": 0.7892635587494625,
"grad_norm": 0.26301074028015137,
"learning_rate": 2.1115259940092736e-05,
"loss": 0.0897,
"step": 19275
},
{
"epoch": 0.7902872468931065,
"grad_norm": 0.2940311133861542,
"learning_rate": 2.1012679003733947e-05,
"loss": 0.0912,
"step": 19300
},
{
"epoch": 0.7913109350367504,
"grad_norm": 0.24101464450359344,
"learning_rate": 2.091009806737516e-05,
"loss": 0.0933,
"step": 19325
},
{
"epoch": 0.7923346231803943,
"grad_norm": 0.3280772268772125,
"learning_rate": 2.0807517131016373e-05,
"loss": 0.0905,
"step": 19350
},
{
"epoch": 0.7933583113240382,
"grad_norm": 0.3161431550979614,
"learning_rate": 2.0704936194657588e-05,
"loss": 0.0901,
"step": 19375
},
{
"epoch": 0.7943819994676822,
"grad_norm": 0.28092876076698303,
"learning_rate": 2.06023552582988e-05,
"loss": 0.0902,
"step": 19400
},
{
"epoch": 0.7954056876113261,
"grad_norm": 0.21107934415340424,
"learning_rate": 2.0499774321940014e-05,
"loss": 0.0888,
"step": 19425
},
{
"epoch": 0.79642937575497,
"grad_norm": 0.24856053292751312,
"learning_rate": 2.0397193385581225e-05,
"loss": 0.0905,
"step": 19450
},
{
"epoch": 0.7974530638986139,
"grad_norm": 0.2561679482460022,
"learning_rate": 2.029461244922244e-05,
"loss": 0.0922,
"step": 19475
},
{
"epoch": 0.7984767520422579,
"grad_norm": 0.25557827949523926,
"learning_rate": 2.019203151286365e-05,
"loss": 0.0879,
"step": 19500
},
{
"epoch": 0.7995004401859017,
"grad_norm": 0.2589765787124634,
"learning_rate": 2.0089450576504865e-05,
"loss": 0.0928,
"step": 19525
},
{
"epoch": 0.8005241283295457,
"grad_norm": 0.21249115467071533,
"learning_rate": 1.9986869640146077e-05,
"loss": 0.0869,
"step": 19550
},
{
"epoch": 0.8015478164731896,
"grad_norm": 0.23621489107608795,
"learning_rate": 1.988428870378729e-05,
"loss": 0.092,
"step": 19575
},
{
"epoch": 0.8025715046168336,
"grad_norm": 0.2507089376449585,
"learning_rate": 1.9781707767428503e-05,
"loss": 0.0875,
"step": 19600
},
{
"epoch": 0.8035951927604774,
"grad_norm": 0.28460606932640076,
"learning_rate": 1.9679126831069717e-05,
"loss": 0.0838,
"step": 19625
},
{
"epoch": 0.8046188809041214,
"grad_norm": 0.3332251310348511,
"learning_rate": 1.9576545894710925e-05,
"loss": 0.0909,
"step": 19650
},
{
"epoch": 0.8056425690477653,
"grad_norm": 0.26824021339416504,
"learning_rate": 1.947396495835214e-05,
"loss": 0.0922,
"step": 19675
},
{
"epoch": 0.8066662571914092,
"grad_norm": 0.2643376886844635,
"learning_rate": 1.937138402199335e-05,
"loss": 0.0915,
"step": 19700
},
{
"epoch": 0.8076899453350531,
"grad_norm": 0.29947948455810547,
"learning_rate": 1.9268803085634566e-05,
"loss": 0.0919,
"step": 19725
},
{
"epoch": 0.8087136334786971,
"grad_norm": 0.37118449807167053,
"learning_rate": 1.9166222149275777e-05,
"loss": 0.0887,
"step": 19750
},
{
"epoch": 0.809737321622341,
"grad_norm": 0.32123562693595886,
"learning_rate": 1.9063641212916992e-05,
"loss": 0.0913,
"step": 19775
},
{
"epoch": 0.8107610097659849,
"grad_norm": 0.2964722514152527,
"learning_rate": 1.8961060276558203e-05,
"loss": 0.0915,
"step": 19800
},
{
"epoch": 0.8117846979096288,
"grad_norm": 0.25374674797058105,
"learning_rate": 1.8858479340199418e-05,
"loss": 0.0918,
"step": 19825
},
{
"epoch": 0.8128083860532728,
"grad_norm": 0.30407896637916565,
"learning_rate": 1.875589840384063e-05,
"loss": 0.0934,
"step": 19850
},
{
"epoch": 0.8138320741969166,
"grad_norm": 0.284839928150177,
"learning_rate": 1.8653317467481844e-05,
"loss": 0.0868,
"step": 19875
},
{
"epoch": 0.8148557623405606,
"grad_norm": 0.27440112829208374,
"learning_rate": 1.8550736531123055e-05,
"loss": 0.0981,
"step": 19900
},
{
"epoch": 0.8158794504842045,
"grad_norm": 0.293817400932312,
"learning_rate": 1.844815559476427e-05,
"loss": 0.0937,
"step": 19925
},
{
"epoch": 0.8169031386278484,
"grad_norm": 0.25099506974220276,
"learning_rate": 1.834557465840548e-05,
"loss": 0.0907,
"step": 19950
},
{
"epoch": 0.8179268267714923,
"grad_norm": 0.2696509063243866,
"learning_rate": 1.8242993722046696e-05,
"loss": 0.0898,
"step": 19975
},
{
"epoch": 0.8189505149151363,
"grad_norm": 0.23524117469787598,
"learning_rate": 1.8140412785687907e-05,
"loss": 0.0834,
"step": 20000
},
{
"epoch": 0.8199742030587802,
"grad_norm": 0.28562095761299133,
"learning_rate": 1.803783184932912e-05,
"loss": 0.0949,
"step": 20025
},
{
"epoch": 0.8209978912024241,
"grad_norm": 0.3326290249824524,
"learning_rate": 1.7935250912970333e-05,
"loss": 0.086,
"step": 20050
},
{
"epoch": 0.822021579346068,
"grad_norm": 0.335920125246048,
"learning_rate": 1.7832669976611548e-05,
"loss": 0.0898,
"step": 20075
},
{
"epoch": 0.823045267489712,
"grad_norm": 0.23107844591140747,
"learning_rate": 1.773008904025276e-05,
"loss": 0.0911,
"step": 20100
},
{
"epoch": 0.8240689556333558,
"grad_norm": 0.2805933356285095,
"learning_rate": 1.7627508103893973e-05,
"loss": 0.0903,
"step": 20125
},
{
"epoch": 0.8250926437769998,
"grad_norm": 0.2637193500995636,
"learning_rate": 1.7524927167535185e-05,
"loss": 0.0934,
"step": 20150
},
{
"epoch": 0.8261163319206437,
"grad_norm": 0.25126680731773376,
"learning_rate": 1.74223462311764e-05,
"loss": 0.0967,
"step": 20175
},
{
"epoch": 0.8271400200642877,
"grad_norm": 0.21200938522815704,
"learning_rate": 1.731976529481761e-05,
"loss": 0.0879,
"step": 20200
},
{
"epoch": 0.8281637082079315,
"grad_norm": 0.2675575017929077,
"learning_rate": 1.7217184358458825e-05,
"loss": 0.0933,
"step": 20225
},
{
"epoch": 0.8291873963515755,
"grad_norm": 0.24949528276920319,
"learning_rate": 1.7114603422100037e-05,
"loss": 0.0834,
"step": 20250
},
{
"epoch": 0.8302110844952194,
"grad_norm": 0.31639212369918823,
"learning_rate": 1.701202248574125e-05,
"loss": 0.0862,
"step": 20275
},
{
"epoch": 0.8312347726388633,
"grad_norm": 0.31430932879447937,
"learning_rate": 1.6909441549382463e-05,
"loss": 0.0895,
"step": 20300
},
{
"epoch": 0.8322584607825072,
"grad_norm": 0.2188422530889511,
"learning_rate": 1.6806860613023674e-05,
"loss": 0.0866,
"step": 20325
},
{
"epoch": 0.8332821489261512,
"grad_norm": 0.26949557662010193,
"learning_rate": 1.670427967666489e-05,
"loss": 0.0874,
"step": 20350
},
{
"epoch": 0.8343058370697951,
"grad_norm": 0.2512851655483246,
"learning_rate": 1.66016987403061e-05,
"loss": 0.0886,
"step": 20375
},
{
"epoch": 0.835329525213439,
"grad_norm": 0.21398603916168213,
"learning_rate": 1.6499117803947314e-05,
"loss": 0.0901,
"step": 20400
},
{
"epoch": 0.8363532133570829,
"grad_norm": 0.3579723834991455,
"learning_rate": 1.6396536867588526e-05,
"loss": 0.089,
"step": 20425
},
{
"epoch": 0.8373769015007269,
"grad_norm": 0.25546953082084656,
"learning_rate": 1.629395593122974e-05,
"loss": 0.09,
"step": 20450
},
{
"epoch": 0.8384005896443707,
"grad_norm": 0.30521437525749207,
"learning_rate": 1.6191374994870952e-05,
"loss": 0.0878,
"step": 20475
},
{
"epoch": 0.8394242777880146,
"grad_norm": 0.25270193815231323,
"learning_rate": 1.6088794058512166e-05,
"loss": 0.0871,
"step": 20500
},
{
"epoch": 0.8404479659316586,
"grad_norm": 0.31624093651771545,
"learning_rate": 1.5986213122153378e-05,
"loss": 0.0872,
"step": 20525
},
{
"epoch": 0.8414716540753026,
"grad_norm": 0.3739725947380066,
"learning_rate": 1.5883632185794592e-05,
"loss": 0.0864,
"step": 20550
},
{
"epoch": 0.8424953422189464,
"grad_norm": 0.25170573592185974,
"learning_rate": 1.5781051249435804e-05,
"loss": 0.0927,
"step": 20575
},
{
"epoch": 0.8435190303625903,
"grad_norm": 0.24413146078586578,
"learning_rate": 1.5678470313077018e-05,
"loss": 0.0878,
"step": 20600
},
{
"epoch": 0.8445427185062343,
"grad_norm": 0.26711735129356384,
"learning_rate": 1.557588937671823e-05,
"loss": 0.0898,
"step": 20625
},
{
"epoch": 0.8455664066498781,
"grad_norm": 0.2967755198478699,
"learning_rate": 1.5473308440359444e-05,
"loss": 0.093,
"step": 20650
},
{
"epoch": 0.8465900947935221,
"grad_norm": 0.25452178716659546,
"learning_rate": 1.5370727504000655e-05,
"loss": 0.088,
"step": 20675
},
{
"epoch": 0.847613782937166,
"grad_norm": 0.22610174119472504,
"learning_rate": 1.526814656764187e-05,
"loss": 0.0844,
"step": 20700
},
{
"epoch": 0.8486374710808099,
"grad_norm": 0.2170991748571396,
"learning_rate": 1.5165565631283083e-05,
"loss": 0.0884,
"step": 20725
},
{
"epoch": 0.8496611592244538,
"grad_norm": 0.2881997227668762,
"learning_rate": 1.5062984694924296e-05,
"loss": 0.0935,
"step": 20750
},
{
"epoch": 0.8506848473680978,
"grad_norm": 0.2766591012477875,
"learning_rate": 1.4960403758565509e-05,
"loss": 0.0874,
"step": 20775
},
{
"epoch": 0.8517085355117417,
"grad_norm": 0.2786926329135895,
"learning_rate": 1.485782282220672e-05,
"loss": 0.0892,
"step": 20800
},
{
"epoch": 0.8527322236553856,
"grad_norm": 0.22950054705142975,
"learning_rate": 1.4755241885847933e-05,
"loss": 0.089,
"step": 20825
},
{
"epoch": 0.8537559117990295,
"grad_norm": 0.43880143761634827,
"learning_rate": 1.4652660949489146e-05,
"loss": 0.0888,
"step": 20850
},
{
"epoch": 0.8547795999426735,
"grad_norm": 0.24918793141841888,
"learning_rate": 1.455008001313036e-05,
"loss": 0.0924,
"step": 20875
},
{
"epoch": 0.8558032880863173,
"grad_norm": 0.26215484738349915,
"learning_rate": 1.4447499076771572e-05,
"loss": 0.0903,
"step": 20900
},
{
"epoch": 0.8568269762299613,
"grad_norm": 0.2752866744995117,
"learning_rate": 1.4344918140412785e-05,
"loss": 0.0916,
"step": 20925
},
{
"epoch": 0.8578506643736052,
"grad_norm": 0.2551786005496979,
"learning_rate": 1.4242337204053998e-05,
"loss": 0.0887,
"step": 20950
},
{
"epoch": 0.8588743525172492,
"grad_norm": 0.2203332632780075,
"learning_rate": 1.4139756267695211e-05,
"loss": 0.086,
"step": 20975
},
{
"epoch": 0.859898040660893,
"grad_norm": 0.25602227449417114,
"learning_rate": 1.4037175331336424e-05,
"loss": 0.0927,
"step": 21000
},
{
"epoch": 0.860921728804537,
"grad_norm": 0.27257677912712097,
"learning_rate": 1.3934594394977637e-05,
"loss": 0.095,
"step": 21025
},
{
"epoch": 0.8619454169481809,
"grad_norm": 0.24853083491325378,
"learning_rate": 1.383201345861885e-05,
"loss": 0.0896,
"step": 21050
},
{
"epoch": 0.8629691050918248,
"grad_norm": 0.22490383684635162,
"learning_rate": 1.3729432522260063e-05,
"loss": 0.089,
"step": 21075
},
{
"epoch": 0.8639927932354687,
"grad_norm": 0.25305449962615967,
"learning_rate": 1.3626851585901276e-05,
"loss": 0.0879,
"step": 21100
},
{
"epoch": 0.8650164813791127,
"grad_norm": 0.31005653738975525,
"learning_rate": 1.3524270649542489e-05,
"loss": 0.0927,
"step": 21125
},
{
"epoch": 0.8660401695227566,
"grad_norm": 0.24999596178531647,
"learning_rate": 1.3421689713183702e-05,
"loss": 0.089,
"step": 21150
},
{
"epoch": 0.8670638576664005,
"grad_norm": 0.23844856023788452,
"learning_rate": 1.3319108776824915e-05,
"loss": 0.0846,
"step": 21175
},
{
"epoch": 0.8680875458100444,
"grad_norm": 0.2782473564147949,
"learning_rate": 1.3216527840466128e-05,
"loss": 0.0931,
"step": 21200
},
{
"epoch": 0.8691112339536884,
"grad_norm": 0.22946637868881226,
"learning_rate": 1.3113946904107341e-05,
"loss": 0.0881,
"step": 21225
},
{
"epoch": 0.8701349220973322,
"grad_norm": 0.28429850935935974,
"learning_rate": 1.3011365967748554e-05,
"loss": 0.0881,
"step": 21250
},
{
"epoch": 0.8711586102409762,
"grad_norm": 0.39349105954170227,
"learning_rate": 1.2908785031389767e-05,
"loss": 0.0914,
"step": 21275
},
{
"epoch": 0.8721822983846201,
"grad_norm": 0.3252253234386444,
"learning_rate": 1.280620409503098e-05,
"loss": 0.0914,
"step": 21300
},
{
"epoch": 0.873205986528264,
"grad_norm": 0.2974836528301239,
"learning_rate": 1.2703623158672193e-05,
"loss": 0.0924,
"step": 21325
},
{
"epoch": 0.8742296746719079,
"grad_norm": 0.27263307571411133,
"learning_rate": 1.2601042222313406e-05,
"loss": 0.092,
"step": 21350
},
{
"epoch": 0.8752533628155519,
"grad_norm": 0.34150230884552,
"learning_rate": 1.2498461285954619e-05,
"loss": 0.0909,
"step": 21375
},
{
"epoch": 0.8762770509591958,
"grad_norm": 0.27397677302360535,
"learning_rate": 1.2395880349595832e-05,
"loss": 0.091,
"step": 21400
},
{
"epoch": 0.8773007391028397,
"grad_norm": 0.28834134340286255,
"learning_rate": 1.2293299413237045e-05,
"loss": 0.0951,
"step": 21425
},
{
"epoch": 0.8783244272464836,
"grad_norm": 0.2486167699098587,
"learning_rate": 1.2190718476878258e-05,
"loss": 0.0838,
"step": 21450
},
{
"epoch": 0.8793481153901276,
"grad_norm": 0.3068005442619324,
"learning_rate": 1.208813754051947e-05,
"loss": 0.0897,
"step": 21475
},
{
"epoch": 0.8803718035337714,
"grad_norm": 0.2985325753688812,
"learning_rate": 1.1985556604160684e-05,
"loss": 0.0897,
"step": 21500
},
{
"epoch": 0.8813954916774154,
"grad_norm": 0.2797314524650574,
"learning_rate": 1.1882975667801897e-05,
"loss": 0.0907,
"step": 21525
},
{
"epoch": 0.8824191798210593,
"grad_norm": 0.22625084221363068,
"learning_rate": 1.178039473144311e-05,
"loss": 0.0898,
"step": 21550
},
{
"epoch": 0.8834428679647033,
"grad_norm": 0.23003660142421722,
"learning_rate": 1.1677813795084323e-05,
"loss": 0.0896,
"step": 21575
},
{
"epoch": 0.8844665561083471,
"grad_norm": 0.2965420186519623,
"learning_rate": 1.1575232858725536e-05,
"loss": 0.0889,
"step": 21600
},
{
"epoch": 0.8854902442519911,
"grad_norm": 0.332224577665329,
"learning_rate": 1.1472651922366748e-05,
"loss": 0.0886,
"step": 21625
},
{
"epoch": 0.886513932395635,
"grad_norm": 0.27045127749443054,
"learning_rate": 1.1370070986007961e-05,
"loss": 0.0899,
"step": 21650
},
{
"epoch": 0.8875376205392789,
"grad_norm": 0.26024821400642395,
"learning_rate": 1.1267490049649174e-05,
"loss": 0.0939,
"step": 21675
},
{
"epoch": 0.8885613086829228,
"grad_norm": 0.2873280942440033,
"learning_rate": 1.1164909113290387e-05,
"loss": 0.0912,
"step": 21700
},
{
"epoch": 0.8895849968265668,
"grad_norm": 0.2818579077720642,
"learning_rate": 1.1062328176931599e-05,
"loss": 0.0909,
"step": 21725
},
{
"epoch": 0.8906086849702107,
"grad_norm": 0.33922845125198364,
"learning_rate": 1.0959747240572812e-05,
"loss": 0.0859,
"step": 21750
},
{
"epoch": 0.8916323731138546,
"grad_norm": 0.3189659118652344,
"learning_rate": 1.0857166304214025e-05,
"loss": 0.0874,
"step": 21775
},
{
"epoch": 0.8926560612574985,
"grad_norm": 0.2925044000148773,
"learning_rate": 1.0754585367855238e-05,
"loss": 0.0874,
"step": 21800
},
{
"epoch": 0.8936797494011425,
"grad_norm": 0.36518415808677673,
"learning_rate": 1.065200443149645e-05,
"loss": 0.0905,
"step": 21825
},
{
"epoch": 0.8947034375447863,
"grad_norm": 0.29783540964126587,
"learning_rate": 1.0549423495137664e-05,
"loss": 0.0851,
"step": 21850
},
{
"epoch": 0.8957271256884303,
"grad_norm": 0.23640768229961395,
"learning_rate": 1.0446842558778877e-05,
"loss": 0.0901,
"step": 21875
},
{
"epoch": 0.8967508138320742,
"grad_norm": 0.26059839129447937,
"learning_rate": 1.034426162242009e-05,
"loss": 0.0903,
"step": 21900
},
{
"epoch": 0.8977745019757181,
"grad_norm": 0.3090721368789673,
"learning_rate": 1.0241680686061302e-05,
"loss": 0.0921,
"step": 21925
},
{
"epoch": 0.898798190119362,
"grad_norm": 0.3036380112171173,
"learning_rate": 1.0139099749702515e-05,
"loss": 0.0902,
"step": 21950
},
{
"epoch": 0.899821878263006,
"grad_norm": 0.27495357394218445,
"learning_rate": 1.0036518813343728e-05,
"loss": 0.0855,
"step": 21975
},
{
"epoch": 0.9008455664066499,
"grad_norm": 0.27286654710769653,
"learning_rate": 9.933937876984941e-06,
"loss": 0.0928,
"step": 22000
},
{
"epoch": 0.9018692545502938,
"grad_norm": 0.27504658699035645,
"learning_rate": 9.831356940626154e-06,
"loss": 0.0905,
"step": 22025
},
{
"epoch": 0.9028929426939377,
"grad_norm": 0.25373876094818115,
"learning_rate": 9.728776004267367e-06,
"loss": 0.0911,
"step": 22050
},
{
"epoch": 0.9039166308375817,
"grad_norm": 0.2752918601036072,
"learning_rate": 9.62619506790858e-06,
"loss": 0.0897,
"step": 22075
},
{
"epoch": 0.9049403189812255,
"grad_norm": 0.28456592559814453,
"learning_rate": 9.523614131549793e-06,
"loss": 0.085,
"step": 22100
},
{
"epoch": 0.9059640071248695,
"grad_norm": 0.2836301028728485,
"learning_rate": 9.421033195191006e-06,
"loss": 0.0879,
"step": 22125
},
{
"epoch": 0.9069876952685134,
"grad_norm": 0.2792745530605316,
"learning_rate": 9.31845225883222e-06,
"loss": 0.086,
"step": 22150
},
{
"epoch": 0.9080113834121574,
"grad_norm": 0.2640101909637451,
"learning_rate": 9.215871322473432e-06,
"loss": 0.0942,
"step": 22175
},
{
"epoch": 0.9090350715558012,
"grad_norm": 0.28286224603652954,
"learning_rate": 9.113290386114645e-06,
"loss": 0.0868,
"step": 22200
},
{
"epoch": 0.9100587596994452,
"grad_norm": 0.3581150770187378,
"learning_rate": 9.010709449755858e-06,
"loss": 0.092,
"step": 22225
},
{
"epoch": 0.9110824478430891,
"grad_norm": 0.2819570302963257,
"learning_rate": 8.908128513397071e-06,
"loss": 0.092,
"step": 22250
},
{
"epoch": 0.912106135986733,
"grad_norm": 0.2538643777370453,
"learning_rate": 8.805547577038284e-06,
"loss": 0.0922,
"step": 22275
},
{
"epoch": 0.9131298241303769,
"grad_norm": 0.2901509404182434,
"learning_rate": 8.702966640679497e-06,
"loss": 0.0862,
"step": 22300
},
{
"epoch": 0.9141535122740209,
"grad_norm": 0.28954175114631653,
"learning_rate": 8.60038570432071e-06,
"loss": 0.0879,
"step": 22325
},
{
"epoch": 0.9151772004176648,
"grad_norm": 0.26981502771377563,
"learning_rate": 8.497804767961923e-06,
"loss": 0.0889,
"step": 22350
},
{
"epoch": 0.9162008885613087,
"grad_norm": 0.3008342683315277,
"learning_rate": 8.395223831603136e-06,
"loss": 0.088,
"step": 22375
},
{
"epoch": 0.9172245767049526,
"grad_norm": 0.23977133631706238,
"learning_rate": 8.292642895244349e-06,
"loss": 0.0896,
"step": 22400
},
{
"epoch": 0.9182482648485966,
"grad_norm": 0.21286515891551971,
"learning_rate": 8.190061958885562e-06,
"loss": 0.0933,
"step": 22425
},
{
"epoch": 0.9192719529922404,
"grad_norm": 0.3176520764827728,
"learning_rate": 8.087481022526775e-06,
"loss": 0.0898,
"step": 22450
},
{
"epoch": 0.9202956411358844,
"grad_norm": 0.2136741727590561,
"learning_rate": 7.984900086167988e-06,
"loss": 0.0911,
"step": 22475
},
{
"epoch": 0.9213193292795283,
"grad_norm": 0.32107657194137573,
"learning_rate": 7.882319149809201e-06,
"loss": 0.0874,
"step": 22500
},
{
"epoch": 0.9223430174231722,
"grad_norm": 0.2349776327610016,
"learning_rate": 7.779738213450414e-06,
"loss": 0.0867,
"step": 22525
},
{
"epoch": 0.9233667055668161,
"grad_norm": 0.2386864870786667,
"learning_rate": 7.677157277091627e-06,
"loss": 0.0878,
"step": 22550
},
{
"epoch": 0.9243903937104601,
"grad_norm": 0.270991712808609,
"learning_rate": 7.574576340732839e-06,
"loss": 0.0837,
"step": 22575
},
{
"epoch": 0.925414081854104,
"grad_norm": 0.33399784564971924,
"learning_rate": 7.471995404374052e-06,
"loss": 0.0901,
"step": 22600
},
{
"epoch": 0.9264377699977479,
"grad_norm": 0.2850496470928192,
"learning_rate": 7.369414468015265e-06,
"loss": 0.0898,
"step": 22625
},
{
"epoch": 0.9274614581413918,
"grad_norm": 0.32937246561050415,
"learning_rate": 7.266833531656478e-06,
"loss": 0.0903,
"step": 22650
},
{
"epoch": 0.9284851462850358,
"grad_norm": 0.22164273262023926,
"learning_rate": 7.164252595297691e-06,
"loss": 0.0928,
"step": 22675
},
{
"epoch": 0.9295088344286796,
"grad_norm": 0.2599170506000519,
"learning_rate": 7.061671658938904e-06,
"loss": 0.0874,
"step": 22700
},
{
"epoch": 0.9305325225723236,
"grad_norm": 0.3116656243801117,
"learning_rate": 6.959090722580117e-06,
"loss": 0.0845,
"step": 22725
},
{
"epoch": 0.9315562107159675,
"grad_norm": 0.27648812532424927,
"learning_rate": 6.85650978622133e-06,
"loss": 0.0866,
"step": 22750
},
{
"epoch": 0.9325798988596115,
"grad_norm": 0.26359742879867554,
"learning_rate": 6.753928849862543e-06,
"loss": 0.0884,
"step": 22775
},
{
"epoch": 0.9336035870032553,
"grad_norm": 0.26720476150512695,
"learning_rate": 6.651347913503756e-06,
"loss": 0.0867,
"step": 22800
},
{
"epoch": 0.9346272751468993,
"grad_norm": 0.2515944540500641,
"learning_rate": 6.548766977144969e-06,
"loss": 0.0883,
"step": 22825
},
{
"epoch": 0.9356509632905432,
"grad_norm": 0.23396004736423492,
"learning_rate": 6.446186040786182e-06,
"loss": 0.0883,
"step": 22850
},
{
"epoch": 0.936674651434187,
"grad_norm": 0.2513067424297333,
"learning_rate": 6.343605104427394e-06,
"loss": 0.0868,
"step": 22875
},
{
"epoch": 0.937698339577831,
"grad_norm": 0.29367002844810486,
"learning_rate": 6.241024168068607e-06,
"loss": 0.0932,
"step": 22900
},
{
"epoch": 0.938722027721475,
"grad_norm": 0.2306540161371231,
"learning_rate": 6.13844323170982e-06,
"loss": 0.0918,
"step": 22925
},
{
"epoch": 0.9397457158651189,
"grad_norm": 0.27428171038627625,
"learning_rate": 6.035862295351033e-06,
"loss": 0.0881,
"step": 22950
},
{
"epoch": 0.9407694040087627,
"grad_norm": 0.3886117935180664,
"learning_rate": 5.933281358992245e-06,
"loss": 0.087,
"step": 22975
},
{
"epoch": 0.9417930921524067,
"grad_norm": 0.25603532791137695,
"learning_rate": 5.830700422633458e-06,
"loss": 0.0856,
"step": 23000
},
{
"epoch": 0.9428167802960506,
"grad_norm": 0.30329135060310364,
"learning_rate": 5.728119486274671e-06,
"loss": 0.093,
"step": 23025
},
{
"epoch": 0.9438404684396945,
"grad_norm": 0.26778334379196167,
"learning_rate": 5.625538549915884e-06,
"loss": 0.0896,
"step": 23050
},
{
"epoch": 0.9448641565833384,
"grad_norm": 0.28244808316230774,
"learning_rate": 5.522957613557097e-06,
"loss": 0.0895,
"step": 23075
},
{
"epoch": 0.9458878447269824,
"grad_norm": 0.353553831577301,
"learning_rate": 5.42037667719831e-06,
"loss": 0.0918,
"step": 23100
},
{
"epoch": 0.9469115328706262,
"grad_norm": 0.3107817769050598,
"learning_rate": 5.317795740839523e-06,
"loss": 0.0929,
"step": 23125
},
{
"epoch": 0.9479352210142702,
"grad_norm": 0.2637424170970917,
"learning_rate": 5.215214804480736e-06,
"loss": 0.0907,
"step": 23150
},
{
"epoch": 0.9489589091579141,
"grad_norm": 0.2971089780330658,
"learning_rate": 5.112633868121949e-06,
"loss": 0.09,
"step": 23175
},
{
"epoch": 0.9499825973015581,
"grad_norm": 0.22394390404224396,
"learning_rate": 5.010052931763162e-06,
"loss": 0.0903,
"step": 23200
},
{
"epoch": 0.9510062854452019,
"grad_norm": 0.2777024805545807,
"learning_rate": 4.9074719954043746e-06,
"loss": 0.0873,
"step": 23225
},
{
"epoch": 0.9520299735888459,
"grad_norm": 0.24165412783622742,
"learning_rate": 4.8048910590455875e-06,
"loss": 0.0911,
"step": 23250
},
{
"epoch": 0.9530536617324898,
"grad_norm": 0.2876558005809784,
"learning_rate": 4.7023101226868005e-06,
"loss": 0.0904,
"step": 23275
},
{
"epoch": 0.9540773498761337,
"grad_norm": 0.2749604284763336,
"learning_rate": 4.5997291863280135e-06,
"loss": 0.0894,
"step": 23300
},
{
"epoch": 0.9551010380197776,
"grad_norm": 0.2758445143699646,
"learning_rate": 4.4971482499692265e-06,
"loss": 0.0899,
"step": 23325
},
{
"epoch": 0.9561247261634216,
"grad_norm": 0.20477938652038574,
"learning_rate": 4.3945673136104394e-06,
"loss": 0.0878,
"step": 23350
},
{
"epoch": 0.9571484143070655,
"grad_norm": 0.3615458607673645,
"learning_rate": 4.291986377251652e-06,
"loss": 0.0858,
"step": 23375
},
{
"epoch": 0.9581721024507094,
"grad_norm": 0.28123295307159424,
"learning_rate": 4.189405440892865e-06,
"loss": 0.087,
"step": 23400
},
{
"epoch": 0.9591957905943533,
"grad_norm": 0.30753856897354126,
"learning_rate": 4.0868245045340775e-06,
"loss": 0.0896,
"step": 23425
},
{
"epoch": 0.9602194787379973,
"grad_norm": 0.31176239252090454,
"learning_rate": 3.9842435681752905e-06,
"loss": 0.0884,
"step": 23450
},
{
"epoch": 0.9612431668816411,
"grad_norm": 0.29048678278923035,
"learning_rate": 3.8816626318165035e-06,
"loss": 0.0818,
"step": 23475
},
{
"epoch": 0.9622668550252851,
"grad_norm": 0.2853899896144867,
"learning_rate": 3.779081695457716e-06,
"loss": 0.088,
"step": 23500
},
{
"epoch": 0.963290543168929,
"grad_norm": 0.2238619327545166,
"learning_rate": 3.676500759098929e-06,
"loss": 0.0901,
"step": 23525
},
{
"epoch": 0.964314231312573,
"grad_norm": 0.44698524475097656,
"learning_rate": 3.573919822740142e-06,
"loss": 0.0856,
"step": 23550
},
{
"epoch": 0.9653379194562168,
"grad_norm": 0.3152156174182892,
"learning_rate": 3.471338886381355e-06,
"loss": 0.0931,
"step": 23575
},
{
"epoch": 0.9663616075998608,
"grad_norm": 0.27057376503944397,
"learning_rate": 3.368757950022568e-06,
"loss": 0.0901,
"step": 23600
},
{
"epoch": 0.9673852957435047,
"grad_norm": 0.29068031907081604,
"learning_rate": 3.2661770136637804e-06,
"loss": 0.0893,
"step": 23625
},
{
"epoch": 0.9684089838871486,
"grad_norm": 0.2840330898761749,
"learning_rate": 3.1635960773049934e-06,
"loss": 0.0888,
"step": 23650
},
{
"epoch": 0.9694326720307925,
"grad_norm": 0.31523531675338745,
"learning_rate": 3.061015140946207e-06,
"loss": 0.0875,
"step": 23675
},
{
"epoch": 0.9704563601744365,
"grad_norm": 0.2516843378543854,
"learning_rate": 2.95843420458742e-06,
"loss": 0.0915,
"step": 23700
},
{
"epoch": 0.9714800483180803,
"grad_norm": 0.26545655727386475,
"learning_rate": 2.8558532682286323e-06,
"loss": 0.0848,
"step": 23725
},
{
"epoch": 0.9725037364617243,
"grad_norm": 0.300320565700531,
"learning_rate": 2.7532723318698453e-06,
"loss": 0.0886,
"step": 23750
},
{
"epoch": 0.9735274246053682,
"grad_norm": 0.3137941360473633,
"learning_rate": 2.6506913955110583e-06,
"loss": 0.0872,
"step": 23775
},
{
"epoch": 0.9745511127490122,
"grad_norm": 0.31483328342437744,
"learning_rate": 2.5481104591522713e-06,
"loss": 0.0895,
"step": 23800
},
{
"epoch": 0.975574800892656,
"grad_norm": 0.28136733174324036,
"learning_rate": 2.4455295227934842e-06,
"loss": 0.0894,
"step": 23825
},
{
"epoch": 0.9765984890363,
"grad_norm": 0.24842825531959534,
"learning_rate": 2.342948586434697e-06,
"loss": 0.0891,
"step": 23850
},
{
"epoch": 0.9776221771799439,
"grad_norm": 0.29128360748291016,
"learning_rate": 2.2403676500759098e-06,
"loss": 0.0939,
"step": 23875
},
{
"epoch": 0.9786458653235878,
"grad_norm": 0.27355626225471497,
"learning_rate": 2.1377867137171227e-06,
"loss": 0.0861,
"step": 23900
},
{
"epoch": 0.9796695534672317,
"grad_norm": 0.424562931060791,
"learning_rate": 2.0352057773583357e-06,
"loss": 0.0888,
"step": 23925
},
{
"epoch": 0.9806932416108757,
"grad_norm": 0.3024253845214844,
"learning_rate": 1.9326248409995487e-06,
"loss": 0.0889,
"step": 23950
},
{
"epoch": 0.9817169297545196,
"grad_norm": 0.3305220603942871,
"learning_rate": 1.8300439046407617e-06,
"loss": 0.091,
"step": 23975
},
{
"epoch": 0.9827406178981635,
"grad_norm": 0.29790905117988586,
"learning_rate": 1.7274629682819746e-06,
"loss": 0.0882,
"step": 24000
},
{
"epoch": 0.9837643060418074,
"grad_norm": 0.3197941184043884,
"learning_rate": 1.6248820319231876e-06,
"loss": 0.0878,
"step": 24025
},
{
"epoch": 0.9847879941854514,
"grad_norm": 0.2794630229473114,
"learning_rate": 1.5223010955644004e-06,
"loss": 0.0878,
"step": 24050
},
{
"epoch": 0.9858116823290952,
"grad_norm": 0.2822708487510681,
"learning_rate": 1.4197201592056133e-06,
"loss": 0.0853,
"step": 24075
},
{
"epoch": 0.9868353704727392,
"grad_norm": 0.2595159709453583,
"learning_rate": 1.3171392228468263e-06,
"loss": 0.0901,
"step": 24100
},
{
"epoch": 0.9878590586163831,
"grad_norm": 0.27910885214805603,
"learning_rate": 1.214558286488039e-06,
"loss": 0.0837,
"step": 24125
},
{
"epoch": 0.9888827467600271,
"grad_norm": 0.2924407720565796,
"learning_rate": 1.111977350129252e-06,
"loss": 0.086,
"step": 24150
},
{
"epoch": 0.9899064349036709,
"grad_norm": 0.2329237014055252,
"learning_rate": 1.0093964137704648e-06,
"loss": 0.094,
"step": 24175
},
{
"epoch": 0.9909301230473149,
"grad_norm": 0.2659105062484741,
"learning_rate": 9.068154774116778e-07,
"loss": 0.086,
"step": 24200
},
{
"epoch": 0.9919538111909588,
"grad_norm": 0.2556705176830292,
"learning_rate": 8.042345410528908e-07,
"loss": 0.0906,
"step": 24225
},
{
"epoch": 0.9929774993346027,
"grad_norm": 0.2836422324180603,
"learning_rate": 7.016536046941037e-07,
"loss": 0.0934,
"step": 24250
},
{
"epoch": 0.9940011874782466,
"grad_norm": 0.26003921031951904,
"learning_rate": 5.990726683353166e-07,
"loss": 0.089,
"step": 24275
},
{
"epoch": 0.9950248756218906,
"grad_norm": 0.2415328323841095,
"learning_rate": 4.964917319765296e-07,
"loss": 0.0882,
"step": 24300
},
{
"epoch": 0.9960485637655345,
"grad_norm": 0.48855510354042053,
"learning_rate": 3.9391079561774244e-07,
"loss": 0.0895,
"step": 24325
},
{
"epoch": 0.9970722519091784,
"grad_norm": 0.2958788573741913,
"learning_rate": 2.9132985925895536e-07,
"loss": 0.084,
"step": 24350
},
{
"epoch": 0.9980959400528223,
"grad_norm": 0.24037177860736847,
"learning_rate": 1.8874892290016825e-07,
"loss": 0.0894,
"step": 24375
},
{
"epoch": 0.9991196281964663,
"grad_norm": 0.32911139726638794,
"learning_rate": 8.616798654138116e-08,
"loss": 0.0882,
"step": 24400
}
],
"logging_steps": 25,
"max_steps": 24421,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.659903259460792e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}