{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999795262371272, "eval_steps": 500, "global_step": 24421, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010236881436439204, "grad_norm": 1.6028783321380615, "learning_rate": 5e-05, "loss": 1.4176, "step": 25 }, { "epoch": 0.0020473762872878407, "grad_norm": 0.7411264777183533, "learning_rate": 0.0001, "loss": 0.5905, "step": 50 }, { "epoch": 0.003071064430931761, "grad_norm": 1.4544100761413574, "learning_rate": 9.989741906364122e-05, "loss": 0.2869, "step": 75 }, { "epoch": 0.004094752574575681, "grad_norm": 0.31396615505218506, "learning_rate": 9.979483812728243e-05, "loss": 0.23, "step": 100 }, { "epoch": 0.005118440718219602, "grad_norm": 0.7768864035606384, "learning_rate": 9.969225719092365e-05, "loss": 0.2192, "step": 125 }, { "epoch": 0.006142128861863522, "grad_norm": 0.3052266836166382, "learning_rate": 9.958967625456485e-05, "loss": 0.2055, "step": 150 }, { "epoch": 0.007165817005507442, "grad_norm": 0.2620660960674286, "learning_rate": 9.948709531820606e-05, "loss": 0.1947, "step": 175 }, { "epoch": 0.008189505149151363, "grad_norm": 0.31376323103904724, "learning_rate": 9.938451438184728e-05, "loss": 0.1904, "step": 200 }, { "epoch": 0.009213193292795283, "grad_norm": 0.30490225553512573, "learning_rate": 9.92819334454885e-05, "loss": 0.1953, "step": 225 }, { "epoch": 0.010236881436439204, "grad_norm": 0.33286434412002563, "learning_rate": 9.917935250912971e-05, "loss": 0.1928, "step": 250 }, { "epoch": 0.011260569580083124, "grad_norm": 0.36407458782196045, "learning_rate": 9.907677157277092e-05, "loss": 0.1771, "step": 275 }, { "epoch": 0.012284257723727043, "grad_norm": 0.3681598901748657, "learning_rate": 9.897419063641214e-05, "loss": 0.1858, "step": 300 }, { "epoch": 0.013307945867370965, "grad_norm": 0.26045089960098267, "learning_rate": 9.887160970005335e-05, "loss": 0.1759, "step": 325 }, { "epoch": 0.014331634011014885, "grad_norm": 0.6006647348403931, "learning_rate": 9.876902876369455e-05, "loss": 0.1698, "step": 350 }, { "epoch": 0.015355322154658804, "grad_norm": 0.5427098870277405, "learning_rate": 9.866644782733577e-05, "loss": 0.1769, "step": 375 }, { "epoch": 0.016379010298302726, "grad_norm": 0.37887805700302124, "learning_rate": 9.856386689097698e-05, "loss": 0.1772, "step": 400 }, { "epoch": 0.017402698441946644, "grad_norm": 0.31953874230384827, "learning_rate": 9.84612859546182e-05, "loss": 0.1776, "step": 425 }, { "epoch": 0.018426386585590565, "grad_norm": 0.2421693652868271, "learning_rate": 9.835870501825941e-05, "loss": 0.1731, "step": 450 }, { "epoch": 0.019450074729234487, "grad_norm": 0.445103257894516, "learning_rate": 9.825612408190063e-05, "loss": 0.1664, "step": 475 }, { "epoch": 0.020473762872878408, "grad_norm": 0.3354673683643341, "learning_rate": 9.815354314554184e-05, "loss": 0.1749, "step": 500 }, { "epoch": 0.021497451016522326, "grad_norm": 0.24303950369358063, "learning_rate": 9.805096220918306e-05, "loss": 0.1655, "step": 525 }, { "epoch": 0.022521139160166247, "grad_norm": 0.2817317247390747, "learning_rate": 9.794838127282426e-05, "loss": 0.1673, "step": 550 }, { "epoch": 0.02354482730381017, "grad_norm": 0.4167644679546356, "learning_rate": 9.784580033646547e-05, "loss": 0.1638, "step": 575 }, { "epoch": 0.024568515447454087, "grad_norm": 0.339884877204895, "learning_rate": 9.774321940010669e-05, "loss": 0.1609, "step": 600 }, { "epoch": 0.025592203591098008, "grad_norm": 0.2876884937286377, "learning_rate": 9.76406384637479e-05, "loss": 0.1659, "step": 625 }, { "epoch": 0.02661589173474193, "grad_norm": 0.3253774642944336, "learning_rate": 9.753805752738912e-05, "loss": 0.1557, "step": 650 }, { "epoch": 0.027639579878385848, "grad_norm": 0.24355168640613556, "learning_rate": 9.743547659103033e-05, "loss": 0.167, "step": 675 }, { "epoch": 0.02866326802202977, "grad_norm": 0.3610304892063141, "learning_rate": 9.733289565467155e-05, "loss": 0.1605, "step": 700 }, { "epoch": 0.02968695616567369, "grad_norm": 0.3692477345466614, "learning_rate": 9.723031471831276e-05, "loss": 0.1546, "step": 725 }, { "epoch": 0.03071064430931761, "grad_norm": 0.63128262758255, "learning_rate": 9.712773378195396e-05, "loss": 0.1652, "step": 750 }, { "epoch": 0.031734332452961526, "grad_norm": 0.3006066381931305, "learning_rate": 9.702515284559518e-05, "loss": 0.1596, "step": 775 }, { "epoch": 0.03275802059660545, "grad_norm": 0.6283088326454163, "learning_rate": 9.692257190923639e-05, "loss": 0.1535, "step": 800 }, { "epoch": 0.03378170874024937, "grad_norm": 0.4018152356147766, "learning_rate": 9.68199909728776e-05, "loss": 0.1603, "step": 825 }, { "epoch": 0.03480539688389329, "grad_norm": 0.36612433195114136, "learning_rate": 9.671741003651882e-05, "loss": 0.1524, "step": 850 }, { "epoch": 0.03582908502753721, "grad_norm": 0.41699767112731934, "learning_rate": 9.661482910016003e-05, "loss": 0.1547, "step": 875 }, { "epoch": 0.03685277317118113, "grad_norm": 0.25239649415016174, "learning_rate": 9.651224816380125e-05, "loss": 0.161, "step": 900 }, { "epoch": 0.037876461314825055, "grad_norm": 0.28330907225608826, "learning_rate": 9.640966722744246e-05, "loss": 0.1544, "step": 925 }, { "epoch": 0.03890014945846897, "grad_norm": 0.393118292093277, "learning_rate": 9.630708629108367e-05, "loss": 0.1529, "step": 950 }, { "epoch": 0.03992383760211289, "grad_norm": 0.2385636419057846, "learning_rate": 9.620450535472488e-05, "loss": 0.1573, "step": 975 }, { "epoch": 0.040947525745756816, "grad_norm": 0.7217739820480347, "learning_rate": 9.61019244183661e-05, "loss": 0.1625, "step": 1000 }, { "epoch": 0.041971213889400734, "grad_norm": 0.4201323091983795, "learning_rate": 9.599934348200731e-05, "loss": 0.1554, "step": 1025 }, { "epoch": 0.04299490203304465, "grad_norm": 0.2981342375278473, "learning_rate": 9.589676254564852e-05, "loss": 0.1606, "step": 1050 }, { "epoch": 0.04401859017668858, "grad_norm": 0.32909801602363586, "learning_rate": 9.579418160928974e-05, "loss": 0.1611, "step": 1075 }, { "epoch": 0.045042278320332495, "grad_norm": 0.3763565719127655, "learning_rate": 9.569160067293095e-05, "loss": 0.1509, "step": 1100 }, { "epoch": 0.04606596646397641, "grad_norm": 0.26296111941337585, "learning_rate": 9.558901973657215e-05, "loss": 0.1458, "step": 1125 }, { "epoch": 0.04708965460762034, "grad_norm": 0.2251584380865097, "learning_rate": 9.548643880021337e-05, "loss": 0.1483, "step": 1150 }, { "epoch": 0.048113342751264256, "grad_norm": 0.24623946845531464, "learning_rate": 9.538385786385458e-05, "loss": 0.1521, "step": 1175 }, { "epoch": 0.049137030894908174, "grad_norm": 0.45473411679267883, "learning_rate": 9.52812769274958e-05, "loss": 0.1549, "step": 1200 }, { "epoch": 0.0501607190385521, "grad_norm": 0.23999722301959991, "learning_rate": 9.517869599113701e-05, "loss": 0.1442, "step": 1225 }, { "epoch": 0.051184407182196016, "grad_norm": 0.32882001996040344, "learning_rate": 9.507611505477823e-05, "loss": 0.153, "step": 1250 }, { "epoch": 0.052208095325839934, "grad_norm": 0.44401663541793823, "learning_rate": 9.497353411841944e-05, "loss": 0.1521, "step": 1275 }, { "epoch": 0.05323178346948386, "grad_norm": 0.2603824734687805, "learning_rate": 9.487095318206066e-05, "loss": 0.1543, "step": 1300 }, { "epoch": 0.05425547161312778, "grad_norm": 0.28304556012153625, "learning_rate": 9.476837224570186e-05, "loss": 0.1491, "step": 1325 }, { "epoch": 0.055279159756771695, "grad_norm": 0.40350213646888733, "learning_rate": 9.466579130934307e-05, "loss": 0.1431, "step": 1350 }, { "epoch": 0.05630284790041562, "grad_norm": 0.3348640501499176, "learning_rate": 9.456321037298429e-05, "loss": 0.1439, "step": 1375 }, { "epoch": 0.05732653604405954, "grad_norm": 0.3141482472419739, "learning_rate": 9.44606294366255e-05, "loss": 0.148, "step": 1400 }, { "epoch": 0.058350224187703456, "grad_norm": 0.2608078420162201, "learning_rate": 9.435804850026672e-05, "loss": 0.1461, "step": 1425 }, { "epoch": 0.05937391233134738, "grad_norm": 0.2971978485584259, "learning_rate": 9.425546756390793e-05, "loss": 0.1435, "step": 1450 }, { "epoch": 0.0603976004749913, "grad_norm": 0.33824801445007324, "learning_rate": 9.415288662754915e-05, "loss": 0.1476, "step": 1475 }, { "epoch": 0.06142128861863522, "grad_norm": 0.22219249606132507, "learning_rate": 9.405030569119036e-05, "loss": 0.1443, "step": 1500 }, { "epoch": 0.06244497676227914, "grad_norm": 0.30279237031936646, "learning_rate": 9.394772475483156e-05, "loss": 0.1451, "step": 1525 }, { "epoch": 0.06346866490592305, "grad_norm": 0.7361096739768982, "learning_rate": 9.384514381847278e-05, "loss": 0.139, "step": 1550 }, { "epoch": 0.06449235304956698, "grad_norm": 0.2694852650165558, "learning_rate": 9.374256288211399e-05, "loss": 0.14, "step": 1575 }, { "epoch": 0.0655160411932109, "grad_norm": 0.2227030247449875, "learning_rate": 9.36399819457552e-05, "loss": 0.1409, "step": 1600 }, { "epoch": 0.06653972933685481, "grad_norm": 0.3561594486236572, "learning_rate": 9.353740100939642e-05, "loss": 0.1386, "step": 1625 }, { "epoch": 0.06756341748049874, "grad_norm": 0.3476031720638275, "learning_rate": 9.343482007303764e-05, "loss": 0.1397, "step": 1650 }, { "epoch": 0.06858710562414266, "grad_norm": 0.3784942626953125, "learning_rate": 9.333223913667885e-05, "loss": 0.1437, "step": 1675 }, { "epoch": 0.06961079376778657, "grad_norm": 0.38352203369140625, "learning_rate": 9.322965820032006e-05, "loss": 0.1345, "step": 1700 }, { "epoch": 0.0706344819114305, "grad_norm": 0.2508692741394043, "learning_rate": 9.312707726396127e-05, "loss": 0.1396, "step": 1725 }, { "epoch": 0.07165817005507442, "grad_norm": 0.5086421966552734, "learning_rate": 9.302449632760248e-05, "loss": 0.1377, "step": 1750 }, { "epoch": 0.07268185819871835, "grad_norm": 0.40866467356681824, "learning_rate": 9.29219153912437e-05, "loss": 0.1347, "step": 1775 }, { "epoch": 0.07370554634236226, "grad_norm": 0.3897942304611206, "learning_rate": 9.281933445488491e-05, "loss": 0.1317, "step": 1800 }, { "epoch": 0.07472923448600619, "grad_norm": 0.3895871937274933, "learning_rate": 9.271675351852612e-05, "loss": 0.1415, "step": 1825 }, { "epoch": 0.07575292262965011, "grad_norm": 0.3118538558483124, "learning_rate": 9.261417258216734e-05, "loss": 0.1276, "step": 1850 }, { "epoch": 0.07677661077329402, "grad_norm": 0.5007463097572327, "learning_rate": 9.251159164580855e-05, "loss": 0.1401, "step": 1875 }, { "epoch": 0.07780029891693795, "grad_norm": 0.37419870495796204, "learning_rate": 9.240901070944977e-05, "loss": 0.1338, "step": 1900 }, { "epoch": 0.07882398706058187, "grad_norm": 0.4792192876338959, "learning_rate": 9.230642977309097e-05, "loss": 0.1343, "step": 1925 }, { "epoch": 0.07984767520422578, "grad_norm": 0.7688687443733215, "learning_rate": 9.220384883673218e-05, "loss": 0.133, "step": 1950 }, { "epoch": 0.0808713633478697, "grad_norm": 0.8818038105964661, "learning_rate": 9.21012679003734e-05, "loss": 0.1354, "step": 1975 }, { "epoch": 0.08189505149151363, "grad_norm": 0.4251585304737091, "learning_rate": 9.199868696401461e-05, "loss": 0.1301, "step": 2000 }, { "epoch": 0.08291873963515754, "grad_norm": 0.33509576320648193, "learning_rate": 9.189610602765583e-05, "loss": 0.1336, "step": 2025 }, { "epoch": 0.08394242777880147, "grad_norm": 0.1928907036781311, "learning_rate": 9.179352509129704e-05, "loss": 0.1355, "step": 2050 }, { "epoch": 0.08496611592244539, "grad_norm": 0.2787665128707886, "learning_rate": 9.169094415493826e-05, "loss": 0.1314, "step": 2075 }, { "epoch": 0.0859898040660893, "grad_norm": 0.4015423357486725, "learning_rate": 9.158836321857946e-05, "loss": 0.1303, "step": 2100 }, { "epoch": 0.08701349220973323, "grad_norm": 0.579844057559967, "learning_rate": 9.148578228222067e-05, "loss": 0.1277, "step": 2125 }, { "epoch": 0.08803718035337715, "grad_norm": 0.3636709153652191, "learning_rate": 9.138320134586189e-05, "loss": 0.128, "step": 2150 }, { "epoch": 0.08906086849702106, "grad_norm": 0.25872743129730225, "learning_rate": 9.12806204095031e-05, "loss": 0.1312, "step": 2175 }, { "epoch": 0.09008455664066499, "grad_norm": 0.32024118304252625, "learning_rate": 9.117803947314432e-05, "loss": 0.1295, "step": 2200 }, { "epoch": 0.09110824478430891, "grad_norm": 0.23083104193210602, "learning_rate": 9.107545853678553e-05, "loss": 0.1292, "step": 2225 }, { "epoch": 0.09213193292795283, "grad_norm": 0.27154719829559326, "learning_rate": 9.097287760042675e-05, "loss": 0.1275, "step": 2250 }, { "epoch": 0.09315562107159675, "grad_norm": 0.29432374238967896, "learning_rate": 9.087029666406796e-05, "loss": 0.1246, "step": 2275 }, { "epoch": 0.09417930921524068, "grad_norm": 0.5287219882011414, "learning_rate": 9.076771572770916e-05, "loss": 0.1293, "step": 2300 }, { "epoch": 0.09520299735888459, "grad_norm": 0.3348105549812317, "learning_rate": 9.066513479135038e-05, "loss": 0.1226, "step": 2325 }, { "epoch": 0.09622668550252851, "grad_norm": 0.2081725001335144, "learning_rate": 9.056255385499159e-05, "loss": 0.1242, "step": 2350 }, { "epoch": 0.09725037364617244, "grad_norm": 0.27878373861312866, "learning_rate": 9.04599729186328e-05, "loss": 0.1343, "step": 2375 }, { "epoch": 0.09827406178981635, "grad_norm": 0.40117210149765015, "learning_rate": 9.035739198227402e-05, "loss": 0.1265, "step": 2400 }, { "epoch": 0.09929774993346027, "grad_norm": 0.46459710597991943, "learning_rate": 9.025481104591524e-05, "loss": 0.1218, "step": 2425 }, { "epoch": 0.1003214380771042, "grad_norm": 0.19930683076381683, "learning_rate": 9.015223010955645e-05, "loss": 0.1258, "step": 2450 }, { "epoch": 0.10134512622074811, "grad_norm": 0.3851957321166992, "learning_rate": 9.004964917319766e-05, "loss": 0.1243, "step": 2475 }, { "epoch": 0.10236881436439203, "grad_norm": 0.3303160071372986, "learning_rate": 8.994706823683887e-05, "loss": 0.1264, "step": 2500 }, { "epoch": 0.10339250250803596, "grad_norm": 0.3450019359588623, "learning_rate": 8.984448730048008e-05, "loss": 0.122, "step": 2525 }, { "epoch": 0.10441619065167987, "grad_norm": 0.36742231249809265, "learning_rate": 8.97419063641213e-05, "loss": 0.1216, "step": 2550 }, { "epoch": 0.1054398787953238, "grad_norm": 0.2524435222148895, "learning_rate": 8.963932542776251e-05, "loss": 0.1238, "step": 2575 }, { "epoch": 0.10646356693896772, "grad_norm": 0.38917961716651917, "learning_rate": 8.953674449140372e-05, "loss": 0.1252, "step": 2600 }, { "epoch": 0.10748725508261163, "grad_norm": 0.3554433584213257, "learning_rate": 8.943416355504494e-05, "loss": 0.1213, "step": 2625 }, { "epoch": 0.10851094322625555, "grad_norm": 0.2701007127761841, "learning_rate": 8.933158261868615e-05, "loss": 0.1255, "step": 2650 }, { "epoch": 0.10953463136989948, "grad_norm": 0.40730130672454834, "learning_rate": 8.922900168232737e-05, "loss": 0.1205, "step": 2675 }, { "epoch": 0.11055831951354339, "grad_norm": 0.36011001467704773, "learning_rate": 8.912642074596857e-05, "loss": 0.1208, "step": 2700 }, { "epoch": 0.11158200765718732, "grad_norm": 0.2509096562862396, "learning_rate": 8.902383980960978e-05, "loss": 0.1234, "step": 2725 }, { "epoch": 0.11260569580083124, "grad_norm": 0.34861189126968384, "learning_rate": 8.8921258873251e-05, "loss": 0.1306, "step": 2750 }, { "epoch": 0.11362938394447515, "grad_norm": 0.20540310442447662, "learning_rate": 8.881867793689221e-05, "loss": 0.1174, "step": 2775 }, { "epoch": 0.11465307208811908, "grad_norm": 0.26270365715026855, "learning_rate": 8.871609700053343e-05, "loss": 0.1299, "step": 2800 }, { "epoch": 0.115676760231763, "grad_norm": 0.5314069986343384, "learning_rate": 8.861351606417464e-05, "loss": 0.1193, "step": 2825 }, { "epoch": 0.11670044837540691, "grad_norm": 0.26417431235313416, "learning_rate": 8.851093512781586e-05, "loss": 0.1221, "step": 2850 }, { "epoch": 0.11772413651905084, "grad_norm": 0.2860862612724304, "learning_rate": 8.840835419145706e-05, "loss": 0.1273, "step": 2875 }, { "epoch": 0.11874782466269476, "grad_norm": 0.27751094102859497, "learning_rate": 8.830577325509827e-05, "loss": 0.1206, "step": 2900 }, { "epoch": 0.11977151280633867, "grad_norm": 0.45580488443374634, "learning_rate": 8.820319231873949e-05, "loss": 0.1187, "step": 2925 }, { "epoch": 0.1207952009499826, "grad_norm": 0.2574482560157776, "learning_rate": 8.81006113823807e-05, "loss": 0.1229, "step": 2950 }, { "epoch": 0.12181888909362652, "grad_norm": 0.2733965516090393, "learning_rate": 8.799803044602192e-05, "loss": 0.1191, "step": 2975 }, { "epoch": 0.12284257723727043, "grad_norm": 0.2117166668176651, "learning_rate": 8.789544950966313e-05, "loss": 0.1205, "step": 3000 }, { "epoch": 0.12386626538091436, "grad_norm": 0.5137503147125244, "learning_rate": 8.779286857330435e-05, "loss": 0.1264, "step": 3025 }, { "epoch": 0.12488995352455828, "grad_norm": 0.23070771992206573, "learning_rate": 8.769028763694556e-05, "loss": 0.118, "step": 3050 }, { "epoch": 0.1259136416682022, "grad_norm": 0.2723982334136963, "learning_rate": 8.758770670058676e-05, "loss": 0.1152, "step": 3075 }, { "epoch": 0.1269373298118461, "grad_norm": 0.3011278212070465, "learning_rate": 8.748512576422798e-05, "loss": 0.1187, "step": 3100 }, { "epoch": 0.12796101795549003, "grad_norm": 0.22801214456558228, "learning_rate": 8.738254482786919e-05, "loss": 0.1182, "step": 3125 }, { "epoch": 0.12898470609913396, "grad_norm": 0.3295694589614868, "learning_rate": 8.72799638915104e-05, "loss": 0.1213, "step": 3150 }, { "epoch": 0.13000839424277788, "grad_norm": 0.34608685970306396, "learning_rate": 8.717738295515162e-05, "loss": 0.1199, "step": 3175 }, { "epoch": 0.1310320823864218, "grad_norm": 0.5989237427711487, "learning_rate": 8.707480201879284e-05, "loss": 0.1173, "step": 3200 }, { "epoch": 0.13205577053006573, "grad_norm": 0.3048112094402313, "learning_rate": 8.697222108243405e-05, "loss": 0.1178, "step": 3225 }, { "epoch": 0.13307945867370963, "grad_norm": 0.3791589140892029, "learning_rate": 8.686964014607527e-05, "loss": 0.1175, "step": 3250 }, { "epoch": 0.13410314681735355, "grad_norm": 0.1966562420129776, "learning_rate": 8.676705920971647e-05, "loss": 0.1192, "step": 3275 }, { "epoch": 0.13512683496099748, "grad_norm": 0.36613497138023376, "learning_rate": 8.666447827335768e-05, "loss": 0.1167, "step": 3300 }, { "epoch": 0.1361505231046414, "grad_norm": 0.35663649439811707, "learning_rate": 8.65618973369989e-05, "loss": 0.122, "step": 3325 }, { "epoch": 0.13717421124828533, "grad_norm": 0.2863902151584625, "learning_rate": 8.645931640064011e-05, "loss": 0.1157, "step": 3350 }, { "epoch": 0.13819789939192925, "grad_norm": 0.3368700444698334, "learning_rate": 8.635673546428133e-05, "loss": 0.1174, "step": 3375 }, { "epoch": 0.13922158753557315, "grad_norm": 0.3548611104488373, "learning_rate": 8.625415452792254e-05, "loss": 0.118, "step": 3400 }, { "epoch": 0.14024527567921707, "grad_norm": 0.25708600878715515, "learning_rate": 8.615157359156375e-05, "loss": 0.1119, "step": 3425 }, { "epoch": 0.141268963822861, "grad_norm": 0.24036449193954468, "learning_rate": 8.604899265520497e-05, "loss": 0.115, "step": 3450 }, { "epoch": 0.14229265196650492, "grad_norm": 0.45417720079421997, "learning_rate": 8.594641171884617e-05, "loss": 0.1199, "step": 3475 }, { "epoch": 0.14331634011014885, "grad_norm": 0.28222933411598206, "learning_rate": 8.584383078248738e-05, "loss": 0.113, "step": 3500 }, { "epoch": 0.14434002825379277, "grad_norm": 0.2157520204782486, "learning_rate": 8.57412498461286e-05, "loss": 0.1146, "step": 3525 }, { "epoch": 0.1453637163974367, "grad_norm": 0.3632587790489197, "learning_rate": 8.563866890976981e-05, "loss": 0.1174, "step": 3550 }, { "epoch": 0.1463874045410806, "grad_norm": 0.23103779554367065, "learning_rate": 8.553608797341103e-05, "loss": 0.1111, "step": 3575 }, { "epoch": 0.14741109268472452, "grad_norm": 0.316450297832489, "learning_rate": 8.543350703705224e-05, "loss": 0.1148, "step": 3600 }, { "epoch": 0.14843478082836845, "grad_norm": 0.2546501159667969, "learning_rate": 8.533092610069346e-05, "loss": 0.1116, "step": 3625 }, { "epoch": 0.14945846897201237, "grad_norm": 0.5451907515525818, "learning_rate": 8.522834516433467e-05, "loss": 0.1154, "step": 3650 }, { "epoch": 0.1504821571156563, "grad_norm": 0.3568204939365387, "learning_rate": 8.512576422797587e-05, "loss": 0.1152, "step": 3675 }, { "epoch": 0.15150584525930022, "grad_norm": 0.22811046242713928, "learning_rate": 8.502318329161709e-05, "loss": 0.1164, "step": 3700 }, { "epoch": 0.15252953340294412, "grad_norm": 0.2431710660457611, "learning_rate": 8.49206023552583e-05, "loss": 0.1127, "step": 3725 }, { "epoch": 0.15355322154658804, "grad_norm": 0.27546626329421997, "learning_rate": 8.481802141889952e-05, "loss": 0.1174, "step": 3750 }, { "epoch": 0.15457690969023197, "grad_norm": 0.23295095562934875, "learning_rate": 8.471544048254073e-05, "loss": 0.1124, "step": 3775 }, { "epoch": 0.1556005978338759, "grad_norm": 0.2244202196598053, "learning_rate": 8.461285954618195e-05, "loss": 0.1104, "step": 3800 }, { "epoch": 0.15662428597751982, "grad_norm": 0.19517052173614502, "learning_rate": 8.451027860982316e-05, "loss": 0.1114, "step": 3825 }, { "epoch": 0.15764797412116374, "grad_norm": 0.26743006706237793, "learning_rate": 8.440769767346436e-05, "loss": 0.1112, "step": 3850 }, { "epoch": 0.15867166226480764, "grad_norm": 0.26785147190093994, "learning_rate": 8.430511673710558e-05, "loss": 0.1126, "step": 3875 }, { "epoch": 0.15969535040845156, "grad_norm": 0.2772103250026703, "learning_rate": 8.420253580074679e-05, "loss": 0.1137, "step": 3900 }, { "epoch": 0.1607190385520955, "grad_norm": 0.268732488155365, "learning_rate": 8.409995486438801e-05, "loss": 0.1148, "step": 3925 }, { "epoch": 0.1617427266957394, "grad_norm": 0.42661407589912415, "learning_rate": 8.399737392802922e-05, "loss": 0.1147, "step": 3950 }, { "epoch": 0.16276641483938334, "grad_norm": 0.2644007205963135, "learning_rate": 8.389479299167044e-05, "loss": 0.1179, "step": 3975 }, { "epoch": 0.16379010298302726, "grad_norm": 0.4172644019126892, "learning_rate": 8.379221205531165e-05, "loss": 0.1167, "step": 4000 }, { "epoch": 0.16481379112667116, "grad_norm": 0.2200649082660675, "learning_rate": 8.368963111895287e-05, "loss": 0.1154, "step": 4025 }, { "epoch": 0.16583747927031509, "grad_norm": 0.3296594023704529, "learning_rate": 8.358705018259407e-05, "loss": 0.1136, "step": 4050 }, { "epoch": 0.166861167413959, "grad_norm": 0.2407379001379013, "learning_rate": 8.348446924623528e-05, "loss": 0.1127, "step": 4075 }, { "epoch": 0.16788485555760294, "grad_norm": 0.19917023181915283, "learning_rate": 8.33818883098765e-05, "loss": 0.1167, "step": 4100 }, { "epoch": 0.16890854370124686, "grad_norm": 0.2644532024860382, "learning_rate": 8.327930737351771e-05, "loss": 0.1156, "step": 4125 }, { "epoch": 0.16993223184489079, "grad_norm": 0.22355978190898895, "learning_rate": 8.317672643715893e-05, "loss": 0.113, "step": 4150 }, { "epoch": 0.17095591998853468, "grad_norm": 0.3826581835746765, "learning_rate": 8.307414550080014e-05, "loss": 0.1172, "step": 4175 }, { "epoch": 0.1719796081321786, "grad_norm": 0.2284521907567978, "learning_rate": 8.297156456444136e-05, "loss": 0.1135, "step": 4200 }, { "epoch": 0.17300329627582253, "grad_norm": 0.2520081400871277, "learning_rate": 8.286898362808257e-05, "loss": 0.1146, "step": 4225 }, { "epoch": 0.17402698441946646, "grad_norm": 0.385019451379776, "learning_rate": 8.276640269172377e-05, "loss": 0.1102, "step": 4250 }, { "epoch": 0.17505067256311038, "grad_norm": 0.24445098638534546, "learning_rate": 8.266382175536499e-05, "loss": 0.1124, "step": 4275 }, { "epoch": 0.1760743607067543, "grad_norm": 0.24673700332641602, "learning_rate": 8.25612408190062e-05, "loss": 0.112, "step": 4300 }, { "epoch": 0.1770980488503982, "grad_norm": 0.2432449609041214, "learning_rate": 8.245865988264741e-05, "loss": 0.1121, "step": 4325 }, { "epoch": 0.17812173699404213, "grad_norm": 0.3263969123363495, "learning_rate": 8.235607894628863e-05, "loss": 0.1131, "step": 4350 }, { "epoch": 0.17914542513768605, "grad_norm": 0.25198620557785034, "learning_rate": 8.225349800992984e-05, "loss": 0.1106, "step": 4375 }, { "epoch": 0.18016911328132998, "grad_norm": 0.31025946140289307, "learning_rate": 8.215091707357106e-05, "loss": 0.1083, "step": 4400 }, { "epoch": 0.1811928014249739, "grad_norm": 0.2822698950767517, "learning_rate": 8.204833613721227e-05, "loss": 0.1128, "step": 4425 }, { "epoch": 0.18221648956861783, "grad_norm": 0.35102951526641846, "learning_rate": 8.194575520085347e-05, "loss": 0.1119, "step": 4450 }, { "epoch": 0.18324017771226173, "grad_norm": 0.2636832892894745, "learning_rate": 8.184317426449469e-05, "loss": 0.1154, "step": 4475 }, { "epoch": 0.18426386585590565, "grad_norm": 0.2501748204231262, "learning_rate": 8.17405933281359e-05, "loss": 0.1146, "step": 4500 }, { "epoch": 0.18528755399954958, "grad_norm": 0.24221724271774292, "learning_rate": 8.163801239177712e-05, "loss": 0.1111, "step": 4525 }, { "epoch": 0.1863112421431935, "grad_norm": 0.23959171772003174, "learning_rate": 8.153543145541833e-05, "loss": 0.1121, "step": 4550 }, { "epoch": 0.18733493028683743, "grad_norm": 0.28256523609161377, "learning_rate": 8.143285051905955e-05, "loss": 0.1056, "step": 4575 }, { "epoch": 0.18835861843048135, "grad_norm": 0.1967180222272873, "learning_rate": 8.133026958270076e-05, "loss": 0.1105, "step": 4600 }, { "epoch": 0.18938230657412525, "grad_norm": 0.25965237617492676, "learning_rate": 8.122768864634198e-05, "loss": 0.1082, "step": 4625 }, { "epoch": 0.19040599471776917, "grad_norm": 0.2722185552120209, "learning_rate": 8.112510770998318e-05, "loss": 0.1134, "step": 4650 }, { "epoch": 0.1914296828614131, "grad_norm": 0.24172380566596985, "learning_rate": 8.102252677362439e-05, "loss": 0.1138, "step": 4675 }, { "epoch": 0.19245337100505702, "grad_norm": 0.26783162355422974, "learning_rate": 8.091994583726561e-05, "loss": 0.1079, "step": 4700 }, { "epoch": 0.19347705914870095, "grad_norm": 0.2563905715942383, "learning_rate": 8.081736490090682e-05, "loss": 0.1097, "step": 4725 }, { "epoch": 0.19450074729234487, "grad_norm": 0.31813859939575195, "learning_rate": 8.071478396454804e-05, "loss": 0.1107, "step": 4750 }, { "epoch": 0.19552443543598877, "grad_norm": 0.2353924810886383, "learning_rate": 8.061220302818925e-05, "loss": 0.1112, "step": 4775 }, { "epoch": 0.1965481235796327, "grad_norm": 0.24150237441062927, "learning_rate": 8.050962209183047e-05, "loss": 0.1073, "step": 4800 }, { "epoch": 0.19757181172327662, "grad_norm": 0.31365466117858887, "learning_rate": 8.040704115547167e-05, "loss": 0.1091, "step": 4825 }, { "epoch": 0.19859549986692054, "grad_norm": 0.3214346468448639, "learning_rate": 8.030446021911288e-05, "loss": 0.1122, "step": 4850 }, { "epoch": 0.19961918801056447, "grad_norm": 0.2675853967666626, "learning_rate": 8.02018792827541e-05, "loss": 0.1078, "step": 4875 }, { "epoch": 0.2006428761542084, "grad_norm": 0.2487669289112091, "learning_rate": 8.009929834639531e-05, "loss": 0.1087, "step": 4900 }, { "epoch": 0.2016665642978523, "grad_norm": 0.23890641331672668, "learning_rate": 7.999671741003653e-05, "loss": 0.1143, "step": 4925 }, { "epoch": 0.20269025244149622, "grad_norm": 0.25644829869270325, "learning_rate": 7.989413647367774e-05, "loss": 0.1117, "step": 4950 }, { "epoch": 0.20371394058514014, "grad_norm": 0.24456225335597992, "learning_rate": 7.979155553731896e-05, "loss": 0.115, "step": 4975 }, { "epoch": 0.20473762872878407, "grad_norm": 0.17908118665218353, "learning_rate": 7.968897460096017e-05, "loss": 0.1124, "step": 5000 }, { "epoch": 0.205761316872428, "grad_norm": 0.35271450877189636, "learning_rate": 7.958639366460137e-05, "loss": 0.1101, "step": 5025 }, { "epoch": 0.20678500501607192, "grad_norm": 0.2770853340625763, "learning_rate": 7.948381272824259e-05, "loss": 0.1119, "step": 5050 }, { "epoch": 0.2078086931597158, "grad_norm": 0.3154667317867279, "learning_rate": 7.93812317918838e-05, "loss": 0.1089, "step": 5075 }, { "epoch": 0.20883238130335974, "grad_norm": 0.27350950241088867, "learning_rate": 7.927865085552502e-05, "loss": 0.1113, "step": 5100 }, { "epoch": 0.20985606944700366, "grad_norm": 0.24580037593841553, "learning_rate": 7.917606991916623e-05, "loss": 0.1112, "step": 5125 }, { "epoch": 0.2108797575906476, "grad_norm": 0.23447053134441376, "learning_rate": 7.907348898280744e-05, "loss": 0.1108, "step": 5150 }, { "epoch": 0.2119034457342915, "grad_norm": 0.2380298674106598, "learning_rate": 7.897090804644866e-05, "loss": 0.1082, "step": 5175 }, { "epoch": 0.21292713387793544, "grad_norm": 0.22617502510547638, "learning_rate": 7.886832711008987e-05, "loss": 0.108, "step": 5200 }, { "epoch": 0.21395082202157933, "grad_norm": 0.2923017740249634, "learning_rate": 7.876574617373108e-05, "loss": 0.1094, "step": 5225 }, { "epoch": 0.21497451016522326, "grad_norm": 0.280912846326828, "learning_rate": 7.866316523737229e-05, "loss": 0.1108, "step": 5250 }, { "epoch": 0.21599819830886718, "grad_norm": 0.24020980298519135, "learning_rate": 7.85605843010135e-05, "loss": 0.1104, "step": 5275 }, { "epoch": 0.2170218864525111, "grad_norm": 0.2545349597930908, "learning_rate": 7.845800336465472e-05, "loss": 0.1105, "step": 5300 }, { "epoch": 0.21804557459615503, "grad_norm": 0.22493721544742584, "learning_rate": 7.835542242829593e-05, "loss": 0.1086, "step": 5325 }, { "epoch": 0.21906926273979896, "grad_norm": 0.26803284883499146, "learning_rate": 7.825284149193715e-05, "loss": 0.107, "step": 5350 }, { "epoch": 0.22009295088344286, "grad_norm": 0.22854533791542053, "learning_rate": 7.815026055557836e-05, "loss": 0.1097, "step": 5375 }, { "epoch": 0.22111663902708678, "grad_norm": 0.19401207566261292, "learning_rate": 7.804767961921958e-05, "loss": 0.1082, "step": 5400 }, { "epoch": 0.2221403271707307, "grad_norm": 0.22267797589302063, "learning_rate": 7.794509868286078e-05, "loss": 0.1107, "step": 5425 }, { "epoch": 0.22316401531437463, "grad_norm": 0.19586950540542603, "learning_rate": 7.7842517746502e-05, "loss": 0.1054, "step": 5450 }, { "epoch": 0.22418770345801856, "grad_norm": 0.23129217326641083, "learning_rate": 7.773993681014321e-05, "loss": 0.1093, "step": 5475 }, { "epoch": 0.22521139160166248, "grad_norm": 0.26472529768943787, "learning_rate": 7.763735587378442e-05, "loss": 0.1052, "step": 5500 }, { "epoch": 0.22623507974530638, "grad_norm": 0.22230687737464905, "learning_rate": 7.753477493742564e-05, "loss": 0.1093, "step": 5525 }, { "epoch": 0.2272587678889503, "grad_norm": 0.3101346492767334, "learning_rate": 7.743219400106685e-05, "loss": 0.1036, "step": 5550 }, { "epoch": 0.22828245603259423, "grad_norm": 0.18460065126419067, "learning_rate": 7.732961306470807e-05, "loss": 0.1108, "step": 5575 }, { "epoch": 0.22930614417623815, "grad_norm": 0.20973823964595795, "learning_rate": 7.722703212834928e-05, "loss": 0.1096, "step": 5600 }, { "epoch": 0.23032983231988208, "grad_norm": 0.277650386095047, "learning_rate": 7.712445119199048e-05, "loss": 0.1065, "step": 5625 }, { "epoch": 0.231353520463526, "grad_norm": 0.22262975573539734, "learning_rate": 7.70218702556317e-05, "loss": 0.1103, "step": 5650 }, { "epoch": 0.2323772086071699, "grad_norm": 0.24553848803043365, "learning_rate": 7.691928931927291e-05, "loss": 0.1111, "step": 5675 }, { "epoch": 0.23340089675081382, "grad_norm": 0.30652496218681335, "learning_rate": 7.681670838291413e-05, "loss": 0.1066, "step": 5700 }, { "epoch": 0.23442458489445775, "grad_norm": 0.17171849310398102, "learning_rate": 7.671412744655534e-05, "loss": 0.1074, "step": 5725 }, { "epoch": 0.23544827303810167, "grad_norm": 0.27997660636901855, "learning_rate": 7.661154651019656e-05, "loss": 0.1057, "step": 5750 }, { "epoch": 0.2364719611817456, "grad_norm": 0.302190899848938, "learning_rate": 7.650896557383777e-05, "loss": 0.1078, "step": 5775 }, { "epoch": 0.23749564932538952, "grad_norm": 0.29618439078330994, "learning_rate": 7.640638463747897e-05, "loss": 0.1078, "step": 5800 }, { "epoch": 0.23851933746903342, "grad_norm": 0.25362005829811096, "learning_rate": 7.630380370112019e-05, "loss": 0.1052, "step": 5825 }, { "epoch": 0.23954302561267735, "grad_norm": 0.22422952950000763, "learning_rate": 7.62012227647614e-05, "loss": 0.1069, "step": 5850 }, { "epoch": 0.24056671375632127, "grad_norm": 0.21477550268173218, "learning_rate": 7.609864182840262e-05, "loss": 0.1079, "step": 5875 }, { "epoch": 0.2415904018999652, "grad_norm": 0.17787286639213562, "learning_rate": 7.599606089204383e-05, "loss": 0.1087, "step": 5900 }, { "epoch": 0.24261409004360912, "grad_norm": 0.25852805376052856, "learning_rate": 7.589347995568505e-05, "loss": 0.1055, "step": 5925 }, { "epoch": 0.24363777818725305, "grad_norm": 0.2465522438287735, "learning_rate": 7.579089901932626e-05, "loss": 0.1052, "step": 5950 }, { "epoch": 0.24466146633089694, "grad_norm": 0.20638887584209442, "learning_rate": 7.568831808296747e-05, "loss": 0.1059, "step": 5975 }, { "epoch": 0.24568515447454087, "grad_norm": 0.24599237740039825, "learning_rate": 7.558573714660868e-05, "loss": 0.1052, "step": 6000 }, { "epoch": 0.2467088426181848, "grad_norm": 0.2663975954055786, "learning_rate": 7.548315621024989e-05, "loss": 0.1051, "step": 6025 }, { "epoch": 0.24773253076182872, "grad_norm": 0.2528514266014099, "learning_rate": 7.53805752738911e-05, "loss": 0.108, "step": 6050 }, { "epoch": 0.24875621890547264, "grad_norm": 0.23383919894695282, "learning_rate": 7.527799433753232e-05, "loss": 0.108, "step": 6075 }, { "epoch": 0.24977990704911657, "grad_norm": 0.23460572957992554, "learning_rate": 7.517541340117353e-05, "loss": 0.1052, "step": 6100 }, { "epoch": 0.25080359519276046, "grad_norm": 0.23296788334846497, "learning_rate": 7.507283246481475e-05, "loss": 0.1045, "step": 6125 }, { "epoch": 0.2518272833364044, "grad_norm": 0.2544507682323456, "learning_rate": 7.497025152845596e-05, "loss": 0.1073, "step": 6150 }, { "epoch": 0.2528509714800483, "grad_norm": 0.33089134097099304, "learning_rate": 7.486767059209718e-05, "loss": 0.1047, "step": 6175 }, { "epoch": 0.2538746596236922, "grad_norm": 0.2965986132621765, "learning_rate": 7.476508965573838e-05, "loss": 0.1094, "step": 6200 }, { "epoch": 0.25489834776733616, "grad_norm": 0.2606011927127838, "learning_rate": 7.46625087193796e-05, "loss": 0.1035, "step": 6225 }, { "epoch": 0.25592203591098006, "grad_norm": 0.21870043873786926, "learning_rate": 7.455992778302081e-05, "loss": 0.1081, "step": 6250 }, { "epoch": 0.256945724054624, "grad_norm": 0.37876567244529724, "learning_rate": 7.445734684666202e-05, "loss": 0.1049, "step": 6275 }, { "epoch": 0.2579694121982679, "grad_norm": 0.26862943172454834, "learning_rate": 7.435476591030324e-05, "loss": 0.0993, "step": 6300 }, { "epoch": 0.25899310034191186, "grad_norm": 0.23476149141788483, "learning_rate": 7.425218497394445e-05, "loss": 0.1059, "step": 6325 }, { "epoch": 0.26001678848555576, "grad_norm": 0.21397703886032104, "learning_rate": 7.414960403758567e-05, "loss": 0.1068, "step": 6350 }, { "epoch": 0.26104047662919966, "grad_norm": 0.18096783757209778, "learning_rate": 7.404702310122688e-05, "loss": 0.1072, "step": 6375 }, { "epoch": 0.2620641647728436, "grad_norm": 0.2302347868680954, "learning_rate": 7.394444216486808e-05, "loss": 0.1106, "step": 6400 }, { "epoch": 0.2630878529164875, "grad_norm": 0.23029176890850067, "learning_rate": 7.38418612285093e-05, "loss": 0.1064, "step": 6425 }, { "epoch": 0.26411154106013146, "grad_norm": 0.22477678954601288, "learning_rate": 7.373928029215051e-05, "loss": 0.1066, "step": 6450 }, { "epoch": 0.26513522920377536, "grad_norm": 0.30752694606781006, "learning_rate": 7.363669935579173e-05, "loss": 0.1072, "step": 6475 }, { "epoch": 0.26615891734741925, "grad_norm": 0.21718832850456238, "learning_rate": 7.353411841943294e-05, "loss": 0.1077, "step": 6500 }, { "epoch": 0.2671826054910632, "grad_norm": 0.24620802700519562, "learning_rate": 7.343153748307416e-05, "loss": 0.1053, "step": 6525 }, { "epoch": 0.2682062936347071, "grad_norm": 0.1965140402317047, "learning_rate": 7.332895654671537e-05, "loss": 0.1057, "step": 6550 }, { "epoch": 0.26922998177835106, "grad_norm": 0.25057727098464966, "learning_rate": 7.322637561035657e-05, "loss": 0.1037, "step": 6575 }, { "epoch": 0.27025366992199495, "grad_norm": 0.2844404876232147, "learning_rate": 7.312379467399779e-05, "loss": 0.1026, "step": 6600 }, { "epoch": 0.2712773580656389, "grad_norm": 0.23390497267246246, "learning_rate": 7.3021213737639e-05, "loss": 0.1032, "step": 6625 }, { "epoch": 0.2723010462092828, "grad_norm": 0.19829843938350677, "learning_rate": 7.291863280128022e-05, "loss": 0.1091, "step": 6650 }, { "epoch": 0.2733247343529267, "grad_norm": 0.24273422360420227, "learning_rate": 7.281605186492143e-05, "loss": 0.1075, "step": 6675 }, { "epoch": 0.27434842249657065, "grad_norm": 0.3134569823741913, "learning_rate": 7.271347092856265e-05, "loss": 0.103, "step": 6700 }, { "epoch": 0.27537211064021455, "grad_norm": 0.18153002858161926, "learning_rate": 7.261088999220386e-05, "loss": 0.1055, "step": 6725 }, { "epoch": 0.2763957987838585, "grad_norm": 0.22859077155590057, "learning_rate": 7.250830905584507e-05, "loss": 0.1082, "step": 6750 }, { "epoch": 0.2774194869275024, "grad_norm": 0.2673007845878601, "learning_rate": 7.240572811948628e-05, "loss": 0.1045, "step": 6775 }, { "epoch": 0.2784431750711463, "grad_norm": 0.2651185691356659, "learning_rate": 7.230314718312749e-05, "loss": 0.1033, "step": 6800 }, { "epoch": 0.27946686321479025, "grad_norm": 0.2199607491493225, "learning_rate": 7.22005662467687e-05, "loss": 0.1056, "step": 6825 }, { "epoch": 0.28049055135843415, "grad_norm": 0.2549345791339874, "learning_rate": 7.209798531040992e-05, "loss": 0.1053, "step": 6850 }, { "epoch": 0.2815142395020781, "grad_norm": 0.22934679687023163, "learning_rate": 7.199540437405113e-05, "loss": 0.1065, "step": 6875 }, { "epoch": 0.282537927645722, "grad_norm": 0.2626487910747528, "learning_rate": 7.189282343769235e-05, "loss": 0.1034, "step": 6900 }, { "epoch": 0.28356161578936595, "grad_norm": 0.2974385917186737, "learning_rate": 7.179024250133356e-05, "loss": 0.1046, "step": 6925 }, { "epoch": 0.28458530393300985, "grad_norm": 0.2448814958333969, "learning_rate": 7.168766156497478e-05, "loss": 0.1067, "step": 6950 }, { "epoch": 0.28560899207665374, "grad_norm": 0.39903128147125244, "learning_rate": 7.158508062861598e-05, "loss": 0.1013, "step": 6975 }, { "epoch": 0.2866326802202977, "grad_norm": 0.25461485981941223, "learning_rate": 7.14824996922572e-05, "loss": 0.1051, "step": 7000 }, { "epoch": 0.2876563683639416, "grad_norm": 0.22692956030368805, "learning_rate": 7.137991875589841e-05, "loss": 0.1051, "step": 7025 }, { "epoch": 0.28868005650758555, "grad_norm": 0.18912681937217712, "learning_rate": 7.127733781953962e-05, "loss": 0.1049, "step": 7050 }, { "epoch": 0.28970374465122944, "grad_norm": 0.29922547936439514, "learning_rate": 7.117475688318084e-05, "loss": 0.1028, "step": 7075 }, { "epoch": 0.2907274327948734, "grad_norm": 0.39868420362472534, "learning_rate": 7.107217594682205e-05, "loss": 0.1046, "step": 7100 }, { "epoch": 0.2917511209385173, "grad_norm": 0.2455105036497116, "learning_rate": 7.096959501046327e-05, "loss": 0.108, "step": 7125 }, { "epoch": 0.2927748090821612, "grad_norm": 0.22028543055057526, "learning_rate": 7.086701407410448e-05, "loss": 0.1031, "step": 7150 }, { "epoch": 0.29379849722580514, "grad_norm": 0.27611467242240906, "learning_rate": 7.076443313774568e-05, "loss": 0.1073, "step": 7175 }, { "epoch": 0.29482218536944904, "grad_norm": 0.31651851534843445, "learning_rate": 7.06618522013869e-05, "loss": 0.1005, "step": 7200 }, { "epoch": 0.295845873513093, "grad_norm": 0.2306353896856308, "learning_rate": 7.055927126502811e-05, "loss": 0.1035, "step": 7225 }, { "epoch": 0.2968695616567369, "grad_norm": 0.22398217022418976, "learning_rate": 7.045669032866933e-05, "loss": 0.1059, "step": 7250 }, { "epoch": 0.2978932498003808, "grad_norm": 0.24632596969604492, "learning_rate": 7.035410939231054e-05, "loss": 0.106, "step": 7275 }, { "epoch": 0.29891693794402474, "grad_norm": 0.21331587433815002, "learning_rate": 7.025152845595176e-05, "loss": 0.0994, "step": 7300 }, { "epoch": 0.29994062608766864, "grad_norm": 0.37877365946769714, "learning_rate": 7.014894751959297e-05, "loss": 0.102, "step": 7325 }, { "epoch": 0.3009643142313126, "grad_norm": 0.28108686208724976, "learning_rate": 7.004636658323419e-05, "loss": 0.104, "step": 7350 }, { "epoch": 0.3019880023749565, "grad_norm": 0.25342661142349243, "learning_rate": 6.994378564687539e-05, "loss": 0.1044, "step": 7375 }, { "epoch": 0.30301169051860044, "grad_norm": 0.7590738534927368, "learning_rate": 6.98412047105166e-05, "loss": 0.106, "step": 7400 }, { "epoch": 0.30403537866224434, "grad_norm": 0.20050746202468872, "learning_rate": 6.973862377415782e-05, "loss": 0.1069, "step": 7425 }, { "epoch": 0.30505906680588823, "grad_norm": 0.27144044637680054, "learning_rate": 6.963604283779903e-05, "loss": 0.104, "step": 7450 }, { "epoch": 0.3060827549495322, "grad_norm": 0.2616618275642395, "learning_rate": 6.953346190144025e-05, "loss": 0.101, "step": 7475 }, { "epoch": 0.3071064430931761, "grad_norm": 0.27171334624290466, "learning_rate": 6.943088096508146e-05, "loss": 0.1036, "step": 7500 }, { "epoch": 0.30813013123682004, "grad_norm": 0.19246098399162292, "learning_rate": 6.932830002872268e-05, "loss": 0.1035, "step": 7525 }, { "epoch": 0.30915381938046393, "grad_norm": 0.2488516867160797, "learning_rate": 6.922571909236388e-05, "loss": 0.1053, "step": 7550 }, { "epoch": 0.31017750752410783, "grad_norm": 0.2559676170349121, "learning_rate": 6.912313815600509e-05, "loss": 0.1039, "step": 7575 }, { "epoch": 0.3112011956677518, "grad_norm": 0.19615231454372406, "learning_rate": 6.90205572196463e-05, "loss": 0.1016, "step": 7600 }, { "epoch": 0.3122248838113957, "grad_norm": 0.22992445528507233, "learning_rate": 6.891797628328752e-05, "loss": 0.103, "step": 7625 }, { "epoch": 0.31324857195503963, "grad_norm": 0.25916945934295654, "learning_rate": 6.881539534692874e-05, "loss": 0.1033, "step": 7650 }, { "epoch": 0.31427226009868353, "grad_norm": 0.2485833466053009, "learning_rate": 6.871281441056995e-05, "loss": 0.1023, "step": 7675 }, { "epoch": 0.3152959482423275, "grad_norm": 0.3130246102809906, "learning_rate": 6.861023347421116e-05, "loss": 0.1013, "step": 7700 }, { "epoch": 0.3163196363859714, "grad_norm": 0.17889827489852905, "learning_rate": 6.850765253785238e-05, "loss": 0.1063, "step": 7725 }, { "epoch": 0.3173433245296153, "grad_norm": 0.23844337463378906, "learning_rate": 6.840507160149358e-05, "loss": 0.1024, "step": 7750 }, { "epoch": 0.31836701267325923, "grad_norm": 0.2489156275987625, "learning_rate": 6.83024906651348e-05, "loss": 0.1018, "step": 7775 }, { "epoch": 0.31939070081690313, "grad_norm": 0.24830876290798187, "learning_rate": 6.819990972877601e-05, "loss": 0.1018, "step": 7800 }, { "epoch": 0.3204143889605471, "grad_norm": 0.23647700250148773, "learning_rate": 6.809732879241722e-05, "loss": 0.1054, "step": 7825 }, { "epoch": 0.321438077104191, "grad_norm": 0.3480120003223419, "learning_rate": 6.799474785605844e-05, "loss": 0.0991, "step": 7850 }, { "epoch": 0.3224617652478349, "grad_norm": 0.2117711305618286, "learning_rate": 6.789216691969965e-05, "loss": 0.1019, "step": 7875 }, { "epoch": 0.3234854533914788, "grad_norm": 0.21510981023311615, "learning_rate": 6.778958598334087e-05, "loss": 0.1023, "step": 7900 }, { "epoch": 0.3245091415351227, "grad_norm": 0.21288833022117615, "learning_rate": 6.768700504698208e-05, "loss": 0.1018, "step": 7925 }, { "epoch": 0.3255328296787667, "grad_norm": 0.2654208242893219, "learning_rate": 6.758442411062328e-05, "loss": 0.1, "step": 7950 }, { "epoch": 0.3265565178224106, "grad_norm": 0.23810634016990662, "learning_rate": 6.74818431742645e-05, "loss": 0.1007, "step": 7975 }, { "epoch": 0.3275802059660545, "grad_norm": 0.26225727796554565, "learning_rate": 6.737926223790571e-05, "loss": 0.1037, "step": 8000 }, { "epoch": 0.3286038941096984, "grad_norm": 0.28832173347473145, "learning_rate": 6.727668130154693e-05, "loss": 0.1024, "step": 8025 }, { "epoch": 0.3296275822533423, "grad_norm": 0.25963491201400757, "learning_rate": 6.717410036518814e-05, "loss": 0.106, "step": 8050 }, { "epoch": 0.3306512703969863, "grad_norm": 0.3249678611755371, "learning_rate": 6.707151942882936e-05, "loss": 0.0958, "step": 8075 }, { "epoch": 0.33167495854063017, "grad_norm": 0.25855204463005066, "learning_rate": 6.696893849247057e-05, "loss": 0.1004, "step": 8100 }, { "epoch": 0.3326986466842741, "grad_norm": 0.2253751903772354, "learning_rate": 6.686635755611179e-05, "loss": 0.1013, "step": 8125 }, { "epoch": 0.333722334827918, "grad_norm": 0.25214654207229614, "learning_rate": 6.676377661975299e-05, "loss": 0.1016, "step": 8150 }, { "epoch": 0.3347460229715619, "grad_norm": 0.2561601996421814, "learning_rate": 6.66611956833942e-05, "loss": 0.1011, "step": 8175 }, { "epoch": 0.33576971111520587, "grad_norm": 0.2241383194923401, "learning_rate": 6.655861474703542e-05, "loss": 0.1013, "step": 8200 }, { "epoch": 0.33679339925884977, "grad_norm": 0.23701010644435883, "learning_rate": 6.645603381067663e-05, "loss": 0.0994, "step": 8225 }, { "epoch": 0.3378170874024937, "grad_norm": 0.2312152236700058, "learning_rate": 6.635345287431785e-05, "loss": 0.0969, "step": 8250 }, { "epoch": 0.3388407755461376, "grad_norm": 0.5713122487068176, "learning_rate": 6.625087193795906e-05, "loss": 0.1032, "step": 8275 }, { "epoch": 0.33986446368978157, "grad_norm": 0.2621745467185974, "learning_rate": 6.614829100160028e-05, "loss": 0.1009, "step": 8300 }, { "epoch": 0.34088815183342547, "grad_norm": 0.24803993105888367, "learning_rate": 6.604571006524149e-05, "loss": 0.0992, "step": 8325 }, { "epoch": 0.34191183997706937, "grad_norm": 0.20469900965690613, "learning_rate": 6.594312912888269e-05, "loss": 0.1021, "step": 8350 }, { "epoch": 0.3429355281207133, "grad_norm": 0.26485446095466614, "learning_rate": 6.58405481925239e-05, "loss": 0.1023, "step": 8375 }, { "epoch": 0.3439592162643572, "grad_norm": 0.30211177468299866, "learning_rate": 6.573796725616512e-05, "loss": 0.1, "step": 8400 }, { "epoch": 0.34498290440800117, "grad_norm": 0.19773200154304504, "learning_rate": 6.563538631980634e-05, "loss": 0.0998, "step": 8425 }, { "epoch": 0.34600659255164506, "grad_norm": 0.37499427795410156, "learning_rate": 6.553280538344755e-05, "loss": 0.0969, "step": 8450 }, { "epoch": 0.34703028069528896, "grad_norm": 0.23352007567882538, "learning_rate": 6.543022444708877e-05, "loss": 0.1013, "step": 8475 }, { "epoch": 0.3480539688389329, "grad_norm": 0.22725583612918854, "learning_rate": 6.532764351072998e-05, "loss": 0.1005, "step": 8500 }, { "epoch": 0.3490776569825768, "grad_norm": 0.2472585290670395, "learning_rate": 6.522506257437118e-05, "loss": 0.0981, "step": 8525 }, { "epoch": 0.35010134512622076, "grad_norm": 0.24253399670124054, "learning_rate": 6.51224816380124e-05, "loss": 0.1029, "step": 8550 }, { "epoch": 0.35112503326986466, "grad_norm": 0.22759589552879333, "learning_rate": 6.501990070165361e-05, "loss": 0.1026, "step": 8575 }, { "epoch": 0.3521487214135086, "grad_norm": 0.3092879056930542, "learning_rate": 6.491731976529482e-05, "loss": 0.105, "step": 8600 }, { "epoch": 0.3531724095571525, "grad_norm": 0.21212832629680634, "learning_rate": 6.481473882893604e-05, "loss": 0.1039, "step": 8625 }, { "epoch": 0.3541960977007964, "grad_norm": 0.22957822680473328, "learning_rate": 6.471215789257725e-05, "loss": 0.1039, "step": 8650 }, { "epoch": 0.35521978584444036, "grad_norm": 0.2514593005180359, "learning_rate": 6.460957695621847e-05, "loss": 0.105, "step": 8675 }, { "epoch": 0.35624347398808426, "grad_norm": 0.32485923171043396, "learning_rate": 6.450699601985968e-05, "loss": 0.1043, "step": 8700 }, { "epoch": 0.3572671621317282, "grad_norm": 0.25438931584358215, "learning_rate": 6.440441508350088e-05, "loss": 0.1033, "step": 8725 }, { "epoch": 0.3582908502753721, "grad_norm": 0.26107901334762573, "learning_rate": 6.43018341471421e-05, "loss": 0.106, "step": 8750 }, { "epoch": 0.359314538419016, "grad_norm": 0.20148183405399323, "learning_rate": 6.41992532107833e-05, "loss": 0.102, "step": 8775 }, { "epoch": 0.36033822656265996, "grad_norm": 0.3115244209766388, "learning_rate": 6.409667227442452e-05, "loss": 0.1002, "step": 8800 }, { "epoch": 0.36136191470630386, "grad_norm": 0.2722707688808441, "learning_rate": 6.399409133806573e-05, "loss": 0.0993, "step": 8825 }, { "epoch": 0.3623856028499478, "grad_norm": 0.3244341015815735, "learning_rate": 6.389151040170694e-05, "loss": 0.0973, "step": 8850 }, { "epoch": 0.3634092909935917, "grad_norm": 0.24697239696979523, "learning_rate": 6.378892946534816e-05, "loss": 0.0958, "step": 8875 }, { "epoch": 0.36443297913723566, "grad_norm": 0.23170702159404755, "learning_rate": 6.368634852898937e-05, "loss": 0.1039, "step": 8900 }, { "epoch": 0.36545666728087955, "grad_norm": 0.25722336769104004, "learning_rate": 6.358376759263059e-05, "loss": 0.1027, "step": 8925 }, { "epoch": 0.36648035542452345, "grad_norm": 0.2329777032136917, "learning_rate": 6.348118665627179e-05, "loss": 0.1003, "step": 8950 }, { "epoch": 0.3675040435681674, "grad_norm": 0.3008142411708832, "learning_rate": 6.3378605719913e-05, "loss": 0.0975, "step": 8975 }, { "epoch": 0.3685277317118113, "grad_norm": 0.19098886847496033, "learning_rate": 6.327602478355422e-05, "loss": 0.097, "step": 9000 }, { "epoch": 0.36955141985545525, "grad_norm": 0.2393869310617447, "learning_rate": 6.317344384719543e-05, "loss": 0.0992, "step": 9025 }, { "epoch": 0.37057510799909915, "grad_norm": 0.24962279200553894, "learning_rate": 6.307086291083665e-05, "loss": 0.1006, "step": 9050 }, { "epoch": 0.37159879614274305, "grad_norm": 0.20281440019607544, "learning_rate": 6.296828197447786e-05, "loss": 0.097, "step": 9075 }, { "epoch": 0.372622484286387, "grad_norm": 0.21669328212738037, "learning_rate": 6.286570103811908e-05, "loss": 0.1008, "step": 9100 }, { "epoch": 0.3736461724300309, "grad_norm": 0.21775703132152557, "learning_rate": 6.276312010176029e-05, "loss": 0.1046, "step": 9125 }, { "epoch": 0.37466986057367485, "grad_norm": 0.24492838978767395, "learning_rate": 6.26605391654015e-05, "loss": 0.0989, "step": 9150 }, { "epoch": 0.37569354871731875, "grad_norm": 0.2119276523590088, "learning_rate": 6.255795822904271e-05, "loss": 0.1038, "step": 9175 }, { "epoch": 0.3767172368609627, "grad_norm": 0.2842216193675995, "learning_rate": 6.245537729268392e-05, "loss": 0.1004, "step": 9200 }, { "epoch": 0.3777409250046066, "grad_norm": 0.2775871455669403, "learning_rate": 6.235279635632514e-05, "loss": 0.1008, "step": 9225 }, { "epoch": 0.3787646131482505, "grad_norm": 0.26387348771095276, "learning_rate": 6.225021541996635e-05, "loss": 0.0972, "step": 9250 }, { "epoch": 0.37978830129189445, "grad_norm": 0.2945527136325836, "learning_rate": 6.214763448360757e-05, "loss": 0.1044, "step": 9275 }, { "epoch": 0.38081198943553835, "grad_norm": 0.34967219829559326, "learning_rate": 6.204505354724878e-05, "loss": 0.1018, "step": 9300 }, { "epoch": 0.3818356775791823, "grad_norm": 0.2373281568288803, "learning_rate": 6.194247261089e-05, "loss": 0.1028, "step": 9325 }, { "epoch": 0.3828593657228262, "grad_norm": 0.27347394824028015, "learning_rate": 6.18398916745312e-05, "loss": 0.0995, "step": 9350 }, { "epoch": 0.3838830538664701, "grad_norm": 0.2860616147518158, "learning_rate": 6.173731073817241e-05, "loss": 0.0983, "step": 9375 }, { "epoch": 0.38490674201011404, "grad_norm": 0.3643983006477356, "learning_rate": 6.163472980181363e-05, "loss": 0.0957, "step": 9400 }, { "epoch": 0.38593043015375794, "grad_norm": 0.3181641399860382, "learning_rate": 6.153214886545484e-05, "loss": 0.0989, "step": 9425 }, { "epoch": 0.3869541182974019, "grad_norm": 0.24089764058589935, "learning_rate": 6.142956792909606e-05, "loss": 0.0982, "step": 9450 }, { "epoch": 0.3879778064410458, "grad_norm": 0.2490035593509674, "learning_rate": 6.132698699273727e-05, "loss": 0.1039, "step": 9475 }, { "epoch": 0.38900149458468974, "grad_norm": 0.2765063941478729, "learning_rate": 6.122440605637849e-05, "loss": 0.0937, "step": 9500 }, { "epoch": 0.39002518272833364, "grad_norm": 0.45849937200546265, "learning_rate": 6.11218251200197e-05, "loss": 0.1002, "step": 9525 }, { "epoch": 0.39104887087197754, "grad_norm": 0.23391731083393097, "learning_rate": 6.101924418366091e-05, "loss": 0.0995, "step": 9550 }, { "epoch": 0.3920725590156215, "grad_norm": 0.258109986782074, "learning_rate": 6.091666324730212e-05, "loss": 0.1032, "step": 9575 }, { "epoch": 0.3930962471592654, "grad_norm": 0.2020760029554367, "learning_rate": 6.081408231094333e-05, "loss": 0.1012, "step": 9600 }, { "epoch": 0.39411993530290934, "grad_norm": 0.20322605967521667, "learning_rate": 6.0711501374584545e-05, "loss": 0.1009, "step": 9625 }, { "epoch": 0.39514362344655324, "grad_norm": 0.3139131963253021, "learning_rate": 6.060892043822576e-05, "loss": 0.1009, "step": 9650 }, { "epoch": 0.39616731159019714, "grad_norm": 0.2019822746515274, "learning_rate": 6.0506339501866974e-05, "loss": 0.1021, "step": 9675 }, { "epoch": 0.3971909997338411, "grad_norm": 0.21363505721092224, "learning_rate": 6.040375856550818e-05, "loss": 0.0996, "step": 9700 }, { "epoch": 0.398214687877485, "grad_norm": 0.25607529282569885, "learning_rate": 6.03011776291494e-05, "loss": 0.0968, "step": 9725 }, { "epoch": 0.39923837602112894, "grad_norm": 0.28837454319000244, "learning_rate": 6.019859669279061e-05, "loss": 0.1003, "step": 9750 }, { "epoch": 0.40026206416477284, "grad_norm": 0.22750523686408997, "learning_rate": 6.009601575643182e-05, "loss": 0.0976, "step": 9775 }, { "epoch": 0.4012857523084168, "grad_norm": 0.2659379541873932, "learning_rate": 5.9993434820073034e-05, "loss": 0.0991, "step": 9800 }, { "epoch": 0.4023094404520607, "grad_norm": 0.3965132534503937, "learning_rate": 5.989085388371425e-05, "loss": 0.1039, "step": 9825 }, { "epoch": 0.4033331285957046, "grad_norm": 0.2643307149410248, "learning_rate": 5.978827294735546e-05, "loss": 0.1018, "step": 9850 }, { "epoch": 0.40435681673934853, "grad_norm": 0.2745136618614197, "learning_rate": 5.968569201099667e-05, "loss": 0.1037, "step": 9875 }, { "epoch": 0.40538050488299243, "grad_norm": 0.235930934548378, "learning_rate": 5.9583111074637886e-05, "loss": 0.0995, "step": 9900 }, { "epoch": 0.4064041930266364, "grad_norm": 0.23560784757137299, "learning_rate": 5.94805301382791e-05, "loss": 0.1001, "step": 9925 }, { "epoch": 0.4074278811702803, "grad_norm": 0.3324751555919647, "learning_rate": 5.9377949201920315e-05, "loss": 0.1007, "step": 9950 }, { "epoch": 0.4084515693139242, "grad_norm": 0.22333605587482452, "learning_rate": 5.927536826556152e-05, "loss": 0.1035, "step": 9975 }, { "epoch": 0.40947525745756813, "grad_norm": 0.23905926942825317, "learning_rate": 5.917278732920274e-05, "loss": 0.1019, "step": 10000 }, { "epoch": 0.41049894560121203, "grad_norm": 0.24543020129203796, "learning_rate": 5.907020639284395e-05, "loss": 0.1005, "step": 10025 }, { "epoch": 0.411522633744856, "grad_norm": 0.2597710192203522, "learning_rate": 5.896762545648517e-05, "loss": 0.0937, "step": 10050 }, { "epoch": 0.4125463218884999, "grad_norm": 0.2141934633255005, "learning_rate": 5.8865044520126375e-05, "loss": 0.1047, "step": 10075 }, { "epoch": 0.41357001003214383, "grad_norm": 0.18962982296943665, "learning_rate": 5.876246358376759e-05, "loss": 0.1019, "step": 10100 }, { "epoch": 0.41459369817578773, "grad_norm": 0.16786764562129974, "learning_rate": 5.8659882647408804e-05, "loss": 0.098, "step": 10125 }, { "epoch": 0.4156173863194316, "grad_norm": 0.2587350904941559, "learning_rate": 5.855730171105002e-05, "loss": 0.1018, "step": 10150 }, { "epoch": 0.4166410744630756, "grad_norm": 0.23551388084888458, "learning_rate": 5.845472077469123e-05, "loss": 0.1017, "step": 10175 }, { "epoch": 0.4176647626067195, "grad_norm": 0.40040743350982666, "learning_rate": 5.835213983833244e-05, "loss": 0.099, "step": 10200 }, { "epoch": 0.41868845075036343, "grad_norm": 0.274138480424881, "learning_rate": 5.8249558901973656e-05, "loss": 0.0988, "step": 10225 }, { "epoch": 0.4197121388940073, "grad_norm": 0.21808317303657532, "learning_rate": 5.814697796561487e-05, "loss": 0.0974, "step": 10250 }, { "epoch": 0.4207358270376513, "grad_norm": 0.2756749093532562, "learning_rate": 5.804439702925608e-05, "loss": 0.1007, "step": 10275 }, { "epoch": 0.4217595151812952, "grad_norm": 0.28059181571006775, "learning_rate": 5.7941816092897293e-05, "loss": 0.0956, "step": 10300 }, { "epoch": 0.42278320332493907, "grad_norm": 0.2666233479976654, "learning_rate": 5.783923515653851e-05, "loss": 0.1014, "step": 10325 }, { "epoch": 0.423806891468583, "grad_norm": 0.17817972600460052, "learning_rate": 5.773665422017972e-05, "loss": 0.098, "step": 10350 }, { "epoch": 0.4248305796122269, "grad_norm": 0.2498740404844284, "learning_rate": 5.763407328382093e-05, "loss": 0.1, "step": 10375 }, { "epoch": 0.4258542677558709, "grad_norm": 0.2427319437265396, "learning_rate": 5.7531492347462145e-05, "loss": 0.0985, "step": 10400 }, { "epoch": 0.42687795589951477, "grad_norm": 0.1904958337545395, "learning_rate": 5.742891141110336e-05, "loss": 0.1024, "step": 10425 }, { "epoch": 0.42790164404315867, "grad_norm": 0.246423602104187, "learning_rate": 5.7326330474744575e-05, "loss": 0.0996, "step": 10450 }, { "epoch": 0.4289253321868026, "grad_norm": 0.3124719262123108, "learning_rate": 5.722374953838578e-05, "loss": 0.0985, "step": 10475 }, { "epoch": 0.4299490203304465, "grad_norm": 0.2046365588903427, "learning_rate": 5.7121168602027e-05, "loss": 0.1031, "step": 10500 }, { "epoch": 0.43097270847409047, "grad_norm": 0.22781619429588318, "learning_rate": 5.701858766566821e-05, "loss": 0.0963, "step": 10525 }, { "epoch": 0.43199639661773437, "grad_norm": 0.28827908635139465, "learning_rate": 5.6916006729309427e-05, "loss": 0.1015, "step": 10550 }, { "epoch": 0.4330200847613783, "grad_norm": 0.27641138434410095, "learning_rate": 5.6813425792950634e-05, "loss": 0.0973, "step": 10575 }, { "epoch": 0.4340437729050222, "grad_norm": 0.3545154929161072, "learning_rate": 5.671084485659185e-05, "loss": 0.0954, "step": 10600 }, { "epoch": 0.4350674610486661, "grad_norm": 0.28629690408706665, "learning_rate": 5.6608263920233064e-05, "loss": 0.0937, "step": 10625 }, { "epoch": 0.43609114919231007, "grad_norm": 0.2316763550043106, "learning_rate": 5.650568298387428e-05, "loss": 0.0975, "step": 10650 }, { "epoch": 0.43711483733595397, "grad_norm": 0.28071022033691406, "learning_rate": 5.6403102047515486e-05, "loss": 0.1011, "step": 10675 }, { "epoch": 0.4381385254795979, "grad_norm": 0.2439073920249939, "learning_rate": 5.63005211111567e-05, "loss": 0.0994, "step": 10700 }, { "epoch": 0.4391622136232418, "grad_norm": 0.2952822744846344, "learning_rate": 5.6197940174797916e-05, "loss": 0.0983, "step": 10725 }, { "epoch": 0.4401859017668857, "grad_norm": 0.28434520959854126, "learning_rate": 5.6095359238439124e-05, "loss": 0.0962, "step": 10750 }, { "epoch": 0.44120958991052966, "grad_norm": 0.24351637065410614, "learning_rate": 5.599277830208034e-05, "loss": 0.0944, "step": 10775 }, { "epoch": 0.44223327805417356, "grad_norm": 0.27679064869880676, "learning_rate": 5.589019736572155e-05, "loss": 0.0959, "step": 10800 }, { "epoch": 0.4432569661978175, "grad_norm": 0.3108427822589874, "learning_rate": 5.578761642936277e-05, "loss": 0.0941, "step": 10825 }, { "epoch": 0.4442806543414614, "grad_norm": 0.19497576355934143, "learning_rate": 5.5685035493003976e-05, "loss": 0.0981, "step": 10850 }, { "epoch": 0.44530434248510536, "grad_norm": 0.22953549027442932, "learning_rate": 5.558245455664519e-05, "loss": 0.0976, "step": 10875 }, { "epoch": 0.44632803062874926, "grad_norm": 0.5247841477394104, "learning_rate": 5.5479873620286405e-05, "loss": 0.1014, "step": 10900 }, { "epoch": 0.44735171877239316, "grad_norm": 0.20830635726451874, "learning_rate": 5.537729268392762e-05, "loss": 0.0946, "step": 10925 }, { "epoch": 0.4483754069160371, "grad_norm": 0.24027594923973083, "learning_rate": 5.527471174756883e-05, "loss": 0.0953, "step": 10950 }, { "epoch": 0.449399095059681, "grad_norm": 0.2040860950946808, "learning_rate": 5.517213081121004e-05, "loss": 0.0961, "step": 10975 }, { "epoch": 0.45042278320332496, "grad_norm": 0.18586771190166473, "learning_rate": 5.506954987485126e-05, "loss": 0.0982, "step": 11000 }, { "epoch": 0.45144647134696886, "grad_norm": 0.32001617550849915, "learning_rate": 5.496696893849247e-05, "loss": 0.0981, "step": 11025 }, { "epoch": 0.45247015949061276, "grad_norm": 0.28242385387420654, "learning_rate": 5.486438800213368e-05, "loss": 0.0981, "step": 11050 }, { "epoch": 0.4534938476342567, "grad_norm": 0.3456820547580719, "learning_rate": 5.4761807065774894e-05, "loss": 0.0941, "step": 11075 }, { "epoch": 0.4545175357779006, "grad_norm": 0.22706662118434906, "learning_rate": 5.465922612941611e-05, "loss": 0.0985, "step": 11100 }, { "epoch": 0.45554122392154456, "grad_norm": 0.2593896985054016, "learning_rate": 5.455664519305732e-05, "loss": 0.0983, "step": 11125 }, { "epoch": 0.45656491206518846, "grad_norm": 0.22201375663280487, "learning_rate": 5.445406425669853e-05, "loss": 0.1026, "step": 11150 }, { "epoch": 0.4575886002088324, "grad_norm": 0.23291830718517303, "learning_rate": 5.4351483320339746e-05, "loss": 0.0995, "step": 11175 }, { "epoch": 0.4586122883524763, "grad_norm": 0.24490538239479065, "learning_rate": 5.424890238398096e-05, "loss": 0.0953, "step": 11200 }, { "epoch": 0.4596359764961202, "grad_norm": 0.3179132640361786, "learning_rate": 5.4146321447622175e-05, "loss": 0.0983, "step": 11225 }, { "epoch": 0.46065966463976415, "grad_norm": 0.23889416456222534, "learning_rate": 5.404374051126338e-05, "loss": 0.0971, "step": 11250 }, { "epoch": 0.46168335278340805, "grad_norm": 0.5037365555763245, "learning_rate": 5.39411595749046e-05, "loss": 0.0936, "step": 11275 }, { "epoch": 0.462707040927052, "grad_norm": 0.2585156559944153, "learning_rate": 5.383857863854581e-05, "loss": 0.0918, "step": 11300 }, { "epoch": 0.4637307290706959, "grad_norm": 0.2691129148006439, "learning_rate": 5.373599770218703e-05, "loss": 0.0959, "step": 11325 }, { "epoch": 0.4647544172143398, "grad_norm": 0.24569182097911835, "learning_rate": 5.3633416765828235e-05, "loss": 0.0966, "step": 11350 }, { "epoch": 0.46577810535798375, "grad_norm": 0.3655073344707489, "learning_rate": 5.353083582946945e-05, "loss": 0.0951, "step": 11375 }, { "epoch": 0.46680179350162765, "grad_norm": 0.24223706126213074, "learning_rate": 5.3428254893110664e-05, "loss": 0.1008, "step": 11400 }, { "epoch": 0.4678254816452716, "grad_norm": 0.2586074769496918, "learning_rate": 5.332567395675188e-05, "loss": 0.1011, "step": 11425 }, { "epoch": 0.4688491697889155, "grad_norm": 0.2603899836540222, "learning_rate": 5.322309302039309e-05, "loss": 0.0984, "step": 11450 }, { "epoch": 0.46987285793255945, "grad_norm": 0.25967130064964294, "learning_rate": 5.31205120840343e-05, "loss": 0.0965, "step": 11475 }, { "epoch": 0.47089654607620335, "grad_norm": 0.2673439085483551, "learning_rate": 5.3017931147675516e-05, "loss": 0.1025, "step": 11500 }, { "epoch": 0.47192023421984725, "grad_norm": 0.24883116781711578, "learning_rate": 5.291535021131673e-05, "loss": 0.0943, "step": 11525 }, { "epoch": 0.4729439223634912, "grad_norm": 0.29023605585098267, "learning_rate": 5.281276927495794e-05, "loss": 0.0993, "step": 11550 }, { "epoch": 0.4739676105071351, "grad_norm": 0.21741856634616852, "learning_rate": 5.2710188338599153e-05, "loss": 0.0937, "step": 11575 }, { "epoch": 0.47499129865077905, "grad_norm": 0.24658936262130737, "learning_rate": 5.260760740224037e-05, "loss": 0.0971, "step": 11600 }, { "epoch": 0.47601498679442295, "grad_norm": 0.3309827148914337, "learning_rate": 5.2505026465881576e-05, "loss": 0.1003, "step": 11625 }, { "epoch": 0.47703867493806684, "grad_norm": 0.22925116121768951, "learning_rate": 5.240244552952279e-05, "loss": 0.0961, "step": 11650 }, { "epoch": 0.4780623630817108, "grad_norm": 0.33367425203323364, "learning_rate": 5.2299864593164005e-05, "loss": 0.0959, "step": 11675 }, { "epoch": 0.4790860512253547, "grad_norm": 0.2225172519683838, "learning_rate": 5.219728365680522e-05, "loss": 0.0983, "step": 11700 }, { "epoch": 0.48010973936899864, "grad_norm": 0.26799845695495605, "learning_rate": 5.209470272044643e-05, "loss": 0.0984, "step": 11725 }, { "epoch": 0.48113342751264254, "grad_norm": 0.28932616114616394, "learning_rate": 5.199212178408764e-05, "loss": 0.0973, "step": 11750 }, { "epoch": 0.4821571156562865, "grad_norm": 0.3406207859516144, "learning_rate": 5.188954084772886e-05, "loss": 0.091, "step": 11775 }, { "epoch": 0.4831808037999304, "grad_norm": 0.2970975935459137, "learning_rate": 5.178695991137007e-05, "loss": 0.0939, "step": 11800 }, { "epoch": 0.4842044919435743, "grad_norm": 0.2747635245323181, "learning_rate": 5.168437897501128e-05, "loss": 0.0927, "step": 11825 }, { "epoch": 0.48522818008721824, "grad_norm": 0.2211136370897293, "learning_rate": 5.1581798038652494e-05, "loss": 0.096, "step": 11850 }, { "epoch": 0.48625186823086214, "grad_norm": 0.2881365418434143, "learning_rate": 5.147921710229371e-05, "loss": 0.0933, "step": 11875 }, { "epoch": 0.4872755563745061, "grad_norm": 0.2213411182165146, "learning_rate": 5.1376636165934924e-05, "loss": 0.0982, "step": 11900 }, { "epoch": 0.48829924451815, "grad_norm": 0.23638983070850372, "learning_rate": 5.127405522957613e-05, "loss": 0.1, "step": 11925 }, { "epoch": 0.4893229326617939, "grad_norm": 0.2544683814048767, "learning_rate": 5.1171474293217346e-05, "loss": 0.1005, "step": 11950 }, { "epoch": 0.49034662080543784, "grad_norm": 0.3138396441936493, "learning_rate": 5.106889335685856e-05, "loss": 0.0952, "step": 11975 }, { "epoch": 0.49137030894908174, "grad_norm": 0.352205365896225, "learning_rate": 5.0966312420499776e-05, "loss": 0.0954, "step": 12000 }, { "epoch": 0.4923939970927257, "grad_norm": 0.2083396166563034, "learning_rate": 5.0863731484140984e-05, "loss": 0.0947, "step": 12025 }, { "epoch": 0.4934176852363696, "grad_norm": 0.2839849591255188, "learning_rate": 5.07611505477822e-05, "loss": 0.0986, "step": 12050 }, { "epoch": 0.49444137338001354, "grad_norm": 0.26629742980003357, "learning_rate": 5.065856961142341e-05, "loss": 0.1007, "step": 12075 }, { "epoch": 0.49546506152365744, "grad_norm": 0.2845945656299591, "learning_rate": 5.055598867506463e-05, "loss": 0.0995, "step": 12100 }, { "epoch": 0.49648874966730133, "grad_norm": 0.22998517751693726, "learning_rate": 5.0453407738705835e-05, "loss": 0.0971, "step": 12125 }, { "epoch": 0.4975124378109453, "grad_norm": 0.21335995197296143, "learning_rate": 5.035082680234705e-05, "loss": 0.0956, "step": 12150 }, { "epoch": 0.4985361259545892, "grad_norm": 0.2018250823020935, "learning_rate": 5.0248245865988265e-05, "loss": 0.0982, "step": 12175 }, { "epoch": 0.49955981409823313, "grad_norm": 0.2268654853105545, "learning_rate": 5.014566492962948e-05, "loss": 0.0951, "step": 12200 }, { "epoch": 0.5005835022418771, "grad_norm": 0.22491568326950073, "learning_rate": 5.004308399327069e-05, "loss": 0.0991, "step": 12225 }, { "epoch": 0.5016071903855209, "grad_norm": 0.2399354726076126, "learning_rate": 4.994050305691191e-05, "loss": 0.0978, "step": 12250 }, { "epoch": 0.5026308785291649, "grad_norm": 0.2331288754940033, "learning_rate": 4.9837922120553123e-05, "loss": 0.0959, "step": 12275 }, { "epoch": 0.5036545666728088, "grad_norm": 0.23224005103111267, "learning_rate": 4.973534118419433e-05, "loss": 0.0957, "step": 12300 }, { "epoch": 0.5046782548164527, "grad_norm": 0.28165706992149353, "learning_rate": 4.9632760247835546e-05, "loss": 0.0985, "step": 12325 }, { "epoch": 0.5057019429600966, "grad_norm": 0.22725163400173187, "learning_rate": 4.953017931147676e-05, "loss": 0.0934, "step": 12350 }, { "epoch": 0.5067256311037406, "grad_norm": 0.30300387740135193, "learning_rate": 4.9427598375117975e-05, "loss": 0.0947, "step": 12375 }, { "epoch": 0.5077493192473844, "grad_norm": 0.22563788294792175, "learning_rate": 4.932501743875918e-05, "loss": 0.0964, "step": 12400 }, { "epoch": 0.5087730073910284, "grad_norm": 0.2117646187543869, "learning_rate": 4.92224365024004e-05, "loss": 0.0952, "step": 12425 }, { "epoch": 0.5097966955346723, "grad_norm": 0.22831501066684723, "learning_rate": 4.911985556604161e-05, "loss": 0.0965, "step": 12450 }, { "epoch": 0.5108203836783163, "grad_norm": 0.2967502474784851, "learning_rate": 4.901727462968283e-05, "loss": 0.0947, "step": 12475 }, { "epoch": 0.5118440718219601, "grad_norm": 0.22398816049098969, "learning_rate": 4.8914693693324035e-05, "loss": 0.0935, "step": 12500 }, { "epoch": 0.5128677599656041, "grad_norm": 0.26190030574798584, "learning_rate": 4.881211275696525e-05, "loss": 0.0965, "step": 12525 }, { "epoch": 0.513891448109248, "grad_norm": 0.2718106508255005, "learning_rate": 4.8709531820606464e-05, "loss": 0.0955, "step": 12550 }, { "epoch": 0.5149151362528919, "grad_norm": 0.27051568031311035, "learning_rate": 4.860695088424768e-05, "loss": 0.094, "step": 12575 }, { "epoch": 0.5159388243965358, "grad_norm": 0.2031087726354599, "learning_rate": 4.850436994788889e-05, "loss": 0.0988, "step": 12600 }, { "epoch": 0.5169625125401798, "grad_norm": 0.23844382166862488, "learning_rate": 4.84017890115301e-05, "loss": 0.0915, "step": 12625 }, { "epoch": 0.5179862006838237, "grad_norm": 0.31874755024909973, "learning_rate": 4.8299208075171316e-05, "loss": 0.0908, "step": 12650 }, { "epoch": 0.5190098888274676, "grad_norm": 0.23138810694217682, "learning_rate": 4.819662713881253e-05, "loss": 0.0962, "step": 12675 }, { "epoch": 0.5200335769711115, "grad_norm": 0.27298617362976074, "learning_rate": 4.809404620245374e-05, "loss": 0.0975, "step": 12700 }, { "epoch": 0.5210572651147555, "grad_norm": 0.25157856941223145, "learning_rate": 4.7991465266094954e-05, "loss": 0.0967, "step": 12725 }, { "epoch": 0.5220809532583993, "grad_norm": 0.20571890473365784, "learning_rate": 4.788888432973617e-05, "loss": 0.0939, "step": 12750 }, { "epoch": 0.5231046414020433, "grad_norm": 0.24462303519248962, "learning_rate": 4.778630339337738e-05, "loss": 0.0944, "step": 12775 }, { "epoch": 0.5241283295456872, "grad_norm": 0.2392750382423401, "learning_rate": 4.768372245701859e-05, "loss": 0.0959, "step": 12800 }, { "epoch": 0.5251520176893312, "grad_norm": 0.2759506106376648, "learning_rate": 4.7581141520659805e-05, "loss": 0.0957, "step": 12825 }, { "epoch": 0.526175705832975, "grad_norm": 0.24135975539684296, "learning_rate": 4.747856058430102e-05, "loss": 0.0917, "step": 12850 }, { "epoch": 0.527199393976619, "grad_norm": 0.2595539689064026, "learning_rate": 4.737597964794223e-05, "loss": 0.0984, "step": 12875 }, { "epoch": 0.5282230821202629, "grad_norm": 0.2650289535522461, "learning_rate": 4.727339871158344e-05, "loss": 0.0938, "step": 12900 }, { "epoch": 0.5292467702639068, "grad_norm": 0.24425174295902252, "learning_rate": 4.717081777522466e-05, "loss": 0.0991, "step": 12925 }, { "epoch": 0.5302704584075507, "grad_norm": 0.24873086810112, "learning_rate": 4.706823683886587e-05, "loss": 0.0914, "step": 12950 }, { "epoch": 0.5312941465511947, "grad_norm": 0.280268132686615, "learning_rate": 4.696565590250708e-05, "loss": 0.0966, "step": 12975 }, { "epoch": 0.5323178346948385, "grad_norm": 0.24281346797943115, "learning_rate": 4.6863074966148295e-05, "loss": 0.0932, "step": 13000 }, { "epoch": 0.5333415228384825, "grad_norm": 0.24113261699676514, "learning_rate": 4.676049402978951e-05, "loss": 0.0944, "step": 13025 }, { "epoch": 0.5343652109821264, "grad_norm": 0.2524602711200714, "learning_rate": 4.6657913093430724e-05, "loss": 0.0922, "step": 13050 }, { "epoch": 0.5353888991257704, "grad_norm": 0.24346871674060822, "learning_rate": 4.655533215707193e-05, "loss": 0.0932, "step": 13075 }, { "epoch": 0.5364125872694142, "grad_norm": 0.29335957765579224, "learning_rate": 4.6452751220713147e-05, "loss": 0.0942, "step": 13100 }, { "epoch": 0.5374362754130582, "grad_norm": 0.31220850348472595, "learning_rate": 4.6350170284354354e-05, "loss": 0.0931, "step": 13125 }, { "epoch": 0.5384599635567021, "grad_norm": 0.2569523751735687, "learning_rate": 4.624758934799557e-05, "loss": 0.1005, "step": 13150 }, { "epoch": 0.539483651700346, "grad_norm": 0.26125669479370117, "learning_rate": 4.6145008411636784e-05, "loss": 0.0924, "step": 13175 }, { "epoch": 0.5405073398439899, "grad_norm": 0.31377628445625305, "learning_rate": 4.604242747527799e-05, "loss": 0.0966, "step": 13200 }, { "epoch": 0.5415310279876339, "grad_norm": 0.25545644760131836, "learning_rate": 4.5939846538919206e-05, "loss": 0.0997, "step": 13225 }, { "epoch": 0.5425547161312778, "grad_norm": 0.2510424554347992, "learning_rate": 4.583726560256042e-05, "loss": 0.0939, "step": 13250 }, { "epoch": 0.5435784042749217, "grad_norm": 0.2709631323814392, "learning_rate": 4.5734684666201636e-05, "loss": 0.0918, "step": 13275 }, { "epoch": 0.5446020924185656, "grad_norm": 0.2531428337097168, "learning_rate": 4.5632103729842844e-05, "loss": 0.0968, "step": 13300 }, { "epoch": 0.5456257805622096, "grad_norm": 0.3153735101222992, "learning_rate": 4.552952279348406e-05, "loss": 0.0957, "step": 13325 }, { "epoch": 0.5466494687058534, "grad_norm": 0.2258891612291336, "learning_rate": 4.542694185712527e-05, "loss": 0.0947, "step": 13350 }, { "epoch": 0.5476731568494974, "grad_norm": 0.2671023905277252, "learning_rate": 4.532436092076649e-05, "loss": 0.0957, "step": 13375 }, { "epoch": 0.5486968449931413, "grad_norm": 0.333008348941803, "learning_rate": 4.5221779984407695e-05, "loss": 0.0953, "step": 13400 }, { "epoch": 0.5497205331367853, "grad_norm": 0.2922687828540802, "learning_rate": 4.511919904804891e-05, "loss": 0.0963, "step": 13425 }, { "epoch": 0.5507442212804291, "grad_norm": 0.27738088369369507, "learning_rate": 4.5016618111690125e-05, "loss": 0.0926, "step": 13450 }, { "epoch": 0.551767909424073, "grad_norm": 0.28781887888908386, "learning_rate": 4.491403717533134e-05, "loss": 0.0986, "step": 13475 }, { "epoch": 0.552791597567717, "grad_norm": 0.2727603018283844, "learning_rate": 4.481145623897255e-05, "loss": 0.0995, "step": 13500 }, { "epoch": 0.5538152857113608, "grad_norm": 0.2735615670681, "learning_rate": 4.470887530261376e-05, "loss": 0.0959, "step": 13525 }, { "epoch": 0.5548389738550048, "grad_norm": 0.2211311310529709, "learning_rate": 4.460629436625498e-05, "loss": 0.0872, "step": 13550 }, { "epoch": 0.5558626619986488, "grad_norm": 0.2359626144170761, "learning_rate": 4.4503713429896185e-05, "loss": 0.0967, "step": 13575 }, { "epoch": 0.5568863501422926, "grad_norm": 0.27807098627090454, "learning_rate": 4.44011324935374e-05, "loss": 0.0953, "step": 13600 }, { "epoch": 0.5579100382859365, "grad_norm": 0.2691691219806671, "learning_rate": 4.4298551557178614e-05, "loss": 0.0942, "step": 13625 }, { "epoch": 0.5589337264295805, "grad_norm": 0.22528734803199768, "learning_rate": 4.419597062081983e-05, "loss": 0.0975, "step": 13650 }, { "epoch": 0.5599574145732245, "grad_norm": 0.22979159653186798, "learning_rate": 4.4093389684461036e-05, "loss": 0.0946, "step": 13675 }, { "epoch": 0.5609811027168683, "grad_norm": 0.35849061608314514, "learning_rate": 4.399080874810225e-05, "loss": 0.0886, "step": 13700 }, { "epoch": 0.5620047908605122, "grad_norm": 0.2247435599565506, "learning_rate": 4.3888227811743466e-05, "loss": 0.0942, "step": 13725 }, { "epoch": 0.5630284790041562, "grad_norm": 0.2186431735754013, "learning_rate": 4.378564687538468e-05, "loss": 0.0958, "step": 13750 }, { "epoch": 0.5640521671478, "grad_norm": 0.26496851444244385, "learning_rate": 4.368306593902589e-05, "loss": 0.0932, "step": 13775 }, { "epoch": 0.565075855291444, "grad_norm": 0.20004922151565552, "learning_rate": 4.35804850026671e-05, "loss": 0.1003, "step": 13800 }, { "epoch": 0.566099543435088, "grad_norm": 0.25645968317985535, "learning_rate": 4.347790406630832e-05, "loss": 0.0932, "step": 13825 }, { "epoch": 0.5671232315787319, "grad_norm": 0.24646583199501038, "learning_rate": 4.337532312994953e-05, "loss": 0.0954, "step": 13850 }, { "epoch": 0.5681469197223757, "grad_norm": 0.25467848777770996, "learning_rate": 4.327274219359074e-05, "loss": 0.0982, "step": 13875 }, { "epoch": 0.5691706078660197, "grad_norm": 0.24455401301383972, "learning_rate": 4.3170161257231955e-05, "loss": 0.0913, "step": 13900 }, { "epoch": 0.5701942960096636, "grad_norm": 0.26495805382728577, "learning_rate": 4.306758032087317e-05, "loss": 0.0894, "step": 13925 }, { "epoch": 0.5712179841533075, "grad_norm": 0.23517432808876038, "learning_rate": 4.2964999384514384e-05, "loss": 0.0941, "step": 13950 }, { "epoch": 0.5722416722969514, "grad_norm": 0.25355222821235657, "learning_rate": 4.286241844815559e-05, "loss": 0.09, "step": 13975 }, { "epoch": 0.5732653604405954, "grad_norm": 0.2494240403175354, "learning_rate": 4.275983751179681e-05, "loss": 0.1003, "step": 14000 }, { "epoch": 0.5742890485842393, "grad_norm": 0.22723737359046936, "learning_rate": 4.265725657543802e-05, "loss": 0.1001, "step": 14025 }, { "epoch": 0.5753127367278832, "grad_norm": 0.19633585214614868, "learning_rate": 4.2554675639079236e-05, "loss": 0.0934, "step": 14050 }, { "epoch": 0.5763364248715271, "grad_norm": 0.24108199775218964, "learning_rate": 4.2452094702720444e-05, "loss": 0.0932, "step": 14075 }, { "epoch": 0.5773601130151711, "grad_norm": 0.28201839327812195, "learning_rate": 4.234951376636166e-05, "loss": 0.0931, "step": 14100 }, { "epoch": 0.5783838011588149, "grad_norm": 0.29982468485832214, "learning_rate": 4.224693283000287e-05, "loss": 0.0941, "step": 14125 }, { "epoch": 0.5794074893024589, "grad_norm": 0.2526194453239441, "learning_rate": 4.214435189364409e-05, "loss": 0.0937, "step": 14150 }, { "epoch": 0.5804311774461028, "grad_norm": 0.2288905531167984, "learning_rate": 4.2041770957285296e-05, "loss": 0.0949, "step": 14175 }, { "epoch": 0.5814548655897468, "grad_norm": 0.19117498397827148, "learning_rate": 4.193919002092651e-05, "loss": 0.0938, "step": 14200 }, { "epoch": 0.5824785537333906, "grad_norm": 0.2719483971595764, "learning_rate": 4.1836609084567725e-05, "loss": 0.0911, "step": 14225 }, { "epoch": 0.5835022418770346, "grad_norm": 0.2975625991821289, "learning_rate": 4.173402814820894e-05, "loss": 0.0991, "step": 14250 }, { "epoch": 0.5845259300206785, "grad_norm": 0.2232026904821396, "learning_rate": 4.163144721185015e-05, "loss": 0.0971, "step": 14275 }, { "epoch": 0.5855496181643224, "grad_norm": 0.26348811388015747, "learning_rate": 4.152886627549136e-05, "loss": 0.0946, "step": 14300 }, { "epoch": 0.5865733063079663, "grad_norm": 0.320698082447052, "learning_rate": 4.142628533913258e-05, "loss": 0.0909, "step": 14325 }, { "epoch": 0.5875969944516103, "grad_norm": 0.27873241901397705, "learning_rate": 4.132370440277379e-05, "loss": 0.0986, "step": 14350 }, { "epoch": 0.5886206825952541, "grad_norm": 0.22352805733680725, "learning_rate": 4.1221123466415e-05, "loss": 0.0959, "step": 14375 }, { "epoch": 0.5896443707388981, "grad_norm": 0.2206275910139084, "learning_rate": 4.1118542530056214e-05, "loss": 0.0973, "step": 14400 }, { "epoch": 0.590668058882542, "grad_norm": 0.20755049586296082, "learning_rate": 4.101596159369743e-05, "loss": 0.0987, "step": 14425 }, { "epoch": 0.591691747026186, "grad_norm": 0.25802165269851685, "learning_rate": 4.091338065733864e-05, "loss": 0.0898, "step": 14450 }, { "epoch": 0.5927154351698298, "grad_norm": 0.21148554980754852, "learning_rate": 4.081079972097985e-05, "loss": 0.0906, "step": 14475 }, { "epoch": 0.5937391233134738, "grad_norm": 0.28330081701278687, "learning_rate": 4.0708218784621066e-05, "loss": 0.0952, "step": 14500 }, { "epoch": 0.5947628114571177, "grad_norm": 0.26006045937538147, "learning_rate": 4.060563784826228e-05, "loss": 0.0879, "step": 14525 }, { "epoch": 0.5957864996007616, "grad_norm": 0.2529297173023224, "learning_rate": 4.050305691190349e-05, "loss": 0.0899, "step": 14550 }, { "epoch": 0.5968101877444055, "grad_norm": 0.25934335589408875, "learning_rate": 4.0400475975544703e-05, "loss": 0.0974, "step": 14575 }, { "epoch": 0.5978338758880495, "grad_norm": 0.34801098704338074, "learning_rate": 4.029789503918592e-05, "loss": 0.0966, "step": 14600 }, { "epoch": 0.5988575640316934, "grad_norm": 0.3519505262374878, "learning_rate": 4.019531410282713e-05, "loss": 0.0976, "step": 14625 }, { "epoch": 0.5998812521753373, "grad_norm": 0.21879540383815765, "learning_rate": 4.009273316646834e-05, "loss": 0.0929, "step": 14650 }, { "epoch": 0.6009049403189812, "grad_norm": 0.3088552951812744, "learning_rate": 3.9990152230109555e-05, "loss": 0.0895, "step": 14675 }, { "epoch": 0.6019286284626252, "grad_norm": 0.1896054446697235, "learning_rate": 3.988757129375077e-05, "loss": 0.0954, "step": 14700 }, { "epoch": 0.602952316606269, "grad_norm": 0.24023133516311646, "learning_rate": 3.9784990357391985e-05, "loss": 0.0928, "step": 14725 }, { "epoch": 0.603976004749913, "grad_norm": 0.2335812747478485, "learning_rate": 3.968240942103319e-05, "loss": 0.0915, "step": 14750 }, { "epoch": 0.6049996928935569, "grad_norm": 0.2618425488471985, "learning_rate": 3.957982848467441e-05, "loss": 0.093, "step": 14775 }, { "epoch": 0.6060233810372009, "grad_norm": 0.28540539741516113, "learning_rate": 3.947724754831562e-05, "loss": 0.0945, "step": 14800 }, { "epoch": 0.6070470691808447, "grad_norm": 0.34257885813713074, "learning_rate": 3.9374666611956837e-05, "loss": 0.0937, "step": 14825 }, { "epoch": 0.6080707573244887, "grad_norm": 0.242543026804924, "learning_rate": 3.9272085675598045e-05, "loss": 0.1006, "step": 14850 }, { "epoch": 0.6090944454681326, "grad_norm": 0.3368709683418274, "learning_rate": 3.916950473923926e-05, "loss": 0.098, "step": 14875 }, { "epoch": 0.6101181336117765, "grad_norm": 0.2498284876346588, "learning_rate": 3.9066923802880474e-05, "loss": 0.0945, "step": 14900 }, { "epoch": 0.6111418217554204, "grad_norm": 0.36862441897392273, "learning_rate": 3.896434286652169e-05, "loss": 0.0947, "step": 14925 }, { "epoch": 0.6121655098990644, "grad_norm": 0.2516944110393524, "learning_rate": 3.8861761930162896e-05, "loss": 0.0904, "step": 14950 }, { "epoch": 0.6131891980427082, "grad_norm": 0.23044705390930176, "learning_rate": 3.875918099380411e-05, "loss": 0.0922, "step": 14975 }, { "epoch": 0.6142128861863522, "grad_norm": 0.322510689496994, "learning_rate": 3.8656600057445326e-05, "loss": 0.0939, "step": 15000 }, { "epoch": 0.6152365743299961, "grad_norm": 0.28902101516723633, "learning_rate": 3.855401912108654e-05, "loss": 0.092, "step": 15025 }, { "epoch": 0.6162602624736401, "grad_norm": 0.33545222878456116, "learning_rate": 3.845143818472775e-05, "loss": 0.0931, "step": 15050 }, { "epoch": 0.6172839506172839, "grad_norm": 0.24440859258174896, "learning_rate": 3.834885724836896e-05, "loss": 0.0957, "step": 15075 }, { "epoch": 0.6183076387609279, "grad_norm": 0.25635841488838196, "learning_rate": 3.824627631201018e-05, "loss": 0.0945, "step": 15100 }, { "epoch": 0.6193313269045718, "grad_norm": 0.26487112045288086, "learning_rate": 3.814369537565139e-05, "loss": 0.094, "step": 15125 }, { "epoch": 0.6203550150482157, "grad_norm": 0.2371329963207245, "learning_rate": 3.80411144392926e-05, "loss": 0.0939, "step": 15150 }, { "epoch": 0.6213787031918596, "grad_norm": 0.23745235800743103, "learning_rate": 3.7938533502933815e-05, "loss": 0.0894, "step": 15175 }, { "epoch": 0.6224023913355036, "grad_norm": 0.32679396867752075, "learning_rate": 3.783595256657503e-05, "loss": 0.0916, "step": 15200 }, { "epoch": 0.6234260794791475, "grad_norm": 0.236038938164711, "learning_rate": 3.7733371630216244e-05, "loss": 0.0896, "step": 15225 }, { "epoch": 0.6244497676227914, "grad_norm": 0.17879773676395416, "learning_rate": 3.763079069385745e-05, "loss": 0.0971, "step": 15250 }, { "epoch": 0.6254734557664353, "grad_norm": 0.30429938435554504, "learning_rate": 3.752820975749867e-05, "loss": 0.0884, "step": 15275 }, { "epoch": 0.6264971439100793, "grad_norm": 0.332989364862442, "learning_rate": 3.742562882113988e-05, "loss": 0.0998, "step": 15300 }, { "epoch": 0.6275208320537231, "grad_norm": 0.2244502305984497, "learning_rate": 3.7323047884781096e-05, "loss": 0.0894, "step": 15325 }, { "epoch": 0.6285445201973671, "grad_norm": 0.22671306133270264, "learning_rate": 3.7220466948422304e-05, "loss": 0.0957, "step": 15350 }, { "epoch": 0.629568208341011, "grad_norm": 0.22526578605175018, "learning_rate": 3.711788601206352e-05, "loss": 0.0892, "step": 15375 }, { "epoch": 0.630591896484655, "grad_norm": 0.2514040768146515, "learning_rate": 3.701530507570473e-05, "loss": 0.0916, "step": 15400 }, { "epoch": 0.6316155846282988, "grad_norm": 0.22782598435878754, "learning_rate": 3.691272413934594e-05, "loss": 0.0977, "step": 15425 }, { "epoch": 0.6326392727719428, "grad_norm": 0.24572695791721344, "learning_rate": 3.6810143202987156e-05, "loss": 0.0919, "step": 15450 }, { "epoch": 0.6336629609155867, "grad_norm": 0.231769859790802, "learning_rate": 3.670756226662837e-05, "loss": 0.0904, "step": 15475 }, { "epoch": 0.6346866490592306, "grad_norm": 0.28821659088134766, "learning_rate": 3.6604981330269585e-05, "loss": 0.0905, "step": 15500 }, { "epoch": 0.6357103372028745, "grad_norm": 0.19901390373706818, "learning_rate": 3.650240039391079e-05, "loss": 0.0885, "step": 15525 }, { "epoch": 0.6367340253465185, "grad_norm": 0.24236318469047546, "learning_rate": 3.639981945755201e-05, "loss": 0.0929, "step": 15550 }, { "epoch": 0.6377577134901623, "grad_norm": 0.27218177914619446, "learning_rate": 3.629723852119322e-05, "loss": 0.0925, "step": 15575 }, { "epoch": 0.6387814016338063, "grad_norm": 0.29827386140823364, "learning_rate": 3.619465758483444e-05, "loss": 0.0913, "step": 15600 }, { "epoch": 0.6398050897774502, "grad_norm": 0.2742908000946045, "learning_rate": 3.6092076648475645e-05, "loss": 0.0941, "step": 15625 }, { "epoch": 0.6408287779210942, "grad_norm": 0.28651776909828186, "learning_rate": 3.598949571211686e-05, "loss": 0.099, "step": 15650 }, { "epoch": 0.641852466064738, "grad_norm": 0.2705094814300537, "learning_rate": 3.5886914775758074e-05, "loss": 0.0972, "step": 15675 }, { "epoch": 0.642876154208382, "grad_norm": 0.2905079424381256, "learning_rate": 3.578433383939929e-05, "loss": 0.0952, "step": 15700 }, { "epoch": 0.6438998423520259, "grad_norm": 0.28126639127731323, "learning_rate": 3.56817529030405e-05, "loss": 0.0928, "step": 15725 }, { "epoch": 0.6449235304956698, "grad_norm": 0.23970367014408112, "learning_rate": 3.557917196668171e-05, "loss": 0.0945, "step": 15750 }, { "epoch": 0.6459472186393137, "grad_norm": 0.23676908016204834, "learning_rate": 3.5476591030322926e-05, "loss": 0.0936, "step": 15775 }, { "epoch": 0.6469709067829577, "grad_norm": 0.2415960431098938, "learning_rate": 3.537401009396414e-05, "loss": 0.0923, "step": 15800 }, { "epoch": 0.6479945949266016, "grad_norm": 0.25593453645706177, "learning_rate": 3.527142915760535e-05, "loss": 0.0924, "step": 15825 }, { "epoch": 0.6490182830702454, "grad_norm": 0.24800516664981842, "learning_rate": 3.5168848221246563e-05, "loss": 0.0945, "step": 15850 }, { "epoch": 0.6500419712138894, "grad_norm": 0.24197053909301758, "learning_rate": 3.506626728488778e-05, "loss": 0.0932, "step": 15875 }, { "epoch": 0.6510656593575334, "grad_norm": 0.25776922702789307, "learning_rate": 3.496368634852899e-05, "loss": 0.0909, "step": 15900 }, { "epoch": 0.6520893475011772, "grad_norm": 0.1953815519809723, "learning_rate": 3.48611054121702e-05, "loss": 0.0927, "step": 15925 }, { "epoch": 0.6531130356448211, "grad_norm": 0.267980694770813, "learning_rate": 3.4758524475811415e-05, "loss": 0.0917, "step": 15950 }, { "epoch": 0.6541367237884651, "grad_norm": 0.2663339674472809, "learning_rate": 3.465594353945263e-05, "loss": 0.0921, "step": 15975 }, { "epoch": 0.655160411932109, "grad_norm": 0.32129451632499695, "learning_rate": 3.4553362603093845e-05, "loss": 0.0906, "step": 16000 }, { "epoch": 0.6561841000757529, "grad_norm": 0.23216140270233154, "learning_rate": 3.445078166673505e-05, "loss": 0.0916, "step": 16025 }, { "epoch": 0.6572077882193968, "grad_norm": 0.26740553975105286, "learning_rate": 3.434820073037627e-05, "loss": 0.0939, "step": 16050 }, { "epoch": 0.6582314763630408, "grad_norm": 0.26812317967414856, "learning_rate": 3.424561979401748e-05, "loss": 0.0903, "step": 16075 }, { "epoch": 0.6592551645066846, "grad_norm": 0.2955368459224701, "learning_rate": 3.4143038857658697e-05, "loss": 0.0907, "step": 16100 }, { "epoch": 0.6602788526503286, "grad_norm": 0.23007504642009735, "learning_rate": 3.4040457921299904e-05, "loss": 0.0894, "step": 16125 }, { "epoch": 0.6613025407939725, "grad_norm": 0.2416328340768814, "learning_rate": 3.393787698494112e-05, "loss": 0.0872, "step": 16150 }, { "epoch": 0.6623262289376164, "grad_norm": 0.2104121297597885, "learning_rate": 3.3835296048582334e-05, "loss": 0.0877, "step": 16175 }, { "epoch": 0.6633499170812603, "grad_norm": 0.23629434406757355, "learning_rate": 3.373271511222355e-05, "loss": 0.0943, "step": 16200 }, { "epoch": 0.6643736052249043, "grad_norm": 0.2717180550098419, "learning_rate": 3.3630134175864756e-05, "loss": 0.0879, "step": 16225 }, { "epoch": 0.6653972933685482, "grad_norm": 0.27863848209381104, "learning_rate": 3.352755323950597e-05, "loss": 0.0881, "step": 16250 }, { "epoch": 0.6664209815121921, "grad_norm": 0.2909884452819824, "learning_rate": 3.3424972303147186e-05, "loss": 0.0938, "step": 16275 }, { "epoch": 0.667444669655836, "grad_norm": 0.18690423667430878, "learning_rate": 3.3322391366788394e-05, "loss": 0.0945, "step": 16300 }, { "epoch": 0.66846835779948, "grad_norm": 0.2364642322063446, "learning_rate": 3.321981043042961e-05, "loss": 0.0981, "step": 16325 }, { "epoch": 0.6694920459431238, "grad_norm": 0.23339948058128357, "learning_rate": 3.311722949407082e-05, "loss": 0.0959, "step": 16350 }, { "epoch": 0.6705157340867678, "grad_norm": 0.3215301036834717, "learning_rate": 3.301464855771204e-05, "loss": 0.0915, "step": 16375 }, { "epoch": 0.6715394222304117, "grad_norm": 0.21121945977210999, "learning_rate": 3.2912067621353245e-05, "loss": 0.0965, "step": 16400 }, { "epoch": 0.6725631103740557, "grad_norm": 0.2474169135093689, "learning_rate": 3.280948668499446e-05, "loss": 0.0901, "step": 16425 }, { "epoch": 0.6735867985176995, "grad_norm": 0.27990350127220154, "learning_rate": 3.2706905748635675e-05, "loss": 0.0914, "step": 16450 }, { "epoch": 0.6746104866613435, "grad_norm": 0.23860132694244385, "learning_rate": 3.260432481227689e-05, "loss": 0.0892, "step": 16475 }, { "epoch": 0.6756341748049874, "grad_norm": 0.29351699352264404, "learning_rate": 3.25017438759181e-05, "loss": 0.0956, "step": 16500 }, { "epoch": 0.6766578629486313, "grad_norm": 0.2769309878349304, "learning_rate": 3.239916293955931e-05, "loss": 0.0938, "step": 16525 }, { "epoch": 0.6776815510922752, "grad_norm": 0.1899634450674057, "learning_rate": 3.229658200320053e-05, "loss": 0.0927, "step": 16550 }, { "epoch": 0.6787052392359192, "grad_norm": 0.23339390754699707, "learning_rate": 3.219400106684174e-05, "loss": 0.0944, "step": 16575 }, { "epoch": 0.6797289273795631, "grad_norm": 0.30219605565071106, "learning_rate": 3.209142013048295e-05, "loss": 0.0908, "step": 16600 }, { "epoch": 0.680752615523207, "grad_norm": 0.24272675812244415, "learning_rate": 3.1988839194124164e-05, "loss": 0.0905, "step": 16625 }, { "epoch": 0.6817763036668509, "grad_norm": 0.28862476348876953, "learning_rate": 3.188625825776538e-05, "loss": 0.0958, "step": 16650 }, { "epoch": 0.6827999918104949, "grad_norm": 0.230793759226799, "learning_rate": 3.178367732140659e-05, "loss": 0.0942, "step": 16675 }, { "epoch": 0.6838236799541387, "grad_norm": 0.256304532289505, "learning_rate": 3.16810963850478e-05, "loss": 0.0908, "step": 16700 }, { "epoch": 0.6848473680977827, "grad_norm": 0.24292372167110443, "learning_rate": 3.1578515448689016e-05, "loss": 0.0919, "step": 16725 }, { "epoch": 0.6858710562414266, "grad_norm": 0.3442842662334442, "learning_rate": 3.147593451233023e-05, "loss": 0.0906, "step": 16750 }, { "epoch": 0.6868947443850705, "grad_norm": 0.28444263339042664, "learning_rate": 3.1373353575971445e-05, "loss": 0.0904, "step": 16775 }, { "epoch": 0.6879184325287144, "grad_norm": 0.2305566966533661, "learning_rate": 3.127077263961265e-05, "loss": 0.0948, "step": 16800 }, { "epoch": 0.6889421206723584, "grad_norm": 0.3065620958805084, "learning_rate": 3.116819170325387e-05, "loss": 0.0916, "step": 16825 }, { "epoch": 0.6899658088160023, "grad_norm": 0.34748420119285583, "learning_rate": 3.106561076689508e-05, "loss": 0.0898, "step": 16850 }, { "epoch": 0.6909894969596462, "grad_norm": 0.28425559401512146, "learning_rate": 3.09630298305363e-05, "loss": 0.0941, "step": 16875 }, { "epoch": 0.6920131851032901, "grad_norm": 0.31354910135269165, "learning_rate": 3.0860448894177505e-05, "loss": 0.0945, "step": 16900 }, { "epoch": 0.6930368732469341, "grad_norm": 0.2128172069787979, "learning_rate": 3.075786795781872e-05, "loss": 0.0909, "step": 16925 }, { "epoch": 0.6940605613905779, "grad_norm": 0.2469140887260437, "learning_rate": 3.0655287021459934e-05, "loss": 0.0939, "step": 16950 }, { "epoch": 0.6950842495342219, "grad_norm": 0.35298585891723633, "learning_rate": 3.055270608510115e-05, "loss": 0.089, "step": 16975 }, { "epoch": 0.6961079376778658, "grad_norm": 0.26399216055870056, "learning_rate": 3.045012514874236e-05, "loss": 0.0968, "step": 17000 }, { "epoch": 0.6971316258215098, "grad_norm": 0.2543809413909912, "learning_rate": 3.034754421238357e-05, "loss": 0.0908, "step": 17025 }, { "epoch": 0.6981553139651536, "grad_norm": 0.24737343192100525, "learning_rate": 3.0244963276024786e-05, "loss": 0.094, "step": 17050 }, { "epoch": 0.6991790021087976, "grad_norm": 0.2577686607837677, "learning_rate": 3.0142382339665997e-05, "loss": 0.0933, "step": 17075 }, { "epoch": 0.7002026902524415, "grad_norm": 0.28968894481658936, "learning_rate": 3.0039801403307212e-05, "loss": 0.0965, "step": 17100 }, { "epoch": 0.7012263783960854, "grad_norm": 0.2456517517566681, "learning_rate": 2.9937220466948423e-05, "loss": 0.0953, "step": 17125 }, { "epoch": 0.7022500665397293, "grad_norm": 0.25714367628097534, "learning_rate": 2.9834639530589638e-05, "loss": 0.0888, "step": 17150 }, { "epoch": 0.7032737546833733, "grad_norm": 0.2177487164735794, "learning_rate": 2.973205859423085e-05, "loss": 0.0917, "step": 17175 }, { "epoch": 0.7042974428270172, "grad_norm": 0.20064932107925415, "learning_rate": 2.962947765787206e-05, "loss": 0.0899, "step": 17200 }, { "epoch": 0.7053211309706611, "grad_norm": 0.2717735469341278, "learning_rate": 2.9526896721513275e-05, "loss": 0.0939, "step": 17225 }, { "epoch": 0.706344819114305, "grad_norm": 0.20536677539348602, "learning_rate": 2.9424315785154487e-05, "loss": 0.0941, "step": 17250 }, { "epoch": 0.707368507257949, "grad_norm": 0.28099992871284485, "learning_rate": 2.93217348487957e-05, "loss": 0.0881, "step": 17275 }, { "epoch": 0.7083921954015928, "grad_norm": 0.21004413068294525, "learning_rate": 2.9219153912436913e-05, "loss": 0.0945, "step": 17300 }, { "epoch": 0.7094158835452368, "grad_norm": 0.24377816915512085, "learning_rate": 2.9116572976078127e-05, "loss": 0.0943, "step": 17325 }, { "epoch": 0.7104395716888807, "grad_norm": 0.2159167379140854, "learning_rate": 2.901399203971934e-05, "loss": 0.0916, "step": 17350 }, { "epoch": 0.7114632598325247, "grad_norm": 0.3277469277381897, "learning_rate": 2.8911411103360553e-05, "loss": 0.0894, "step": 17375 }, { "epoch": 0.7124869479761685, "grad_norm": 0.3423548638820648, "learning_rate": 2.8808830167001764e-05, "loss": 0.0922, "step": 17400 }, { "epoch": 0.7135106361198125, "grad_norm": 0.20151039958000183, "learning_rate": 2.870624923064298e-05, "loss": 0.0906, "step": 17425 }, { "epoch": 0.7145343242634564, "grad_norm": 0.29227256774902344, "learning_rate": 2.860366829428419e-05, "loss": 0.0914, "step": 17450 }, { "epoch": 0.7155580124071003, "grad_norm": 0.31062838435173035, "learning_rate": 2.8501087357925405e-05, "loss": 0.0932, "step": 17475 }, { "epoch": 0.7165817005507442, "grad_norm": 0.24426613748073578, "learning_rate": 2.8398506421566616e-05, "loss": 0.0938, "step": 17500 }, { "epoch": 0.7176053886943882, "grad_norm": 0.2505645155906677, "learning_rate": 2.829592548520783e-05, "loss": 0.0924, "step": 17525 }, { "epoch": 0.718629076838032, "grad_norm": 0.21960324048995972, "learning_rate": 2.8193344548849042e-05, "loss": 0.0912, "step": 17550 }, { "epoch": 0.719652764981676, "grad_norm": 0.25820910930633545, "learning_rate": 2.8090763612490257e-05, "loss": 0.0913, "step": 17575 }, { "epoch": 0.7206764531253199, "grad_norm": 0.23069611191749573, "learning_rate": 2.7988182676131468e-05, "loss": 0.0903, "step": 17600 }, { "epoch": 0.7217001412689639, "grad_norm": 0.2641305923461914, "learning_rate": 2.7885601739772683e-05, "loss": 0.0899, "step": 17625 }, { "epoch": 0.7227238294126077, "grad_norm": 0.28528881072998047, "learning_rate": 2.7783020803413894e-05, "loss": 0.0922, "step": 17650 }, { "epoch": 0.7237475175562517, "grad_norm": 0.297124445438385, "learning_rate": 2.768043986705511e-05, "loss": 0.0899, "step": 17675 }, { "epoch": 0.7247712056998956, "grad_norm": 0.2650444805622101, "learning_rate": 2.757785893069632e-05, "loss": 0.0903, "step": 17700 }, { "epoch": 0.7257948938435395, "grad_norm": 0.2515466809272766, "learning_rate": 2.7475277994337535e-05, "loss": 0.0943, "step": 17725 }, { "epoch": 0.7268185819871834, "grad_norm": 0.29468923807144165, "learning_rate": 2.7372697057978746e-05, "loss": 0.0935, "step": 17750 }, { "epoch": 0.7278422701308274, "grad_norm": 0.28869664669036865, "learning_rate": 2.727011612161996e-05, "loss": 0.0877, "step": 17775 }, { "epoch": 0.7288659582744713, "grad_norm": 0.2862752377986908, "learning_rate": 2.7167535185261172e-05, "loss": 0.0894, "step": 17800 }, { "epoch": 0.7298896464181152, "grad_norm": 0.4324943721294403, "learning_rate": 2.7064954248902387e-05, "loss": 0.0964, "step": 17825 }, { "epoch": 0.7309133345617591, "grad_norm": 0.2106688767671585, "learning_rate": 2.6962373312543598e-05, "loss": 0.0941, "step": 17850 }, { "epoch": 0.7319370227054031, "grad_norm": 0.2924487292766571, "learning_rate": 2.6859792376184813e-05, "loss": 0.0895, "step": 17875 }, { "epoch": 0.7329607108490469, "grad_norm": 0.21302323043346405, "learning_rate": 2.6757211439826024e-05, "loss": 0.0951, "step": 17900 }, { "epoch": 0.7339843989926909, "grad_norm": 0.2614041268825531, "learning_rate": 2.665463050346724e-05, "loss": 0.0885, "step": 17925 }, { "epoch": 0.7350080871363348, "grad_norm": 0.2530576288700104, "learning_rate": 2.655204956710845e-05, "loss": 0.0906, "step": 17950 }, { "epoch": 0.7360317752799788, "grad_norm": 0.21055959165096283, "learning_rate": 2.6449468630749665e-05, "loss": 0.0905, "step": 17975 }, { "epoch": 0.7370554634236226, "grad_norm": 0.23487575352191925, "learning_rate": 2.6346887694390876e-05, "loss": 0.0886, "step": 18000 }, { "epoch": 0.7380791515672666, "grad_norm": 0.2657538950443268, "learning_rate": 2.624430675803209e-05, "loss": 0.0902, "step": 18025 }, { "epoch": 0.7391028397109105, "grad_norm": 0.2803148627281189, "learning_rate": 2.6141725821673302e-05, "loss": 0.0914, "step": 18050 }, { "epoch": 0.7401265278545544, "grad_norm": 0.29323095083236694, "learning_rate": 2.6039144885314516e-05, "loss": 0.0863, "step": 18075 }, { "epoch": 0.7411502159981983, "grad_norm": 0.2417263686656952, "learning_rate": 2.5936563948955728e-05, "loss": 0.091, "step": 18100 }, { "epoch": 0.7421739041418423, "grad_norm": 0.30392271280288696, "learning_rate": 2.583398301259694e-05, "loss": 0.094, "step": 18125 }, { "epoch": 0.7431975922854861, "grad_norm": 0.24675561487674713, "learning_rate": 2.5731402076238154e-05, "loss": 0.09, "step": 18150 }, { "epoch": 0.74422128042913, "grad_norm": 0.28635236620903015, "learning_rate": 2.5628821139879365e-05, "loss": 0.0944, "step": 18175 }, { "epoch": 0.745244968572774, "grad_norm": 0.3268403112888336, "learning_rate": 2.552624020352058e-05, "loss": 0.0914, "step": 18200 }, { "epoch": 0.746268656716418, "grad_norm": 0.32864445447921753, "learning_rate": 2.542365926716179e-05, "loss": 0.0956, "step": 18225 }, { "epoch": 0.7472923448600618, "grad_norm": 0.2175736427307129, "learning_rate": 2.5321078330803006e-05, "loss": 0.0922, "step": 18250 }, { "epoch": 0.7483160330037057, "grad_norm": 0.26862508058547974, "learning_rate": 2.5218497394444217e-05, "loss": 0.0881, "step": 18275 }, { "epoch": 0.7493397211473497, "grad_norm": 0.2962358593940735, "learning_rate": 2.511591645808543e-05, "loss": 0.0886, "step": 18300 }, { "epoch": 0.7503634092909935, "grad_norm": 0.21592926979064941, "learning_rate": 2.5013335521726643e-05, "loss": 0.0852, "step": 18325 }, { "epoch": 0.7513870974346375, "grad_norm": 0.4917377531528473, "learning_rate": 2.4910754585367857e-05, "loss": 0.088, "step": 18350 }, { "epoch": 0.7524107855782814, "grad_norm": 0.2455429881811142, "learning_rate": 2.480817364900907e-05, "loss": 0.0937, "step": 18375 }, { "epoch": 0.7534344737219254, "grad_norm": 0.22315055131912231, "learning_rate": 2.4705592712650283e-05, "loss": 0.0928, "step": 18400 }, { "epoch": 0.7544581618655692, "grad_norm": 0.2998165190219879, "learning_rate": 2.4603011776291495e-05, "loss": 0.0973, "step": 18425 }, { "epoch": 0.7554818500092132, "grad_norm": 0.29680758714675903, "learning_rate": 2.450043083993271e-05, "loss": 0.0883, "step": 18450 }, { "epoch": 0.7565055381528571, "grad_norm": 0.34500744938850403, "learning_rate": 2.439784990357392e-05, "loss": 0.0936, "step": 18475 }, { "epoch": 0.757529226296501, "grad_norm": 0.2546531856060028, "learning_rate": 2.4295268967215135e-05, "loss": 0.0929, "step": 18500 }, { "epoch": 0.7585529144401449, "grad_norm": 0.2985497713088989, "learning_rate": 2.4192688030856347e-05, "loss": 0.0931, "step": 18525 }, { "epoch": 0.7595766025837889, "grad_norm": 0.21997804939746857, "learning_rate": 2.409010709449756e-05, "loss": 0.0922, "step": 18550 }, { "epoch": 0.7606002907274328, "grad_norm": 0.33792802691459656, "learning_rate": 2.3987526158138772e-05, "loss": 0.0856, "step": 18575 }, { "epoch": 0.7616239788710767, "grad_norm": 0.21099922060966492, "learning_rate": 2.3884945221779987e-05, "loss": 0.096, "step": 18600 }, { "epoch": 0.7626476670147206, "grad_norm": 0.29002106189727783, "learning_rate": 2.37823642854212e-05, "loss": 0.0938, "step": 18625 }, { "epoch": 0.7636713551583646, "grad_norm": 0.23993101716041565, "learning_rate": 2.3679783349062413e-05, "loss": 0.0875, "step": 18650 }, { "epoch": 0.7646950433020084, "grad_norm": 0.2299950271844864, "learning_rate": 2.3577202412703624e-05, "loss": 0.0903, "step": 18675 }, { "epoch": 0.7657187314456524, "grad_norm": 0.2547556757926941, "learning_rate": 2.347462147634484e-05, "loss": 0.0966, "step": 18700 }, { "epoch": 0.7667424195892963, "grad_norm": 0.24056895077228546, "learning_rate": 2.337204053998605e-05, "loss": 0.0901, "step": 18725 }, { "epoch": 0.7677661077329402, "grad_norm": 0.2962265610694885, "learning_rate": 2.3269459603627265e-05, "loss": 0.0941, "step": 18750 }, { "epoch": 0.7687897958765841, "grad_norm": 0.3107589781284332, "learning_rate": 2.3166878667268476e-05, "loss": 0.0922, "step": 18775 }, { "epoch": 0.7698134840202281, "grad_norm": 0.2781747877597809, "learning_rate": 2.306429773090969e-05, "loss": 0.0909, "step": 18800 }, { "epoch": 0.770837172163872, "grad_norm": 0.3311710059642792, "learning_rate": 2.2961716794550902e-05, "loss": 0.0877, "step": 18825 }, { "epoch": 0.7718608603075159, "grad_norm": 0.2895514965057373, "learning_rate": 2.2859135858192117e-05, "loss": 0.0973, "step": 18850 }, { "epoch": 0.7728845484511598, "grad_norm": 0.24788254499435425, "learning_rate": 2.2756554921833328e-05, "loss": 0.0922, "step": 18875 }, { "epoch": 0.7739082365948038, "grad_norm": 0.3390001952648163, "learning_rate": 2.2653973985474543e-05, "loss": 0.0951, "step": 18900 }, { "epoch": 0.7749319247384476, "grad_norm": 0.3275790214538574, "learning_rate": 2.2551393049115754e-05, "loss": 0.0902, "step": 18925 }, { "epoch": 0.7759556128820916, "grad_norm": 0.2598778009414673, "learning_rate": 2.244881211275697e-05, "loss": 0.0936, "step": 18950 }, { "epoch": 0.7769793010257355, "grad_norm": 0.32007846236228943, "learning_rate": 2.234623117639818e-05, "loss": 0.093, "step": 18975 }, { "epoch": 0.7780029891693795, "grad_norm": 0.25675615668296814, "learning_rate": 2.2243650240039395e-05, "loss": 0.097, "step": 19000 }, { "epoch": 0.7790266773130233, "grad_norm": 0.20342758297920227, "learning_rate": 2.2141069303680606e-05, "loss": 0.0941, "step": 19025 }, { "epoch": 0.7800503654566673, "grad_norm": 0.2361544668674469, "learning_rate": 2.203848836732182e-05, "loss": 0.0903, "step": 19050 }, { "epoch": 0.7810740536003112, "grad_norm": 0.2677974998950958, "learning_rate": 2.1935907430963032e-05, "loss": 0.0938, "step": 19075 }, { "epoch": 0.7820977417439551, "grad_norm": 0.3720152676105499, "learning_rate": 2.1833326494604243e-05, "loss": 0.0899, "step": 19100 }, { "epoch": 0.783121429887599, "grad_norm": 0.30042845010757446, "learning_rate": 2.1730745558245458e-05, "loss": 0.0906, "step": 19125 }, { "epoch": 0.784145118031243, "grad_norm": 0.25269991159439087, "learning_rate": 2.162816462188667e-05, "loss": 0.0899, "step": 19150 }, { "epoch": 0.7851688061748869, "grad_norm": 0.21545687317848206, "learning_rate": 2.1525583685527884e-05, "loss": 0.0888, "step": 19175 }, { "epoch": 0.7861924943185308, "grad_norm": 0.24490401148796082, "learning_rate": 2.1423002749169095e-05, "loss": 0.0899, "step": 19200 }, { "epoch": 0.7872161824621747, "grad_norm": 0.3394610583782196, "learning_rate": 2.132042181281031e-05, "loss": 0.0981, "step": 19225 }, { "epoch": 0.7882398706058187, "grad_norm": 0.27232640981674194, "learning_rate": 2.121784087645152e-05, "loss": 0.0888, "step": 19250 }, { "epoch": 0.7892635587494625, "grad_norm": 0.26301074028015137, "learning_rate": 2.1115259940092736e-05, "loss": 0.0897, "step": 19275 }, { "epoch": 0.7902872468931065, "grad_norm": 0.2940311133861542, "learning_rate": 2.1012679003733947e-05, "loss": 0.0912, "step": 19300 }, { "epoch": 0.7913109350367504, "grad_norm": 0.24101464450359344, "learning_rate": 2.091009806737516e-05, "loss": 0.0933, "step": 19325 }, { "epoch": 0.7923346231803943, "grad_norm": 0.3280772268772125, "learning_rate": 2.0807517131016373e-05, "loss": 0.0905, "step": 19350 }, { "epoch": 0.7933583113240382, "grad_norm": 0.3161431550979614, "learning_rate": 2.0704936194657588e-05, "loss": 0.0901, "step": 19375 }, { "epoch": 0.7943819994676822, "grad_norm": 0.28092876076698303, "learning_rate": 2.06023552582988e-05, "loss": 0.0902, "step": 19400 }, { "epoch": 0.7954056876113261, "grad_norm": 0.21107934415340424, "learning_rate": 2.0499774321940014e-05, "loss": 0.0888, "step": 19425 }, { "epoch": 0.79642937575497, "grad_norm": 0.24856053292751312, "learning_rate": 2.0397193385581225e-05, "loss": 0.0905, "step": 19450 }, { "epoch": 0.7974530638986139, "grad_norm": 0.2561679482460022, "learning_rate": 2.029461244922244e-05, "loss": 0.0922, "step": 19475 }, { "epoch": 0.7984767520422579, "grad_norm": 0.25557827949523926, "learning_rate": 2.019203151286365e-05, "loss": 0.0879, "step": 19500 }, { "epoch": 0.7995004401859017, "grad_norm": 0.2589765787124634, "learning_rate": 2.0089450576504865e-05, "loss": 0.0928, "step": 19525 }, { "epoch": 0.8005241283295457, "grad_norm": 0.21249115467071533, "learning_rate": 1.9986869640146077e-05, "loss": 0.0869, "step": 19550 }, { "epoch": 0.8015478164731896, "grad_norm": 0.23621489107608795, "learning_rate": 1.988428870378729e-05, "loss": 0.092, "step": 19575 }, { "epoch": 0.8025715046168336, "grad_norm": 0.2507089376449585, "learning_rate": 1.9781707767428503e-05, "loss": 0.0875, "step": 19600 }, { "epoch": 0.8035951927604774, "grad_norm": 0.28460606932640076, "learning_rate": 1.9679126831069717e-05, "loss": 0.0838, "step": 19625 }, { "epoch": 0.8046188809041214, "grad_norm": 0.3332251310348511, "learning_rate": 1.9576545894710925e-05, "loss": 0.0909, "step": 19650 }, { "epoch": 0.8056425690477653, "grad_norm": 0.26824021339416504, "learning_rate": 1.947396495835214e-05, "loss": 0.0922, "step": 19675 }, { "epoch": 0.8066662571914092, "grad_norm": 0.2643376886844635, "learning_rate": 1.937138402199335e-05, "loss": 0.0915, "step": 19700 }, { "epoch": 0.8076899453350531, "grad_norm": 0.29947948455810547, "learning_rate": 1.9268803085634566e-05, "loss": 0.0919, "step": 19725 }, { "epoch": 0.8087136334786971, "grad_norm": 0.37118449807167053, "learning_rate": 1.9166222149275777e-05, "loss": 0.0887, "step": 19750 }, { "epoch": 0.809737321622341, "grad_norm": 0.32123562693595886, "learning_rate": 1.9063641212916992e-05, "loss": 0.0913, "step": 19775 }, { "epoch": 0.8107610097659849, "grad_norm": 0.2964722514152527, "learning_rate": 1.8961060276558203e-05, "loss": 0.0915, "step": 19800 }, { "epoch": 0.8117846979096288, "grad_norm": 0.25374674797058105, "learning_rate": 1.8858479340199418e-05, "loss": 0.0918, "step": 19825 }, { "epoch": 0.8128083860532728, "grad_norm": 0.30407896637916565, "learning_rate": 1.875589840384063e-05, "loss": 0.0934, "step": 19850 }, { "epoch": 0.8138320741969166, "grad_norm": 0.284839928150177, "learning_rate": 1.8653317467481844e-05, "loss": 0.0868, "step": 19875 }, { "epoch": 0.8148557623405606, "grad_norm": 0.27440112829208374, "learning_rate": 1.8550736531123055e-05, "loss": 0.0981, "step": 19900 }, { "epoch": 0.8158794504842045, "grad_norm": 0.293817400932312, "learning_rate": 1.844815559476427e-05, "loss": 0.0937, "step": 19925 }, { "epoch": 0.8169031386278484, "grad_norm": 0.25099506974220276, "learning_rate": 1.834557465840548e-05, "loss": 0.0907, "step": 19950 }, { "epoch": 0.8179268267714923, "grad_norm": 0.2696509063243866, "learning_rate": 1.8242993722046696e-05, "loss": 0.0898, "step": 19975 }, { "epoch": 0.8189505149151363, "grad_norm": 0.23524117469787598, "learning_rate": 1.8140412785687907e-05, "loss": 0.0834, "step": 20000 }, { "epoch": 0.8199742030587802, "grad_norm": 0.28562095761299133, "learning_rate": 1.803783184932912e-05, "loss": 0.0949, "step": 20025 }, { "epoch": 0.8209978912024241, "grad_norm": 0.3326290249824524, "learning_rate": 1.7935250912970333e-05, "loss": 0.086, "step": 20050 }, { "epoch": 0.822021579346068, "grad_norm": 0.335920125246048, "learning_rate": 1.7832669976611548e-05, "loss": 0.0898, "step": 20075 }, { "epoch": 0.823045267489712, "grad_norm": 0.23107844591140747, "learning_rate": 1.773008904025276e-05, "loss": 0.0911, "step": 20100 }, { "epoch": 0.8240689556333558, "grad_norm": 0.2805933356285095, "learning_rate": 1.7627508103893973e-05, "loss": 0.0903, "step": 20125 }, { "epoch": 0.8250926437769998, "grad_norm": 0.2637193500995636, "learning_rate": 1.7524927167535185e-05, "loss": 0.0934, "step": 20150 }, { "epoch": 0.8261163319206437, "grad_norm": 0.25126680731773376, "learning_rate": 1.74223462311764e-05, "loss": 0.0967, "step": 20175 }, { "epoch": 0.8271400200642877, "grad_norm": 0.21200938522815704, "learning_rate": 1.731976529481761e-05, "loss": 0.0879, "step": 20200 }, { "epoch": 0.8281637082079315, "grad_norm": 0.2675575017929077, "learning_rate": 1.7217184358458825e-05, "loss": 0.0933, "step": 20225 }, { "epoch": 0.8291873963515755, "grad_norm": 0.24949528276920319, "learning_rate": 1.7114603422100037e-05, "loss": 0.0834, "step": 20250 }, { "epoch": 0.8302110844952194, "grad_norm": 0.31639212369918823, "learning_rate": 1.701202248574125e-05, "loss": 0.0862, "step": 20275 }, { "epoch": 0.8312347726388633, "grad_norm": 0.31430932879447937, "learning_rate": 1.6909441549382463e-05, "loss": 0.0895, "step": 20300 }, { "epoch": 0.8322584607825072, "grad_norm": 0.2188422530889511, "learning_rate": 1.6806860613023674e-05, "loss": 0.0866, "step": 20325 }, { "epoch": 0.8332821489261512, "grad_norm": 0.26949557662010193, "learning_rate": 1.670427967666489e-05, "loss": 0.0874, "step": 20350 }, { "epoch": 0.8343058370697951, "grad_norm": 0.2512851655483246, "learning_rate": 1.66016987403061e-05, "loss": 0.0886, "step": 20375 }, { "epoch": 0.835329525213439, "grad_norm": 0.21398603916168213, "learning_rate": 1.6499117803947314e-05, "loss": 0.0901, "step": 20400 }, { "epoch": 0.8363532133570829, "grad_norm": 0.3579723834991455, "learning_rate": 1.6396536867588526e-05, "loss": 0.089, "step": 20425 }, { "epoch": 0.8373769015007269, "grad_norm": 0.25546953082084656, "learning_rate": 1.629395593122974e-05, "loss": 0.09, "step": 20450 }, { "epoch": 0.8384005896443707, "grad_norm": 0.30521437525749207, "learning_rate": 1.6191374994870952e-05, "loss": 0.0878, "step": 20475 }, { "epoch": 0.8394242777880146, "grad_norm": 0.25270193815231323, "learning_rate": 1.6088794058512166e-05, "loss": 0.0871, "step": 20500 }, { "epoch": 0.8404479659316586, "grad_norm": 0.31624093651771545, "learning_rate": 1.5986213122153378e-05, "loss": 0.0872, "step": 20525 }, { "epoch": 0.8414716540753026, "grad_norm": 0.3739725947380066, "learning_rate": 1.5883632185794592e-05, "loss": 0.0864, "step": 20550 }, { "epoch": 0.8424953422189464, "grad_norm": 0.25170573592185974, "learning_rate": 1.5781051249435804e-05, "loss": 0.0927, "step": 20575 }, { "epoch": 0.8435190303625903, "grad_norm": 0.24413146078586578, "learning_rate": 1.5678470313077018e-05, "loss": 0.0878, "step": 20600 }, { "epoch": 0.8445427185062343, "grad_norm": 0.26711735129356384, "learning_rate": 1.557588937671823e-05, "loss": 0.0898, "step": 20625 }, { "epoch": 0.8455664066498781, "grad_norm": 0.2967755198478699, "learning_rate": 1.5473308440359444e-05, "loss": 0.093, "step": 20650 }, { "epoch": 0.8465900947935221, "grad_norm": 0.25452178716659546, "learning_rate": 1.5370727504000655e-05, "loss": 0.088, "step": 20675 }, { "epoch": 0.847613782937166, "grad_norm": 0.22610174119472504, "learning_rate": 1.526814656764187e-05, "loss": 0.0844, "step": 20700 }, { "epoch": 0.8486374710808099, "grad_norm": 0.2170991748571396, "learning_rate": 1.5165565631283083e-05, "loss": 0.0884, "step": 20725 }, { "epoch": 0.8496611592244538, "grad_norm": 0.2881997227668762, "learning_rate": 1.5062984694924296e-05, "loss": 0.0935, "step": 20750 }, { "epoch": 0.8506848473680978, "grad_norm": 0.2766591012477875, "learning_rate": 1.4960403758565509e-05, "loss": 0.0874, "step": 20775 }, { "epoch": 0.8517085355117417, "grad_norm": 0.2786926329135895, "learning_rate": 1.485782282220672e-05, "loss": 0.0892, "step": 20800 }, { "epoch": 0.8527322236553856, "grad_norm": 0.22950054705142975, "learning_rate": 1.4755241885847933e-05, "loss": 0.089, "step": 20825 }, { "epoch": 0.8537559117990295, "grad_norm": 0.43880143761634827, "learning_rate": 1.4652660949489146e-05, "loss": 0.0888, "step": 20850 }, { "epoch": 0.8547795999426735, "grad_norm": 0.24918793141841888, "learning_rate": 1.455008001313036e-05, "loss": 0.0924, "step": 20875 }, { "epoch": 0.8558032880863173, "grad_norm": 0.26215484738349915, "learning_rate": 1.4447499076771572e-05, "loss": 0.0903, "step": 20900 }, { "epoch": 0.8568269762299613, "grad_norm": 0.2752866744995117, "learning_rate": 1.4344918140412785e-05, "loss": 0.0916, "step": 20925 }, { "epoch": 0.8578506643736052, "grad_norm": 0.2551786005496979, "learning_rate": 1.4242337204053998e-05, "loss": 0.0887, "step": 20950 }, { "epoch": 0.8588743525172492, "grad_norm": 0.2203332632780075, "learning_rate": 1.4139756267695211e-05, "loss": 0.086, "step": 20975 }, { "epoch": 0.859898040660893, "grad_norm": 0.25602227449417114, "learning_rate": 1.4037175331336424e-05, "loss": 0.0927, "step": 21000 }, { "epoch": 0.860921728804537, "grad_norm": 0.27257677912712097, "learning_rate": 1.3934594394977637e-05, "loss": 0.095, "step": 21025 }, { "epoch": 0.8619454169481809, "grad_norm": 0.24853083491325378, "learning_rate": 1.383201345861885e-05, "loss": 0.0896, "step": 21050 }, { "epoch": 0.8629691050918248, "grad_norm": 0.22490383684635162, "learning_rate": 1.3729432522260063e-05, "loss": 0.089, "step": 21075 }, { "epoch": 0.8639927932354687, "grad_norm": 0.25305449962615967, "learning_rate": 1.3626851585901276e-05, "loss": 0.0879, "step": 21100 }, { "epoch": 0.8650164813791127, "grad_norm": 0.31005653738975525, "learning_rate": 1.3524270649542489e-05, "loss": 0.0927, "step": 21125 }, { "epoch": 0.8660401695227566, "grad_norm": 0.24999596178531647, "learning_rate": 1.3421689713183702e-05, "loss": 0.089, "step": 21150 }, { "epoch": 0.8670638576664005, "grad_norm": 0.23844856023788452, "learning_rate": 1.3319108776824915e-05, "loss": 0.0846, "step": 21175 }, { "epoch": 0.8680875458100444, "grad_norm": 0.2782473564147949, "learning_rate": 1.3216527840466128e-05, "loss": 0.0931, "step": 21200 }, { "epoch": 0.8691112339536884, "grad_norm": 0.22946637868881226, "learning_rate": 1.3113946904107341e-05, "loss": 0.0881, "step": 21225 }, { "epoch": 0.8701349220973322, "grad_norm": 0.28429850935935974, "learning_rate": 1.3011365967748554e-05, "loss": 0.0881, "step": 21250 }, { "epoch": 0.8711586102409762, "grad_norm": 0.39349105954170227, "learning_rate": 1.2908785031389767e-05, "loss": 0.0914, "step": 21275 }, { "epoch": 0.8721822983846201, "grad_norm": 0.3252253234386444, "learning_rate": 1.280620409503098e-05, "loss": 0.0914, "step": 21300 }, { "epoch": 0.873205986528264, "grad_norm": 0.2974836528301239, "learning_rate": 1.2703623158672193e-05, "loss": 0.0924, "step": 21325 }, { "epoch": 0.8742296746719079, "grad_norm": 0.27263307571411133, "learning_rate": 1.2601042222313406e-05, "loss": 0.092, "step": 21350 }, { "epoch": 0.8752533628155519, "grad_norm": 0.34150230884552, "learning_rate": 1.2498461285954619e-05, "loss": 0.0909, "step": 21375 }, { "epoch": 0.8762770509591958, "grad_norm": 0.27397677302360535, "learning_rate": 1.2395880349595832e-05, "loss": 0.091, "step": 21400 }, { "epoch": 0.8773007391028397, "grad_norm": 0.28834134340286255, "learning_rate": 1.2293299413237045e-05, "loss": 0.0951, "step": 21425 }, { "epoch": 0.8783244272464836, "grad_norm": 0.2486167699098587, "learning_rate": 1.2190718476878258e-05, "loss": 0.0838, "step": 21450 }, { "epoch": 0.8793481153901276, "grad_norm": 0.3068005442619324, "learning_rate": 1.208813754051947e-05, "loss": 0.0897, "step": 21475 }, { "epoch": 0.8803718035337714, "grad_norm": 0.2985325753688812, "learning_rate": 1.1985556604160684e-05, "loss": 0.0897, "step": 21500 }, { "epoch": 0.8813954916774154, "grad_norm": 0.2797314524650574, "learning_rate": 1.1882975667801897e-05, "loss": 0.0907, "step": 21525 }, { "epoch": 0.8824191798210593, "grad_norm": 0.22625084221363068, "learning_rate": 1.178039473144311e-05, "loss": 0.0898, "step": 21550 }, { "epoch": 0.8834428679647033, "grad_norm": 0.23003660142421722, "learning_rate": 1.1677813795084323e-05, "loss": 0.0896, "step": 21575 }, { "epoch": 0.8844665561083471, "grad_norm": 0.2965420186519623, "learning_rate": 1.1575232858725536e-05, "loss": 0.0889, "step": 21600 }, { "epoch": 0.8854902442519911, "grad_norm": 0.332224577665329, "learning_rate": 1.1472651922366748e-05, "loss": 0.0886, "step": 21625 }, { "epoch": 0.886513932395635, "grad_norm": 0.27045127749443054, "learning_rate": 1.1370070986007961e-05, "loss": 0.0899, "step": 21650 }, { "epoch": 0.8875376205392789, "grad_norm": 0.26024821400642395, "learning_rate": 1.1267490049649174e-05, "loss": 0.0939, "step": 21675 }, { "epoch": 0.8885613086829228, "grad_norm": 0.2873280942440033, "learning_rate": 1.1164909113290387e-05, "loss": 0.0912, "step": 21700 }, { "epoch": 0.8895849968265668, "grad_norm": 0.2818579077720642, "learning_rate": 1.1062328176931599e-05, "loss": 0.0909, "step": 21725 }, { "epoch": 0.8906086849702107, "grad_norm": 0.33922845125198364, "learning_rate": 1.0959747240572812e-05, "loss": 0.0859, "step": 21750 }, { "epoch": 0.8916323731138546, "grad_norm": 0.3189659118652344, "learning_rate": 1.0857166304214025e-05, "loss": 0.0874, "step": 21775 }, { "epoch": 0.8926560612574985, "grad_norm": 0.2925044000148773, "learning_rate": 1.0754585367855238e-05, "loss": 0.0874, "step": 21800 }, { "epoch": 0.8936797494011425, "grad_norm": 0.36518415808677673, "learning_rate": 1.065200443149645e-05, "loss": 0.0905, "step": 21825 }, { "epoch": 0.8947034375447863, "grad_norm": 0.29783540964126587, "learning_rate": 1.0549423495137664e-05, "loss": 0.0851, "step": 21850 }, { "epoch": 0.8957271256884303, "grad_norm": 0.23640768229961395, "learning_rate": 1.0446842558778877e-05, "loss": 0.0901, "step": 21875 }, { "epoch": 0.8967508138320742, "grad_norm": 0.26059839129447937, "learning_rate": 1.034426162242009e-05, "loss": 0.0903, "step": 21900 }, { "epoch": 0.8977745019757181, "grad_norm": 0.3090721368789673, "learning_rate": 1.0241680686061302e-05, "loss": 0.0921, "step": 21925 }, { "epoch": 0.898798190119362, "grad_norm": 0.3036380112171173, "learning_rate": 1.0139099749702515e-05, "loss": 0.0902, "step": 21950 }, { "epoch": 0.899821878263006, "grad_norm": 0.27495357394218445, "learning_rate": 1.0036518813343728e-05, "loss": 0.0855, "step": 21975 }, { "epoch": 0.9008455664066499, "grad_norm": 0.27286654710769653, "learning_rate": 9.933937876984941e-06, "loss": 0.0928, "step": 22000 }, { "epoch": 0.9018692545502938, "grad_norm": 0.27504658699035645, "learning_rate": 9.831356940626154e-06, "loss": 0.0905, "step": 22025 }, { "epoch": 0.9028929426939377, "grad_norm": 0.25373876094818115, "learning_rate": 9.728776004267367e-06, "loss": 0.0911, "step": 22050 }, { "epoch": 0.9039166308375817, "grad_norm": 0.2752918601036072, "learning_rate": 9.62619506790858e-06, "loss": 0.0897, "step": 22075 }, { "epoch": 0.9049403189812255, "grad_norm": 0.28456592559814453, "learning_rate": 9.523614131549793e-06, "loss": 0.085, "step": 22100 }, { "epoch": 0.9059640071248695, "grad_norm": 0.2836301028728485, "learning_rate": 9.421033195191006e-06, "loss": 0.0879, "step": 22125 }, { "epoch": 0.9069876952685134, "grad_norm": 0.2792745530605316, "learning_rate": 9.31845225883222e-06, "loss": 0.086, "step": 22150 }, { "epoch": 0.9080113834121574, "grad_norm": 0.2640101909637451, "learning_rate": 9.215871322473432e-06, "loss": 0.0942, "step": 22175 }, { "epoch": 0.9090350715558012, "grad_norm": 0.28286224603652954, "learning_rate": 9.113290386114645e-06, "loss": 0.0868, "step": 22200 }, { "epoch": 0.9100587596994452, "grad_norm": 0.3581150770187378, "learning_rate": 9.010709449755858e-06, "loss": 0.092, "step": 22225 }, { "epoch": 0.9110824478430891, "grad_norm": 0.2819570302963257, "learning_rate": 8.908128513397071e-06, "loss": 0.092, "step": 22250 }, { "epoch": 0.912106135986733, "grad_norm": 0.2538643777370453, "learning_rate": 8.805547577038284e-06, "loss": 0.0922, "step": 22275 }, { "epoch": 0.9131298241303769, "grad_norm": 0.2901509404182434, "learning_rate": 8.702966640679497e-06, "loss": 0.0862, "step": 22300 }, { "epoch": 0.9141535122740209, "grad_norm": 0.28954175114631653, "learning_rate": 8.60038570432071e-06, "loss": 0.0879, "step": 22325 }, { "epoch": 0.9151772004176648, "grad_norm": 0.26981502771377563, "learning_rate": 8.497804767961923e-06, "loss": 0.0889, "step": 22350 }, { "epoch": 0.9162008885613087, "grad_norm": 0.3008342683315277, "learning_rate": 8.395223831603136e-06, "loss": 0.088, "step": 22375 }, { "epoch": 0.9172245767049526, "grad_norm": 0.23977133631706238, "learning_rate": 8.292642895244349e-06, "loss": 0.0896, "step": 22400 }, { "epoch": 0.9182482648485966, "grad_norm": 0.21286515891551971, "learning_rate": 8.190061958885562e-06, "loss": 0.0933, "step": 22425 }, { "epoch": 0.9192719529922404, "grad_norm": 0.3176520764827728, "learning_rate": 8.087481022526775e-06, "loss": 0.0898, "step": 22450 }, { "epoch": 0.9202956411358844, "grad_norm": 0.2136741727590561, "learning_rate": 7.984900086167988e-06, "loss": 0.0911, "step": 22475 }, { "epoch": 0.9213193292795283, "grad_norm": 0.32107657194137573, "learning_rate": 7.882319149809201e-06, "loss": 0.0874, "step": 22500 }, { "epoch": 0.9223430174231722, "grad_norm": 0.2349776327610016, "learning_rate": 7.779738213450414e-06, "loss": 0.0867, "step": 22525 }, { "epoch": 0.9233667055668161, "grad_norm": 0.2386864870786667, "learning_rate": 7.677157277091627e-06, "loss": 0.0878, "step": 22550 }, { "epoch": 0.9243903937104601, "grad_norm": 0.270991712808609, "learning_rate": 7.574576340732839e-06, "loss": 0.0837, "step": 22575 }, { "epoch": 0.925414081854104, "grad_norm": 0.33399784564971924, "learning_rate": 7.471995404374052e-06, "loss": 0.0901, "step": 22600 }, { "epoch": 0.9264377699977479, "grad_norm": 0.2850496470928192, "learning_rate": 7.369414468015265e-06, "loss": 0.0898, "step": 22625 }, { "epoch": 0.9274614581413918, "grad_norm": 0.32937246561050415, "learning_rate": 7.266833531656478e-06, "loss": 0.0903, "step": 22650 }, { "epoch": 0.9284851462850358, "grad_norm": 0.22164273262023926, "learning_rate": 7.164252595297691e-06, "loss": 0.0928, "step": 22675 }, { "epoch": 0.9295088344286796, "grad_norm": 0.2599170506000519, "learning_rate": 7.061671658938904e-06, "loss": 0.0874, "step": 22700 }, { "epoch": 0.9305325225723236, "grad_norm": 0.3116656243801117, "learning_rate": 6.959090722580117e-06, "loss": 0.0845, "step": 22725 }, { "epoch": 0.9315562107159675, "grad_norm": 0.27648812532424927, "learning_rate": 6.85650978622133e-06, "loss": 0.0866, "step": 22750 }, { "epoch": 0.9325798988596115, "grad_norm": 0.26359742879867554, "learning_rate": 6.753928849862543e-06, "loss": 0.0884, "step": 22775 }, { "epoch": 0.9336035870032553, "grad_norm": 0.26720476150512695, "learning_rate": 6.651347913503756e-06, "loss": 0.0867, "step": 22800 }, { "epoch": 0.9346272751468993, "grad_norm": 0.2515944540500641, "learning_rate": 6.548766977144969e-06, "loss": 0.0883, "step": 22825 }, { "epoch": 0.9356509632905432, "grad_norm": 0.23396004736423492, "learning_rate": 6.446186040786182e-06, "loss": 0.0883, "step": 22850 }, { "epoch": 0.936674651434187, "grad_norm": 0.2513067424297333, "learning_rate": 6.343605104427394e-06, "loss": 0.0868, "step": 22875 }, { "epoch": 0.937698339577831, "grad_norm": 0.29367002844810486, "learning_rate": 6.241024168068607e-06, "loss": 0.0932, "step": 22900 }, { "epoch": 0.938722027721475, "grad_norm": 0.2306540161371231, "learning_rate": 6.13844323170982e-06, "loss": 0.0918, "step": 22925 }, { "epoch": 0.9397457158651189, "grad_norm": 0.27428171038627625, "learning_rate": 6.035862295351033e-06, "loss": 0.0881, "step": 22950 }, { "epoch": 0.9407694040087627, "grad_norm": 0.3886117935180664, "learning_rate": 5.933281358992245e-06, "loss": 0.087, "step": 22975 }, { "epoch": 0.9417930921524067, "grad_norm": 0.25603532791137695, "learning_rate": 5.830700422633458e-06, "loss": 0.0856, "step": 23000 }, { "epoch": 0.9428167802960506, "grad_norm": 0.30329135060310364, "learning_rate": 5.728119486274671e-06, "loss": 0.093, "step": 23025 }, { "epoch": 0.9438404684396945, "grad_norm": 0.26778334379196167, "learning_rate": 5.625538549915884e-06, "loss": 0.0896, "step": 23050 }, { "epoch": 0.9448641565833384, "grad_norm": 0.28244808316230774, "learning_rate": 5.522957613557097e-06, "loss": 0.0895, "step": 23075 }, { "epoch": 0.9458878447269824, "grad_norm": 0.353553831577301, "learning_rate": 5.42037667719831e-06, "loss": 0.0918, "step": 23100 }, { "epoch": 0.9469115328706262, "grad_norm": 0.3107817769050598, "learning_rate": 5.317795740839523e-06, "loss": 0.0929, "step": 23125 }, { "epoch": 0.9479352210142702, "grad_norm": 0.2637424170970917, "learning_rate": 5.215214804480736e-06, "loss": 0.0907, "step": 23150 }, { "epoch": 0.9489589091579141, "grad_norm": 0.2971089780330658, "learning_rate": 5.112633868121949e-06, "loss": 0.09, "step": 23175 }, { "epoch": 0.9499825973015581, "grad_norm": 0.22394390404224396, "learning_rate": 5.010052931763162e-06, "loss": 0.0903, "step": 23200 }, { "epoch": 0.9510062854452019, "grad_norm": 0.2777024805545807, "learning_rate": 4.9074719954043746e-06, "loss": 0.0873, "step": 23225 }, { "epoch": 0.9520299735888459, "grad_norm": 0.24165412783622742, "learning_rate": 4.8048910590455875e-06, "loss": 0.0911, "step": 23250 }, { "epoch": 0.9530536617324898, "grad_norm": 0.2876558005809784, "learning_rate": 4.7023101226868005e-06, "loss": 0.0904, "step": 23275 }, { "epoch": 0.9540773498761337, "grad_norm": 0.2749604284763336, "learning_rate": 4.5997291863280135e-06, "loss": 0.0894, "step": 23300 }, { "epoch": 0.9551010380197776, "grad_norm": 0.2758445143699646, "learning_rate": 4.4971482499692265e-06, "loss": 0.0899, "step": 23325 }, { "epoch": 0.9561247261634216, "grad_norm": 0.20477938652038574, "learning_rate": 4.3945673136104394e-06, "loss": 0.0878, "step": 23350 }, { "epoch": 0.9571484143070655, "grad_norm": 0.3615458607673645, "learning_rate": 4.291986377251652e-06, "loss": 0.0858, "step": 23375 }, { "epoch": 0.9581721024507094, "grad_norm": 0.28123295307159424, "learning_rate": 4.189405440892865e-06, "loss": 0.087, "step": 23400 }, { "epoch": 0.9591957905943533, "grad_norm": 0.30753856897354126, "learning_rate": 4.0868245045340775e-06, "loss": 0.0896, "step": 23425 }, { "epoch": 0.9602194787379973, "grad_norm": 0.31176239252090454, "learning_rate": 3.9842435681752905e-06, "loss": 0.0884, "step": 23450 }, { "epoch": 0.9612431668816411, "grad_norm": 0.29048678278923035, "learning_rate": 3.8816626318165035e-06, "loss": 0.0818, "step": 23475 }, { "epoch": 0.9622668550252851, "grad_norm": 0.2853899896144867, "learning_rate": 3.779081695457716e-06, "loss": 0.088, "step": 23500 }, { "epoch": 0.963290543168929, "grad_norm": 0.2238619327545166, "learning_rate": 3.676500759098929e-06, "loss": 0.0901, "step": 23525 }, { "epoch": 0.964314231312573, "grad_norm": 0.44698524475097656, "learning_rate": 3.573919822740142e-06, "loss": 0.0856, "step": 23550 }, { "epoch": 0.9653379194562168, "grad_norm": 0.3152156174182892, "learning_rate": 3.471338886381355e-06, "loss": 0.0931, "step": 23575 }, { "epoch": 0.9663616075998608, "grad_norm": 0.27057376503944397, "learning_rate": 3.368757950022568e-06, "loss": 0.0901, "step": 23600 }, { "epoch": 0.9673852957435047, "grad_norm": 0.29068031907081604, "learning_rate": 3.2661770136637804e-06, "loss": 0.0893, "step": 23625 }, { "epoch": 0.9684089838871486, "grad_norm": 0.2840330898761749, "learning_rate": 3.1635960773049934e-06, "loss": 0.0888, "step": 23650 }, { "epoch": 0.9694326720307925, "grad_norm": 0.31523531675338745, "learning_rate": 3.061015140946207e-06, "loss": 0.0875, "step": 23675 }, { "epoch": 0.9704563601744365, "grad_norm": 0.2516843378543854, "learning_rate": 2.95843420458742e-06, "loss": 0.0915, "step": 23700 }, { "epoch": 0.9714800483180803, "grad_norm": 0.26545655727386475, "learning_rate": 2.8558532682286323e-06, "loss": 0.0848, "step": 23725 }, { "epoch": 0.9725037364617243, "grad_norm": 0.300320565700531, "learning_rate": 2.7532723318698453e-06, "loss": 0.0886, "step": 23750 }, { "epoch": 0.9735274246053682, "grad_norm": 0.3137941360473633, "learning_rate": 2.6506913955110583e-06, "loss": 0.0872, "step": 23775 }, { "epoch": 0.9745511127490122, "grad_norm": 0.31483328342437744, "learning_rate": 2.5481104591522713e-06, "loss": 0.0895, "step": 23800 }, { "epoch": 0.975574800892656, "grad_norm": 0.28136733174324036, "learning_rate": 2.4455295227934842e-06, "loss": 0.0894, "step": 23825 }, { "epoch": 0.9765984890363, "grad_norm": 0.24842825531959534, "learning_rate": 2.342948586434697e-06, "loss": 0.0891, "step": 23850 }, { "epoch": 0.9776221771799439, "grad_norm": 0.29128360748291016, "learning_rate": 2.2403676500759098e-06, "loss": 0.0939, "step": 23875 }, { "epoch": 0.9786458653235878, "grad_norm": 0.27355626225471497, "learning_rate": 2.1377867137171227e-06, "loss": 0.0861, "step": 23900 }, { "epoch": 0.9796695534672317, "grad_norm": 0.424562931060791, "learning_rate": 2.0352057773583357e-06, "loss": 0.0888, "step": 23925 }, { "epoch": 0.9806932416108757, "grad_norm": 0.3024253845214844, "learning_rate": 1.9326248409995487e-06, "loss": 0.0889, "step": 23950 }, { "epoch": 0.9817169297545196, "grad_norm": 0.3305220603942871, "learning_rate": 1.8300439046407617e-06, "loss": 0.091, "step": 23975 }, { "epoch": 0.9827406178981635, "grad_norm": 0.29790905117988586, "learning_rate": 1.7274629682819746e-06, "loss": 0.0882, "step": 24000 }, { "epoch": 0.9837643060418074, "grad_norm": 0.3197941184043884, "learning_rate": 1.6248820319231876e-06, "loss": 0.0878, "step": 24025 }, { "epoch": 0.9847879941854514, "grad_norm": 0.2794630229473114, "learning_rate": 1.5223010955644004e-06, "loss": 0.0878, "step": 24050 }, { "epoch": 0.9858116823290952, "grad_norm": 0.2822708487510681, "learning_rate": 1.4197201592056133e-06, "loss": 0.0853, "step": 24075 }, { "epoch": 0.9868353704727392, "grad_norm": 0.2595159709453583, "learning_rate": 1.3171392228468263e-06, "loss": 0.0901, "step": 24100 }, { "epoch": 0.9878590586163831, "grad_norm": 0.27910885214805603, "learning_rate": 1.214558286488039e-06, "loss": 0.0837, "step": 24125 }, { "epoch": 0.9888827467600271, "grad_norm": 0.2924407720565796, "learning_rate": 1.111977350129252e-06, "loss": 0.086, "step": 24150 }, { "epoch": 0.9899064349036709, "grad_norm": 0.2329237014055252, "learning_rate": 1.0093964137704648e-06, "loss": 0.094, "step": 24175 }, { "epoch": 0.9909301230473149, "grad_norm": 0.2659105062484741, "learning_rate": 9.068154774116778e-07, "loss": 0.086, "step": 24200 }, { "epoch": 0.9919538111909588, "grad_norm": 0.2556705176830292, "learning_rate": 8.042345410528908e-07, "loss": 0.0906, "step": 24225 }, { "epoch": 0.9929774993346027, "grad_norm": 0.2836422324180603, "learning_rate": 7.016536046941037e-07, "loss": 0.0934, "step": 24250 }, { "epoch": 0.9940011874782466, "grad_norm": 0.26003921031951904, "learning_rate": 5.990726683353166e-07, "loss": 0.089, "step": 24275 }, { "epoch": 0.9950248756218906, "grad_norm": 0.2415328323841095, "learning_rate": 4.964917319765296e-07, "loss": 0.0882, "step": 24300 }, { "epoch": 0.9960485637655345, "grad_norm": 0.48855510354042053, "learning_rate": 3.9391079561774244e-07, "loss": 0.0895, "step": 24325 }, { "epoch": 0.9970722519091784, "grad_norm": 0.2958788573741913, "learning_rate": 2.9132985925895536e-07, "loss": 0.084, "step": 24350 }, { "epoch": 0.9980959400528223, "grad_norm": 0.24037177860736847, "learning_rate": 1.8874892290016825e-07, "loss": 0.0894, "step": 24375 }, { "epoch": 0.9991196281964663, "grad_norm": 0.32911139726638794, "learning_rate": 8.616798654138116e-08, "loss": 0.0882, "step": 24400 } ], "logging_steps": 25, "max_steps": 24421, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.659903259460792e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }