tvkain's picture
Add files using upload-large-folder tool
3bd65fb verified
raw
history blame
184 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5682586930050061,
"eval_steps": 500,
"global_step": 2100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00027059937762143147,
"grad_norm": 4.086390018463135,
"learning_rate": 0.0,
"loss": 3.2754,
"step": 1
},
{
"epoch": 0.0005411987552428629,
"grad_norm": 3.758815288543701,
"learning_rate": 9.017132551848513e-08,
"loss": 3.2863,
"step": 2
},
{
"epoch": 0.0010823975104857259,
"grad_norm": 3.8250608444213867,
"learning_rate": 2.705139765554554e-07,
"loss": 3.3425,
"step": 4
},
{
"epoch": 0.0016235962657285888,
"grad_norm": 3.8092095851898193,
"learning_rate": 4.5085662759242564e-07,
"loss": 3.3165,
"step": 6
},
{
"epoch": 0.0021647950209714517,
"grad_norm": 3.7621052265167236,
"learning_rate": 6.311992786293959e-07,
"loss": 3.3295,
"step": 8
},
{
"epoch": 0.002705993776214315,
"grad_norm": 3.4136276245117188,
"learning_rate": 8.115419296663661e-07,
"loss": 3.3073,
"step": 10
},
{
"epoch": 0.0032471925314571776,
"grad_norm": 2.855100393295288,
"learning_rate": 9.918845807033363e-07,
"loss": 3.3031,
"step": 12
},
{
"epoch": 0.0037883912867000408,
"grad_norm": 2.491767406463623,
"learning_rate": 1.1722272317403068e-06,
"loss": 3.2943,
"step": 14
},
{
"epoch": 0.0043295900419429035,
"grad_norm": 2.359778642654419,
"learning_rate": 1.3525698827772768e-06,
"loss": 3.2622,
"step": 16
},
{
"epoch": 0.004870788797185766,
"grad_norm": 2.037504196166992,
"learning_rate": 1.5329125338142473e-06,
"loss": 3.239,
"step": 18
},
{
"epoch": 0.00541198755242863,
"grad_norm": 2.8542497158050537,
"learning_rate": 1.7132551848512173e-06,
"loss": 3.2031,
"step": 20
},
{
"epoch": 0.0059531863076714925,
"grad_norm": 2.297046661376953,
"learning_rate": 1.8935978358881876e-06,
"loss": 3.1721,
"step": 22
},
{
"epoch": 0.006494385062914355,
"grad_norm": 2.2149112224578857,
"learning_rate": 2.0739404869251576e-06,
"loss": 3.121,
"step": 24
},
{
"epoch": 0.007035583818157218,
"grad_norm": 1.8048591613769531,
"learning_rate": 2.254283137962128e-06,
"loss": 3.0857,
"step": 26
},
{
"epoch": 0.0075767825734000815,
"grad_norm": 1.7466434240341187,
"learning_rate": 2.4346257889990986e-06,
"loss": 3.0489,
"step": 28
},
{
"epoch": 0.008117981328642944,
"grad_norm": 2.1722524166107178,
"learning_rate": 2.6149684400360686e-06,
"loss": 3.0016,
"step": 30
},
{
"epoch": 0.008659180083885807,
"grad_norm": 1.364578366279602,
"learning_rate": 2.7953110910730386e-06,
"loss": 2.9587,
"step": 32
},
{
"epoch": 0.00920037883912867,
"grad_norm": 1.5823427438735962,
"learning_rate": 2.9756537421100095e-06,
"loss": 2.931,
"step": 34
},
{
"epoch": 0.009741577594371532,
"grad_norm": 1.2367908954620361,
"learning_rate": 3.1559963931469796e-06,
"loss": 2.8953,
"step": 36
},
{
"epoch": 0.010282776349614395,
"grad_norm": 1.0437366962432861,
"learning_rate": 3.3363390441839496e-06,
"loss": 2.8412,
"step": 38
},
{
"epoch": 0.01082397510485726,
"grad_norm": 1.081803798675537,
"learning_rate": 3.5166816952209197e-06,
"loss": 2.7832,
"step": 40
},
{
"epoch": 0.011365173860100122,
"grad_norm": 0.9715840220451355,
"learning_rate": 3.69702434625789e-06,
"loss": 2.7729,
"step": 42
},
{
"epoch": 0.011906372615342985,
"grad_norm": 0.8603936433792114,
"learning_rate": 3.877366997294861e-06,
"loss": 2.6904,
"step": 44
},
{
"epoch": 0.012447571370585848,
"grad_norm": 0.8236231803894043,
"learning_rate": 4.057709648331831e-06,
"loss": 2.6908,
"step": 46
},
{
"epoch": 0.01298877012582871,
"grad_norm": 0.7681186199188232,
"learning_rate": 4.2380522993688015e-06,
"loss": 2.6212,
"step": 48
},
{
"epoch": 0.013529968881071573,
"grad_norm": 0.8002827167510986,
"learning_rate": 4.4183949504057716e-06,
"loss": 2.6035,
"step": 50
},
{
"epoch": 0.014071167636314436,
"grad_norm": 0.6757120490074158,
"learning_rate": 4.598737601442742e-06,
"loss": 2.595,
"step": 52
},
{
"epoch": 0.014612366391557299,
"grad_norm": 0.6619369387626648,
"learning_rate": 4.779080252479712e-06,
"loss": 2.5522,
"step": 54
},
{
"epoch": 0.015153565146800163,
"grad_norm": 0.6247105598449707,
"learning_rate": 4.959422903516682e-06,
"loss": 2.5079,
"step": 56
},
{
"epoch": 0.015694763902043024,
"grad_norm": 0.6559263467788696,
"learning_rate": 5.139765554553652e-06,
"loss": 2.5009,
"step": 58
},
{
"epoch": 0.01623596265728589,
"grad_norm": 0.6590877175331116,
"learning_rate": 5.320108205590623e-06,
"loss": 2.4648,
"step": 60
},
{
"epoch": 0.01677716141252875,
"grad_norm": 0.6045516133308411,
"learning_rate": 5.500450856627593e-06,
"loss": 2.421,
"step": 62
},
{
"epoch": 0.017318360167771614,
"grad_norm": 0.6533932089805603,
"learning_rate": 5.680793507664563e-06,
"loss": 2.3966,
"step": 64
},
{
"epoch": 0.01785955892301448,
"grad_norm": 0.6478094458580017,
"learning_rate": 5.861136158701533e-06,
"loss": 2.3903,
"step": 66
},
{
"epoch": 0.01840075767825734,
"grad_norm": 0.7349300980567932,
"learning_rate": 6.041478809738504e-06,
"loss": 2.3552,
"step": 68
},
{
"epoch": 0.018941956433500204,
"grad_norm": 0.6454821825027466,
"learning_rate": 6.221821460775474e-06,
"loss": 2.3262,
"step": 70
},
{
"epoch": 0.019483155188743065,
"grad_norm": 0.7321672439575195,
"learning_rate": 6.402164111812444e-06,
"loss": 2.3197,
"step": 72
},
{
"epoch": 0.02002435394398593,
"grad_norm": 0.7664237022399902,
"learning_rate": 6.582506762849414e-06,
"loss": 2.2992,
"step": 74
},
{
"epoch": 0.02056555269922879,
"grad_norm": 0.6843811869621277,
"learning_rate": 6.762849413886384e-06,
"loss": 2.2927,
"step": 76
},
{
"epoch": 0.021106751454471655,
"grad_norm": 0.7199612259864807,
"learning_rate": 6.9431920649233556e-06,
"loss": 2.2525,
"step": 78
},
{
"epoch": 0.02164795020971452,
"grad_norm": 0.778446614742279,
"learning_rate": 7.123534715960326e-06,
"loss": 2.2267,
"step": 80
},
{
"epoch": 0.02218914896495738,
"grad_norm": 0.9287930727005005,
"learning_rate": 7.303877366997296e-06,
"loss": 2.2206,
"step": 82
},
{
"epoch": 0.022730347720200245,
"grad_norm": 1.033782958984375,
"learning_rate": 7.484220018034266e-06,
"loss": 2.2063,
"step": 84
},
{
"epoch": 0.023271546475443106,
"grad_norm": 1.0132615566253662,
"learning_rate": 7.664562669071236e-06,
"loss": 2.1677,
"step": 86
},
{
"epoch": 0.02381274523068597,
"grad_norm": 0.9043529033660889,
"learning_rate": 7.844905320108207e-06,
"loss": 2.1696,
"step": 88
},
{
"epoch": 0.02435394398592883,
"grad_norm": 0.6718290448188782,
"learning_rate": 8.025247971145176e-06,
"loss": 2.1492,
"step": 90
},
{
"epoch": 0.024895142741171696,
"grad_norm": 0.9615944027900696,
"learning_rate": 8.205590622182147e-06,
"loss": 2.1452,
"step": 92
},
{
"epoch": 0.02543634149641456,
"grad_norm": 0.9435996413230896,
"learning_rate": 8.385933273219116e-06,
"loss": 2.1098,
"step": 94
},
{
"epoch": 0.02597754025165742,
"grad_norm": 0.7614261507987976,
"learning_rate": 8.566275924256087e-06,
"loss": 2.1286,
"step": 96
},
{
"epoch": 0.026518739006900285,
"grad_norm": 0.9416339993476868,
"learning_rate": 8.746618575293058e-06,
"loss": 2.1092,
"step": 98
},
{
"epoch": 0.027059937762143146,
"grad_norm": 0.9229443073272705,
"learning_rate": 8.926961226330027e-06,
"loss": 2.0932,
"step": 100
},
{
"epoch": 0.02760113651738601,
"grad_norm": 0.7135593295097351,
"learning_rate": 9.107303877366998e-06,
"loss": 2.0699,
"step": 102
},
{
"epoch": 0.028142335272628872,
"grad_norm": 1.0263723134994507,
"learning_rate": 9.287646528403967e-06,
"loss": 2.0445,
"step": 104
},
{
"epoch": 0.028683534027871736,
"grad_norm": 1.0300300121307373,
"learning_rate": 9.467989179440938e-06,
"loss": 2.0463,
"step": 106
},
{
"epoch": 0.029224732783114597,
"grad_norm": 0.8331286311149597,
"learning_rate": 9.648331830477909e-06,
"loss": 2.0381,
"step": 108
},
{
"epoch": 0.02976593153835746,
"grad_norm": 0.7501435875892639,
"learning_rate": 9.828674481514878e-06,
"loss": 2.0411,
"step": 110
},
{
"epoch": 0.030307130293600326,
"grad_norm": 0.6895191073417664,
"learning_rate": 1.0009017132551849e-05,
"loss": 2.0475,
"step": 112
},
{
"epoch": 0.030848329048843187,
"grad_norm": 0.95854252576828,
"learning_rate": 1.018935978358882e-05,
"loss": 2.0071,
"step": 114
},
{
"epoch": 0.03138952780408605,
"grad_norm": 1.1303929090499878,
"learning_rate": 1.036970243462579e-05,
"loss": 2.0008,
"step": 116
},
{
"epoch": 0.031930726559328916,
"grad_norm": 0.7708876729011536,
"learning_rate": 1.055004508566276e-05,
"loss": 2.0061,
"step": 118
},
{
"epoch": 0.03247192531457178,
"grad_norm": 0.9773860573768616,
"learning_rate": 1.073038773669973e-05,
"loss": 2.0096,
"step": 120
},
{
"epoch": 0.03301312406981464,
"grad_norm": 1.118385910987854,
"learning_rate": 1.09107303877367e-05,
"loss": 1.9939,
"step": 122
},
{
"epoch": 0.0335543228250575,
"grad_norm": 0.7215014696121216,
"learning_rate": 1.109107303877367e-05,
"loss": 1.9515,
"step": 124
},
{
"epoch": 0.03409552158030037,
"grad_norm": 0.9696834683418274,
"learning_rate": 1.1271415689810642e-05,
"loss": 1.9639,
"step": 126
},
{
"epoch": 0.03463672033554323,
"grad_norm": 0.945482611656189,
"learning_rate": 1.1451758340847611e-05,
"loss": 1.9397,
"step": 128
},
{
"epoch": 0.03517791909078609,
"grad_norm": 0.7454535365104675,
"learning_rate": 1.1632100991884582e-05,
"loss": 1.9353,
"step": 130
},
{
"epoch": 0.03571911784602896,
"grad_norm": 0.7824187278747559,
"learning_rate": 1.1812443642921551e-05,
"loss": 1.9227,
"step": 132
},
{
"epoch": 0.03626031660127182,
"grad_norm": 0.7939879894256592,
"learning_rate": 1.1992786293958522e-05,
"loss": 1.9126,
"step": 134
},
{
"epoch": 0.03680151535651468,
"grad_norm": 0.7776147723197937,
"learning_rate": 1.2173128944995491e-05,
"loss": 1.9002,
"step": 136
},
{
"epoch": 0.03734271411175754,
"grad_norm": 0.6580236554145813,
"learning_rate": 1.2353471596032462e-05,
"loss": 1.9121,
"step": 138
},
{
"epoch": 0.03788391286700041,
"grad_norm": 0.7200301289558411,
"learning_rate": 1.2533814247069433e-05,
"loss": 1.8885,
"step": 140
},
{
"epoch": 0.03842511162224327,
"grad_norm": 0.7958497405052185,
"learning_rate": 1.2714156898106402e-05,
"loss": 1.9095,
"step": 142
},
{
"epoch": 0.03896631037748613,
"grad_norm": 0.9120681881904602,
"learning_rate": 1.2894499549143375e-05,
"loss": 1.884,
"step": 144
},
{
"epoch": 0.039507509132729,
"grad_norm": 0.8108247518539429,
"learning_rate": 1.3074842200180342e-05,
"loss": 1.8656,
"step": 146
},
{
"epoch": 0.04004870788797186,
"grad_norm": 0.7010449171066284,
"learning_rate": 1.3255184851217315e-05,
"loss": 1.8635,
"step": 148
},
{
"epoch": 0.04058990664321472,
"grad_norm": 0.8178524374961853,
"learning_rate": 1.3435527502254284e-05,
"loss": 1.8933,
"step": 150
},
{
"epoch": 0.04113110539845758,
"grad_norm": 1.0447405576705933,
"learning_rate": 1.3615870153291255e-05,
"loss": 1.8523,
"step": 152
},
{
"epoch": 0.04167230415370045,
"grad_norm": 0.8516271710395813,
"learning_rate": 1.3796212804328224e-05,
"loss": 1.8528,
"step": 154
},
{
"epoch": 0.04221350290894331,
"grad_norm": 0.8437328934669495,
"learning_rate": 1.3976555455365195e-05,
"loss": 1.861,
"step": 156
},
{
"epoch": 0.04275470166418617,
"grad_norm": 0.851265549659729,
"learning_rate": 1.4156898106402164e-05,
"loss": 1.8315,
"step": 158
},
{
"epoch": 0.04329590041942904,
"grad_norm": 0.7337156534194946,
"learning_rate": 1.4337240757439135e-05,
"loss": 1.8354,
"step": 160
},
{
"epoch": 0.0438370991746719,
"grad_norm": 0.9754143357276917,
"learning_rate": 1.4517583408476104e-05,
"loss": 1.8252,
"step": 162
},
{
"epoch": 0.04437829792991476,
"grad_norm": 0.6172115802764893,
"learning_rate": 1.4697926059513075e-05,
"loss": 1.8094,
"step": 164
},
{
"epoch": 0.04491949668515762,
"grad_norm": 0.8304158449172974,
"learning_rate": 1.4878268710550044e-05,
"loss": 1.8078,
"step": 166
},
{
"epoch": 0.04546069544040049,
"grad_norm": 0.6388853788375854,
"learning_rate": 1.5058611361587017e-05,
"loss": 1.8106,
"step": 168
},
{
"epoch": 0.04600189419564335,
"grad_norm": 0.743231475353241,
"learning_rate": 1.5238954012623984e-05,
"loss": 1.8144,
"step": 170
},
{
"epoch": 0.04654309295088621,
"grad_norm": 0.6442289352416992,
"learning_rate": 1.5419296663660955e-05,
"loss": 1.7831,
"step": 172
},
{
"epoch": 0.04708429170612908,
"grad_norm": 0.6877187490463257,
"learning_rate": 1.559963931469793e-05,
"loss": 1.8043,
"step": 174
},
{
"epoch": 0.04762549046137194,
"grad_norm": 0.9389640688896179,
"learning_rate": 1.5779981965734897e-05,
"loss": 1.7869,
"step": 176
},
{
"epoch": 0.0481666892166148,
"grad_norm": 1.0456589460372925,
"learning_rate": 1.5960324616771868e-05,
"loss": 1.7681,
"step": 178
},
{
"epoch": 0.04870788797185766,
"grad_norm": 0.9617791175842285,
"learning_rate": 1.614066726780884e-05,
"loss": 1.7668,
"step": 180
},
{
"epoch": 0.04924908672710053,
"grad_norm": 0.9334360361099243,
"learning_rate": 1.632100991884581e-05,
"loss": 1.7893,
"step": 182
},
{
"epoch": 0.04979028548234339,
"grad_norm": 0.8952531814575195,
"learning_rate": 1.6501352569882777e-05,
"loss": 1.7758,
"step": 184
},
{
"epoch": 0.05033148423758625,
"grad_norm": 0.8544924855232239,
"learning_rate": 1.6681695220919748e-05,
"loss": 1.793,
"step": 186
},
{
"epoch": 0.05087268299282912,
"grad_norm": 0.7782765030860901,
"learning_rate": 1.686203787195672e-05,
"loss": 1.768,
"step": 188
},
{
"epoch": 0.05141388174807198,
"grad_norm": 0.7119695544242859,
"learning_rate": 1.704238052299369e-05,
"loss": 1.7685,
"step": 190
},
{
"epoch": 0.05195508050331484,
"grad_norm": 0.9119647145271301,
"learning_rate": 1.7222723174030657e-05,
"loss": 1.7706,
"step": 192
},
{
"epoch": 0.0524962792585577,
"grad_norm": 0.6414957642555237,
"learning_rate": 1.7403065825067628e-05,
"loss": 1.7626,
"step": 194
},
{
"epoch": 0.05303747801380057,
"grad_norm": 0.8069677352905273,
"learning_rate": 1.75834084761046e-05,
"loss": 1.7423,
"step": 196
},
{
"epoch": 0.05357867676904343,
"grad_norm": 0.6549937725067139,
"learning_rate": 1.776375112714157e-05,
"loss": 1.7428,
"step": 198
},
{
"epoch": 0.05411987552428629,
"grad_norm": 0.8064024448394775,
"learning_rate": 1.7944093778178538e-05,
"loss": 1.7448,
"step": 200
},
{
"epoch": 0.054661074279529154,
"grad_norm": 0.7182701826095581,
"learning_rate": 1.8124436429215512e-05,
"loss": 1.7248,
"step": 202
},
{
"epoch": 0.05520227303477202,
"grad_norm": 0.6997919678688049,
"learning_rate": 1.830477908025248e-05,
"loss": 1.7281,
"step": 204
},
{
"epoch": 0.05574347179001488,
"grad_norm": 0.7071277499198914,
"learning_rate": 1.848512173128945e-05,
"loss": 1.714,
"step": 206
},
{
"epoch": 0.056284670545257744,
"grad_norm": 0.6344273090362549,
"learning_rate": 1.866546438232642e-05,
"loss": 1.7463,
"step": 208
},
{
"epoch": 0.05682586930050061,
"grad_norm": 0.7192733883857727,
"learning_rate": 1.8845807033363392e-05,
"loss": 1.737,
"step": 210
},
{
"epoch": 0.05736706805574347,
"grad_norm": 0.7418521642684937,
"learning_rate": 1.9026149684400363e-05,
"loss": 1.7197,
"step": 212
},
{
"epoch": 0.057908266810986334,
"grad_norm": 0.875845730304718,
"learning_rate": 1.920649233543733e-05,
"loss": 1.6968,
"step": 214
},
{
"epoch": 0.058449465566229195,
"grad_norm": 0.7394037842750549,
"learning_rate": 1.9386834986474305e-05,
"loss": 1.7051,
"step": 216
},
{
"epoch": 0.05899066432147206,
"grad_norm": 0.6689572930335999,
"learning_rate": 1.9567177637511272e-05,
"loss": 1.7152,
"step": 218
},
{
"epoch": 0.05953186307671492,
"grad_norm": 0.7955539226531982,
"learning_rate": 1.9747520288548243e-05,
"loss": 1.7136,
"step": 220
},
{
"epoch": 0.060073061831957784,
"grad_norm": 0.7005388140678406,
"learning_rate": 1.9927862939585214e-05,
"loss": 1.7152,
"step": 222
},
{
"epoch": 0.06061426058720065,
"grad_norm": 0.6205731630325317,
"learning_rate": 2.0108205590622185e-05,
"loss": 1.6901,
"step": 224
},
{
"epoch": 0.06115545934244351,
"grad_norm": 0.7079929709434509,
"learning_rate": 2.0288548241659152e-05,
"loss": 1.6905,
"step": 226
},
{
"epoch": 0.061696658097686374,
"grad_norm": 0.6871302723884583,
"learning_rate": 2.0468890892696123e-05,
"loss": 1.6867,
"step": 228
},
{
"epoch": 0.062237856852929235,
"grad_norm": 0.7172162532806396,
"learning_rate": 2.0649233543733094e-05,
"loss": 1.685,
"step": 230
},
{
"epoch": 0.0627790556081721,
"grad_norm": 0.6729004979133606,
"learning_rate": 2.0829576194770065e-05,
"loss": 1.6961,
"step": 232
},
{
"epoch": 0.06332025436341496,
"grad_norm": 0.7335099577903748,
"learning_rate": 2.1009918845807033e-05,
"loss": 1.6797,
"step": 234
},
{
"epoch": 0.06386145311865783,
"grad_norm": 0.6398060321807861,
"learning_rate": 2.1190261496844003e-05,
"loss": 1.7037,
"step": 236
},
{
"epoch": 0.0644026518739007,
"grad_norm": 0.7026365399360657,
"learning_rate": 2.1370604147880974e-05,
"loss": 1.6698,
"step": 238
},
{
"epoch": 0.06494385062914355,
"grad_norm": 0.7972332239151001,
"learning_rate": 2.1550946798917945e-05,
"loss": 1.6866,
"step": 240
},
{
"epoch": 0.06548504938438642,
"grad_norm": 0.7363021969795227,
"learning_rate": 2.1731289449954913e-05,
"loss": 1.6879,
"step": 242
},
{
"epoch": 0.06602624813962928,
"grad_norm": 0.7071017026901245,
"learning_rate": 2.1911632100991887e-05,
"loss": 1.6898,
"step": 244
},
{
"epoch": 0.06656744689487214,
"grad_norm": 0.8030880093574524,
"learning_rate": 2.2091974752028858e-05,
"loss": 1.6734,
"step": 246
},
{
"epoch": 0.067108645650115,
"grad_norm": 0.7429569363594055,
"learning_rate": 2.2272317403065825e-05,
"loss": 1.6722,
"step": 248
},
{
"epoch": 0.06764984440535787,
"grad_norm": 0.6807804107666016,
"learning_rate": 2.2452660054102796e-05,
"loss": 1.6697,
"step": 250
},
{
"epoch": 0.06819104316060073,
"grad_norm": 0.6632562875747681,
"learning_rate": 2.2633002705139767e-05,
"loss": 1.6453,
"step": 252
},
{
"epoch": 0.0687322419158436,
"grad_norm": 0.6661680340766907,
"learning_rate": 2.2813345356176738e-05,
"loss": 1.6701,
"step": 254
},
{
"epoch": 0.06927344067108646,
"grad_norm": 0.6747105121612549,
"learning_rate": 2.2993688007213706e-05,
"loss": 1.6729,
"step": 256
},
{
"epoch": 0.06981463942632932,
"grad_norm": 0.7698473334312439,
"learning_rate": 2.317403065825068e-05,
"loss": 1.6528,
"step": 258
},
{
"epoch": 0.07035583818157218,
"grad_norm": 0.6111325621604919,
"learning_rate": 2.3354373309287647e-05,
"loss": 1.6412,
"step": 260
},
{
"epoch": 0.07089703693681504,
"grad_norm": 0.7405019998550415,
"learning_rate": 2.3534715960324618e-05,
"loss": 1.6564,
"step": 262
},
{
"epoch": 0.07143823569205791,
"grad_norm": 0.6702501773834229,
"learning_rate": 2.371505861136159e-05,
"loss": 1.654,
"step": 264
},
{
"epoch": 0.07197943444730077,
"grad_norm": 0.7076373100280762,
"learning_rate": 2.389540126239856e-05,
"loss": 1.6301,
"step": 266
},
{
"epoch": 0.07252063320254364,
"grad_norm": 0.7239627242088318,
"learning_rate": 2.4075743913435528e-05,
"loss": 1.6575,
"step": 268
},
{
"epoch": 0.0730618319577865,
"grad_norm": 0.753480076789856,
"learning_rate": 2.42560865644725e-05,
"loss": 1.6603,
"step": 270
},
{
"epoch": 0.07360303071302936,
"grad_norm": 0.7261641025543213,
"learning_rate": 2.443642921550947e-05,
"loss": 1.6449,
"step": 272
},
{
"epoch": 0.07414422946827222,
"grad_norm": 0.6315119862556458,
"learning_rate": 2.461677186654644e-05,
"loss": 1.6538,
"step": 274
},
{
"epoch": 0.07468542822351508,
"grad_norm": 0.5698412656784058,
"learning_rate": 2.4797114517583408e-05,
"loss": 1.6663,
"step": 276
},
{
"epoch": 0.07522662697875795,
"grad_norm": 0.5968983173370361,
"learning_rate": 2.497745716862038e-05,
"loss": 1.643,
"step": 278
},
{
"epoch": 0.07576782573400082,
"grad_norm": 0.561126172542572,
"learning_rate": 2.5157799819657353e-05,
"loss": 1.6301,
"step": 280
},
{
"epoch": 0.07630902448924368,
"grad_norm": 0.7290865778923035,
"learning_rate": 2.533814247069432e-05,
"loss": 1.6412,
"step": 282
},
{
"epoch": 0.07685022324448654,
"grad_norm": 0.7629122138023376,
"learning_rate": 2.5518485121731288e-05,
"loss": 1.6335,
"step": 284
},
{
"epoch": 0.0773914219997294,
"grad_norm": 0.5383496284484863,
"learning_rate": 2.5698827772768262e-05,
"loss": 1.6226,
"step": 286
},
{
"epoch": 0.07793262075497226,
"grad_norm": 0.7778373956680298,
"learning_rate": 2.5879170423805233e-05,
"loss": 1.6333,
"step": 288
},
{
"epoch": 0.07847381951021512,
"grad_norm": 0.6851366758346558,
"learning_rate": 2.60595130748422e-05,
"loss": 1.6251,
"step": 290
},
{
"epoch": 0.079015018265458,
"grad_norm": 0.5947225689888,
"learning_rate": 2.623985572587917e-05,
"loss": 1.6298,
"step": 292
},
{
"epoch": 0.07955621702070086,
"grad_norm": 0.9742544889450073,
"learning_rate": 2.6420198376916146e-05,
"loss": 1.6252,
"step": 294
},
{
"epoch": 0.08009741577594372,
"grad_norm": 1.2064323425292969,
"learning_rate": 2.6600541027953113e-05,
"loss": 1.6152,
"step": 296
},
{
"epoch": 0.08063861453118658,
"grad_norm": 1.0506716966629028,
"learning_rate": 2.678088367899008e-05,
"loss": 1.6351,
"step": 298
},
{
"epoch": 0.08117981328642944,
"grad_norm": 1.2992738485336304,
"learning_rate": 2.696122633002705e-05,
"loss": 1.6193,
"step": 300
},
{
"epoch": 0.0817210120416723,
"grad_norm": 1.0616599321365356,
"learning_rate": 2.7141568981064026e-05,
"loss": 1.6135,
"step": 302
},
{
"epoch": 0.08226221079691516,
"grad_norm": 1.037997841835022,
"learning_rate": 2.7321911632100993e-05,
"loss": 1.6344,
"step": 304
},
{
"epoch": 0.08280340955215804,
"grad_norm": 0.8937569856643677,
"learning_rate": 2.7502254283137964e-05,
"loss": 1.6077,
"step": 306
},
{
"epoch": 0.0833446083074009,
"grad_norm": 1.1334234476089478,
"learning_rate": 2.7682596934174932e-05,
"loss": 1.6193,
"step": 308
},
{
"epoch": 0.08388580706264376,
"grad_norm": 0.8336219191551208,
"learning_rate": 2.7862939585211906e-05,
"loss": 1.5948,
"step": 310
},
{
"epoch": 0.08442700581788662,
"grad_norm": 1.1825398206710815,
"learning_rate": 2.8043282236248874e-05,
"loss": 1.6239,
"step": 312
},
{
"epoch": 0.08496820457312948,
"grad_norm": 0.7945433259010315,
"learning_rate": 2.8223624887285844e-05,
"loss": 1.6119,
"step": 314
},
{
"epoch": 0.08550940332837234,
"grad_norm": 0.6971009969711304,
"learning_rate": 2.8403967538322812e-05,
"loss": 1.5822,
"step": 316
},
{
"epoch": 0.0860506020836152,
"grad_norm": 0.6050766706466675,
"learning_rate": 2.8584310189359786e-05,
"loss": 1.6161,
"step": 318
},
{
"epoch": 0.08659180083885808,
"grad_norm": 0.6123189330101013,
"learning_rate": 2.8764652840396754e-05,
"loss": 1.5941,
"step": 320
},
{
"epoch": 0.08713299959410094,
"grad_norm": 0.5471253395080566,
"learning_rate": 2.8944995491433725e-05,
"loss": 1.603,
"step": 322
},
{
"epoch": 0.0876741983493438,
"grad_norm": 0.5793882608413696,
"learning_rate": 2.91253381424707e-05,
"loss": 1.6076,
"step": 324
},
{
"epoch": 0.08821539710458666,
"grad_norm": 0.5409413576126099,
"learning_rate": 2.9305680793507666e-05,
"loss": 1.5825,
"step": 326
},
{
"epoch": 0.08875659585982952,
"grad_norm": 6.757148265838623,
"learning_rate": 2.9486023444544637e-05,
"loss": 1.5942,
"step": 328
},
{
"epoch": 0.08929779461507238,
"grad_norm": 1.3357856273651123,
"learning_rate": 2.9666366095581605e-05,
"loss": 1.642,
"step": 330
},
{
"epoch": 0.08983899337031524,
"grad_norm": 0.8245829939842224,
"learning_rate": 2.984670874661858e-05,
"loss": 1.6062,
"step": 332
},
{
"epoch": 0.09038019212555812,
"grad_norm": 0.8888993263244629,
"learning_rate": 3.0027051397655547e-05,
"loss": 1.5952,
"step": 334
},
{
"epoch": 0.09092139088080098,
"grad_norm": 0.8923915028572083,
"learning_rate": 3.0207394048692517e-05,
"loss": 1.5977,
"step": 336
},
{
"epoch": 0.09146258963604384,
"grad_norm": 0.7443459033966064,
"learning_rate": 3.0387736699729485e-05,
"loss": 1.5738,
"step": 338
},
{
"epoch": 0.0920037883912867,
"grad_norm": 0.7297430038452148,
"learning_rate": 3.056807935076646e-05,
"loss": 1.5907,
"step": 340
},
{
"epoch": 0.09254498714652956,
"grad_norm": 0.6882812976837158,
"learning_rate": 3.074842200180343e-05,
"loss": 1.5767,
"step": 342
},
{
"epoch": 0.09308618590177242,
"grad_norm": 0.6150392889976501,
"learning_rate": 3.0928764652840394e-05,
"loss": 1.5747,
"step": 344
},
{
"epoch": 0.09362738465701528,
"grad_norm": 0.6230599284172058,
"learning_rate": 3.110910730387737e-05,
"loss": 1.583,
"step": 346
},
{
"epoch": 0.09416858341225816,
"grad_norm": 0.6081874966621399,
"learning_rate": 3.128944995491434e-05,
"loss": 1.5875,
"step": 348
},
{
"epoch": 0.09470978216750102,
"grad_norm": 0.5467821955680847,
"learning_rate": 3.146979260595131e-05,
"loss": 1.575,
"step": 350
},
{
"epoch": 0.09525098092274388,
"grad_norm": 0.5629361271858215,
"learning_rate": 3.165013525698828e-05,
"loss": 1.5828,
"step": 352
},
{
"epoch": 0.09579217967798674,
"grad_norm": 0.5995283126831055,
"learning_rate": 3.1830477908025245e-05,
"loss": 1.5872,
"step": 354
},
{
"epoch": 0.0963333784332296,
"grad_norm": 0.556450366973877,
"learning_rate": 3.201082055906222e-05,
"loss": 1.553,
"step": 356
},
{
"epoch": 0.09687457718847246,
"grad_norm": 0.6498537063598633,
"learning_rate": 3.219116321009919e-05,
"loss": 1.5667,
"step": 358
},
{
"epoch": 0.09741577594371532,
"grad_norm": 0.5891172885894775,
"learning_rate": 3.237150586113616e-05,
"loss": 1.5818,
"step": 360
},
{
"epoch": 0.0979569746989582,
"grad_norm": 0.6487797498703003,
"learning_rate": 3.2551848512173136e-05,
"loss": 1.5582,
"step": 362
},
{
"epoch": 0.09849817345420106,
"grad_norm": 0.5860658884048462,
"learning_rate": 3.27321911632101e-05,
"loss": 1.5725,
"step": 364
},
{
"epoch": 0.09903937220944392,
"grad_norm": 0.5619581937789917,
"learning_rate": 3.291253381424707e-05,
"loss": 1.5779,
"step": 366
},
{
"epoch": 0.09958057096468678,
"grad_norm": 0.7147429585456848,
"learning_rate": 3.309287646528404e-05,
"loss": 1.5766,
"step": 368
},
{
"epoch": 0.10012176971992964,
"grad_norm": 0.5840562582015991,
"learning_rate": 3.327321911632101e-05,
"loss": 1.5609,
"step": 370
},
{
"epoch": 0.1006629684751725,
"grad_norm": 0.6277860403060913,
"learning_rate": 3.345356176735798e-05,
"loss": 1.5645,
"step": 372
},
{
"epoch": 0.10120416723041536,
"grad_norm": 0.6395567655563354,
"learning_rate": 3.3633904418394954e-05,
"loss": 1.545,
"step": 374
},
{
"epoch": 0.10174536598565824,
"grad_norm": 0.6651553511619568,
"learning_rate": 3.381424706943192e-05,
"loss": 1.5643,
"step": 376
},
{
"epoch": 0.1022865647409011,
"grad_norm": 0.6691033244132996,
"learning_rate": 3.3994589720468896e-05,
"loss": 1.5705,
"step": 378
},
{
"epoch": 0.10282776349614396,
"grad_norm": 0.5426511764526367,
"learning_rate": 3.4174932371505863e-05,
"loss": 1.536,
"step": 380
},
{
"epoch": 0.10336896225138682,
"grad_norm": 0.6677694916725159,
"learning_rate": 3.435527502254283e-05,
"loss": 1.5664,
"step": 382
},
{
"epoch": 0.10391016100662968,
"grad_norm": 0.5283762216567993,
"learning_rate": 3.45356176735798e-05,
"loss": 1.5474,
"step": 384
},
{
"epoch": 0.10445135976187254,
"grad_norm": 0.652812659740448,
"learning_rate": 3.471596032461677e-05,
"loss": 1.5509,
"step": 386
},
{
"epoch": 0.1049925585171154,
"grad_norm": 0.8639987111091614,
"learning_rate": 3.489630297565375e-05,
"loss": 1.5563,
"step": 388
},
{
"epoch": 0.10553375727235827,
"grad_norm": 0.7726946473121643,
"learning_rate": 3.5076645626690715e-05,
"loss": 1.5682,
"step": 390
},
{
"epoch": 0.10607495602760114,
"grad_norm": 0.6511155962944031,
"learning_rate": 3.525698827772768e-05,
"loss": 1.5571,
"step": 392
},
{
"epoch": 0.106616154782844,
"grad_norm": 0.6578395962715149,
"learning_rate": 3.5437330928764656e-05,
"loss": 1.5452,
"step": 394
},
{
"epoch": 0.10715735353808686,
"grad_norm": 0.642919659614563,
"learning_rate": 3.5617673579801624e-05,
"loss": 1.5508,
"step": 396
},
{
"epoch": 0.10769855229332972,
"grad_norm": 0.5190348029136658,
"learning_rate": 3.579801623083859e-05,
"loss": 1.5432,
"step": 398
},
{
"epoch": 0.10823975104857259,
"grad_norm": 0.48932549357414246,
"learning_rate": 3.5978358881875566e-05,
"loss": 1.5544,
"step": 400
},
{
"epoch": 0.10878094980381545,
"grad_norm": 0.5018340945243835,
"learning_rate": 3.615870153291254e-05,
"loss": 1.5322,
"step": 402
},
{
"epoch": 0.10932214855905831,
"grad_norm": 0.5701499581336975,
"learning_rate": 3.633904418394951e-05,
"loss": 1.5288,
"step": 404
},
{
"epoch": 0.10986334731430118,
"grad_norm": 0.6049205660820007,
"learning_rate": 3.6519386834986475e-05,
"loss": 1.5627,
"step": 406
},
{
"epoch": 0.11040454606954404,
"grad_norm": 0.5781517028808594,
"learning_rate": 3.669972948602345e-05,
"loss": 1.542,
"step": 408
},
{
"epoch": 0.1109457448247869,
"grad_norm": 0.5594660043716431,
"learning_rate": 3.688007213706042e-05,
"loss": 1.5461,
"step": 410
},
{
"epoch": 0.11148694358002977,
"grad_norm": 0.5319619178771973,
"learning_rate": 3.7060414788097384e-05,
"loss": 1.5668,
"step": 412
},
{
"epoch": 0.11202814233527263,
"grad_norm": 0.5311123728752136,
"learning_rate": 3.724075743913435e-05,
"loss": 1.528,
"step": 414
},
{
"epoch": 0.11256934109051549,
"grad_norm": 0.5555101633071899,
"learning_rate": 3.7421100090171326e-05,
"loss": 1.5392,
"step": 416
},
{
"epoch": 0.11311053984575835,
"grad_norm": 0.5486223101615906,
"learning_rate": 3.76014427412083e-05,
"loss": 1.5337,
"step": 418
},
{
"epoch": 0.11365173860100122,
"grad_norm": 0.5156669020652771,
"learning_rate": 3.778178539224527e-05,
"loss": 1.5105,
"step": 420
},
{
"epoch": 0.11419293735624408,
"grad_norm": 0.49596554040908813,
"learning_rate": 3.7962128043282235e-05,
"loss": 1.515,
"step": 422
},
{
"epoch": 0.11473413611148695,
"grad_norm": 0.641333281993866,
"learning_rate": 3.814247069431921e-05,
"loss": 1.5328,
"step": 424
},
{
"epoch": 0.1152753348667298,
"grad_norm": 0.6106113195419312,
"learning_rate": 3.832281334535618e-05,
"loss": 1.5189,
"step": 426
},
{
"epoch": 0.11581653362197267,
"grad_norm": 0.5619134306907654,
"learning_rate": 3.8503155996393145e-05,
"loss": 1.5295,
"step": 428
},
{
"epoch": 0.11635773237721553,
"grad_norm": 0.5396978259086609,
"learning_rate": 3.868349864743012e-05,
"loss": 1.5173,
"step": 430
},
{
"epoch": 0.11689893113245839,
"grad_norm": 0.5466894507408142,
"learning_rate": 3.886384129846709e-05,
"loss": 1.5191,
"step": 432
},
{
"epoch": 0.11744012988770126,
"grad_norm": 0.5601218342781067,
"learning_rate": 3.904418394950406e-05,
"loss": 1.5285,
"step": 434
},
{
"epoch": 0.11798132864294412,
"grad_norm": 0.6620492935180664,
"learning_rate": 3.922452660054103e-05,
"loss": 1.4946,
"step": 436
},
{
"epoch": 0.11852252739818699,
"grad_norm": 0.49140048027038574,
"learning_rate": 3.9404869251578e-05,
"loss": 1.512,
"step": 438
},
{
"epoch": 0.11906372615342985,
"grad_norm": 0.5824118256568909,
"learning_rate": 3.958521190261497e-05,
"loss": 1.5244,
"step": 440
},
{
"epoch": 0.11960492490867271,
"grad_norm": 0.4967150092124939,
"learning_rate": 3.976555455365194e-05,
"loss": 1.5273,
"step": 442
},
{
"epoch": 0.12014612366391557,
"grad_norm": 0.5089767575263977,
"learning_rate": 3.994589720468891e-05,
"loss": 1.5119,
"step": 444
},
{
"epoch": 0.12068732241915843,
"grad_norm": 0.5404312014579773,
"learning_rate": 4.0126239855725886e-05,
"loss": 1.5072,
"step": 446
},
{
"epoch": 0.1212285211744013,
"grad_norm": 0.5239550471305847,
"learning_rate": 4.0306582506762853e-05,
"loss": 1.5336,
"step": 448
},
{
"epoch": 0.12176971992964417,
"grad_norm": 0.4974781274795532,
"learning_rate": 4.048692515779982e-05,
"loss": 1.5225,
"step": 450
},
{
"epoch": 0.12231091868488703,
"grad_norm": 0.5363791584968567,
"learning_rate": 4.066726780883679e-05,
"loss": 1.5176,
"step": 452
},
{
"epoch": 0.12285211744012989,
"grad_norm": 0.5095157027244568,
"learning_rate": 4.084761045987376e-05,
"loss": 1.4936,
"step": 454
},
{
"epoch": 0.12339331619537275,
"grad_norm": 0.4920356869697571,
"learning_rate": 4.102795311091073e-05,
"loss": 1.5269,
"step": 456
},
{
"epoch": 0.12393451495061561,
"grad_norm": 0.4940793514251709,
"learning_rate": 4.1208295761947705e-05,
"loss": 1.5072,
"step": 458
},
{
"epoch": 0.12447571370585847,
"grad_norm": 0.4805227220058441,
"learning_rate": 4.138863841298467e-05,
"loss": 1.4987,
"step": 460
},
{
"epoch": 0.12501691246110133,
"grad_norm": 0.49683934450149536,
"learning_rate": 4.1568981064021646e-05,
"loss": 1.5008,
"step": 462
},
{
"epoch": 0.1255581112163442,
"grad_norm": 0.5283801555633545,
"learning_rate": 4.1749323715058614e-05,
"loss": 1.5177,
"step": 464
},
{
"epoch": 0.12609930997158705,
"grad_norm": 0.5395119190216064,
"learning_rate": 4.192966636609558e-05,
"loss": 1.5106,
"step": 466
},
{
"epoch": 0.12664050872682991,
"grad_norm": 0.5403693914413452,
"learning_rate": 4.211000901713255e-05,
"loss": 1.4854,
"step": 468
},
{
"epoch": 0.1271817074820728,
"grad_norm": 0.4690951406955719,
"learning_rate": 4.229035166816952e-05,
"loss": 1.5079,
"step": 470
},
{
"epoch": 0.12772290623731566,
"grad_norm": 0.5077293515205383,
"learning_rate": 4.24706943192065e-05,
"loss": 1.4953,
"step": 472
},
{
"epoch": 0.12826410499255853,
"grad_norm": 0.440019816160202,
"learning_rate": 4.2651036970243465e-05,
"loss": 1.4864,
"step": 474
},
{
"epoch": 0.1288053037478014,
"grad_norm": 0.48672759532928467,
"learning_rate": 4.283137962128044e-05,
"loss": 1.5205,
"step": 476
},
{
"epoch": 0.12934650250304425,
"grad_norm": 0.4732811450958252,
"learning_rate": 4.301172227231741e-05,
"loss": 1.4998,
"step": 478
},
{
"epoch": 0.1298877012582871,
"grad_norm": 0.46713048219680786,
"learning_rate": 4.3192064923354374e-05,
"loss": 1.4893,
"step": 480
},
{
"epoch": 0.13042890001352997,
"grad_norm": 0.502356231212616,
"learning_rate": 4.337240757439134e-05,
"loss": 1.5125,
"step": 482
},
{
"epoch": 0.13097009876877283,
"grad_norm": 0.45067864656448364,
"learning_rate": 4.3552750225428316e-05,
"loss": 1.4978,
"step": 484
},
{
"epoch": 0.1315112975240157,
"grad_norm": 0.46964120864868164,
"learning_rate": 4.373309287646529e-05,
"loss": 1.5006,
"step": 486
},
{
"epoch": 0.13205249627925855,
"grad_norm": 0.47723180055618286,
"learning_rate": 4.391343552750226e-05,
"loss": 1.513,
"step": 488
},
{
"epoch": 0.1325936950345014,
"grad_norm": 0.5100542306900024,
"learning_rate": 4.4093778178539225e-05,
"loss": 1.5279,
"step": 490
},
{
"epoch": 0.13313489378974427,
"grad_norm": 0.5344257354736328,
"learning_rate": 4.42741208295762e-05,
"loss": 1.5193,
"step": 492
},
{
"epoch": 0.13367609254498714,
"grad_norm": 0.5867893695831299,
"learning_rate": 4.445446348061317e-05,
"loss": 1.512,
"step": 494
},
{
"epoch": 0.13421729130023,
"grad_norm": 0.7811394929885864,
"learning_rate": 4.4634806131650134e-05,
"loss": 1.5038,
"step": 496
},
{
"epoch": 0.13475849005547288,
"grad_norm": 0.8505339622497559,
"learning_rate": 4.48151487826871e-05,
"loss": 1.5169,
"step": 498
},
{
"epoch": 0.13529968881071575,
"grad_norm": 0.6337641477584839,
"learning_rate": 4.4995491433724076e-05,
"loss": 1.4951,
"step": 500
},
{
"epoch": 0.1358408875659586,
"grad_norm": 0.7979961633682251,
"learning_rate": 4.517583408476105e-05,
"loss": 1.5031,
"step": 502
},
{
"epoch": 0.13638208632120147,
"grad_norm": 0.6946894526481628,
"learning_rate": 4.535617673579802e-05,
"loss": 1.501,
"step": 504
},
{
"epoch": 0.13692328507644433,
"grad_norm": 0.6830259561538696,
"learning_rate": 4.5536519386834986e-05,
"loss": 1.4896,
"step": 506
},
{
"epoch": 0.1374644838316872,
"grad_norm": 0.5908662676811218,
"learning_rate": 4.571686203787196e-05,
"loss": 1.4992,
"step": 508
},
{
"epoch": 0.13800568258693005,
"grad_norm": 0.7655865550041199,
"learning_rate": 4.589720468890893e-05,
"loss": 1.4911,
"step": 510
},
{
"epoch": 0.1385468813421729,
"grad_norm": 0.5924785733222961,
"learning_rate": 4.6077547339945895e-05,
"loss": 1.4719,
"step": 512
},
{
"epoch": 0.13908808009741577,
"grad_norm": 0.6654263138771057,
"learning_rate": 4.625788999098287e-05,
"loss": 1.5109,
"step": 514
},
{
"epoch": 0.13962927885265863,
"grad_norm": 0.5296297073364258,
"learning_rate": 4.6438232642019843e-05,
"loss": 1.4934,
"step": 516
},
{
"epoch": 0.1401704776079015,
"grad_norm": 0.5698690414428711,
"learning_rate": 4.661857529305681e-05,
"loss": 1.4954,
"step": 518
},
{
"epoch": 0.14071167636314436,
"grad_norm": 0.5790325403213501,
"learning_rate": 4.679891794409378e-05,
"loss": 1.4673,
"step": 520
},
{
"epoch": 0.14125287511838722,
"grad_norm": 0.551480770111084,
"learning_rate": 4.697926059513075e-05,
"loss": 1.476,
"step": 522
},
{
"epoch": 0.14179407387363008,
"grad_norm": 0.5201780796051025,
"learning_rate": 4.715960324616772e-05,
"loss": 1.4701,
"step": 524
},
{
"epoch": 0.14233527262887297,
"grad_norm": 0.46442562341690063,
"learning_rate": 4.733994589720469e-05,
"loss": 1.4831,
"step": 526
},
{
"epoch": 0.14287647138411583,
"grad_norm": 0.5558522939682007,
"learning_rate": 4.752028854824166e-05,
"loss": 1.4729,
"step": 528
},
{
"epoch": 0.1434176701393587,
"grad_norm": 0.48511791229248047,
"learning_rate": 4.7700631199278636e-05,
"loss": 1.4742,
"step": 530
},
{
"epoch": 0.14395886889460155,
"grad_norm": 0.5244829058647156,
"learning_rate": 4.7880973850315604e-05,
"loss": 1.4928,
"step": 532
},
{
"epoch": 0.1445000676498444,
"grad_norm": 0.48878946900367737,
"learning_rate": 4.806131650135257e-05,
"loss": 1.4921,
"step": 534
},
{
"epoch": 0.14504126640508727,
"grad_norm": 0.5348760485649109,
"learning_rate": 4.824165915238954e-05,
"loss": 1.4917,
"step": 536
},
{
"epoch": 0.14558246516033013,
"grad_norm": 0.5444923639297485,
"learning_rate": 4.842200180342651e-05,
"loss": 1.4546,
"step": 538
},
{
"epoch": 0.146123663915573,
"grad_norm": 0.494761198759079,
"learning_rate": 4.860234445446348e-05,
"loss": 1.4751,
"step": 540
},
{
"epoch": 0.14666486267081585,
"grad_norm": 0.4921441674232483,
"learning_rate": 4.8782687105500455e-05,
"loss": 1.4767,
"step": 542
},
{
"epoch": 0.14720606142605872,
"grad_norm": 0.48382577300071716,
"learning_rate": 4.896302975653742e-05,
"loss": 1.485,
"step": 544
},
{
"epoch": 0.14774726018130158,
"grad_norm": 0.4616708755493164,
"learning_rate": 4.9143372407574397e-05,
"loss": 1.4732,
"step": 546
},
{
"epoch": 0.14828845893654444,
"grad_norm": 0.5030043125152588,
"learning_rate": 4.9323715058611364e-05,
"loss": 1.4799,
"step": 548
},
{
"epoch": 0.1488296576917873,
"grad_norm": 0.467230886220932,
"learning_rate": 4.950405770964833e-05,
"loss": 1.4594,
"step": 550
},
{
"epoch": 0.14937085644703016,
"grad_norm": 0.42864304780960083,
"learning_rate": 4.9684400360685306e-05,
"loss": 1.4748,
"step": 552
},
{
"epoch": 0.14991205520227305,
"grad_norm": 0.43733683228492737,
"learning_rate": 4.986474301172227e-05,
"loss": 1.462,
"step": 554
},
{
"epoch": 0.1504532539575159,
"grad_norm": 0.45550286769866943,
"learning_rate": 5.004508566275925e-05,
"loss": 1.475,
"step": 556
},
{
"epoch": 0.15099445271275877,
"grad_norm": 0.44999995827674866,
"learning_rate": 5.022542831379622e-05,
"loss": 1.4794,
"step": 558
},
{
"epoch": 0.15153565146800163,
"grad_norm": 0.5035279989242554,
"learning_rate": 5.040577096483319e-05,
"loss": 1.471,
"step": 560
},
{
"epoch": 0.1520768502232445,
"grad_norm": 0.44605591893196106,
"learning_rate": 5.058611361587016e-05,
"loss": 1.4461,
"step": 562
},
{
"epoch": 0.15261804897848735,
"grad_norm": 0.5482723712921143,
"learning_rate": 5.0766456266907124e-05,
"loss": 1.4597,
"step": 564
},
{
"epoch": 0.1531592477337302,
"grad_norm": 0.5323627591133118,
"learning_rate": 5.094679891794409e-05,
"loss": 1.4743,
"step": 566
},
{
"epoch": 0.15370044648897307,
"grad_norm": 0.5289944410324097,
"learning_rate": 5.1127141568981066e-05,
"loss": 1.5,
"step": 568
},
{
"epoch": 0.15424164524421594,
"grad_norm": 0.5446243286132812,
"learning_rate": 5.1307484220018034e-05,
"loss": 1.4751,
"step": 570
},
{
"epoch": 0.1547828439994588,
"grad_norm": 0.525830090045929,
"learning_rate": 5.1487826871055015e-05,
"loss": 1.4639,
"step": 572
},
{
"epoch": 0.15532404275470166,
"grad_norm": 0.48129191994667053,
"learning_rate": 5.166816952209198e-05,
"loss": 1.4652,
"step": 574
},
{
"epoch": 0.15586524150994452,
"grad_norm": 0.47915297746658325,
"learning_rate": 5.184851217312895e-05,
"loss": 1.4627,
"step": 576
},
{
"epoch": 0.15640644026518738,
"grad_norm": 0.5229325294494629,
"learning_rate": 5.202885482416592e-05,
"loss": 1.4525,
"step": 578
},
{
"epoch": 0.15694763902043024,
"grad_norm": 0.5452600121498108,
"learning_rate": 5.2209197475202885e-05,
"loss": 1.458,
"step": 580
},
{
"epoch": 0.15748883777567313,
"grad_norm": 0.427432656288147,
"learning_rate": 5.238954012623985e-05,
"loss": 1.4773,
"step": 582
},
{
"epoch": 0.158030036530916,
"grad_norm": 0.450712114572525,
"learning_rate": 5.2569882777276827e-05,
"loss": 1.469,
"step": 584
},
{
"epoch": 0.15857123528615885,
"grad_norm": 0.5500516891479492,
"learning_rate": 5.27502254283138e-05,
"loss": 1.4603,
"step": 586
},
{
"epoch": 0.1591124340414017,
"grad_norm": 0.457157164812088,
"learning_rate": 5.2930568079350775e-05,
"loss": 1.4785,
"step": 588
},
{
"epoch": 0.15965363279664457,
"grad_norm": 0.49750396609306335,
"learning_rate": 5.311091073038774e-05,
"loss": 1.4603,
"step": 590
},
{
"epoch": 0.16019483155188743,
"grad_norm": 0.5720525979995728,
"learning_rate": 5.329125338142471e-05,
"loss": 1.4753,
"step": 592
},
{
"epoch": 0.1607360303071303,
"grad_norm": 0.4425548315048218,
"learning_rate": 5.347159603246168e-05,
"loss": 1.462,
"step": 594
},
{
"epoch": 0.16127722906237316,
"grad_norm": 0.5064132809638977,
"learning_rate": 5.3651938683498645e-05,
"loss": 1.4596,
"step": 596
},
{
"epoch": 0.16181842781761602,
"grad_norm": 0.518460750579834,
"learning_rate": 5.383228133453562e-05,
"loss": 1.4763,
"step": 598
},
{
"epoch": 0.16235962657285888,
"grad_norm": 0.4613576829433441,
"learning_rate": 5.401262398557259e-05,
"loss": 1.4487,
"step": 600
},
{
"epoch": 0.16290082532810174,
"grad_norm": 0.7046213746070862,
"learning_rate": 5.419296663660957e-05,
"loss": 1.472,
"step": 602
},
{
"epoch": 0.1634420240833446,
"grad_norm": 0.6164196133613586,
"learning_rate": 5.4373309287646535e-05,
"loss": 1.4424,
"step": 604
},
{
"epoch": 0.16398322283858746,
"grad_norm": 0.5106020569801331,
"learning_rate": 5.45536519386835e-05,
"loss": 1.4567,
"step": 606
},
{
"epoch": 0.16452442159383032,
"grad_norm": 0.4291236400604248,
"learning_rate": 5.473399458972047e-05,
"loss": 1.4514,
"step": 608
},
{
"epoch": 0.16506562034907318,
"grad_norm": 0.46577414870262146,
"learning_rate": 5.491433724075744e-05,
"loss": 1.4408,
"step": 610
},
{
"epoch": 0.16560681910431607,
"grad_norm": 0.4729917049407959,
"learning_rate": 5.509467989179441e-05,
"loss": 1.4493,
"step": 612
},
{
"epoch": 0.16614801785955893,
"grad_norm": 0.4651925563812256,
"learning_rate": 5.527502254283138e-05,
"loss": 1.465,
"step": 614
},
{
"epoch": 0.1666892166148018,
"grad_norm": 0.4756859540939331,
"learning_rate": 5.545536519386835e-05,
"loss": 1.4641,
"step": 616
},
{
"epoch": 0.16723041537004465,
"grad_norm": 0.42555975914001465,
"learning_rate": 5.563570784490533e-05,
"loss": 1.4569,
"step": 618
},
{
"epoch": 0.16777161412528752,
"grad_norm": 0.5162522196769714,
"learning_rate": 5.5816050495942296e-05,
"loss": 1.4344,
"step": 620
},
{
"epoch": 0.16831281288053038,
"grad_norm": 0.5867063999176025,
"learning_rate": 5.599639314697926e-05,
"loss": 1.4647,
"step": 622
},
{
"epoch": 0.16885401163577324,
"grad_norm": 0.6629165410995483,
"learning_rate": 5.617673579801623e-05,
"loss": 1.473,
"step": 624
},
{
"epoch": 0.1693952103910161,
"grad_norm": 0.5905330777168274,
"learning_rate": 5.6357078449053205e-05,
"loss": 1.4459,
"step": 626
},
{
"epoch": 0.16993640914625896,
"grad_norm": 0.7457858324050903,
"learning_rate": 5.653742110009017e-05,
"loss": 1.4603,
"step": 628
},
{
"epoch": 0.17047760790150182,
"grad_norm": 0.5977684855461121,
"learning_rate": 5.671776375112714e-05,
"loss": 1.4621,
"step": 630
},
{
"epoch": 0.17101880665674468,
"grad_norm": 0.7097992897033691,
"learning_rate": 5.689810640216412e-05,
"loss": 1.4646,
"step": 632
},
{
"epoch": 0.17156000541198754,
"grad_norm": 0.5895450711250305,
"learning_rate": 5.707844905320109e-05,
"loss": 1.4338,
"step": 634
},
{
"epoch": 0.1721012041672304,
"grad_norm": 0.576877772808075,
"learning_rate": 5.7258791704238056e-05,
"loss": 1.4666,
"step": 636
},
{
"epoch": 0.17264240292247326,
"grad_norm": 0.541110098361969,
"learning_rate": 5.7439134355275024e-05,
"loss": 1.4624,
"step": 638
},
{
"epoch": 0.17318360167771615,
"grad_norm": 0.5172320604324341,
"learning_rate": 5.7619477006312e-05,
"loss": 1.473,
"step": 640
},
{
"epoch": 0.17372480043295901,
"grad_norm": 0.47511357069015503,
"learning_rate": 5.7799819657348965e-05,
"loss": 1.446,
"step": 642
},
{
"epoch": 0.17426599918820188,
"grad_norm": 0.48614808917045593,
"learning_rate": 5.798016230838593e-05,
"loss": 1.4394,
"step": 644
},
{
"epoch": 0.17480719794344474,
"grad_norm": 0.4435577094554901,
"learning_rate": 5.81605049594229e-05,
"loss": 1.43,
"step": 646
},
{
"epoch": 0.1753483966986876,
"grad_norm": 0.4458653926849365,
"learning_rate": 5.834084761045988e-05,
"loss": 1.46,
"step": 648
},
{
"epoch": 0.17588959545393046,
"grad_norm": 0.40675726532936096,
"learning_rate": 5.852119026149685e-05,
"loss": 1.4565,
"step": 650
},
{
"epoch": 0.17643079420917332,
"grad_norm": 0.4132504165172577,
"learning_rate": 5.8701532912533817e-05,
"loss": 1.4522,
"step": 652
},
{
"epoch": 0.17697199296441618,
"grad_norm": 0.40881386399269104,
"learning_rate": 5.888187556357079e-05,
"loss": 1.4232,
"step": 654
},
{
"epoch": 0.17751319171965904,
"grad_norm": 0.40527868270874023,
"learning_rate": 5.906221821460776e-05,
"loss": 1.441,
"step": 656
},
{
"epoch": 0.1780543904749019,
"grad_norm": 0.40227004885673523,
"learning_rate": 5.9242560865644726e-05,
"loss": 1.4259,
"step": 658
},
{
"epoch": 0.17859558923014476,
"grad_norm": 0.4043656289577484,
"learning_rate": 5.942290351668169e-05,
"loss": 1.4298,
"step": 660
},
{
"epoch": 0.17913678798538762,
"grad_norm": 0.4288482666015625,
"learning_rate": 5.9603246167718674e-05,
"loss": 1.4439,
"step": 662
},
{
"epoch": 0.17967798674063049,
"grad_norm": 0.4385060966014862,
"learning_rate": 5.978358881875564e-05,
"loss": 1.4237,
"step": 664
},
{
"epoch": 0.18021918549587335,
"grad_norm": 0.396980345249176,
"learning_rate": 5.996393146979261e-05,
"loss": 1.4174,
"step": 666
},
{
"epoch": 0.18076038425111624,
"grad_norm": 0.4060603678226471,
"learning_rate": 6.014427412082958e-05,
"loss": 1.4479,
"step": 668
},
{
"epoch": 0.1813015830063591,
"grad_norm": 0.4485025703907013,
"learning_rate": 6.032461677186655e-05,
"loss": 1.4493,
"step": 670
},
{
"epoch": 0.18184278176160196,
"grad_norm": 0.44034305214881897,
"learning_rate": 6.050495942290352e-05,
"loss": 1.4461,
"step": 672
},
{
"epoch": 0.18238398051684482,
"grad_norm": 0.418074369430542,
"learning_rate": 6.0685302073940486e-05,
"loss": 1.4287,
"step": 674
},
{
"epoch": 0.18292517927208768,
"grad_norm": 0.41937318444252014,
"learning_rate": 6.0865644724977454e-05,
"loss": 1.4338,
"step": 676
},
{
"epoch": 0.18346637802733054,
"grad_norm": 0.4103530943393707,
"learning_rate": 6.104598737601444e-05,
"loss": 1.4391,
"step": 678
},
{
"epoch": 0.1840075767825734,
"grad_norm": 0.4066039025783539,
"learning_rate": 6.122633002705141e-05,
"loss": 1.4357,
"step": 680
},
{
"epoch": 0.18454877553781626,
"grad_norm": 0.36903437972068787,
"learning_rate": 6.140667267808838e-05,
"loss": 1.4111,
"step": 682
},
{
"epoch": 0.18508997429305912,
"grad_norm": 0.37125757336616516,
"learning_rate": 6.158701532912534e-05,
"loss": 1.4233,
"step": 684
},
{
"epoch": 0.18563117304830198,
"grad_norm": 0.44102513790130615,
"learning_rate": 6.176735798016231e-05,
"loss": 1.4437,
"step": 686
},
{
"epoch": 0.18617237180354484,
"grad_norm": 0.4337277114391327,
"learning_rate": 6.194770063119928e-05,
"loss": 1.4425,
"step": 688
},
{
"epoch": 0.1867135705587877,
"grad_norm": 0.37394315004348755,
"learning_rate": 6.212804328223625e-05,
"loss": 1.4452,
"step": 690
},
{
"epoch": 0.18725476931403057,
"grad_norm": 0.41764944791793823,
"learning_rate": 6.230838593327321e-05,
"loss": 1.4535,
"step": 692
},
{
"epoch": 0.18779596806927343,
"grad_norm": 0.4214741289615631,
"learning_rate": 6.24887285843102e-05,
"loss": 1.4391,
"step": 694
},
{
"epoch": 0.18833716682451632,
"grad_norm": 0.4159027338027954,
"learning_rate": 6.266907123534716e-05,
"loss": 1.4197,
"step": 696
},
{
"epoch": 0.18887836557975918,
"grad_norm": 0.38865673542022705,
"learning_rate": 6.284941388638413e-05,
"loss": 1.4329,
"step": 698
},
{
"epoch": 0.18941956433500204,
"grad_norm": 0.43646490573883057,
"learning_rate": 6.30297565374211e-05,
"loss": 1.4147,
"step": 700
},
{
"epoch": 0.1899607630902449,
"grad_norm": 0.41997334361076355,
"learning_rate": 6.321009918845807e-05,
"loss": 1.4275,
"step": 702
},
{
"epoch": 0.19050196184548776,
"grad_norm": 0.38556602597236633,
"learning_rate": 6.339044183949505e-05,
"loss": 1.4258,
"step": 704
},
{
"epoch": 0.19104316060073062,
"grad_norm": 0.42955082654953003,
"learning_rate": 6.357078449053201e-05,
"loss": 1.4201,
"step": 706
},
{
"epoch": 0.19158435935597348,
"grad_norm": 0.3844427764415741,
"learning_rate": 6.3751127141569e-05,
"loss": 1.4448,
"step": 708
},
{
"epoch": 0.19212555811121634,
"grad_norm": 0.4312956929206848,
"learning_rate": 6.393146979260596e-05,
"loss": 1.4051,
"step": 710
},
{
"epoch": 0.1926667568664592,
"grad_norm": 0.4556865394115448,
"learning_rate": 6.411181244364293e-05,
"loss": 1.4305,
"step": 712
},
{
"epoch": 0.19320795562170207,
"grad_norm": 0.37053731083869934,
"learning_rate": 6.42921550946799e-05,
"loss": 1.4301,
"step": 714
},
{
"epoch": 0.19374915437694493,
"grad_norm": 0.3996010720729828,
"learning_rate": 6.447249774571686e-05,
"loss": 1.4282,
"step": 716
},
{
"epoch": 0.1942903531321878,
"grad_norm": 0.37610816955566406,
"learning_rate": 6.465284039675383e-05,
"loss": 1.4277,
"step": 718
},
{
"epoch": 0.19483155188743065,
"grad_norm": 0.3677166998386383,
"learning_rate": 6.48331830477908e-05,
"loss": 1.4029,
"step": 720
},
{
"epoch": 0.1953727506426735,
"grad_norm": 0.3841564357280731,
"learning_rate": 6.501352569882777e-05,
"loss": 1.4144,
"step": 722
},
{
"epoch": 0.1959139493979164,
"grad_norm": 0.3687719404697418,
"learning_rate": 6.519386834986475e-05,
"loss": 1.4079,
"step": 724
},
{
"epoch": 0.19645514815315926,
"grad_norm": 0.38350847363471985,
"learning_rate": 6.537421100090172e-05,
"loss": 1.4269,
"step": 726
},
{
"epoch": 0.19699634690840212,
"grad_norm": 0.39060813188552856,
"learning_rate": 6.555455365193868e-05,
"loss": 1.4265,
"step": 728
},
{
"epoch": 0.19753754566364498,
"grad_norm": 0.36068469285964966,
"learning_rate": 6.573489630297565e-05,
"loss": 1.4325,
"step": 730
},
{
"epoch": 0.19807874441888784,
"grad_norm": 0.41185086965560913,
"learning_rate": 6.591523895401263e-05,
"loss": 1.4348,
"step": 732
},
{
"epoch": 0.1986199431741307,
"grad_norm": 0.4441224932670593,
"learning_rate": 6.60955816050496e-05,
"loss": 1.4103,
"step": 734
},
{
"epoch": 0.19916114192937356,
"grad_norm": 0.3727317452430725,
"learning_rate": 6.627592425608657e-05,
"loss": 1.4188,
"step": 736
},
{
"epoch": 0.19970234068461643,
"grad_norm": 0.394972562789917,
"learning_rate": 6.645626690712355e-05,
"loss": 1.4095,
"step": 738
},
{
"epoch": 0.20024353943985929,
"grad_norm": 0.40716880559921265,
"learning_rate": 6.663660955816052e-05,
"loss": 1.4127,
"step": 740
},
{
"epoch": 0.20078473819510215,
"grad_norm": 0.4156644344329834,
"learning_rate": 6.681695220919748e-05,
"loss": 1.4189,
"step": 742
},
{
"epoch": 0.201325936950345,
"grad_norm": 0.3787958323955536,
"learning_rate": 6.699729486023445e-05,
"loss": 1.4221,
"step": 744
},
{
"epoch": 0.20186713570558787,
"grad_norm": 0.42427608370780945,
"learning_rate": 6.717763751127142e-05,
"loss": 1.4192,
"step": 746
},
{
"epoch": 0.20240833446083073,
"grad_norm": 0.4778277277946472,
"learning_rate": 6.735798016230839e-05,
"loss": 1.4024,
"step": 748
},
{
"epoch": 0.2029495332160736,
"grad_norm": 0.44801151752471924,
"learning_rate": 6.753832281334535e-05,
"loss": 1.4222,
"step": 750
},
{
"epoch": 0.20349073197131648,
"grad_norm": 0.46737611293792725,
"learning_rate": 6.771866546438232e-05,
"loss": 1.4117,
"step": 752
},
{
"epoch": 0.20403193072655934,
"grad_norm": 0.4184872806072235,
"learning_rate": 6.78990081154193e-05,
"loss": 1.4066,
"step": 754
},
{
"epoch": 0.2045731294818022,
"grad_norm": 0.40458211302757263,
"learning_rate": 6.807935076645627e-05,
"loss": 1.4274,
"step": 756
},
{
"epoch": 0.20511432823704506,
"grad_norm": 0.43926185369491577,
"learning_rate": 6.825969341749324e-05,
"loss": 1.4231,
"step": 758
},
{
"epoch": 0.20565552699228792,
"grad_norm": 0.4434867203235626,
"learning_rate": 6.844003606853022e-05,
"loss": 1.4121,
"step": 760
},
{
"epoch": 0.20619672574753078,
"grad_norm": 0.4500143826007843,
"learning_rate": 6.862037871956719e-05,
"loss": 1.4179,
"step": 762
},
{
"epoch": 0.20673792450277365,
"grad_norm": 0.45456650853157043,
"learning_rate": 6.880072137060415e-05,
"loss": 1.3912,
"step": 764
},
{
"epoch": 0.2072791232580165,
"grad_norm": 0.4214187264442444,
"learning_rate": 6.898106402164112e-05,
"loss": 1.3962,
"step": 766
},
{
"epoch": 0.20782032201325937,
"grad_norm": 0.427682101726532,
"learning_rate": 6.916140667267809e-05,
"loss": 1.4316,
"step": 768
},
{
"epoch": 0.20836152076850223,
"grad_norm": 0.44491469860076904,
"learning_rate": 6.934174932371507e-05,
"loss": 1.4218,
"step": 770
},
{
"epoch": 0.2089027195237451,
"grad_norm": 0.42736080288887024,
"learning_rate": 6.952209197475204e-05,
"loss": 1.3931,
"step": 772
},
{
"epoch": 0.20944391827898795,
"grad_norm": 0.4041571021080017,
"learning_rate": 6.9702434625789e-05,
"loss": 1.4201,
"step": 774
},
{
"epoch": 0.2099851170342308,
"grad_norm": 0.4250961244106293,
"learning_rate": 6.988277727682597e-05,
"loss": 1.4299,
"step": 776
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.4335261881351471,
"learning_rate": 7.006311992786294e-05,
"loss": 1.4125,
"step": 778
},
{
"epoch": 0.21106751454471653,
"grad_norm": 0.42000851035118103,
"learning_rate": 7.02434625788999e-05,
"loss": 1.3969,
"step": 780
},
{
"epoch": 0.21160871329995942,
"grad_norm": 0.38111838698387146,
"learning_rate": 7.042380522993687e-05,
"loss": 1.3795,
"step": 782
},
{
"epoch": 0.21214991205520228,
"grad_norm": 0.38366812467575073,
"learning_rate": 7.060414788097385e-05,
"loss": 1.4041,
"step": 784
},
{
"epoch": 0.21269111081044514,
"grad_norm": 0.4334602355957031,
"learning_rate": 7.078449053201082e-05,
"loss": 1.415,
"step": 786
},
{
"epoch": 0.213232309565688,
"grad_norm": 0.40296411514282227,
"learning_rate": 7.096483318304779e-05,
"loss": 1.4052,
"step": 788
},
{
"epoch": 0.21377350832093087,
"grad_norm": 0.4197232723236084,
"learning_rate": 7.114517583408477e-05,
"loss": 1.4205,
"step": 790
},
{
"epoch": 0.21431470707617373,
"grad_norm": 0.40287715196609497,
"learning_rate": 7.132551848512174e-05,
"loss": 1.4047,
"step": 792
},
{
"epoch": 0.2148559058314166,
"grad_norm": 0.37324196100234985,
"learning_rate": 7.15058611361587e-05,
"loss": 1.4398,
"step": 794
},
{
"epoch": 0.21539710458665945,
"grad_norm": 0.4409985840320587,
"learning_rate": 7.168620378719567e-05,
"loss": 1.3873,
"step": 796
},
{
"epoch": 0.2159383033419023,
"grad_norm": 0.41441893577575684,
"learning_rate": 7.186654643823264e-05,
"loss": 1.4174,
"step": 798
},
{
"epoch": 0.21647950209714517,
"grad_norm": 0.4271719455718994,
"learning_rate": 7.204688908926962e-05,
"loss": 1.3987,
"step": 800
},
{
"epoch": 0.21702070085238803,
"grad_norm": 0.4969992935657501,
"learning_rate": 7.222723174030659e-05,
"loss": 1.4049,
"step": 802
},
{
"epoch": 0.2175618996076309,
"grad_norm": 0.45711180567741394,
"learning_rate": 7.240757439134356e-05,
"loss": 1.4061,
"step": 804
},
{
"epoch": 0.21810309836287375,
"grad_norm": 0.4479979872703552,
"learning_rate": 7.258791704238052e-05,
"loss": 1.4049,
"step": 806
},
{
"epoch": 0.21864429711811662,
"grad_norm": 0.4708006978034973,
"learning_rate": 7.276825969341749e-05,
"loss": 1.3971,
"step": 808
},
{
"epoch": 0.2191854958733595,
"grad_norm": 0.4387456774711609,
"learning_rate": 7.294860234445446e-05,
"loss": 1.4272,
"step": 810
},
{
"epoch": 0.21972669462860236,
"grad_norm": 0.5285756587982178,
"learning_rate": 7.312894499549143e-05,
"loss": 1.3902,
"step": 812
},
{
"epoch": 0.22026789338384523,
"grad_norm": 0.5111876726150513,
"learning_rate": 7.330928764652841e-05,
"loss": 1.4176,
"step": 814
},
{
"epoch": 0.2208090921390881,
"grad_norm": 0.4643821716308594,
"learning_rate": 7.348963029756538e-05,
"loss": 1.4216,
"step": 816
},
{
"epoch": 0.22135029089433095,
"grad_norm": 0.5162214040756226,
"learning_rate": 7.366997294860236e-05,
"loss": 1.4025,
"step": 818
},
{
"epoch": 0.2218914896495738,
"grad_norm": 0.4296860992908478,
"learning_rate": 7.385031559963932e-05,
"loss": 1.3919,
"step": 820
},
{
"epoch": 0.22243268840481667,
"grad_norm": 0.4449775815010071,
"learning_rate": 7.403065825067629e-05,
"loss": 1.4002,
"step": 822
},
{
"epoch": 0.22297388716005953,
"grad_norm": 0.39713212847709656,
"learning_rate": 7.421100090171326e-05,
"loss": 1.4012,
"step": 824
},
{
"epoch": 0.2235150859153024,
"grad_norm": 0.41655346751213074,
"learning_rate": 7.439134355275023e-05,
"loss": 1.4155,
"step": 826
},
{
"epoch": 0.22405628467054525,
"grad_norm": 0.3751365542411804,
"learning_rate": 7.45716862037872e-05,
"loss": 1.4021,
"step": 828
},
{
"epoch": 0.2245974834257881,
"grad_norm": 0.41483408212661743,
"learning_rate": 7.475202885482417e-05,
"loss": 1.4207,
"step": 830
},
{
"epoch": 0.22513868218103097,
"grad_norm": 0.397360235452652,
"learning_rate": 7.493237150586114e-05,
"loss": 1.392,
"step": 832
},
{
"epoch": 0.22567988093627384,
"grad_norm": 0.3874877691268921,
"learning_rate": 7.511271415689811e-05,
"loss": 1.4143,
"step": 834
},
{
"epoch": 0.2262210796915167,
"grad_norm": 0.4382254481315613,
"learning_rate": 7.529305680793508e-05,
"loss": 1.4109,
"step": 836
},
{
"epoch": 0.22676227844675959,
"grad_norm": 0.3728530704975128,
"learning_rate": 7.547339945897204e-05,
"loss": 1.4215,
"step": 838
},
{
"epoch": 0.22730347720200245,
"grad_norm": 0.41155338287353516,
"learning_rate": 7.565374211000901e-05,
"loss": 1.3963,
"step": 840
},
{
"epoch": 0.2278446759572453,
"grad_norm": 0.3550320267677307,
"learning_rate": 7.5834084761046e-05,
"loss": 1.3998,
"step": 842
},
{
"epoch": 0.22838587471248817,
"grad_norm": 0.3858035206794739,
"learning_rate": 7.601442741208296e-05,
"loss": 1.387,
"step": 844
},
{
"epoch": 0.22892707346773103,
"grad_norm": 0.38636457920074463,
"learning_rate": 7.619477006311994e-05,
"loss": 1.387,
"step": 846
},
{
"epoch": 0.2294682722229739,
"grad_norm": 0.41915518045425415,
"learning_rate": 7.637511271415691e-05,
"loss": 1.3917,
"step": 848
},
{
"epoch": 0.23000947097821675,
"grad_norm": 0.35796865820884705,
"learning_rate": 7.655545536519388e-05,
"loss": 1.406,
"step": 850
},
{
"epoch": 0.2305506697334596,
"grad_norm": 0.35221853852272034,
"learning_rate": 7.673579801623084e-05,
"loss": 1.3892,
"step": 852
},
{
"epoch": 0.23109186848870247,
"grad_norm": 0.3815077245235443,
"learning_rate": 7.691614066726781e-05,
"loss": 1.3845,
"step": 854
},
{
"epoch": 0.23163306724394533,
"grad_norm": 0.3554491400718689,
"learning_rate": 7.709648331830478e-05,
"loss": 1.3644,
"step": 856
},
{
"epoch": 0.2321742659991882,
"grad_norm": 0.3762814998626709,
"learning_rate": 7.727682596934175e-05,
"loss": 1.3976,
"step": 858
},
{
"epoch": 0.23271546475443106,
"grad_norm": 0.34575173258781433,
"learning_rate": 7.745716862037873e-05,
"loss": 1.3925,
"step": 860
},
{
"epoch": 0.23325666350967392,
"grad_norm": 0.37864556908607483,
"learning_rate": 7.76375112714157e-05,
"loss": 1.3993,
"step": 862
},
{
"epoch": 0.23379786226491678,
"grad_norm": 0.34448474645614624,
"learning_rate": 7.781785392245266e-05,
"loss": 1.3855,
"step": 864
},
{
"epoch": 0.23433906102015967,
"grad_norm": 0.40932390093803406,
"learning_rate": 7.799819657348963e-05,
"loss": 1.395,
"step": 866
},
{
"epoch": 0.23488025977540253,
"grad_norm": 0.3737650513648987,
"learning_rate": 7.81785392245266e-05,
"loss": 1.3918,
"step": 868
},
{
"epoch": 0.2354214585306454,
"grad_norm": 0.42988118529319763,
"learning_rate": 7.835888187556357e-05,
"loss": 1.3837,
"step": 870
},
{
"epoch": 0.23596265728588825,
"grad_norm": 0.3865496814250946,
"learning_rate": 7.853922452660055e-05,
"loss": 1.3976,
"step": 872
},
{
"epoch": 0.2365038560411311,
"grad_norm": 0.3682670295238495,
"learning_rate": 7.871956717763751e-05,
"loss": 1.3792,
"step": 874
},
{
"epoch": 0.23704505479637397,
"grad_norm": 0.4236462712287903,
"learning_rate": 7.88999098286745e-05,
"loss": 1.4032,
"step": 876
},
{
"epoch": 0.23758625355161683,
"grad_norm": 0.3742213249206543,
"learning_rate": 7.908025247971146e-05,
"loss": 1.3709,
"step": 878
},
{
"epoch": 0.2381274523068597,
"grad_norm": 0.38234424591064453,
"learning_rate": 7.926059513074843e-05,
"loss": 1.3862,
"step": 880
},
{
"epoch": 0.23866865106210255,
"grad_norm": 0.37414151430130005,
"learning_rate": 7.94409377817854e-05,
"loss": 1.3751,
"step": 882
},
{
"epoch": 0.23920984981734542,
"grad_norm": 0.3838132619857788,
"learning_rate": 7.962128043282237e-05,
"loss": 1.3805,
"step": 884
},
{
"epoch": 0.23975104857258828,
"grad_norm": 0.3818622827529907,
"learning_rate": 7.980162308385933e-05,
"loss": 1.3735,
"step": 886
},
{
"epoch": 0.24029224732783114,
"grad_norm": 0.38791927695274353,
"learning_rate": 7.99819657348963e-05,
"loss": 1.3958,
"step": 888
},
{
"epoch": 0.240833446083074,
"grad_norm": 0.4164978861808777,
"learning_rate": 8.016230838593328e-05,
"loss": 1.421,
"step": 890
},
{
"epoch": 0.24137464483831686,
"grad_norm": 0.3721414804458618,
"learning_rate": 8.034265103697025e-05,
"loss": 1.3977,
"step": 892
},
{
"epoch": 0.24191584359355975,
"grad_norm": 0.37698984146118164,
"learning_rate": 8.052299368800722e-05,
"loss": 1.3854,
"step": 894
},
{
"epoch": 0.2424570423488026,
"grad_norm": 0.3553116023540497,
"learning_rate": 8.070333633904418e-05,
"loss": 1.3925,
"step": 896
},
{
"epoch": 0.24299824110404547,
"grad_norm": 0.37809059023857117,
"learning_rate": 8.088367899008115e-05,
"loss": 1.368,
"step": 898
},
{
"epoch": 0.24353943985928833,
"grad_norm": 0.3835943043231964,
"learning_rate": 8.106402164111813e-05,
"loss": 1.3992,
"step": 900
},
{
"epoch": 0.2440806386145312,
"grad_norm": 0.4013379216194153,
"learning_rate": 8.12443642921551e-05,
"loss": 1.3912,
"step": 902
},
{
"epoch": 0.24462183736977405,
"grad_norm": 0.37845560908317566,
"learning_rate": 8.142470694319207e-05,
"loss": 1.3934,
"step": 904
},
{
"epoch": 0.24516303612501691,
"grad_norm": 0.39762255549430847,
"learning_rate": 8.160504959422905e-05,
"loss": 1.3782,
"step": 906
},
{
"epoch": 0.24570423488025978,
"grad_norm": 0.36652496457099915,
"learning_rate": 8.178539224526602e-05,
"loss": 1.3787,
"step": 908
},
{
"epoch": 0.24624543363550264,
"grad_norm": 0.39953047037124634,
"learning_rate": 8.196573489630298e-05,
"loss": 1.3752,
"step": 910
},
{
"epoch": 0.2467866323907455,
"grad_norm": 0.35875022411346436,
"learning_rate": 8.214607754733995e-05,
"loss": 1.3768,
"step": 912
},
{
"epoch": 0.24732783114598836,
"grad_norm": 0.3617067337036133,
"learning_rate": 8.232642019837692e-05,
"loss": 1.3859,
"step": 914
},
{
"epoch": 0.24786902990123122,
"grad_norm": 0.38250839710235596,
"learning_rate": 8.250676284941389e-05,
"loss": 1.3897,
"step": 916
},
{
"epoch": 0.24841022865647408,
"grad_norm": 0.3404116928577423,
"learning_rate": 8.268710550045085e-05,
"loss": 1.3933,
"step": 918
},
{
"epoch": 0.24895142741171694,
"grad_norm": 0.3547706604003906,
"learning_rate": 8.286744815148782e-05,
"loss": 1.3787,
"step": 920
},
{
"epoch": 0.2494926261669598,
"grad_norm": 0.32752275466918945,
"learning_rate": 8.30477908025248e-05,
"loss": 1.3905,
"step": 922
},
{
"epoch": 0.25003382492220266,
"grad_norm": 0.3413980007171631,
"learning_rate": 8.322813345356177e-05,
"loss": 1.385,
"step": 924
},
{
"epoch": 0.25057502367744555,
"grad_norm": 0.5574982762336731,
"learning_rate": 8.340847610459874e-05,
"loss": 1.3869,
"step": 926
},
{
"epoch": 0.2511162224326884,
"grad_norm": 0.41128844022750854,
"learning_rate": 8.358881875563572e-05,
"loss": 1.3583,
"step": 928
},
{
"epoch": 0.2516574211879313,
"grad_norm": 0.3476073145866394,
"learning_rate": 8.376916140667269e-05,
"loss": 1.3832,
"step": 930
},
{
"epoch": 0.2521986199431741,
"grad_norm": 0.34838998317718506,
"learning_rate": 8.394950405770965e-05,
"loss": 1.3748,
"step": 932
},
{
"epoch": 0.252739818698417,
"grad_norm": 0.3552824556827545,
"learning_rate": 8.412984670874662e-05,
"loss": 1.3936,
"step": 934
},
{
"epoch": 0.25328101745365983,
"grad_norm": 0.34918278455734253,
"learning_rate": 8.43101893597836e-05,
"loss": 1.3733,
"step": 936
},
{
"epoch": 0.2538222162089027,
"grad_norm": 0.431455135345459,
"learning_rate": 8.449053201082057e-05,
"loss": 1.3924,
"step": 938
},
{
"epoch": 0.2543634149641456,
"grad_norm": 0.37811046838760376,
"learning_rate": 8.467087466185754e-05,
"loss": 1.3861,
"step": 940
},
{
"epoch": 0.25490461371938844,
"grad_norm": 0.35659778118133545,
"learning_rate": 8.48512173128945e-05,
"loss": 1.3736,
"step": 942
},
{
"epoch": 0.25544581247463133,
"grad_norm": 0.4327319264411926,
"learning_rate": 8.503155996393147e-05,
"loss": 1.3883,
"step": 944
},
{
"epoch": 0.25598701122987416,
"grad_norm": 0.39134231209754944,
"learning_rate": 8.521190261496844e-05,
"loss": 1.3704,
"step": 946
},
{
"epoch": 0.25652820998511705,
"grad_norm": 0.39573270082473755,
"learning_rate": 8.53922452660054e-05,
"loss": 1.4047,
"step": 948
},
{
"epoch": 0.2570694087403599,
"grad_norm": 0.3299993872642517,
"learning_rate": 8.557258791704237e-05,
"loss": 1.3778,
"step": 950
},
{
"epoch": 0.2576106074956028,
"grad_norm": 0.3559456765651703,
"learning_rate": 8.575293056807936e-05,
"loss": 1.3794,
"step": 952
},
{
"epoch": 0.2581518062508456,
"grad_norm": 0.36347028613090515,
"learning_rate": 8.593327321911632e-05,
"loss": 1.3817,
"step": 954
},
{
"epoch": 0.2586930050060885,
"grad_norm": 0.39882585406303406,
"learning_rate": 8.611361587015329e-05,
"loss": 1.3565,
"step": 956
},
{
"epoch": 0.2592342037613313,
"grad_norm": 0.3932117223739624,
"learning_rate": 8.629395852119027e-05,
"loss": 1.396,
"step": 958
},
{
"epoch": 0.2597754025165742,
"grad_norm": 0.3526294231414795,
"learning_rate": 8.647430117222724e-05,
"loss": 1.3624,
"step": 960
},
{
"epoch": 0.26031660127181705,
"grad_norm": 0.3804738223552704,
"learning_rate": 8.66546438232642e-05,
"loss": 1.3616,
"step": 962
},
{
"epoch": 0.26085780002705994,
"grad_norm": 0.36557725071907043,
"learning_rate": 8.683498647430117e-05,
"loss": 1.3997,
"step": 964
},
{
"epoch": 0.2613989987823028,
"grad_norm": 0.3574380874633789,
"learning_rate": 8.701532912533815e-05,
"loss": 1.3901,
"step": 966
},
{
"epoch": 0.26194019753754566,
"grad_norm": 0.4025056064128876,
"learning_rate": 8.719567177637512e-05,
"loss": 1.3707,
"step": 968
},
{
"epoch": 0.26248139629278855,
"grad_norm": 0.3687063157558441,
"learning_rate": 8.737601442741209e-05,
"loss": 1.3679,
"step": 970
},
{
"epoch": 0.2630225950480314,
"grad_norm": 0.3697878420352936,
"learning_rate": 8.755635707844906e-05,
"loss": 1.3981,
"step": 972
},
{
"epoch": 0.26356379380327427,
"grad_norm": 0.34241798520088196,
"learning_rate": 8.773669972948602e-05,
"loss": 1.3728,
"step": 974
},
{
"epoch": 0.2641049925585171,
"grad_norm": 0.40002745389938354,
"learning_rate": 8.791704238052299e-05,
"loss": 1.3732,
"step": 976
},
{
"epoch": 0.26464619131376,
"grad_norm": 0.42943906784057617,
"learning_rate": 8.809738503155996e-05,
"loss": 1.3731,
"step": 978
},
{
"epoch": 0.2651873900690028,
"grad_norm": 0.37437063455581665,
"learning_rate": 8.827772768259693e-05,
"loss": 1.372,
"step": 980
},
{
"epoch": 0.2657285888242457,
"grad_norm": 0.3378891944885254,
"learning_rate": 8.845807033363391e-05,
"loss": 1.3777,
"step": 982
},
{
"epoch": 0.26626978757948855,
"grad_norm": 0.32884734869003296,
"learning_rate": 8.863841298467088e-05,
"loss": 1.3639,
"step": 984
},
{
"epoch": 0.26681098633473144,
"grad_norm": 0.3945903480052948,
"learning_rate": 8.881875563570786e-05,
"loss": 1.3722,
"step": 986
},
{
"epoch": 0.26735218508997427,
"grad_norm": 0.39569205045700073,
"learning_rate": 8.899909828674482e-05,
"loss": 1.376,
"step": 988
},
{
"epoch": 0.26789338384521716,
"grad_norm": 0.31659135222435,
"learning_rate": 8.917944093778179e-05,
"loss": 1.3807,
"step": 990
},
{
"epoch": 0.26843458260046,
"grad_norm": 0.44032666087150574,
"learning_rate": 8.935978358881876e-05,
"loss": 1.3986,
"step": 992
},
{
"epoch": 0.2689757813557029,
"grad_norm": 0.3445993661880493,
"learning_rate": 8.954012623985573e-05,
"loss": 1.3589,
"step": 994
},
{
"epoch": 0.26951698011094577,
"grad_norm": 0.3693557679653168,
"learning_rate": 8.97204688908927e-05,
"loss": 1.3593,
"step": 996
},
{
"epoch": 0.2700581788661886,
"grad_norm": 0.3965442478656769,
"learning_rate": 8.990081154192968e-05,
"loss": 1.3909,
"step": 998
},
{
"epoch": 0.2705993776214315,
"grad_norm": 0.4038390815258026,
"learning_rate": 9.008115419296664e-05,
"loss": 1.3629,
"step": 1000
},
{
"epoch": 0.2711405763766743,
"grad_norm": 0.36394256353378296,
"learning_rate": 9.026149684400361e-05,
"loss": 1.3812,
"step": 1002
},
{
"epoch": 0.2716817751319172,
"grad_norm": 0.4527181386947632,
"learning_rate": 9.044183949504058e-05,
"loss": 1.3692,
"step": 1004
},
{
"epoch": 0.27222297388716005,
"grad_norm": 0.37700143456459045,
"learning_rate": 9.062218214607755e-05,
"loss": 1.3652,
"step": 1006
},
{
"epoch": 0.27276417264240294,
"grad_norm": 0.45016244053840637,
"learning_rate": 9.080252479711451e-05,
"loss": 1.3657,
"step": 1008
},
{
"epoch": 0.27330537139764577,
"grad_norm": 0.42159709334373474,
"learning_rate": 9.09828674481515e-05,
"loss": 1.3702,
"step": 1010
},
{
"epoch": 0.27384657015288866,
"grad_norm": 0.3884572982788086,
"learning_rate": 9.116321009918846e-05,
"loss": 1.3535,
"step": 1012
},
{
"epoch": 0.2743877689081315,
"grad_norm": 0.37507420778274536,
"learning_rate": 9.134355275022544e-05,
"loss": 1.3659,
"step": 1014
},
{
"epoch": 0.2749289676633744,
"grad_norm": 0.35269656777381897,
"learning_rate": 9.152389540126241e-05,
"loss": 1.3623,
"step": 1016
},
{
"epoch": 0.2754701664186172,
"grad_norm": 0.3543412387371063,
"learning_rate": 9.170423805229938e-05,
"loss": 1.3695,
"step": 1018
},
{
"epoch": 0.2760113651738601,
"grad_norm": 0.3173674941062927,
"learning_rate": 9.188458070333635e-05,
"loss": 1.3572,
"step": 1020
},
{
"epoch": 0.276552563929103,
"grad_norm": 0.3729746341705322,
"learning_rate": 9.206492335437331e-05,
"loss": 1.3888,
"step": 1022
},
{
"epoch": 0.2770937626843458,
"grad_norm": 0.33210429549217224,
"learning_rate": 9.224526600541028e-05,
"loss": 1.3395,
"step": 1024
},
{
"epoch": 0.2776349614395887,
"grad_norm": 0.338366836309433,
"learning_rate": 9.242560865644725e-05,
"loss": 1.3498,
"step": 1026
},
{
"epoch": 0.27817616019483155,
"grad_norm": 0.3367864191532135,
"learning_rate": 9.260595130748423e-05,
"loss": 1.3548,
"step": 1028
},
{
"epoch": 0.27871735895007443,
"grad_norm": 0.40313002467155457,
"learning_rate": 9.27862939585212e-05,
"loss": 1.4059,
"step": 1030
},
{
"epoch": 0.27925855770531727,
"grad_norm": 0.3434394299983978,
"learning_rate": 9.296663660955816e-05,
"loss": 1.3522,
"step": 1032
},
{
"epoch": 0.27979975646056016,
"grad_norm": 0.35454580187797546,
"learning_rate": 9.314697926059513e-05,
"loss": 1.3838,
"step": 1034
},
{
"epoch": 0.280340955215803,
"grad_norm": 0.3280038833618164,
"learning_rate": 9.33273219116321e-05,
"loss": 1.3753,
"step": 1036
},
{
"epoch": 0.2808821539710459,
"grad_norm": 0.4306875169277191,
"learning_rate": 9.350766456266907e-05,
"loss": 1.3807,
"step": 1038
},
{
"epoch": 0.2814233527262887,
"grad_norm": 0.3500923812389374,
"learning_rate": 9.368800721370605e-05,
"loss": 1.36,
"step": 1040
},
{
"epoch": 0.2819645514815316,
"grad_norm": 0.3702130913734436,
"learning_rate": 9.386834986474301e-05,
"loss": 1.3919,
"step": 1042
},
{
"epoch": 0.28250575023677443,
"grad_norm": 0.3651416599750519,
"learning_rate": 9.404869251578e-05,
"loss": 1.3805,
"step": 1044
},
{
"epoch": 0.2830469489920173,
"grad_norm": 0.35927796363830566,
"learning_rate": 9.422903516681696e-05,
"loss": 1.3507,
"step": 1046
},
{
"epoch": 0.28358814774726016,
"grad_norm": 0.36750975251197815,
"learning_rate": 9.440937781785393e-05,
"loss": 1.3475,
"step": 1048
},
{
"epoch": 0.28412934650250304,
"grad_norm": 0.31946998834609985,
"learning_rate": 9.45897204688909e-05,
"loss": 1.3708,
"step": 1050
},
{
"epoch": 0.28467054525774593,
"grad_norm": 0.3447932302951813,
"learning_rate": 9.477006311992787e-05,
"loss": 1.3519,
"step": 1052
},
{
"epoch": 0.28521174401298877,
"grad_norm": 0.31405511498451233,
"learning_rate": 9.495040577096483e-05,
"loss": 1.3806,
"step": 1054
},
{
"epoch": 0.28575294276823165,
"grad_norm": 0.3198442757129669,
"learning_rate": 9.51307484220018e-05,
"loss": 1.368,
"step": 1056
},
{
"epoch": 0.2862941415234745,
"grad_norm": 0.33328956365585327,
"learning_rate": 9.531109107303878e-05,
"loss": 1.3429,
"step": 1058
},
{
"epoch": 0.2868353402787174,
"grad_norm": 0.29432907700538635,
"learning_rate": 9.549143372407575e-05,
"loss": 1.3698,
"step": 1060
},
{
"epoch": 0.2873765390339602,
"grad_norm": 0.3468937575817108,
"learning_rate": 9.567177637511272e-05,
"loss": 1.356,
"step": 1062
},
{
"epoch": 0.2879177377892031,
"grad_norm": 0.3619658350944519,
"learning_rate": 9.585211902614968e-05,
"loss": 1.3596,
"step": 1064
},
{
"epoch": 0.28845893654444593,
"grad_norm": 0.3384917378425598,
"learning_rate": 9.603246167718665e-05,
"loss": 1.3693,
"step": 1066
},
{
"epoch": 0.2890001352996888,
"grad_norm": 0.3724029064178467,
"learning_rate": 9.621280432822363e-05,
"loss": 1.3639,
"step": 1068
},
{
"epoch": 0.28954133405493165,
"grad_norm": 0.7029115557670593,
"learning_rate": 9.63931469792606e-05,
"loss": 1.3557,
"step": 1070
},
{
"epoch": 0.29008253281017454,
"grad_norm": 0.5529230833053589,
"learning_rate": 9.657348963029757e-05,
"loss": 1.3657,
"step": 1072
},
{
"epoch": 0.2906237315654174,
"grad_norm": 0.4254820644855499,
"learning_rate": 9.675383228133455e-05,
"loss": 1.3633,
"step": 1074
},
{
"epoch": 0.29116493032066026,
"grad_norm": 0.4930615723133087,
"learning_rate": 9.693417493237152e-05,
"loss": 1.3714,
"step": 1076
},
{
"epoch": 0.2917061290759031,
"grad_norm": 0.4455857574939728,
"learning_rate": 9.711451758340848e-05,
"loss": 1.3615,
"step": 1078
},
{
"epoch": 0.292247327831146,
"grad_norm": 0.4171796441078186,
"learning_rate": 9.729486023444545e-05,
"loss": 1.3673,
"step": 1080
},
{
"epoch": 0.2927885265863889,
"grad_norm": 0.37810683250427246,
"learning_rate": 9.747520288548242e-05,
"loss": 1.3683,
"step": 1082
},
{
"epoch": 0.2933297253416317,
"grad_norm": 0.4057900905609131,
"learning_rate": 9.765554553651939e-05,
"loss": 1.3674,
"step": 1084
},
{
"epoch": 0.2938709240968746,
"grad_norm": 0.40583640336990356,
"learning_rate": 9.783588818755635e-05,
"loss": 1.3566,
"step": 1086
},
{
"epoch": 0.29441212285211743,
"grad_norm": 0.39454150199890137,
"learning_rate": 9.801623083859334e-05,
"loss": 1.3611,
"step": 1088
},
{
"epoch": 0.2949533216073603,
"grad_norm": 0.42229679226875305,
"learning_rate": 9.81965734896303e-05,
"loss": 1.3726,
"step": 1090
},
{
"epoch": 0.29549452036260315,
"grad_norm": 0.3274170160293579,
"learning_rate": 9.837691614066727e-05,
"loss": 1.3375,
"step": 1092
},
{
"epoch": 0.29603571911784604,
"grad_norm": 0.40999388694763184,
"learning_rate": 9.855725879170424e-05,
"loss": 1.3548,
"step": 1094
},
{
"epoch": 0.2965769178730889,
"grad_norm": 0.33515796065330505,
"learning_rate": 9.873760144274122e-05,
"loss": 1.3903,
"step": 1096
},
{
"epoch": 0.29711811662833176,
"grad_norm": 0.3834095597267151,
"learning_rate": 9.891794409377819e-05,
"loss": 1.3653,
"step": 1098
},
{
"epoch": 0.2976593153835746,
"grad_norm": 0.34850651025772095,
"learning_rate": 9.909828674481515e-05,
"loss": 1.3573,
"step": 1100
},
{
"epoch": 0.2982005141388175,
"grad_norm": 0.3811749815940857,
"learning_rate": 9.927862939585212e-05,
"loss": 1.3843,
"step": 1102
},
{
"epoch": 0.2987417128940603,
"grad_norm": 0.3308597803115845,
"learning_rate": 9.94589720468891e-05,
"loss": 1.3492,
"step": 1104
},
{
"epoch": 0.2992829116493032,
"grad_norm": 0.31952470541000366,
"learning_rate": 9.963931469792607e-05,
"loss": 1.3586,
"step": 1106
},
{
"epoch": 0.2998241104045461,
"grad_norm": 0.3433592915534973,
"learning_rate": 9.981965734896304e-05,
"loss": 1.3524,
"step": 1108
},
{
"epoch": 0.30036530915978893,
"grad_norm": 0.4547680914402008,
"learning_rate": 0.0001,
"loss": 1.3562,
"step": 1110
},
{
"epoch": 0.3009065079150318,
"grad_norm": 0.4963592290878296,
"learning_rate": 9.999999008881264e-05,
"loss": 1.3452,
"step": 1112
},
{
"epoch": 0.30144770667027465,
"grad_norm": 1.1111193895339966,
"learning_rate": 9.999996035525452e-05,
"loss": 1.3732,
"step": 1114
},
{
"epoch": 0.30198890542551754,
"grad_norm": 0.6860964298248291,
"learning_rate": 9.999991079933739e-05,
"loss": 1.3689,
"step": 1116
},
{
"epoch": 0.3025301041807604,
"grad_norm": 0.7344204783439636,
"learning_rate": 9.999984142108093e-05,
"loss": 1.3575,
"step": 1118
},
{
"epoch": 0.30307130293600326,
"grad_norm": 0.6534725427627563,
"learning_rate": 9.999975222051263e-05,
"loss": 1.376,
"step": 1120
},
{
"epoch": 0.3036125016912461,
"grad_norm": 0.5108229517936707,
"learning_rate": 9.999964319766785e-05,
"loss": 1.3741,
"step": 1122
},
{
"epoch": 0.304153700446489,
"grad_norm": 0.4888688325881958,
"learning_rate": 9.99995143525898e-05,
"loss": 1.3555,
"step": 1124
},
{
"epoch": 0.3046948992017318,
"grad_norm": 0.42808806896209717,
"learning_rate": 9.999936568532962e-05,
"loss": 1.3548,
"step": 1126
},
{
"epoch": 0.3052360979569747,
"grad_norm": 0.3921727240085602,
"learning_rate": 9.999919719594617e-05,
"loss": 1.3559,
"step": 1128
},
{
"epoch": 0.30577729671221754,
"grad_norm": 0.3473529517650604,
"learning_rate": 9.999900888450628e-05,
"loss": 1.3603,
"step": 1130
},
{
"epoch": 0.3063184954674604,
"grad_norm": 0.3337381184101105,
"learning_rate": 9.999880075108464e-05,
"loss": 1.3642,
"step": 1132
},
{
"epoch": 0.30685969422270326,
"grad_norm": 0.3363231122493744,
"learning_rate": 9.99985727957637e-05,
"loss": 1.3606,
"step": 1134
},
{
"epoch": 0.30740089297794615,
"grad_norm": 0.32726484537124634,
"learning_rate": 9.999832501863386e-05,
"loss": 1.3493,
"step": 1136
},
{
"epoch": 0.30794209173318904,
"grad_norm": 0.3190646767616272,
"learning_rate": 9.999805741979338e-05,
"loss": 1.3518,
"step": 1138
},
{
"epoch": 0.30848329048843187,
"grad_norm": 0.31244540214538574,
"learning_rate": 9.999776999934831e-05,
"loss": 1.3495,
"step": 1140
},
{
"epoch": 0.30902448924367476,
"grad_norm": 0.3286384344100952,
"learning_rate": 9.999746275741261e-05,
"loss": 1.3517,
"step": 1142
},
{
"epoch": 0.3095656879989176,
"grad_norm": 0.3630046546459198,
"learning_rate": 9.99971356941081e-05,
"loss": 1.3641,
"step": 1144
},
{
"epoch": 0.3101068867541605,
"grad_norm": 0.30771151185035706,
"learning_rate": 9.999678880956443e-05,
"loss": 1.3571,
"step": 1146
},
{
"epoch": 0.3106480855094033,
"grad_norm": 0.30026301741600037,
"learning_rate": 9.99964221039191e-05,
"loss": 1.3541,
"step": 1148
},
{
"epoch": 0.3111892842646462,
"grad_norm": 0.3128298223018646,
"learning_rate": 9.999603557731754e-05,
"loss": 1.3556,
"step": 1150
},
{
"epoch": 0.31173048301988904,
"grad_norm": 0.30185452103614807,
"learning_rate": 9.999562922991293e-05,
"loss": 1.3484,
"step": 1152
},
{
"epoch": 0.3122716817751319,
"grad_norm": 0.3274635076522827,
"learning_rate": 9.99952030618664e-05,
"loss": 1.3729,
"step": 1154
},
{
"epoch": 0.31281288053037476,
"grad_norm": 0.30549076199531555,
"learning_rate": 9.999475707334692e-05,
"loss": 1.3642,
"step": 1156
},
{
"epoch": 0.31335407928561765,
"grad_norm": 0.3147718906402588,
"learning_rate": 9.999429126453126e-05,
"loss": 1.3493,
"step": 1158
},
{
"epoch": 0.3138952780408605,
"grad_norm": 0.6205586791038513,
"learning_rate": 9.99938056356041e-05,
"loss": 1.3623,
"step": 1160
},
{
"epoch": 0.31443647679610337,
"grad_norm": 0.3471706211566925,
"learning_rate": 9.999330018675798e-05,
"loss": 1.3533,
"step": 1162
},
{
"epoch": 0.31497767555134626,
"grad_norm": 1.3515815734863281,
"learning_rate": 9.999277491819328e-05,
"loss": 1.3565,
"step": 1164
},
{
"epoch": 0.3155188743065891,
"grad_norm": 733.9155883789062,
"learning_rate": 9.999222983011824e-05,
"loss": 5.2143,
"step": 1166
},
{
"epoch": 0.316060073061832,
"grad_norm": 2.9439170360565186,
"learning_rate": 9.999166492274894e-05,
"loss": 1.4438,
"step": 1168
},
{
"epoch": 0.3166012718170748,
"grad_norm": 1.5871142148971558,
"learning_rate": 9.999108019630938e-05,
"loss": 1.4426,
"step": 1170
},
{
"epoch": 0.3171424705723177,
"grad_norm": 711.9217529296875,
"learning_rate": 9.999047565103132e-05,
"loss": 3.6935,
"step": 1172
},
{
"epoch": 0.31768366932756054,
"grad_norm": 100.76264953613281,
"learning_rate": 9.998985128715448e-05,
"loss": 4.2396,
"step": 1174
},
{
"epoch": 0.3182248680828034,
"grad_norm": 108.88189697265625,
"learning_rate": 9.998920710492634e-05,
"loss": 4.9929,
"step": 1176
},
{
"epoch": 0.31876606683804626,
"grad_norm": 72.18595123291016,
"learning_rate": 9.998854310460233e-05,
"loss": 6.0375,
"step": 1178
},
{
"epoch": 0.31930726559328915,
"grad_norm": 59.48538589477539,
"learning_rate": 9.998785928644567e-05,
"loss": 5.8932,
"step": 1180
},
{
"epoch": 0.319848464348532,
"grad_norm": 36.32703399658203,
"learning_rate": 9.998715565072744e-05,
"loss": 6.5369,
"step": 1182
},
{
"epoch": 0.32038966310377487,
"grad_norm": 18.565351486206055,
"learning_rate": 9.998643219772664e-05,
"loss": 6.1671,
"step": 1184
},
{
"epoch": 0.3209308618590177,
"grad_norm": 45.84898376464844,
"learning_rate": 9.998568892773003e-05,
"loss": 5.9379,
"step": 1186
},
{
"epoch": 0.3214720606142606,
"grad_norm": 66.2480239868164,
"learning_rate": 9.998492584103232e-05,
"loss": 5.7071,
"step": 1188
},
{
"epoch": 0.3220132593695034,
"grad_norm": 41.693092346191406,
"learning_rate": 9.998414293793599e-05,
"loss": 6.3198,
"step": 1190
},
{
"epoch": 0.3225544581247463,
"grad_norm": 19.323413848876953,
"learning_rate": 9.998334021875147e-05,
"loss": 5.377,
"step": 1192
},
{
"epoch": 0.3230956568799892,
"grad_norm": 15.907301902770996,
"learning_rate": 9.998251768379696e-05,
"loss": 4.5293,
"step": 1194
},
{
"epoch": 0.32363685563523203,
"grad_norm": 80.1374740600586,
"learning_rate": 9.998167533339857e-05,
"loss": 4.3471,
"step": 1196
},
{
"epoch": 0.3241780543904749,
"grad_norm": 23.298336029052734,
"learning_rate": 9.998081316789024e-05,
"loss": 3.7461,
"step": 1198
},
{
"epoch": 0.32471925314571776,
"grad_norm": 82.48027801513672,
"learning_rate": 9.997993118761378e-05,
"loss": 4.1647,
"step": 1200
},
{
"epoch": 0.32526045190096065,
"grad_norm": 27.916913986206055,
"learning_rate": 9.997902939291883e-05,
"loss": 3.9092,
"step": 1202
},
{
"epoch": 0.3258016506562035,
"grad_norm": 15.70148754119873,
"learning_rate": 9.997810778416293e-05,
"loss": 3.1628,
"step": 1204
},
{
"epoch": 0.32634284941144637,
"grad_norm": 18.33330535888672,
"learning_rate": 9.997716636171142e-05,
"loss": 2.8777,
"step": 1206
},
{
"epoch": 0.3268840481666892,
"grad_norm": 10.6620512008667,
"learning_rate": 9.997620512593755e-05,
"loss": 2.3009,
"step": 1208
},
{
"epoch": 0.3274252469219321,
"grad_norm": 32.01799011230469,
"learning_rate": 9.99752240772224e-05,
"loss": 1.9617,
"step": 1210
},
{
"epoch": 0.3279664456771749,
"grad_norm": 5.677090644836426,
"learning_rate": 9.997422321595488e-05,
"loss": 1.8401,
"step": 1212
},
{
"epoch": 0.3285076444324178,
"grad_norm": 8.914667129516602,
"learning_rate": 9.997320254253179e-05,
"loss": 1.6707,
"step": 1214
},
{
"epoch": 0.32904884318766064,
"grad_norm": 2.3725008964538574,
"learning_rate": 9.997216205735779e-05,
"loss": 1.5757,
"step": 1216
},
{
"epoch": 0.32959004194290353,
"grad_norm": 2.418389320373535,
"learning_rate": 9.997110176084538e-05,
"loss": 1.5154,
"step": 1218
},
{
"epoch": 0.33013124069814637,
"grad_norm": 2.802185297012329,
"learning_rate": 9.997002165341487e-05,
"loss": 1.4883,
"step": 1220
},
{
"epoch": 0.33067243945338926,
"grad_norm": 2.1769211292266846,
"learning_rate": 9.996892173549452e-05,
"loss": 1.445,
"step": 1222
},
{
"epoch": 0.33121363820863214,
"grad_norm": 1.799670934677124,
"learning_rate": 9.996780200752035e-05,
"loss": 1.4276,
"step": 1224
},
{
"epoch": 0.331754836963875,
"grad_norm": 3.2545313835144043,
"learning_rate": 9.996666246993627e-05,
"loss": 1.4394,
"step": 1226
},
{
"epoch": 0.33229603571911787,
"grad_norm": 1.1922351121902466,
"learning_rate": 9.996550312319408e-05,
"loss": 1.4359,
"step": 1228
},
{
"epoch": 0.3328372344743607,
"grad_norm": 2.6813228130340576,
"learning_rate": 9.996432396775339e-05,
"loss": 1.4229,
"step": 1230
},
{
"epoch": 0.3333784332296036,
"grad_norm": 1.6968843936920166,
"learning_rate": 9.996312500408165e-05,
"loss": 1.4281,
"step": 1232
},
{
"epoch": 0.3339196319848464,
"grad_norm": 1.3502254486083984,
"learning_rate": 9.996190623265421e-05,
"loss": 1.408,
"step": 1234
},
{
"epoch": 0.3344608307400893,
"grad_norm": 1.2809518575668335,
"learning_rate": 9.996066765395424e-05,
"loss": 1.4176,
"step": 1236
},
{
"epoch": 0.33500202949533214,
"grad_norm": 1.0455057621002197,
"learning_rate": 9.995940926847279e-05,
"loss": 1.4056,
"step": 1238
},
{
"epoch": 0.33554322825057503,
"grad_norm": 1.3292824029922485,
"learning_rate": 9.99581310767087e-05,
"loss": 1.4033,
"step": 1240
},
{
"epoch": 0.33608442700581787,
"grad_norm": 1.5960067510604858,
"learning_rate": 9.995683307916875e-05,
"loss": 1.379,
"step": 1242
},
{
"epoch": 0.33662562576106075,
"grad_norm": 1.0471105575561523,
"learning_rate": 9.99555152763675e-05,
"loss": 1.3823,
"step": 1244
},
{
"epoch": 0.3371668245163036,
"grad_norm": 2.339273452758789,
"learning_rate": 9.99541776688274e-05,
"loss": 1.3698,
"step": 1246
},
{
"epoch": 0.3377080232715465,
"grad_norm": 0.81674724817276,
"learning_rate": 9.995282025707875e-05,
"loss": 1.4154,
"step": 1248
},
{
"epoch": 0.33824922202678936,
"grad_norm": 0.6240290999412537,
"learning_rate": 9.995144304165968e-05,
"loss": 1.4035,
"step": 1250
},
{
"epoch": 0.3387904207820322,
"grad_norm": 2.281787872314453,
"learning_rate": 9.995004602311619e-05,
"loss": 1.3906,
"step": 1252
},
{
"epoch": 0.3393316195372751,
"grad_norm": 0.6818395853042603,
"learning_rate": 9.99486292020021e-05,
"loss": 1.3853,
"step": 1254
},
{
"epoch": 0.3398728182925179,
"grad_norm": 6.299881935119629,
"learning_rate": 9.994719257887915e-05,
"loss": 1.3856,
"step": 1256
},
{
"epoch": 0.3404140170477608,
"grad_norm": 0.8173750638961792,
"learning_rate": 9.994573615431686e-05,
"loss": 1.3871,
"step": 1258
},
{
"epoch": 0.34095521580300364,
"grad_norm": 2.155395746231079,
"learning_rate": 9.994425992889262e-05,
"loss": 1.3382,
"step": 1260
},
{
"epoch": 0.34149641455824653,
"grad_norm": 0.5846114754676819,
"learning_rate": 9.99427639031917e-05,
"loss": 1.3978,
"step": 1262
},
{
"epoch": 0.34203761331348936,
"grad_norm": 0.6624069213867188,
"learning_rate": 9.994124807780717e-05,
"loss": 1.3792,
"step": 1264
},
{
"epoch": 0.34257881206873225,
"grad_norm": 0.5708588361740112,
"learning_rate": 9.993971245333998e-05,
"loss": 1.3677,
"step": 1266
},
{
"epoch": 0.3431200108239751,
"grad_norm": 0.5245474576950073,
"learning_rate": 9.993815703039894e-05,
"loss": 1.3672,
"step": 1268
},
{
"epoch": 0.343661209579218,
"grad_norm": 0.501871645450592,
"learning_rate": 9.993658180960069e-05,
"loss": 1.3674,
"step": 1270
},
{
"epoch": 0.3442024083344608,
"grad_norm": 0.5990382432937622,
"learning_rate": 9.993498679156969e-05,
"loss": 1.3804,
"step": 1272
},
{
"epoch": 0.3447436070897037,
"grad_norm": 0.42392146587371826,
"learning_rate": 9.993337197693833e-05,
"loss": 1.3628,
"step": 1274
},
{
"epoch": 0.34528480584494653,
"grad_norm": 0.46936917304992676,
"learning_rate": 9.993173736634676e-05,
"loss": 1.3696,
"step": 1276
},
{
"epoch": 0.3458260046001894,
"grad_norm": 0.52222740650177,
"learning_rate": 9.993008296044304e-05,
"loss": 1.3697,
"step": 1278
},
{
"epoch": 0.3463672033554323,
"grad_norm": 0.3582518398761749,
"learning_rate": 9.992840875988305e-05,
"loss": 1.3825,
"step": 1280
},
{
"epoch": 0.34690840211067514,
"grad_norm": 0.3533988296985626,
"learning_rate": 9.99267147653305e-05,
"loss": 1.361,
"step": 1282
},
{
"epoch": 0.34744960086591803,
"grad_norm": 0.35905274748802185,
"learning_rate": 9.992500097745702e-05,
"loss": 1.3721,
"step": 1284
},
{
"epoch": 0.34799079962116086,
"grad_norm": 0.3057416081428528,
"learning_rate": 9.9923267396942e-05,
"loss": 1.369,
"step": 1286
},
{
"epoch": 0.34853199837640375,
"grad_norm": 0.3299311101436615,
"learning_rate": 9.992151402447272e-05,
"loss": 1.358,
"step": 1288
},
{
"epoch": 0.3490731971316466,
"grad_norm": 0.3086453080177307,
"learning_rate": 9.99197408607443e-05,
"loss": 1.3534,
"step": 1290
},
{
"epoch": 0.3496143958868895,
"grad_norm": 0.3111782968044281,
"learning_rate": 9.991794790645969e-05,
"loss": 1.3605,
"step": 1292
},
{
"epoch": 0.3501555946421323,
"grad_norm": 0.3231568932533264,
"learning_rate": 9.991613516232974e-05,
"loss": 1.3543,
"step": 1294
},
{
"epoch": 0.3506967933973752,
"grad_norm": 0.3288814425468445,
"learning_rate": 9.991430262907309e-05,
"loss": 1.3521,
"step": 1296
},
{
"epoch": 0.35123799215261803,
"grad_norm": 0.3239436745643616,
"learning_rate": 9.991245030741622e-05,
"loss": 1.3335,
"step": 1298
},
{
"epoch": 0.3517791909078609,
"grad_norm": 0.3560773730278015,
"learning_rate": 9.991057819809353e-05,
"loss": 1.3487,
"step": 1300
},
{
"epoch": 0.35232038966310375,
"grad_norm": 0.4387347400188446,
"learning_rate": 9.990868630184716e-05,
"loss": 1.3548,
"step": 1302
},
{
"epoch": 0.35286158841834664,
"grad_norm": 0.32067278027534485,
"learning_rate": 9.990677461942717e-05,
"loss": 1.3471,
"step": 1304
},
{
"epoch": 0.3534027871735895,
"grad_norm": 0.4399580955505371,
"learning_rate": 9.990484315159146e-05,
"loss": 1.3588,
"step": 1306
},
{
"epoch": 0.35394398592883236,
"grad_norm": 0.9175602793693542,
"learning_rate": 9.990289189910571e-05,
"loss": 1.3432,
"step": 1308
},
{
"epoch": 0.35448518468407525,
"grad_norm": 0.45273318886756897,
"learning_rate": 9.990092086274352e-05,
"loss": 1.3434,
"step": 1310
},
{
"epoch": 0.3550263834393181,
"grad_norm": 0.3346487879753113,
"learning_rate": 9.989893004328632e-05,
"loss": 1.3339,
"step": 1312
},
{
"epoch": 0.35556758219456097,
"grad_norm": 0.4779951870441437,
"learning_rate": 9.989691944152333e-05,
"loss": 1.3561,
"step": 1314
},
{
"epoch": 0.3561087809498038,
"grad_norm": 0.6359366774559021,
"learning_rate": 9.989488905825166e-05,
"loss": 1.3499,
"step": 1316
},
{
"epoch": 0.3566499797050467,
"grad_norm": 0.5867050290107727,
"learning_rate": 9.989283889427625e-05,
"loss": 1.3791,
"step": 1318
},
{
"epoch": 0.3571911784602895,
"grad_norm": 1.869691014289856,
"learning_rate": 9.989076895040989e-05,
"loss": 1.3663,
"step": 1320
},
{
"epoch": 0.3577323772155324,
"grad_norm": 2.7147843837738037,
"learning_rate": 9.98886792274732e-05,
"loss": 1.358,
"step": 1322
},
{
"epoch": 0.35827357597077525,
"grad_norm": 0.8717885613441467,
"learning_rate": 9.988656972629465e-05,
"loss": 1.34,
"step": 1324
},
{
"epoch": 0.35881477472601814,
"grad_norm": 0.7126337885856628,
"learning_rate": 9.988444044771054e-05,
"loss": 1.3281,
"step": 1326
},
{
"epoch": 0.35935597348126097,
"grad_norm": 0.7409217357635498,
"learning_rate": 9.988229139256502e-05,
"loss": 1.3571,
"step": 1328
},
{
"epoch": 0.35989717223650386,
"grad_norm": 0.5892549157142639,
"learning_rate": 9.988012256171006e-05,
"loss": 1.3269,
"step": 1330
},
{
"epoch": 0.3604383709917467,
"grad_norm": 0.4858717620372772,
"learning_rate": 9.98779339560055e-05,
"loss": 1.3506,
"step": 1332
},
{
"epoch": 0.3609795697469896,
"grad_norm": 0.37409740686416626,
"learning_rate": 9.987572557631903e-05,
"loss": 1.3339,
"step": 1334
},
{
"epoch": 0.36152076850223247,
"grad_norm": 0.38315168023109436,
"learning_rate": 9.987349742352611e-05,
"loss": 1.3404,
"step": 1336
},
{
"epoch": 0.3620619672574753,
"grad_norm": 0.32702726125717163,
"learning_rate": 9.987124949851014e-05,
"loss": 1.3595,
"step": 1338
},
{
"epoch": 0.3626031660127182,
"grad_norm": 0.3133656680583954,
"learning_rate": 9.986898180216226e-05,
"loss": 1.3428,
"step": 1340
},
{
"epoch": 0.363144364767961,
"grad_norm": 0.2916230857372284,
"learning_rate": 9.986669433538152e-05,
"loss": 1.3381,
"step": 1342
},
{
"epoch": 0.3636855635232039,
"grad_norm": 0.28036215901374817,
"learning_rate": 9.986438709907476e-05,
"loss": 1.3447,
"step": 1344
},
{
"epoch": 0.36422676227844675,
"grad_norm": 0.30352699756622314,
"learning_rate": 9.98620600941567e-05,
"loss": 1.3427,
"step": 1346
},
{
"epoch": 0.36476796103368964,
"grad_norm": 0.3100769519805908,
"learning_rate": 9.985971332154984e-05,
"loss": 1.3603,
"step": 1348
},
{
"epoch": 0.36530915978893247,
"grad_norm": 0.2933647930622101,
"learning_rate": 9.98573467821846e-05,
"loss": 1.3646,
"step": 1350
},
{
"epoch": 0.36585035854417536,
"grad_norm": 0.2938663959503174,
"learning_rate": 9.985496047699916e-05,
"loss": 1.3763,
"step": 1352
},
{
"epoch": 0.3663915572994182,
"grad_norm": 0.2916519343852997,
"learning_rate": 9.985255440693955e-05,
"loss": 1.3431,
"step": 1354
},
{
"epoch": 0.3669327560546611,
"grad_norm": 0.2954147756099701,
"learning_rate": 9.985012857295968e-05,
"loss": 1.338,
"step": 1356
},
{
"epoch": 0.3674739548099039,
"grad_norm": 0.2839341163635254,
"learning_rate": 9.984768297602125e-05,
"loss": 1.3653,
"step": 1358
},
{
"epoch": 0.3680151535651468,
"grad_norm": 0.2878473699092865,
"learning_rate": 9.984521761709382e-05,
"loss": 1.3302,
"step": 1360
},
{
"epoch": 0.3685563523203897,
"grad_norm": 0.2859325408935547,
"learning_rate": 9.984273249715478e-05,
"loss": 1.3273,
"step": 1362
},
{
"epoch": 0.3690975510756325,
"grad_norm": 0.28399959206581116,
"learning_rate": 9.984022761718933e-05,
"loss": 1.3516,
"step": 1364
},
{
"epoch": 0.3696387498308754,
"grad_norm": 0.29740169644355774,
"learning_rate": 9.983770297819052e-05,
"loss": 1.3389,
"step": 1366
},
{
"epoch": 0.37017994858611825,
"grad_norm": 0.3143361806869507,
"learning_rate": 9.983515858115928e-05,
"loss": 1.3557,
"step": 1368
},
{
"epoch": 0.37072114734136113,
"grad_norm": 0.30783936381340027,
"learning_rate": 9.983259442710429e-05,
"loss": 1.3498,
"step": 1370
},
{
"epoch": 0.37126234609660397,
"grad_norm": 0.297091543674469,
"learning_rate": 9.983001051704211e-05,
"loss": 1.3308,
"step": 1372
},
{
"epoch": 0.37180354485184686,
"grad_norm": 0.3118893504142761,
"learning_rate": 9.982740685199712e-05,
"loss": 1.3372,
"step": 1374
},
{
"epoch": 0.3723447436070897,
"grad_norm": 0.2826865017414093,
"learning_rate": 9.982478343300155e-05,
"loss": 1.3488,
"step": 1376
},
{
"epoch": 0.3728859423623326,
"grad_norm": 0.2829175889492035,
"learning_rate": 9.982214026109544e-05,
"loss": 1.3693,
"step": 1378
},
{
"epoch": 0.3734271411175754,
"grad_norm": 0.3026389479637146,
"learning_rate": 9.981947733732668e-05,
"loss": 1.3276,
"step": 1380
},
{
"epoch": 0.3739683398728183,
"grad_norm": 0.30112889409065247,
"learning_rate": 9.981679466275096e-05,
"loss": 1.3441,
"step": 1382
},
{
"epoch": 0.37450953862806113,
"grad_norm": 0.27241262793540955,
"learning_rate": 9.981409223843183e-05,
"loss": 1.3373,
"step": 1384
},
{
"epoch": 0.375050737383304,
"grad_norm": 0.2804114520549774,
"learning_rate": 9.981137006544066e-05,
"loss": 1.344,
"step": 1386
},
{
"epoch": 0.37559193613854686,
"grad_norm": 0.27698764204978943,
"learning_rate": 9.980862814485665e-05,
"loss": 1.3543,
"step": 1388
},
{
"epoch": 0.37613313489378974,
"grad_norm": 0.29283177852630615,
"learning_rate": 9.980586647776681e-05,
"loss": 1.3332,
"step": 1390
},
{
"epoch": 0.37667433364903263,
"grad_norm": 0.2896028459072113,
"learning_rate": 9.980308506526604e-05,
"loss": 1.3392,
"step": 1392
},
{
"epoch": 0.37721553240427547,
"grad_norm": 0.27882838249206543,
"learning_rate": 9.980028390845697e-05,
"loss": 1.336,
"step": 1394
},
{
"epoch": 0.37775673115951836,
"grad_norm": 0.2886262834072113,
"learning_rate": 9.979746300845015e-05,
"loss": 1.3331,
"step": 1396
},
{
"epoch": 0.3782979299147612,
"grad_norm": 0.3085189163684845,
"learning_rate": 9.97946223663639e-05,
"loss": 1.3296,
"step": 1398
},
{
"epoch": 0.3788391286700041,
"grad_norm": 0.3342386484146118,
"learning_rate": 9.97917619833244e-05,
"loss": 1.351,
"step": 1400
},
{
"epoch": 0.3793803274252469,
"grad_norm": 0.3263756036758423,
"learning_rate": 9.978888186046562e-05,
"loss": 1.3526,
"step": 1402
},
{
"epoch": 0.3799215261804898,
"grad_norm": 0.292346715927124,
"learning_rate": 9.97859819989294e-05,
"loss": 1.3498,
"step": 1404
},
{
"epoch": 0.38046272493573263,
"grad_norm": 0.29072263836860657,
"learning_rate": 9.978306239986536e-05,
"loss": 1.3423,
"step": 1406
},
{
"epoch": 0.3810039236909755,
"grad_norm": 0.3350834548473358,
"learning_rate": 9.978012306443101e-05,
"loss": 1.3559,
"step": 1408
},
{
"epoch": 0.38154512244621835,
"grad_norm": 0.28721559047698975,
"learning_rate": 9.977716399379157e-05,
"loss": 1.3294,
"step": 1410
},
{
"epoch": 0.38208632120146124,
"grad_norm": 0.3062276244163513,
"learning_rate": 9.977418518912023e-05,
"loss": 1.3457,
"step": 1412
},
{
"epoch": 0.3826275199567041,
"grad_norm": 0.30255332589149475,
"learning_rate": 9.977118665159791e-05,
"loss": 1.3371,
"step": 1414
},
{
"epoch": 0.38316871871194697,
"grad_norm": 0.2800199091434479,
"learning_rate": 9.976816838241334e-05,
"loss": 1.3439,
"step": 1416
},
{
"epoch": 0.3837099174671898,
"grad_norm": 0.2754746675491333,
"learning_rate": 9.976513038276312e-05,
"loss": 1.3303,
"step": 1418
},
{
"epoch": 0.3842511162224327,
"grad_norm": 0.29933616518974304,
"learning_rate": 9.976207265385168e-05,
"loss": 1.3365,
"step": 1420
},
{
"epoch": 0.3847923149776756,
"grad_norm": 0.3023386001586914,
"learning_rate": 9.975899519689122e-05,
"loss": 1.3164,
"step": 1422
},
{
"epoch": 0.3853335137329184,
"grad_norm": 0.2901383936405182,
"learning_rate": 9.975589801310181e-05,
"loss": 1.3209,
"step": 1424
},
{
"epoch": 0.3858747124881613,
"grad_norm": 0.28566035628318787,
"learning_rate": 9.975278110371131e-05,
"loss": 1.3301,
"step": 1426
},
{
"epoch": 0.38641591124340413,
"grad_norm": 0.3010505735874176,
"learning_rate": 9.974964446995543e-05,
"loss": 1.319,
"step": 1428
},
{
"epoch": 0.386957109998647,
"grad_norm": 0.2977135479450226,
"learning_rate": 9.974648811307766e-05,
"loss": 1.3311,
"step": 1430
},
{
"epoch": 0.38749830875388985,
"grad_norm": 0.28914034366607666,
"learning_rate": 9.974331203432932e-05,
"loss": 1.343,
"step": 1432
},
{
"epoch": 0.38803950750913274,
"grad_norm": 0.2842980623245239,
"learning_rate": 9.974011623496958e-05,
"loss": 1.3162,
"step": 1434
},
{
"epoch": 0.3885807062643756,
"grad_norm": 0.3048929274082184,
"learning_rate": 9.97369007162654e-05,
"loss": 1.3166,
"step": 1436
},
{
"epoch": 0.38912190501961846,
"grad_norm": 0.3024531304836273,
"learning_rate": 9.973366547949157e-05,
"loss": 1.3156,
"step": 1438
},
{
"epoch": 0.3896631037748613,
"grad_norm": 0.2911103367805481,
"learning_rate": 9.973041052593068e-05,
"loss": 1.3314,
"step": 1440
},
{
"epoch": 0.3902043025301042,
"grad_norm": 0.30932334065437317,
"learning_rate": 9.972713585687317e-05,
"loss": 1.3144,
"step": 1442
},
{
"epoch": 0.390745501285347,
"grad_norm": 0.302971750497818,
"learning_rate": 9.972384147361725e-05,
"loss": 1.3431,
"step": 1444
},
{
"epoch": 0.3912867000405899,
"grad_norm": 0.32412296533584595,
"learning_rate": 9.972052737746898e-05,
"loss": 1.3167,
"step": 1446
},
{
"epoch": 0.3918278987958328,
"grad_norm": 0.4637945890426636,
"learning_rate": 9.97171935697422e-05,
"loss": 1.3433,
"step": 1448
},
{
"epoch": 0.39236909755107563,
"grad_norm": 0.32690081000328064,
"learning_rate": 9.971384005175864e-05,
"loss": 1.3327,
"step": 1450
},
{
"epoch": 0.3929102963063185,
"grad_norm": 0.3049994111061096,
"learning_rate": 9.971046682484776e-05,
"loss": 1.3401,
"step": 1452
},
{
"epoch": 0.39345149506156135,
"grad_norm": 0.306095689535141,
"learning_rate": 9.970707389034688e-05,
"loss": 1.3205,
"step": 1454
},
{
"epoch": 0.39399269381680424,
"grad_norm": 0.3375592529773712,
"learning_rate": 9.970366124960111e-05,
"loss": 1.3243,
"step": 1456
},
{
"epoch": 0.3945338925720471,
"grad_norm": 0.30508387088775635,
"learning_rate": 9.970022890396338e-05,
"loss": 1.3342,
"step": 1458
},
{
"epoch": 0.39507509132728996,
"grad_norm": 0.2996918261051178,
"learning_rate": 9.969677685479444e-05,
"loss": 1.3457,
"step": 1460
},
{
"epoch": 0.3956162900825328,
"grad_norm": 0.29500269889831543,
"learning_rate": 9.969330510346286e-05,
"loss": 1.3306,
"step": 1462
},
{
"epoch": 0.3961574888377757,
"grad_norm": 0.28392598032951355,
"learning_rate": 9.9689813651345e-05,
"loss": 1.3347,
"step": 1464
},
{
"epoch": 0.3966986875930185,
"grad_norm": 0.2859434485435486,
"learning_rate": 9.968630249982503e-05,
"loss": 1.3342,
"step": 1466
},
{
"epoch": 0.3972398863482614,
"grad_norm": 0.3038876950740814,
"learning_rate": 9.968277165029494e-05,
"loss": 1.3248,
"step": 1468
},
{
"epoch": 0.39778108510350424,
"grad_norm": 0.3060581088066101,
"learning_rate": 9.967922110415454e-05,
"loss": 1.3403,
"step": 1470
},
{
"epoch": 0.39832228385874713,
"grad_norm": 0.30475133657455444,
"learning_rate": 9.96756508628114e-05,
"loss": 1.3338,
"step": 1472
},
{
"epoch": 0.39886348261398996,
"grad_norm": 0.33263343572616577,
"learning_rate": 9.967206092768095e-05,
"loss": 1.3209,
"step": 1474
},
{
"epoch": 0.39940468136923285,
"grad_norm": 0.2895435094833374,
"learning_rate": 9.966845130018645e-05,
"loss": 1.3352,
"step": 1476
},
{
"epoch": 0.39994588012447574,
"grad_norm": 0.27237775921821594,
"learning_rate": 9.966482198175886e-05,
"loss": 1.3239,
"step": 1478
},
{
"epoch": 0.40048707887971857,
"grad_norm": 0.2740168571472168,
"learning_rate": 9.966117297383707e-05,
"loss": 1.3371,
"step": 1480
},
{
"epoch": 0.40102827763496146,
"grad_norm": 0.30601269006729126,
"learning_rate": 9.965750427786768e-05,
"loss": 1.343,
"step": 1482
},
{
"epoch": 0.4015694763902043,
"grad_norm": 0.28768840432167053,
"learning_rate": 9.965381589530518e-05,
"loss": 1.3442,
"step": 1484
},
{
"epoch": 0.4021106751454472,
"grad_norm": 0.28244882822036743,
"learning_rate": 9.965010782761177e-05,
"loss": 1.3336,
"step": 1486
},
{
"epoch": 0.40265187390069,
"grad_norm": 0.2694818079471588,
"learning_rate": 9.964638007625754e-05,
"loss": 1.3448,
"step": 1488
},
{
"epoch": 0.4031930726559329,
"grad_norm": 0.29507288336753845,
"learning_rate": 9.964263264272033e-05,
"loss": 1.327,
"step": 1490
},
{
"epoch": 0.40373427141117574,
"grad_norm": 0.3036315143108368,
"learning_rate": 9.963886552848581e-05,
"loss": 1.3289,
"step": 1492
},
{
"epoch": 0.4042754701664186,
"grad_norm": 0.2737107574939728,
"learning_rate": 9.963507873504744e-05,
"loss": 1.3281,
"step": 1494
},
{
"epoch": 0.40481666892166146,
"grad_norm": 0.29833105206489563,
"learning_rate": 9.963127226390647e-05,
"loss": 1.3378,
"step": 1496
},
{
"epoch": 0.40535786767690435,
"grad_norm": 0.32203689217567444,
"learning_rate": 9.9627446116572e-05,
"loss": 1.3158,
"step": 1498
},
{
"epoch": 0.4058990664321472,
"grad_norm": 0.27837038040161133,
"learning_rate": 9.962360029456086e-05,
"loss": 1.3051,
"step": 1500
},
{
"epoch": 0.40644026518739007,
"grad_norm": 0.2688932418823242,
"learning_rate": 9.961973479939774e-05,
"loss": 1.339,
"step": 1502
},
{
"epoch": 0.40698146394263296,
"grad_norm": 0.2779388725757599,
"learning_rate": 9.96158496326151e-05,
"loss": 1.3264,
"step": 1504
},
{
"epoch": 0.4075226626978758,
"grad_norm": 0.27401190996170044,
"learning_rate": 9.961194479575321e-05,
"loss": 1.3139,
"step": 1506
},
{
"epoch": 0.4080638614531187,
"grad_norm": 0.270448237657547,
"learning_rate": 9.960802029036012e-05,
"loss": 1.3253,
"step": 1508
},
{
"epoch": 0.4086050602083615,
"grad_norm": 0.29150158166885376,
"learning_rate": 9.96040761179917e-05,
"loss": 1.3324,
"step": 1510
},
{
"epoch": 0.4091462589636044,
"grad_norm": 0.2666511833667755,
"learning_rate": 9.960011228021159e-05,
"loss": 1.325,
"step": 1512
},
{
"epoch": 0.40968745771884724,
"grad_norm": 0.2782241106033325,
"learning_rate": 9.959612877859125e-05,
"loss": 1.3162,
"step": 1514
},
{
"epoch": 0.4102286564740901,
"grad_norm": 0.2845720946788788,
"learning_rate": 9.959212561470996e-05,
"loss": 1.3316,
"step": 1516
},
{
"epoch": 0.41076985522933296,
"grad_norm": 0.27991780638694763,
"learning_rate": 9.958810279015473e-05,
"loss": 1.3121,
"step": 1518
},
{
"epoch": 0.41131105398457585,
"grad_norm": 0.2804965674877167,
"learning_rate": 9.958406030652043e-05,
"loss": 1.3246,
"step": 1520
},
{
"epoch": 0.4118522527398187,
"grad_norm": 0.2732795178890228,
"learning_rate": 9.957999816540965e-05,
"loss": 1.3217,
"step": 1522
},
{
"epoch": 0.41239345149506157,
"grad_norm": 0.28181079030036926,
"learning_rate": 9.957591636843284e-05,
"loss": 1.3374,
"step": 1524
},
{
"epoch": 0.4129346502503044,
"grad_norm": 0.3096240162849426,
"learning_rate": 9.957181491720822e-05,
"loss": 1.3324,
"step": 1526
},
{
"epoch": 0.4134758490055473,
"grad_norm": 0.2709742486476898,
"learning_rate": 9.95676938133618e-05,
"loss": 1.3055,
"step": 1528
},
{
"epoch": 0.4140170477607901,
"grad_norm": 0.27309080958366394,
"learning_rate": 9.956355305852736e-05,
"loss": 1.313,
"step": 1530
},
{
"epoch": 0.414558246516033,
"grad_norm": 0.29801151156425476,
"learning_rate": 9.955939265434652e-05,
"loss": 1.3185,
"step": 1532
},
{
"epoch": 0.4150994452712759,
"grad_norm": 0.28698021173477173,
"learning_rate": 9.955521260246865e-05,
"loss": 1.3214,
"step": 1534
},
{
"epoch": 0.41564064402651874,
"grad_norm": 0.2641914188861847,
"learning_rate": 9.955101290455093e-05,
"loss": 1.317,
"step": 1536
},
{
"epoch": 0.4161818427817616,
"grad_norm": 0.26065558195114136,
"learning_rate": 9.954679356225832e-05,
"loss": 1.3253,
"step": 1538
},
{
"epoch": 0.41672304153700446,
"grad_norm": 0.27157294750213623,
"learning_rate": 9.954255457726354e-05,
"loss": 1.3218,
"step": 1540
},
{
"epoch": 0.41726424029224735,
"grad_norm": 0.2833496630191803,
"learning_rate": 9.953829595124715e-05,
"loss": 1.32,
"step": 1542
},
{
"epoch": 0.4178054390474902,
"grad_norm": 0.2757824659347534,
"learning_rate": 9.953401768589745e-05,
"loss": 1.3165,
"step": 1544
},
{
"epoch": 0.41834663780273307,
"grad_norm": 0.2609362304210663,
"learning_rate": 9.952971978291059e-05,
"loss": 1.3229,
"step": 1546
},
{
"epoch": 0.4188878365579759,
"grad_norm": 0.2863214313983917,
"learning_rate": 9.952540224399043e-05,
"loss": 1.3217,
"step": 1548
},
{
"epoch": 0.4194290353132188,
"grad_norm": 0.27573657035827637,
"learning_rate": 9.952106507084864e-05,
"loss": 1.3151,
"step": 1550
},
{
"epoch": 0.4199702340684616,
"grad_norm": 0.26843398809432983,
"learning_rate": 9.95167082652047e-05,
"loss": 1.3185,
"step": 1552
},
{
"epoch": 0.4205114328237045,
"grad_norm": 0.25903749465942383,
"learning_rate": 9.951233182878585e-05,
"loss": 1.3142,
"step": 1554
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.27221450209617615,
"learning_rate": 9.950793576332713e-05,
"loss": 1.3119,
"step": 1556
},
{
"epoch": 0.42159383033419023,
"grad_norm": 0.2897038161754608,
"learning_rate": 9.950352007057134e-05,
"loss": 1.3217,
"step": 1558
},
{
"epoch": 0.42213502908943307,
"grad_norm": 0.2515231668949127,
"learning_rate": 9.949908475226905e-05,
"loss": 1.3263,
"step": 1560
},
{
"epoch": 0.42267622784467596,
"grad_norm": 0.26686710119247437,
"learning_rate": 9.949462981017865e-05,
"loss": 1.3269,
"step": 1562
},
{
"epoch": 0.42321742659991884,
"grad_norm": 0.2747204899787903,
"learning_rate": 9.949015524606629e-05,
"loss": 1.3208,
"step": 1564
},
{
"epoch": 0.4237586253551617,
"grad_norm": 0.25866395235061646,
"learning_rate": 9.948566106170589e-05,
"loss": 1.3273,
"step": 1566
},
{
"epoch": 0.42429982411040457,
"grad_norm": 0.2659189999103546,
"learning_rate": 9.948114725887918e-05,
"loss": 1.2955,
"step": 1568
},
{
"epoch": 0.4248410228656474,
"grad_norm": 0.25262853503227234,
"learning_rate": 9.947661383937563e-05,
"loss": 1.284,
"step": 1570
},
{
"epoch": 0.4253822216208903,
"grad_norm": 0.24780422449111938,
"learning_rate": 9.94720608049925e-05,
"loss": 1.3168,
"step": 1572
},
{
"epoch": 0.4259234203761331,
"grad_norm": 0.2663845121860504,
"learning_rate": 9.946748815753484e-05,
"loss": 1.313,
"step": 1574
},
{
"epoch": 0.426464619131376,
"grad_norm": 0.2906511425971985,
"learning_rate": 9.946289589881545e-05,
"loss": 1.3197,
"step": 1576
},
{
"epoch": 0.42700581788661884,
"grad_norm": 0.28401264548301697,
"learning_rate": 9.945828403065493e-05,
"loss": 1.3254,
"step": 1578
},
{
"epoch": 0.42754701664186173,
"grad_norm": 0.27820122241973877,
"learning_rate": 9.945365255488164e-05,
"loss": 1.3153,
"step": 1580
},
{
"epoch": 0.42808821539710457,
"grad_norm": 0.2573559880256653,
"learning_rate": 9.944900147333173e-05,
"loss": 1.3144,
"step": 1582
},
{
"epoch": 0.42862941415234745,
"grad_norm": 0.2536357343196869,
"learning_rate": 9.944433078784909e-05,
"loss": 1.3172,
"step": 1584
},
{
"epoch": 0.4291706129075903,
"grad_norm": 0.2745160758495331,
"learning_rate": 9.94396405002854e-05,
"loss": 1.3023,
"step": 1586
},
{
"epoch": 0.4297118116628332,
"grad_norm": 0.290393203496933,
"learning_rate": 9.943493061250013e-05,
"loss": 1.3095,
"step": 1588
},
{
"epoch": 0.43025301041807607,
"grad_norm": 0.29357218742370605,
"learning_rate": 9.94302011263605e-05,
"loss": 1.3232,
"step": 1590
},
{
"epoch": 0.4307942091733189,
"grad_norm": 0.2756180167198181,
"learning_rate": 9.94254520437415e-05,
"loss": 1.3179,
"step": 1592
},
{
"epoch": 0.4313354079285618,
"grad_norm": 0.30225417017936707,
"learning_rate": 9.942068336652589e-05,
"loss": 1.3353,
"step": 1594
},
{
"epoch": 0.4318766066838046,
"grad_norm": 0.26694637537002563,
"learning_rate": 9.94158950966042e-05,
"loss": 1.318,
"step": 1596
},
{
"epoch": 0.4324178054390475,
"grad_norm": 0.2528863549232483,
"learning_rate": 9.941108723587471e-05,
"loss": 1.3282,
"step": 1598
},
{
"epoch": 0.43295900419429034,
"grad_norm": 0.25261232256889343,
"learning_rate": 9.940625978624353e-05,
"loss": 1.3178,
"step": 1600
},
{
"epoch": 0.43350020294953323,
"grad_norm": 0.2624775767326355,
"learning_rate": 9.940141274962444e-05,
"loss": 1.31,
"step": 1602
},
{
"epoch": 0.43404140170477606,
"grad_norm": 0.260810524225235,
"learning_rate": 9.939654612793908e-05,
"loss": 1.3162,
"step": 1604
},
{
"epoch": 0.43458260046001895,
"grad_norm": 0.2815745174884796,
"learning_rate": 9.939165992311676e-05,
"loss": 1.3112,
"step": 1606
},
{
"epoch": 0.4351237992152618,
"grad_norm": 0.2773973345756531,
"learning_rate": 9.938675413709466e-05,
"loss": 1.3,
"step": 1608
},
{
"epoch": 0.4356649979705047,
"grad_norm": 0.26486915349960327,
"learning_rate": 9.938182877181763e-05,
"loss": 1.3193,
"step": 1610
},
{
"epoch": 0.4362061967257475,
"grad_norm": 0.26103830337524414,
"learning_rate": 9.937688382923832e-05,
"loss": 1.3244,
"step": 1612
},
{
"epoch": 0.4367473954809904,
"grad_norm": 0.2556493878364563,
"learning_rate": 9.937191931131716e-05,
"loss": 1.3087,
"step": 1614
},
{
"epoch": 0.43728859423623323,
"grad_norm": 0.2739090919494629,
"learning_rate": 9.93669352200223e-05,
"loss": 1.3009,
"step": 1616
},
{
"epoch": 0.4378297929914761,
"grad_norm": 0.26297444105148315,
"learning_rate": 9.936193155732967e-05,
"loss": 1.2971,
"step": 1618
},
{
"epoch": 0.438370991746719,
"grad_norm": 0.2587411403656006,
"learning_rate": 9.935690832522297e-05,
"loss": 1.3259,
"step": 1620
},
{
"epoch": 0.43891219050196184,
"grad_norm": 0.2419731616973877,
"learning_rate": 9.935186552569366e-05,
"loss": 1.3123,
"step": 1622
},
{
"epoch": 0.43945338925720473,
"grad_norm": 0.27424389123916626,
"learning_rate": 9.934680316074092e-05,
"loss": 1.3196,
"step": 1624
},
{
"epoch": 0.43999458801244756,
"grad_norm": 0.258242666721344,
"learning_rate": 9.934172123237173e-05,
"loss": 1.3044,
"step": 1626
},
{
"epoch": 0.44053578676769045,
"grad_norm": 0.2621035575866699,
"learning_rate": 9.933661974260078e-05,
"loss": 1.3111,
"step": 1628
},
{
"epoch": 0.4410769855229333,
"grad_norm": 0.25349390506744385,
"learning_rate": 9.93314986934506e-05,
"loss": 1.3025,
"step": 1630
},
{
"epoch": 0.4416181842781762,
"grad_norm": 0.2615620195865631,
"learning_rate": 9.932635808695136e-05,
"loss": 1.3291,
"step": 1632
},
{
"epoch": 0.442159383033419,
"grad_norm": 0.2933880686759949,
"learning_rate": 9.932119792514105e-05,
"loss": 1.3327,
"step": 1634
},
{
"epoch": 0.4427005817886619,
"grad_norm": 0.2584700286388397,
"learning_rate": 9.931601821006544e-05,
"loss": 1.3031,
"step": 1636
},
{
"epoch": 0.44324178054390473,
"grad_norm": 0.2718084156513214,
"learning_rate": 9.931081894377797e-05,
"loss": 1.3053,
"step": 1638
},
{
"epoch": 0.4437829792991476,
"grad_norm": 0.27105703949928284,
"learning_rate": 9.93056001283399e-05,
"loss": 1.3012,
"step": 1640
},
{
"epoch": 0.44432417805439045,
"grad_norm": 0.27265292406082153,
"learning_rate": 9.930036176582021e-05,
"loss": 1.2957,
"step": 1642
},
{
"epoch": 0.44486537680963334,
"grad_norm": 0.26121169328689575,
"learning_rate": 9.929510385829564e-05,
"loss": 1.3062,
"step": 1644
},
{
"epoch": 0.44540657556487623,
"grad_norm": 0.26841971278190613,
"learning_rate": 9.928982640785067e-05,
"loss": 1.3192,
"step": 1646
},
{
"epoch": 0.44594777432011906,
"grad_norm": 0.27634862065315247,
"learning_rate": 9.928452941657755e-05,
"loss": 1.3005,
"step": 1648
},
{
"epoch": 0.44648897307536195,
"grad_norm": 0.25527122616767883,
"learning_rate": 9.927921288657623e-05,
"loss": 1.3121,
"step": 1650
},
{
"epoch": 0.4470301718306048,
"grad_norm": 0.2733294665813446,
"learning_rate": 9.927387681995443e-05,
"loss": 1.3051,
"step": 1652
},
{
"epoch": 0.44757137058584767,
"grad_norm": 0.2783257067203522,
"learning_rate": 9.926852121882766e-05,
"loss": 1.2947,
"step": 1654
},
{
"epoch": 0.4481125693410905,
"grad_norm": 0.2672583758831024,
"learning_rate": 9.926314608531911e-05,
"loss": 1.3272,
"step": 1656
},
{
"epoch": 0.4486537680963334,
"grad_norm": 0.2568219304084778,
"learning_rate": 9.925775142155974e-05,
"loss": 1.3025,
"step": 1658
},
{
"epoch": 0.4491949668515762,
"grad_norm": 0.2576539218425751,
"learning_rate": 9.925233722968826e-05,
"loss": 1.2715,
"step": 1660
},
{
"epoch": 0.4497361656068191,
"grad_norm": 0.25898897647857666,
"learning_rate": 9.924690351185109e-05,
"loss": 1.3039,
"step": 1662
},
{
"epoch": 0.45027736436206195,
"grad_norm": 0.25795668363571167,
"learning_rate": 9.924145027020242e-05,
"loss": 1.3115,
"step": 1664
},
{
"epoch": 0.45081856311730484,
"grad_norm": 0.2781166136264801,
"learning_rate": 9.92359775069042e-05,
"loss": 1.3017,
"step": 1666
},
{
"epoch": 0.45135976187254767,
"grad_norm": 0.2871512770652771,
"learning_rate": 9.923048522412608e-05,
"loss": 1.3206,
"step": 1668
},
{
"epoch": 0.45190096062779056,
"grad_norm": 0.27760595083236694,
"learning_rate": 9.922497342404544e-05,
"loss": 1.3214,
"step": 1670
},
{
"epoch": 0.4524421593830334,
"grad_norm": 0.26959067583084106,
"learning_rate": 9.921944210884746e-05,
"loss": 1.3144,
"step": 1672
},
{
"epoch": 0.4529833581382763,
"grad_norm": 0.2662011384963989,
"learning_rate": 9.921389128072498e-05,
"loss": 1.3022,
"step": 1674
},
{
"epoch": 0.45352455689351917,
"grad_norm": 0.28014811873435974,
"learning_rate": 9.920832094187861e-05,
"loss": 1.3104,
"step": 1676
},
{
"epoch": 0.454065755648762,
"grad_norm": 0.2560974955558777,
"learning_rate": 9.920273109451673e-05,
"loss": 1.3113,
"step": 1678
},
{
"epoch": 0.4546069544040049,
"grad_norm": 0.285339891910553,
"learning_rate": 9.91971217408554e-05,
"loss": 1.3126,
"step": 1680
},
{
"epoch": 0.4551481531592477,
"grad_norm": 0.29105204343795776,
"learning_rate": 9.919149288311843e-05,
"loss": 1.3248,
"step": 1682
},
{
"epoch": 0.4556893519144906,
"grad_norm": 0.2868146002292633,
"learning_rate": 9.918584452353739e-05,
"loss": 1.3217,
"step": 1684
},
{
"epoch": 0.45623055066973345,
"grad_norm": 0.26717278361320496,
"learning_rate": 9.918017666435152e-05,
"loss": 1.2991,
"step": 1686
},
{
"epoch": 0.45677174942497634,
"grad_norm": 0.2560403048992157,
"learning_rate": 9.917448930780786e-05,
"loss": 1.3091,
"step": 1688
},
{
"epoch": 0.45731294818021917,
"grad_norm": 0.2610042989253998,
"learning_rate": 9.916878245616114e-05,
"loss": 1.2948,
"step": 1690
},
{
"epoch": 0.45785414693546206,
"grad_norm": 0.27322304248809814,
"learning_rate": 9.916305611167382e-05,
"loss": 1.3121,
"step": 1692
},
{
"epoch": 0.4583953456907049,
"grad_norm": 0.26559844613075256,
"learning_rate": 9.91573102766161e-05,
"loss": 1.307,
"step": 1694
},
{
"epoch": 0.4589365444459478,
"grad_norm": 0.2677384316921234,
"learning_rate": 9.91515449532659e-05,
"loss": 1.2925,
"step": 1696
},
{
"epoch": 0.4594777432011906,
"grad_norm": 0.2670448422431946,
"learning_rate": 9.914576014390888e-05,
"loss": 1.3051,
"step": 1698
},
{
"epoch": 0.4600189419564335,
"grad_norm": 0.2537919878959656,
"learning_rate": 9.91399558508384e-05,
"loss": 1.3047,
"step": 1700
},
{
"epoch": 0.46056014071167634,
"grad_norm": 0.2712916433811188,
"learning_rate": 9.913413207635555e-05,
"loss": 1.2949,
"step": 1702
},
{
"epoch": 0.4611013394669192,
"grad_norm": 0.27910125255584717,
"learning_rate": 9.912828882276917e-05,
"loss": 1.336,
"step": 1704
},
{
"epoch": 0.4616425382221621,
"grad_norm": 0.25917065143585205,
"learning_rate": 9.91224260923958e-05,
"loss": 1.2938,
"step": 1706
},
{
"epoch": 0.46218373697740495,
"grad_norm": 0.265024334192276,
"learning_rate": 9.91165438875597e-05,
"loss": 1.2876,
"step": 1708
},
{
"epoch": 0.46272493573264784,
"grad_norm": 0.2637651860713959,
"learning_rate": 9.911064221059286e-05,
"loss": 1.3128,
"step": 1710
},
{
"epoch": 0.46326613448789067,
"grad_norm": 0.25448864698410034,
"learning_rate": 9.910472106383495e-05,
"loss": 1.3289,
"step": 1712
},
{
"epoch": 0.46380733324313356,
"grad_norm": 0.24903124570846558,
"learning_rate": 9.909878044963346e-05,
"loss": 1.3013,
"step": 1714
},
{
"epoch": 0.4643485319983764,
"grad_norm": 0.258848637342453,
"learning_rate": 9.909282037034347e-05,
"loss": 1.3052,
"step": 1716
},
{
"epoch": 0.4648897307536193,
"grad_norm": 0.25806304812431335,
"learning_rate": 9.908684082832787e-05,
"loss": 1.286,
"step": 1718
},
{
"epoch": 0.4654309295088621,
"grad_norm": 0.26794132590293884,
"learning_rate": 9.908084182595723e-05,
"loss": 1.3069,
"step": 1720
},
{
"epoch": 0.465972128264105,
"grad_norm": 0.26079118251800537,
"learning_rate": 9.907482336560983e-05,
"loss": 1.3145,
"step": 1722
},
{
"epoch": 0.46651332701934783,
"grad_norm": 0.25958481431007385,
"learning_rate": 9.906878544967169e-05,
"loss": 1.3098,
"step": 1724
},
{
"epoch": 0.4670545257745907,
"grad_norm": 0.2390812784433365,
"learning_rate": 9.906272808053652e-05,
"loss": 1.3085,
"step": 1726
},
{
"epoch": 0.46759572452983356,
"grad_norm": 0.263637900352478,
"learning_rate": 9.905665126060574e-05,
"loss": 1.2933,
"step": 1728
},
{
"epoch": 0.46813692328507645,
"grad_norm": 0.2462746798992157,
"learning_rate": 9.90505549922885e-05,
"loss": 1.2877,
"step": 1730
},
{
"epoch": 0.46867812204031933,
"grad_norm": 0.244845911860466,
"learning_rate": 9.904443927800164e-05,
"loss": 1.325,
"step": 1732
},
{
"epoch": 0.46921932079556217,
"grad_norm": 0.28249332308769226,
"learning_rate": 9.903830412016974e-05,
"loss": 1.313,
"step": 1734
},
{
"epoch": 0.46976051955080506,
"grad_norm": 0.29556336998939514,
"learning_rate": 9.903214952122504e-05,
"loss": 1.3142,
"step": 1736
},
{
"epoch": 0.4703017183060479,
"grad_norm": 0.2746431827545166,
"learning_rate": 9.902597548360754e-05,
"loss": 1.3096,
"step": 1738
},
{
"epoch": 0.4708429170612908,
"grad_norm": 0.2979538142681122,
"learning_rate": 9.901978200976492e-05,
"loss": 1.2849,
"step": 1740
},
{
"epoch": 0.4713841158165336,
"grad_norm": 0.2766527235507965,
"learning_rate": 9.901356910215255e-05,
"loss": 1.3089,
"step": 1742
},
{
"epoch": 0.4719253145717765,
"grad_norm": 0.25000783801078796,
"learning_rate": 9.900733676323353e-05,
"loss": 1.308,
"step": 1744
},
{
"epoch": 0.47246651332701933,
"grad_norm": 0.26226234436035156,
"learning_rate": 9.900108499547864e-05,
"loss": 1.3041,
"step": 1746
},
{
"epoch": 0.4730077120822622,
"grad_norm": 0.2794544994831085,
"learning_rate": 9.899481380136642e-05,
"loss": 1.3312,
"step": 1748
},
{
"epoch": 0.47354891083750505,
"grad_norm": 0.24771127104759216,
"learning_rate": 9.898852318338303e-05,
"loss": 1.2853,
"step": 1750
},
{
"epoch": 0.47409010959274794,
"grad_norm": 0.2811632752418518,
"learning_rate": 9.898221314402238e-05,
"loss": 1.3019,
"step": 1752
},
{
"epoch": 0.4746313083479908,
"grad_norm": 0.2812533378601074,
"learning_rate": 9.897588368578608e-05,
"loss": 1.3298,
"step": 1754
},
{
"epoch": 0.47517250710323367,
"grad_norm": 0.25955653190612793,
"learning_rate": 9.896953481118341e-05,
"loss": 1.3093,
"step": 1756
},
{
"epoch": 0.4757137058584765,
"grad_norm": 0.2653108537197113,
"learning_rate": 9.896316652273136e-05,
"loss": 1.2898,
"step": 1758
},
{
"epoch": 0.4762549046137194,
"grad_norm": 0.27985796332359314,
"learning_rate": 9.895677882295466e-05,
"loss": 1.2928,
"step": 1760
},
{
"epoch": 0.4767961033689623,
"grad_norm": 0.2889133393764496,
"learning_rate": 9.895037171438568e-05,
"loss": 1.3088,
"step": 1762
},
{
"epoch": 0.4773373021242051,
"grad_norm": 0.2615009546279907,
"learning_rate": 9.894394519956448e-05,
"loss": 1.3212,
"step": 1764
},
{
"epoch": 0.477878500879448,
"grad_norm": 0.24938960373401642,
"learning_rate": 9.893749928103885e-05,
"loss": 1.2982,
"step": 1766
},
{
"epoch": 0.47841969963469083,
"grad_norm": 0.27132853865623474,
"learning_rate": 9.893103396136427e-05,
"loss": 1.294,
"step": 1768
},
{
"epoch": 0.4789608983899337,
"grad_norm": 0.2632822096347809,
"learning_rate": 9.89245492431039e-05,
"loss": 1.2852,
"step": 1770
},
{
"epoch": 0.47950209714517655,
"grad_norm": 0.27269670367240906,
"learning_rate": 9.891804512882856e-05,
"loss": 1.2934,
"step": 1772
},
{
"epoch": 0.48004329590041944,
"grad_norm": 0.2572595179080963,
"learning_rate": 9.891152162111683e-05,
"loss": 1.2719,
"step": 1774
},
{
"epoch": 0.4805844946556623,
"grad_norm": 0.2708267867565155,
"learning_rate": 9.890497872255489e-05,
"loss": 1.2907,
"step": 1776
},
{
"epoch": 0.48112569341090516,
"grad_norm": 0.28407028317451477,
"learning_rate": 9.889841643573671e-05,
"loss": 1.2977,
"step": 1778
},
{
"epoch": 0.481666892166148,
"grad_norm": 0.26248103380203247,
"learning_rate": 9.889183476326386e-05,
"loss": 1.2993,
"step": 1780
},
{
"epoch": 0.4822080909213909,
"grad_norm": 0.26148512959480286,
"learning_rate": 9.888523370774563e-05,
"loss": 1.2893,
"step": 1782
},
{
"epoch": 0.4827492896766337,
"grad_norm": 0.2815425395965576,
"learning_rate": 9.8878613271799e-05,
"loss": 1.3015,
"step": 1784
},
{
"epoch": 0.4832904884318766,
"grad_norm": 0.26061713695526123,
"learning_rate": 9.887197345804862e-05,
"loss": 1.2781,
"step": 1786
},
{
"epoch": 0.4838316871871195,
"grad_norm": 0.2641533613204956,
"learning_rate": 9.886531426912683e-05,
"loss": 1.2993,
"step": 1788
},
{
"epoch": 0.48437288594236233,
"grad_norm": 0.25920137763023376,
"learning_rate": 9.885863570767364e-05,
"loss": 1.2955,
"step": 1790
},
{
"epoch": 0.4849140846976052,
"grad_norm": 0.24002158641815186,
"learning_rate": 9.885193777633676e-05,
"loss": 1.2932,
"step": 1792
},
{
"epoch": 0.48545528345284805,
"grad_norm": 0.2643393576145172,
"learning_rate": 9.884522047777157e-05,
"loss": 1.2963,
"step": 1794
},
{
"epoch": 0.48599648220809094,
"grad_norm": 0.2522197663784027,
"learning_rate": 9.883848381464112e-05,
"loss": 1.2947,
"step": 1796
},
{
"epoch": 0.4865376809633338,
"grad_norm": 0.2431286871433258,
"learning_rate": 9.883172778961613e-05,
"loss": 1.3112,
"step": 1798
},
{
"epoch": 0.48707887971857666,
"grad_norm": 0.26892608404159546,
"learning_rate": 9.882495240537505e-05,
"loss": 1.2904,
"step": 1800
},
{
"epoch": 0.4876200784738195,
"grad_norm": 0.2528528571128845,
"learning_rate": 9.881815766460392e-05,
"loss": 1.2949,
"step": 1802
},
{
"epoch": 0.4881612772290624,
"grad_norm": 0.2614927291870117,
"learning_rate": 9.881134356999652e-05,
"loss": 1.288,
"step": 1804
},
{
"epoch": 0.4887024759843052,
"grad_norm": 0.2523605227470398,
"learning_rate": 9.880451012425426e-05,
"loss": 1.3029,
"step": 1806
},
{
"epoch": 0.4892436747395481,
"grad_norm": 0.24303248524665833,
"learning_rate": 9.879765733008627e-05,
"loss": 1.3107,
"step": 1808
},
{
"epoch": 0.48978487349479094,
"grad_norm": 0.2470557987689972,
"learning_rate": 9.879078519020933e-05,
"loss": 1.2856,
"step": 1810
},
{
"epoch": 0.49032607225003383,
"grad_norm": 0.2526317536830902,
"learning_rate": 9.878389370734784e-05,
"loss": 1.2965,
"step": 1812
},
{
"epoch": 0.49086727100527666,
"grad_norm": 0.2483314871788025,
"learning_rate": 9.877698288423394e-05,
"loss": 1.3016,
"step": 1814
},
{
"epoch": 0.49140846976051955,
"grad_norm": 0.24746839702129364,
"learning_rate": 9.877005272360741e-05,
"loss": 1.2944,
"step": 1816
},
{
"epoch": 0.49194966851576244,
"grad_norm": 0.24739988148212433,
"learning_rate": 9.876310322821568e-05,
"loss": 1.3037,
"step": 1818
},
{
"epoch": 0.4924908672710053,
"grad_norm": 0.2740204632282257,
"learning_rate": 9.875613440081387e-05,
"loss": 1.3116,
"step": 1820
},
{
"epoch": 0.49303206602624816,
"grad_norm": 0.27116379141807556,
"learning_rate": 9.874914624416475e-05,
"loss": 1.288,
"step": 1822
},
{
"epoch": 0.493573264781491,
"grad_norm": 0.24231554567813873,
"learning_rate": 9.874213876103878e-05,
"loss": 1.2975,
"step": 1824
},
{
"epoch": 0.4941144635367339,
"grad_norm": 0.2590995728969574,
"learning_rate": 9.873511195421402e-05,
"loss": 1.2678,
"step": 1826
},
{
"epoch": 0.4946556622919767,
"grad_norm": 0.25694531202316284,
"learning_rate": 9.872806582647625e-05,
"loss": 1.28,
"step": 1828
},
{
"epoch": 0.4951968610472196,
"grad_norm": 0.25455620884895325,
"learning_rate": 9.87210003806189e-05,
"loss": 1.2942,
"step": 1830
},
{
"epoch": 0.49573805980246244,
"grad_norm": 0.2639889121055603,
"learning_rate": 9.871391561944302e-05,
"loss": 1.3161,
"step": 1832
},
{
"epoch": 0.4962792585577053,
"grad_norm": 0.271282821893692,
"learning_rate": 9.870681154575737e-05,
"loss": 1.3071,
"step": 1834
},
{
"epoch": 0.49682045731294816,
"grad_norm": 0.26479372382164,
"learning_rate": 9.869968816237833e-05,
"loss": 1.2841,
"step": 1836
},
{
"epoch": 0.49736165606819105,
"grad_norm": 0.26040130853652954,
"learning_rate": 9.869254547212997e-05,
"loss": 1.2989,
"step": 1838
},
{
"epoch": 0.4979028548234339,
"grad_norm": 0.26563623547554016,
"learning_rate": 9.868538347784396e-05,
"loss": 1.2965,
"step": 1840
},
{
"epoch": 0.49844405357867677,
"grad_norm": 0.26089224219322205,
"learning_rate": 9.867820218235969e-05,
"loss": 1.3071,
"step": 1842
},
{
"epoch": 0.4989852523339196,
"grad_norm": 0.27151811122894287,
"learning_rate": 9.867100158852412e-05,
"loss": 1.287,
"step": 1844
},
{
"epoch": 0.4995264510891625,
"grad_norm": 0.2477792203426361,
"learning_rate": 9.866378169919192e-05,
"loss": 1.2894,
"step": 1846
},
{
"epoch": 0.5000676498444053,
"grad_norm": 0.24871942400932312,
"learning_rate": 9.865654251722545e-05,
"loss": 1.3024,
"step": 1848
},
{
"epoch": 0.5006088485996483,
"grad_norm": 0.26377877593040466,
"learning_rate": 9.86492840454946e-05,
"loss": 1.2939,
"step": 1850
},
{
"epoch": 0.5011500473548911,
"grad_norm": 0.258228063583374,
"learning_rate": 9.8642006286877e-05,
"loss": 1.291,
"step": 1852
},
{
"epoch": 0.5016912461101339,
"grad_norm": 0.26982301473617554,
"learning_rate": 9.86347092442579e-05,
"loss": 1.2845,
"step": 1854
},
{
"epoch": 0.5022324448653768,
"grad_norm": 0.24094600975513458,
"learning_rate": 9.862739292053021e-05,
"loss": 1.2744,
"step": 1856
},
{
"epoch": 0.5027736436206197,
"grad_norm": 0.25840380787849426,
"learning_rate": 9.862005731859442e-05,
"loss": 1.2966,
"step": 1858
},
{
"epoch": 0.5033148423758625,
"grad_norm": 0.26734429597854614,
"learning_rate": 9.861270244135877e-05,
"loss": 1.2856,
"step": 1860
},
{
"epoch": 0.5038560411311054,
"grad_norm": 0.24431397020816803,
"learning_rate": 9.860532829173903e-05,
"loss": 1.2871,
"step": 1862
},
{
"epoch": 0.5043972398863482,
"grad_norm": 0.25425857305526733,
"learning_rate": 9.859793487265869e-05,
"loss": 1.2822,
"step": 1864
},
{
"epoch": 0.5049384386415912,
"grad_norm": 0.25332111120224,
"learning_rate": 9.859052218704885e-05,
"loss": 1.2723,
"step": 1866
},
{
"epoch": 0.505479637396834,
"grad_norm": 0.24775418639183044,
"learning_rate": 9.858309023784826e-05,
"loss": 1.2934,
"step": 1868
},
{
"epoch": 0.5060208361520768,
"grad_norm": 0.24880458414554596,
"learning_rate": 9.857563902800328e-05,
"loss": 1.3041,
"step": 1870
},
{
"epoch": 0.5065620349073197,
"grad_norm": 0.2574135959148407,
"learning_rate": 9.856816856046793e-05,
"loss": 1.2855,
"step": 1872
},
{
"epoch": 0.5071032336625626,
"grad_norm": 0.26873350143432617,
"learning_rate": 9.856067883820386e-05,
"loss": 1.3055,
"step": 1874
},
{
"epoch": 0.5076444324178054,
"grad_norm": 0.23742420971393585,
"learning_rate": 9.855316986418036e-05,
"loss": 1.3029,
"step": 1876
},
{
"epoch": 0.5081856311730483,
"grad_norm": 0.2398921549320221,
"learning_rate": 9.854564164137432e-05,
"loss": 1.2849,
"step": 1878
},
{
"epoch": 0.5087268299282912,
"grad_norm": 0.25182288885116577,
"learning_rate": 9.85380941727703e-05,
"loss": 1.2981,
"step": 1880
},
{
"epoch": 0.509268028683534,
"grad_norm": 0.23373378813266754,
"learning_rate": 9.853052746136048e-05,
"loss": 1.2772,
"step": 1882
},
{
"epoch": 0.5098092274387769,
"grad_norm": 0.2581213712692261,
"learning_rate": 9.852294151014466e-05,
"loss": 1.3147,
"step": 1884
},
{
"epoch": 0.5103504261940197,
"grad_norm": 0.26642751693725586,
"learning_rate": 9.851533632213028e-05,
"loss": 1.2885,
"step": 1886
},
{
"epoch": 0.5108916249492627,
"grad_norm": 0.24029181897640228,
"learning_rate": 9.850771190033237e-05,
"loss": 1.297,
"step": 1888
},
{
"epoch": 0.5114328237045055,
"grad_norm": 0.2555221915245056,
"learning_rate": 9.850006824777364e-05,
"loss": 1.284,
"step": 1890
},
{
"epoch": 0.5119740224597483,
"grad_norm": 0.2723660171031952,
"learning_rate": 9.849240536748439e-05,
"loss": 1.2821,
"step": 1892
},
{
"epoch": 0.5125152212149912,
"grad_norm": 0.24772705137729645,
"learning_rate": 9.848472326250253e-05,
"loss": 1.2743,
"step": 1894
},
{
"epoch": 0.5130564199702341,
"grad_norm": 0.2344834804534912,
"learning_rate": 9.847702193587365e-05,
"loss": 1.286,
"step": 1896
},
{
"epoch": 0.5135976187254769,
"grad_norm": 0.23948362469673157,
"learning_rate": 9.846930139065088e-05,
"loss": 1.2673,
"step": 1898
},
{
"epoch": 0.5141388174807198,
"grad_norm": 0.27207908034324646,
"learning_rate": 9.846156162989503e-05,
"loss": 1.3041,
"step": 1900
},
{
"epoch": 0.5146800162359627,
"grad_norm": 0.2407965511083603,
"learning_rate": 9.845380265667454e-05,
"loss": 1.2875,
"step": 1902
},
{
"epoch": 0.5152212149912055,
"grad_norm": 0.2517203688621521,
"learning_rate": 9.844602447406538e-05,
"loss": 1.2855,
"step": 1904
},
{
"epoch": 0.5157624137464484,
"grad_norm": 0.24267178773880005,
"learning_rate": 9.843822708515123e-05,
"loss": 1.2711,
"step": 1906
},
{
"epoch": 0.5163036125016912,
"grad_norm": 0.23933006823062897,
"learning_rate": 9.843041049302331e-05,
"loss": 1.3094,
"step": 1908
},
{
"epoch": 0.5168448112569342,
"grad_norm": 0.21948301792144775,
"learning_rate": 9.842257470078054e-05,
"loss": 1.2686,
"step": 1910
},
{
"epoch": 0.517386010012177,
"grad_norm": 0.239594966173172,
"learning_rate": 9.841471971152933e-05,
"loss": 1.2959,
"step": 1912
},
{
"epoch": 0.5179272087674198,
"grad_norm": 0.26850634813308716,
"learning_rate": 9.840684552838385e-05,
"loss": 1.2969,
"step": 1914
},
{
"epoch": 0.5184684075226627,
"grad_norm": 0.26066869497299194,
"learning_rate": 9.839895215446573e-05,
"loss": 1.2935,
"step": 1916
},
{
"epoch": 0.5190096062779056,
"grad_norm": 0.25288596749305725,
"learning_rate": 9.839103959290433e-05,
"loss": 1.2922,
"step": 1918
},
{
"epoch": 0.5195508050331484,
"grad_norm": 0.24453966319561005,
"learning_rate": 9.838310784683655e-05,
"loss": 1.3058,
"step": 1920
},
{
"epoch": 0.5200920037883913,
"grad_norm": 0.25353509187698364,
"learning_rate": 9.837515691940689e-05,
"loss": 1.3161,
"step": 1922
},
{
"epoch": 0.5206332025436341,
"grad_norm": 0.24898375570774078,
"learning_rate": 9.836718681376749e-05,
"loss": 1.2925,
"step": 1924
},
{
"epoch": 0.521174401298877,
"grad_norm": 0.2576977014541626,
"learning_rate": 9.835919753307807e-05,
"loss": 1.2916,
"step": 1926
},
{
"epoch": 0.5217156000541199,
"grad_norm": 0.25432518124580383,
"learning_rate": 9.8351189080506e-05,
"loss": 1.2866,
"step": 1928
},
{
"epoch": 0.5222567988093627,
"grad_norm": 0.2504200339317322,
"learning_rate": 9.834316145922615e-05,
"loss": 1.2728,
"step": 1930
},
{
"epoch": 0.5227979975646057,
"grad_norm": 0.2627692222595215,
"learning_rate": 9.83351146724211e-05,
"loss": 1.2853,
"step": 1932
},
{
"epoch": 0.5233391963198485,
"grad_norm": 0.2776716351509094,
"learning_rate": 9.832704872328094e-05,
"loss": 1.2881,
"step": 1934
},
{
"epoch": 0.5238803950750913,
"grad_norm": 0.24669450521469116,
"learning_rate": 9.831896361500344e-05,
"loss": 1.2681,
"step": 1936
},
{
"epoch": 0.5244215938303342,
"grad_norm": 0.24949464201927185,
"learning_rate": 9.831085935079387e-05,
"loss": 1.2851,
"step": 1938
},
{
"epoch": 0.5249627925855771,
"grad_norm": 0.2585392892360687,
"learning_rate": 9.830273593386518e-05,
"loss": 1.2796,
"step": 1940
},
{
"epoch": 0.5255039913408199,
"grad_norm": 0.26086801290512085,
"learning_rate": 9.829459336743787e-05,
"loss": 1.293,
"step": 1942
},
{
"epoch": 0.5260451900960628,
"grad_norm": 0.25490057468414307,
"learning_rate": 9.828643165474006e-05,
"loss": 1.2824,
"step": 1944
},
{
"epoch": 0.5265863888513056,
"grad_norm": 0.24865177273750305,
"learning_rate": 9.827825079900739e-05,
"loss": 1.2835,
"step": 1946
},
{
"epoch": 0.5271275876065485,
"grad_norm": 0.25498902797698975,
"learning_rate": 9.827005080348317e-05,
"loss": 1.2931,
"step": 1948
},
{
"epoch": 0.5276687863617914,
"grad_norm": 0.2585375905036926,
"learning_rate": 9.826183167141828e-05,
"loss": 1.2659,
"step": 1950
},
{
"epoch": 0.5282099851170342,
"grad_norm": 0.2300305813550949,
"learning_rate": 9.825359340607116e-05,
"loss": 1.3019,
"step": 1952
},
{
"epoch": 0.528751183872277,
"grad_norm": 0.24674038589000702,
"learning_rate": 9.824533601070784e-05,
"loss": 1.2784,
"step": 1954
},
{
"epoch": 0.52929238262752,
"grad_norm": 0.23458759486675262,
"learning_rate": 9.823705948860195e-05,
"loss": 1.2779,
"step": 1956
},
{
"epoch": 0.5298335813827628,
"grad_norm": 0.24736309051513672,
"learning_rate": 9.822876384303472e-05,
"loss": 1.3083,
"step": 1958
},
{
"epoch": 0.5303747801380057,
"grad_norm": 0.25108450651168823,
"learning_rate": 9.82204490772949e-05,
"loss": 1.3044,
"step": 1960
},
{
"epoch": 0.5309159788932486,
"grad_norm": 0.23308375477790833,
"learning_rate": 9.82121151946789e-05,
"loss": 1.2694,
"step": 1962
},
{
"epoch": 0.5314571776484914,
"grad_norm": 0.2283206284046173,
"learning_rate": 9.820376219849064e-05,
"loss": 1.2735,
"step": 1964
},
{
"epoch": 0.5319983764037343,
"grad_norm": 0.24121573567390442,
"learning_rate": 9.819539009204164e-05,
"loss": 1.2799,
"step": 1966
},
{
"epoch": 0.5325395751589771,
"grad_norm": 0.24135661125183105,
"learning_rate": 9.8186998878651e-05,
"loss": 1.295,
"step": 1968
},
{
"epoch": 0.53308077391422,
"grad_norm": 0.24390241503715515,
"learning_rate": 9.817858856164542e-05,
"loss": 1.2812,
"step": 1970
},
{
"epoch": 0.5336219726694629,
"grad_norm": 0.24739502370357513,
"learning_rate": 9.817015914435913e-05,
"loss": 1.2872,
"step": 1972
},
{
"epoch": 0.5341631714247057,
"grad_norm": 0.25517916679382324,
"learning_rate": 9.816171063013395e-05,
"loss": 1.2718,
"step": 1974
},
{
"epoch": 0.5347043701799485,
"grad_norm": 0.25479528307914734,
"learning_rate": 9.815324302231928e-05,
"loss": 1.2952,
"step": 1976
},
{
"epoch": 0.5352455689351915,
"grad_norm": 0.24998174607753754,
"learning_rate": 9.814475632427206e-05,
"loss": 1.2914,
"step": 1978
},
{
"epoch": 0.5357867676904343,
"grad_norm": 0.2341603934764862,
"learning_rate": 9.813625053935686e-05,
"loss": 1.2793,
"step": 1980
},
{
"epoch": 0.5363279664456772,
"grad_norm": 0.23716285824775696,
"learning_rate": 9.812772567094574e-05,
"loss": 1.2872,
"step": 1982
},
{
"epoch": 0.53686916520092,
"grad_norm": 0.2324230819940567,
"learning_rate": 9.81191817224184e-05,
"loss": 1.2604,
"step": 1984
},
{
"epoch": 0.5374103639561629,
"grad_norm": 0.24399405717849731,
"learning_rate": 9.811061869716205e-05,
"loss": 1.2972,
"step": 1986
},
{
"epoch": 0.5379515627114058,
"grad_norm": 0.24572497606277466,
"learning_rate": 9.810203659857145e-05,
"loss": 1.2784,
"step": 1988
},
{
"epoch": 0.5384927614666486,
"grad_norm": 0.22993844747543335,
"learning_rate": 9.8093435430049e-05,
"loss": 1.2886,
"step": 1990
},
{
"epoch": 0.5390339602218915,
"grad_norm": 0.24518661201000214,
"learning_rate": 9.808481519500458e-05,
"loss": 1.2622,
"step": 1992
},
{
"epoch": 0.5395751589771344,
"grad_norm": 0.2601888179779053,
"learning_rate": 9.807617589685568e-05,
"loss": 1.2739,
"step": 1994
},
{
"epoch": 0.5401163577323772,
"grad_norm": 0.24736261367797852,
"learning_rate": 9.80675175390273e-05,
"loss": 1.2748,
"step": 1996
},
{
"epoch": 0.54065755648762,
"grad_norm": 0.2332574725151062,
"learning_rate": 9.805884012495203e-05,
"loss": 1.2639,
"step": 1998
},
{
"epoch": 0.541198755242863,
"grad_norm": 0.2662294805049896,
"learning_rate": 9.805014365807004e-05,
"loss": 1.2914,
"step": 2000
},
{
"epoch": 0.5417399539981058,
"grad_norm": 0.28600943088531494,
"learning_rate": 9.804142814182902e-05,
"loss": 1.2657,
"step": 2002
},
{
"epoch": 0.5422811527533486,
"grad_norm": 0.2814892530441284,
"learning_rate": 9.803269357968416e-05,
"loss": 1.2839,
"step": 2004
},
{
"epoch": 0.5428223515085915,
"grad_norm": 0.24939605593681335,
"learning_rate": 9.802393997509833e-05,
"loss": 1.2692,
"step": 2006
},
{
"epoch": 0.5433635502638344,
"grad_norm": 0.2562806308269501,
"learning_rate": 9.801516733154181e-05,
"loss": 1.291,
"step": 2008
},
{
"epoch": 0.5439047490190773,
"grad_norm": 0.2617442011833191,
"learning_rate": 9.800637565249255e-05,
"loss": 1.2808,
"step": 2010
},
{
"epoch": 0.5444459477743201,
"grad_norm": 0.2421412616968155,
"learning_rate": 9.799756494143593e-05,
"loss": 1.2733,
"step": 2012
},
{
"epoch": 0.5449871465295629,
"grad_norm": 0.25231024622917175,
"learning_rate": 9.798873520186497e-05,
"loss": 1.2695,
"step": 2014
},
{
"epoch": 0.5455283452848059,
"grad_norm": 0.25108659267425537,
"learning_rate": 9.79798864372802e-05,
"loss": 1.298,
"step": 2016
},
{
"epoch": 0.5460695440400487,
"grad_norm": 0.24615678191184998,
"learning_rate": 9.79710186511897e-05,
"loss": 1.3127,
"step": 2018
},
{
"epoch": 0.5466107427952915,
"grad_norm": 0.23436503112316132,
"learning_rate": 9.796213184710904e-05,
"loss": 1.2896,
"step": 2020
},
{
"epoch": 0.5471519415505345,
"grad_norm": 0.23453901708126068,
"learning_rate": 9.79532260285614e-05,
"loss": 1.2761,
"step": 2022
},
{
"epoch": 0.5476931403057773,
"grad_norm": 0.2413233071565628,
"learning_rate": 9.794430119907748e-05,
"loss": 1.2744,
"step": 2024
},
{
"epoch": 0.5482343390610201,
"grad_norm": 0.2426893562078476,
"learning_rate": 9.793535736219546e-05,
"loss": 1.2615,
"step": 2026
},
{
"epoch": 0.548775537816263,
"grad_norm": 0.23853014409542084,
"learning_rate": 9.792639452146115e-05,
"loss": 1.2897,
"step": 2028
},
{
"epoch": 0.5493167365715059,
"grad_norm": 0.24866445362567902,
"learning_rate": 9.791741268042784e-05,
"loss": 1.2957,
"step": 2030
},
{
"epoch": 0.5498579353267488,
"grad_norm": 0.24467822909355164,
"learning_rate": 9.790841184265633e-05,
"loss": 1.2867,
"step": 2032
},
{
"epoch": 0.5503991340819916,
"grad_norm": 0.2393324077129364,
"learning_rate": 9.7899392011715e-05,
"loss": 1.3061,
"step": 2034
},
{
"epoch": 0.5509403328372344,
"grad_norm": 0.23834531009197235,
"learning_rate": 9.789035319117974e-05,
"loss": 1.2957,
"step": 2036
},
{
"epoch": 0.5514815315924774,
"grad_norm": 0.2603852450847626,
"learning_rate": 9.788129538463397e-05,
"loss": 1.2897,
"step": 2038
},
{
"epoch": 0.5520227303477202,
"grad_norm": 0.26540425419807434,
"learning_rate": 9.787221859566861e-05,
"loss": 1.2829,
"step": 2040
},
{
"epoch": 0.552563929102963,
"grad_norm": 0.25125250220298767,
"learning_rate": 9.786312282788216e-05,
"loss": 1.2708,
"step": 2042
},
{
"epoch": 0.553105127858206,
"grad_norm": 0.23911471664905548,
"learning_rate": 9.785400808488061e-05,
"loss": 1.2949,
"step": 2044
},
{
"epoch": 0.5536463266134488,
"grad_norm": 0.23871150612831116,
"learning_rate": 9.784487437027746e-05,
"loss": 1.2863,
"step": 2046
},
{
"epoch": 0.5541875253686916,
"grad_norm": 0.25253376364707947,
"learning_rate": 9.783572168769376e-05,
"loss": 1.2797,
"step": 2048
},
{
"epoch": 0.5547287241239345,
"grad_norm": 0.25140559673309326,
"learning_rate": 9.782655004075807e-05,
"loss": 1.2666,
"step": 2050
},
{
"epoch": 0.5552699228791774,
"grad_norm": 0.25297242403030396,
"learning_rate": 9.781735943310646e-05,
"loss": 1.2935,
"step": 2052
},
{
"epoch": 0.5558111216344203,
"grad_norm": 0.28536322712898254,
"learning_rate": 9.780814986838252e-05,
"loss": 1.2891,
"step": 2054
},
{
"epoch": 0.5563523203896631,
"grad_norm": 0.28267911076545715,
"learning_rate": 9.779892135023738e-05,
"loss": 1.2846,
"step": 2056
},
{
"epoch": 0.5568935191449059,
"grad_norm": 0.24850498139858246,
"learning_rate": 9.778967388232964e-05,
"loss": 1.2823,
"step": 2058
},
{
"epoch": 0.5574347179001489,
"grad_norm": 0.4929364025592804,
"learning_rate": 9.778040746832544e-05,
"loss": 1.2681,
"step": 2060
},
{
"epoch": 0.5579759166553917,
"grad_norm": 0.25423306226730347,
"learning_rate": 9.777112211189843e-05,
"loss": 1.2765,
"step": 2062
},
{
"epoch": 0.5585171154106345,
"grad_norm": 0.23608753085136414,
"learning_rate": 9.776181781672977e-05,
"loss": 1.2756,
"step": 2064
},
{
"epoch": 0.5590583141658774,
"grad_norm": 0.3117451071739197,
"learning_rate": 9.775249458650812e-05,
"loss": 1.2731,
"step": 2066
},
{
"epoch": 0.5595995129211203,
"grad_norm": 0.2454603612422943,
"learning_rate": 9.774315242492965e-05,
"loss": 1.2821,
"step": 2068
},
{
"epoch": 0.5601407116763631,
"grad_norm": 0.3214171528816223,
"learning_rate": 9.773379133569804e-05,
"loss": 1.2964,
"step": 2070
},
{
"epoch": 0.560681910431606,
"grad_norm": 0.23589906096458435,
"learning_rate": 9.772441132252448e-05,
"loss": 1.2794,
"step": 2072
},
{
"epoch": 0.5612231091868489,
"grad_norm": 0.23020370304584503,
"learning_rate": 9.771501238912763e-05,
"loss": 1.2753,
"step": 2074
},
{
"epoch": 0.5617643079420918,
"grad_norm": 0.2368050515651703,
"learning_rate": 9.77055945392337e-05,
"loss": 1.3048,
"step": 2076
},
{
"epoch": 0.5623055066973346,
"grad_norm": 0.2581866383552551,
"learning_rate": 9.769615777657633e-05,
"loss": 1.2765,
"step": 2078
},
{
"epoch": 0.5628467054525774,
"grad_norm": 0.2481439858675003,
"learning_rate": 9.768670210489675e-05,
"loss": 1.2957,
"step": 2080
},
{
"epoch": 0.5633879042078204,
"grad_norm": 0.2861919701099396,
"learning_rate": 9.767722752794361e-05,
"loss": 1.2647,
"step": 2082
},
{
"epoch": 0.5639291029630632,
"grad_norm": 0.2552880346775055,
"learning_rate": 9.766773404947309e-05,
"loss": 1.2675,
"step": 2084
},
{
"epoch": 0.564470301718306,
"grad_norm": 0.251891165971756,
"learning_rate": 9.765822167324885e-05,
"loss": 1.2799,
"step": 2086
},
{
"epoch": 0.5650115004735489,
"grad_norm": 0.25395113229751587,
"learning_rate": 9.764869040304205e-05,
"loss": 1.2916,
"step": 2088
},
{
"epoch": 0.5655526992287918,
"grad_norm": 0.2496347427368164,
"learning_rate": 9.763914024263136e-05,
"loss": 1.2722,
"step": 2090
},
{
"epoch": 0.5660938979840346,
"grad_norm": 0.24722573161125183,
"learning_rate": 9.762957119580287e-05,
"loss": 1.2722,
"step": 2092
},
{
"epoch": 0.5666350967392775,
"grad_norm": 0.23567502200603485,
"learning_rate": 9.761998326635026e-05,
"loss": 1.2681,
"step": 2094
},
{
"epoch": 0.5671762954945203,
"grad_norm": 0.2396802455186844,
"learning_rate": 9.76103764580746e-05,
"loss": 1.2509,
"step": 2096
},
{
"epoch": 0.5677174942497633,
"grad_norm": 0.24394263327121735,
"learning_rate": 9.76007507747845e-05,
"loss": 1.2863,
"step": 2098
},
{
"epoch": 0.5682586930050061,
"grad_norm": 0.23184406757354736,
"learning_rate": 9.759110622029604e-05,
"loss": 1.2827,
"step": 2100
}
],
"logging_steps": 2,
"max_steps": 11088,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.729660443758428e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}