diff --git "a/l2-7b-eu/checkpoint-10200/trainer_state.json" "b/l2-7b-eu/checkpoint-10200/trainer_state.json" new file mode 100644--- /dev/null +++ "b/l2-7b-eu/checkpoint-10200/trainer_state.json" @@ -0,0 +1,35741 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.7598430523609796, + "eval_steps": 500, + "global_step": 10200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00027059937762143147, + "grad_norm": 4.086390018463135, + "learning_rate": 0.0, + "loss": 3.2754, + "step": 1 + }, + { + "epoch": 0.0005411987552428629, + "grad_norm": 3.758815288543701, + "learning_rate": 9.017132551848513e-08, + "loss": 3.2863, + "step": 2 + }, + { + "epoch": 0.0010823975104857259, + "grad_norm": 3.8250608444213867, + "learning_rate": 2.705139765554554e-07, + "loss": 3.3425, + "step": 4 + }, + { + "epoch": 0.0016235962657285888, + "grad_norm": 3.8092095851898193, + "learning_rate": 4.5085662759242564e-07, + "loss": 3.3165, + "step": 6 + }, + { + "epoch": 0.0021647950209714517, + "grad_norm": 3.7621052265167236, + "learning_rate": 6.311992786293959e-07, + "loss": 3.3295, + "step": 8 + }, + { + "epoch": 0.002705993776214315, + "grad_norm": 3.4136276245117188, + "learning_rate": 8.115419296663661e-07, + "loss": 3.3073, + "step": 10 + }, + { + "epoch": 0.0032471925314571776, + "grad_norm": 2.855100393295288, + "learning_rate": 9.918845807033363e-07, + "loss": 3.3031, + "step": 12 + }, + { + "epoch": 0.0037883912867000408, + "grad_norm": 2.491767406463623, + "learning_rate": 1.1722272317403068e-06, + "loss": 3.2943, + "step": 14 + }, + { + "epoch": 0.0043295900419429035, + "grad_norm": 2.359778642654419, + "learning_rate": 1.3525698827772768e-06, + "loss": 3.2622, + "step": 16 + }, + { + "epoch": 0.004870788797185766, + "grad_norm": 2.037504196166992, + "learning_rate": 1.5329125338142473e-06, + "loss": 3.239, + "step": 18 + }, + { + "epoch": 0.00541198755242863, + "grad_norm": 2.8542497158050537, + "learning_rate": 1.7132551848512173e-06, + "loss": 3.2031, + "step": 20 + }, + { + "epoch": 0.0059531863076714925, + "grad_norm": 2.297046661376953, + "learning_rate": 1.8935978358881876e-06, + "loss": 3.1721, + "step": 22 + }, + { + "epoch": 0.006494385062914355, + "grad_norm": 2.2149112224578857, + "learning_rate": 2.0739404869251576e-06, + "loss": 3.121, + "step": 24 + }, + { + "epoch": 0.007035583818157218, + "grad_norm": 1.8048591613769531, + "learning_rate": 2.254283137962128e-06, + "loss": 3.0857, + "step": 26 + }, + { + "epoch": 0.0075767825734000815, + "grad_norm": 1.7466434240341187, + "learning_rate": 2.4346257889990986e-06, + "loss": 3.0489, + "step": 28 + }, + { + "epoch": 0.008117981328642944, + "grad_norm": 2.1722524166107178, + "learning_rate": 2.6149684400360686e-06, + "loss": 3.0016, + "step": 30 + }, + { + "epoch": 0.008659180083885807, + "grad_norm": 1.364578366279602, + "learning_rate": 2.7953110910730386e-06, + "loss": 2.9587, + "step": 32 + }, + { + "epoch": 0.00920037883912867, + "grad_norm": 1.5823427438735962, + "learning_rate": 2.9756537421100095e-06, + "loss": 2.931, + "step": 34 + }, + { + "epoch": 0.009741577594371532, + "grad_norm": 1.2367908954620361, + "learning_rate": 3.1559963931469796e-06, + "loss": 2.8953, + "step": 36 + }, + { + "epoch": 0.010282776349614395, + "grad_norm": 1.0437366962432861, + "learning_rate": 3.3363390441839496e-06, + "loss": 2.8412, + "step": 38 + }, + { + "epoch": 0.01082397510485726, + "grad_norm": 1.081803798675537, + "learning_rate": 3.5166816952209197e-06, + "loss": 2.7832, + "step": 40 + }, + { + "epoch": 0.011365173860100122, + "grad_norm": 0.9715840220451355, + "learning_rate": 3.69702434625789e-06, + "loss": 2.7729, + "step": 42 + }, + { + "epoch": 0.011906372615342985, + "grad_norm": 0.8603936433792114, + "learning_rate": 3.877366997294861e-06, + "loss": 2.6904, + "step": 44 + }, + { + "epoch": 0.012447571370585848, + "grad_norm": 0.8236231803894043, + "learning_rate": 4.057709648331831e-06, + "loss": 2.6908, + "step": 46 + }, + { + "epoch": 0.01298877012582871, + "grad_norm": 0.7681186199188232, + "learning_rate": 4.2380522993688015e-06, + "loss": 2.6212, + "step": 48 + }, + { + "epoch": 0.013529968881071573, + "grad_norm": 0.8002827167510986, + "learning_rate": 4.4183949504057716e-06, + "loss": 2.6035, + "step": 50 + }, + { + "epoch": 0.014071167636314436, + "grad_norm": 0.6757120490074158, + "learning_rate": 4.598737601442742e-06, + "loss": 2.595, + "step": 52 + }, + { + "epoch": 0.014612366391557299, + "grad_norm": 0.6619369387626648, + "learning_rate": 4.779080252479712e-06, + "loss": 2.5522, + "step": 54 + }, + { + "epoch": 0.015153565146800163, + "grad_norm": 0.6247105598449707, + "learning_rate": 4.959422903516682e-06, + "loss": 2.5079, + "step": 56 + }, + { + "epoch": 0.015694763902043024, + "grad_norm": 0.6559263467788696, + "learning_rate": 5.139765554553652e-06, + "loss": 2.5009, + "step": 58 + }, + { + "epoch": 0.01623596265728589, + "grad_norm": 0.6590877175331116, + "learning_rate": 5.320108205590623e-06, + "loss": 2.4648, + "step": 60 + }, + { + "epoch": 0.01677716141252875, + "grad_norm": 0.6045516133308411, + "learning_rate": 5.500450856627593e-06, + "loss": 2.421, + "step": 62 + }, + { + "epoch": 0.017318360167771614, + "grad_norm": 0.6533932089805603, + "learning_rate": 5.680793507664563e-06, + "loss": 2.3966, + "step": 64 + }, + { + "epoch": 0.01785955892301448, + "grad_norm": 0.6478094458580017, + "learning_rate": 5.861136158701533e-06, + "loss": 2.3903, + "step": 66 + }, + { + "epoch": 0.01840075767825734, + "grad_norm": 0.7349300980567932, + "learning_rate": 6.041478809738504e-06, + "loss": 2.3552, + "step": 68 + }, + { + "epoch": 0.018941956433500204, + "grad_norm": 0.6454821825027466, + "learning_rate": 6.221821460775474e-06, + "loss": 2.3262, + "step": 70 + }, + { + "epoch": 0.019483155188743065, + "grad_norm": 0.7321672439575195, + "learning_rate": 6.402164111812444e-06, + "loss": 2.3197, + "step": 72 + }, + { + "epoch": 0.02002435394398593, + "grad_norm": 0.7664237022399902, + "learning_rate": 6.582506762849414e-06, + "loss": 2.2992, + "step": 74 + }, + { + "epoch": 0.02056555269922879, + "grad_norm": 0.6843811869621277, + "learning_rate": 6.762849413886384e-06, + "loss": 2.2927, + "step": 76 + }, + { + "epoch": 0.021106751454471655, + "grad_norm": 0.7199612259864807, + "learning_rate": 6.9431920649233556e-06, + "loss": 2.2525, + "step": 78 + }, + { + "epoch": 0.02164795020971452, + "grad_norm": 0.778446614742279, + "learning_rate": 7.123534715960326e-06, + "loss": 2.2267, + "step": 80 + }, + { + "epoch": 0.02218914896495738, + "grad_norm": 0.9287930727005005, + "learning_rate": 7.303877366997296e-06, + "loss": 2.2206, + "step": 82 + }, + { + "epoch": 0.022730347720200245, + "grad_norm": 1.033782958984375, + "learning_rate": 7.484220018034266e-06, + "loss": 2.2063, + "step": 84 + }, + { + "epoch": 0.023271546475443106, + "grad_norm": 1.0132615566253662, + "learning_rate": 7.664562669071236e-06, + "loss": 2.1677, + "step": 86 + }, + { + "epoch": 0.02381274523068597, + "grad_norm": 0.9043529033660889, + "learning_rate": 7.844905320108207e-06, + "loss": 2.1696, + "step": 88 + }, + { + "epoch": 0.02435394398592883, + "grad_norm": 0.6718290448188782, + "learning_rate": 8.025247971145176e-06, + "loss": 2.1492, + "step": 90 + }, + { + "epoch": 0.024895142741171696, + "grad_norm": 0.9615944027900696, + "learning_rate": 8.205590622182147e-06, + "loss": 2.1452, + "step": 92 + }, + { + "epoch": 0.02543634149641456, + "grad_norm": 0.9435996413230896, + "learning_rate": 8.385933273219116e-06, + "loss": 2.1098, + "step": 94 + }, + { + "epoch": 0.02597754025165742, + "grad_norm": 0.7614261507987976, + "learning_rate": 8.566275924256087e-06, + "loss": 2.1286, + "step": 96 + }, + { + "epoch": 0.026518739006900285, + "grad_norm": 0.9416339993476868, + "learning_rate": 8.746618575293058e-06, + "loss": 2.1092, + "step": 98 + }, + { + "epoch": 0.027059937762143146, + "grad_norm": 0.9229443073272705, + "learning_rate": 8.926961226330027e-06, + "loss": 2.0932, + "step": 100 + }, + { + "epoch": 0.02760113651738601, + "grad_norm": 0.7135593295097351, + "learning_rate": 9.107303877366998e-06, + "loss": 2.0699, + "step": 102 + }, + { + "epoch": 0.028142335272628872, + "grad_norm": 1.0263723134994507, + "learning_rate": 9.287646528403967e-06, + "loss": 2.0445, + "step": 104 + }, + { + "epoch": 0.028683534027871736, + "grad_norm": 1.0300300121307373, + "learning_rate": 9.467989179440938e-06, + "loss": 2.0463, + "step": 106 + }, + { + "epoch": 0.029224732783114597, + "grad_norm": 0.8331286311149597, + "learning_rate": 9.648331830477909e-06, + "loss": 2.0381, + "step": 108 + }, + { + "epoch": 0.02976593153835746, + "grad_norm": 0.7501435875892639, + "learning_rate": 9.828674481514878e-06, + "loss": 2.0411, + "step": 110 + }, + { + "epoch": 0.030307130293600326, + "grad_norm": 0.6895191073417664, + "learning_rate": 1.0009017132551849e-05, + "loss": 2.0475, + "step": 112 + }, + { + "epoch": 0.030848329048843187, + "grad_norm": 0.95854252576828, + "learning_rate": 1.018935978358882e-05, + "loss": 2.0071, + "step": 114 + }, + { + "epoch": 0.03138952780408605, + "grad_norm": 1.1303929090499878, + "learning_rate": 1.036970243462579e-05, + "loss": 2.0008, + "step": 116 + }, + { + "epoch": 0.031930726559328916, + "grad_norm": 0.7708876729011536, + "learning_rate": 1.055004508566276e-05, + "loss": 2.0061, + "step": 118 + }, + { + "epoch": 0.03247192531457178, + "grad_norm": 0.9773860573768616, + "learning_rate": 1.073038773669973e-05, + "loss": 2.0096, + "step": 120 + }, + { + "epoch": 0.03301312406981464, + "grad_norm": 1.118385910987854, + "learning_rate": 1.09107303877367e-05, + "loss": 1.9939, + "step": 122 + }, + { + "epoch": 0.0335543228250575, + "grad_norm": 0.7215014696121216, + "learning_rate": 1.109107303877367e-05, + "loss": 1.9515, + "step": 124 + }, + { + "epoch": 0.03409552158030037, + "grad_norm": 0.9696834683418274, + "learning_rate": 1.1271415689810642e-05, + "loss": 1.9639, + "step": 126 + }, + { + "epoch": 0.03463672033554323, + "grad_norm": 0.945482611656189, + "learning_rate": 1.1451758340847611e-05, + "loss": 1.9397, + "step": 128 + }, + { + "epoch": 0.03517791909078609, + "grad_norm": 0.7454535365104675, + "learning_rate": 1.1632100991884582e-05, + "loss": 1.9353, + "step": 130 + }, + { + "epoch": 0.03571911784602896, + "grad_norm": 0.7824187278747559, + "learning_rate": 1.1812443642921551e-05, + "loss": 1.9227, + "step": 132 + }, + { + "epoch": 0.03626031660127182, + "grad_norm": 0.7939879894256592, + "learning_rate": 1.1992786293958522e-05, + "loss": 1.9126, + "step": 134 + }, + { + "epoch": 0.03680151535651468, + "grad_norm": 0.7776147723197937, + "learning_rate": 1.2173128944995491e-05, + "loss": 1.9002, + "step": 136 + }, + { + "epoch": 0.03734271411175754, + "grad_norm": 0.6580236554145813, + "learning_rate": 1.2353471596032462e-05, + "loss": 1.9121, + "step": 138 + }, + { + "epoch": 0.03788391286700041, + "grad_norm": 0.7200301289558411, + "learning_rate": 1.2533814247069433e-05, + "loss": 1.8885, + "step": 140 + }, + { + "epoch": 0.03842511162224327, + "grad_norm": 0.7958497405052185, + "learning_rate": 1.2714156898106402e-05, + "loss": 1.9095, + "step": 142 + }, + { + "epoch": 0.03896631037748613, + "grad_norm": 0.9120681881904602, + "learning_rate": 1.2894499549143375e-05, + "loss": 1.884, + "step": 144 + }, + { + "epoch": 0.039507509132729, + "grad_norm": 0.8108247518539429, + "learning_rate": 1.3074842200180342e-05, + "loss": 1.8656, + "step": 146 + }, + { + "epoch": 0.04004870788797186, + "grad_norm": 0.7010449171066284, + "learning_rate": 1.3255184851217315e-05, + "loss": 1.8635, + "step": 148 + }, + { + "epoch": 0.04058990664321472, + "grad_norm": 0.8178524374961853, + "learning_rate": 1.3435527502254284e-05, + "loss": 1.8933, + "step": 150 + }, + { + "epoch": 0.04113110539845758, + "grad_norm": 1.0447405576705933, + "learning_rate": 1.3615870153291255e-05, + "loss": 1.8523, + "step": 152 + }, + { + "epoch": 0.04167230415370045, + "grad_norm": 0.8516271710395813, + "learning_rate": 1.3796212804328224e-05, + "loss": 1.8528, + "step": 154 + }, + { + "epoch": 0.04221350290894331, + "grad_norm": 0.8437328934669495, + "learning_rate": 1.3976555455365195e-05, + "loss": 1.861, + "step": 156 + }, + { + "epoch": 0.04275470166418617, + "grad_norm": 0.851265549659729, + "learning_rate": 1.4156898106402164e-05, + "loss": 1.8315, + "step": 158 + }, + { + "epoch": 0.04329590041942904, + "grad_norm": 0.7337156534194946, + "learning_rate": 1.4337240757439135e-05, + "loss": 1.8354, + "step": 160 + }, + { + "epoch": 0.0438370991746719, + "grad_norm": 0.9754143357276917, + "learning_rate": 1.4517583408476104e-05, + "loss": 1.8252, + "step": 162 + }, + { + "epoch": 0.04437829792991476, + "grad_norm": 0.6172115802764893, + "learning_rate": 1.4697926059513075e-05, + "loss": 1.8094, + "step": 164 + }, + { + "epoch": 0.04491949668515762, + "grad_norm": 0.8304158449172974, + "learning_rate": 1.4878268710550044e-05, + "loss": 1.8078, + "step": 166 + }, + { + "epoch": 0.04546069544040049, + "grad_norm": 0.6388853788375854, + "learning_rate": 1.5058611361587017e-05, + "loss": 1.8106, + "step": 168 + }, + { + "epoch": 0.04600189419564335, + "grad_norm": 0.743231475353241, + "learning_rate": 1.5238954012623984e-05, + "loss": 1.8144, + "step": 170 + }, + { + "epoch": 0.04654309295088621, + "grad_norm": 0.6442289352416992, + "learning_rate": 1.5419296663660955e-05, + "loss": 1.7831, + "step": 172 + }, + { + "epoch": 0.04708429170612908, + "grad_norm": 0.6877187490463257, + "learning_rate": 1.559963931469793e-05, + "loss": 1.8043, + "step": 174 + }, + { + "epoch": 0.04762549046137194, + "grad_norm": 0.9389640688896179, + "learning_rate": 1.5779981965734897e-05, + "loss": 1.7869, + "step": 176 + }, + { + "epoch": 0.0481666892166148, + "grad_norm": 1.0456589460372925, + "learning_rate": 1.5960324616771868e-05, + "loss": 1.7681, + "step": 178 + }, + { + "epoch": 0.04870788797185766, + "grad_norm": 0.9617791175842285, + "learning_rate": 1.614066726780884e-05, + "loss": 1.7668, + "step": 180 + }, + { + "epoch": 0.04924908672710053, + "grad_norm": 0.9334360361099243, + "learning_rate": 1.632100991884581e-05, + "loss": 1.7893, + "step": 182 + }, + { + "epoch": 0.04979028548234339, + "grad_norm": 0.8952531814575195, + "learning_rate": 1.6501352569882777e-05, + "loss": 1.7758, + "step": 184 + }, + { + "epoch": 0.05033148423758625, + "grad_norm": 0.8544924855232239, + "learning_rate": 1.6681695220919748e-05, + "loss": 1.793, + "step": 186 + }, + { + "epoch": 0.05087268299282912, + "grad_norm": 0.7782765030860901, + "learning_rate": 1.686203787195672e-05, + "loss": 1.768, + "step": 188 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.7119695544242859, + "learning_rate": 1.704238052299369e-05, + "loss": 1.7685, + "step": 190 + }, + { + "epoch": 0.05195508050331484, + "grad_norm": 0.9119647145271301, + "learning_rate": 1.7222723174030657e-05, + "loss": 1.7706, + "step": 192 + }, + { + "epoch": 0.0524962792585577, + "grad_norm": 0.6414957642555237, + "learning_rate": 1.7403065825067628e-05, + "loss": 1.7626, + "step": 194 + }, + { + "epoch": 0.05303747801380057, + "grad_norm": 0.8069677352905273, + "learning_rate": 1.75834084761046e-05, + "loss": 1.7423, + "step": 196 + }, + { + "epoch": 0.05357867676904343, + "grad_norm": 0.6549937725067139, + "learning_rate": 1.776375112714157e-05, + "loss": 1.7428, + "step": 198 + }, + { + "epoch": 0.05411987552428629, + "grad_norm": 0.8064024448394775, + "learning_rate": 1.7944093778178538e-05, + "loss": 1.7448, + "step": 200 + }, + { + "epoch": 0.054661074279529154, + "grad_norm": 0.7182701826095581, + "learning_rate": 1.8124436429215512e-05, + "loss": 1.7248, + "step": 202 + }, + { + "epoch": 0.05520227303477202, + "grad_norm": 0.6997919678688049, + "learning_rate": 1.830477908025248e-05, + "loss": 1.7281, + "step": 204 + }, + { + "epoch": 0.05574347179001488, + "grad_norm": 0.7071277499198914, + "learning_rate": 1.848512173128945e-05, + "loss": 1.714, + "step": 206 + }, + { + "epoch": 0.056284670545257744, + "grad_norm": 0.6344273090362549, + "learning_rate": 1.866546438232642e-05, + "loss": 1.7463, + "step": 208 + }, + { + "epoch": 0.05682586930050061, + "grad_norm": 0.7192733883857727, + "learning_rate": 1.8845807033363392e-05, + "loss": 1.737, + "step": 210 + }, + { + "epoch": 0.05736706805574347, + "grad_norm": 0.7418521642684937, + "learning_rate": 1.9026149684400363e-05, + "loss": 1.7197, + "step": 212 + }, + { + "epoch": 0.057908266810986334, + "grad_norm": 0.875845730304718, + "learning_rate": 1.920649233543733e-05, + "loss": 1.6968, + "step": 214 + }, + { + "epoch": 0.058449465566229195, + "grad_norm": 0.7394037842750549, + "learning_rate": 1.9386834986474305e-05, + "loss": 1.7051, + "step": 216 + }, + { + "epoch": 0.05899066432147206, + "grad_norm": 0.6689572930335999, + "learning_rate": 1.9567177637511272e-05, + "loss": 1.7152, + "step": 218 + }, + { + "epoch": 0.05953186307671492, + "grad_norm": 0.7955539226531982, + "learning_rate": 1.9747520288548243e-05, + "loss": 1.7136, + "step": 220 + }, + { + "epoch": 0.060073061831957784, + "grad_norm": 0.7005388140678406, + "learning_rate": 1.9927862939585214e-05, + "loss": 1.7152, + "step": 222 + }, + { + "epoch": 0.06061426058720065, + "grad_norm": 0.6205731630325317, + "learning_rate": 2.0108205590622185e-05, + "loss": 1.6901, + "step": 224 + }, + { + "epoch": 0.06115545934244351, + "grad_norm": 0.7079929709434509, + "learning_rate": 2.0288548241659152e-05, + "loss": 1.6905, + "step": 226 + }, + { + "epoch": 0.061696658097686374, + "grad_norm": 0.6871302723884583, + "learning_rate": 2.0468890892696123e-05, + "loss": 1.6867, + "step": 228 + }, + { + "epoch": 0.062237856852929235, + "grad_norm": 0.7172162532806396, + "learning_rate": 2.0649233543733094e-05, + "loss": 1.685, + "step": 230 + }, + { + "epoch": 0.0627790556081721, + "grad_norm": 0.6729004979133606, + "learning_rate": 2.0829576194770065e-05, + "loss": 1.6961, + "step": 232 + }, + { + "epoch": 0.06332025436341496, + "grad_norm": 0.7335099577903748, + "learning_rate": 2.1009918845807033e-05, + "loss": 1.6797, + "step": 234 + }, + { + "epoch": 0.06386145311865783, + "grad_norm": 0.6398060321807861, + "learning_rate": 2.1190261496844003e-05, + "loss": 1.7037, + "step": 236 + }, + { + "epoch": 0.0644026518739007, + "grad_norm": 0.7026365399360657, + "learning_rate": 2.1370604147880974e-05, + "loss": 1.6698, + "step": 238 + }, + { + "epoch": 0.06494385062914355, + "grad_norm": 0.7972332239151001, + "learning_rate": 2.1550946798917945e-05, + "loss": 1.6866, + "step": 240 + }, + { + "epoch": 0.06548504938438642, + "grad_norm": 0.7363021969795227, + "learning_rate": 2.1731289449954913e-05, + "loss": 1.6879, + "step": 242 + }, + { + "epoch": 0.06602624813962928, + "grad_norm": 0.7071017026901245, + "learning_rate": 2.1911632100991887e-05, + "loss": 1.6898, + "step": 244 + }, + { + "epoch": 0.06656744689487214, + "grad_norm": 0.8030880093574524, + "learning_rate": 2.2091974752028858e-05, + "loss": 1.6734, + "step": 246 + }, + { + "epoch": 0.067108645650115, + "grad_norm": 0.7429569363594055, + "learning_rate": 2.2272317403065825e-05, + "loss": 1.6722, + "step": 248 + }, + { + "epoch": 0.06764984440535787, + "grad_norm": 0.6807804107666016, + "learning_rate": 2.2452660054102796e-05, + "loss": 1.6697, + "step": 250 + }, + { + "epoch": 0.06819104316060073, + "grad_norm": 0.6632562875747681, + "learning_rate": 2.2633002705139767e-05, + "loss": 1.6453, + "step": 252 + }, + { + "epoch": 0.0687322419158436, + "grad_norm": 0.6661680340766907, + "learning_rate": 2.2813345356176738e-05, + "loss": 1.6701, + "step": 254 + }, + { + "epoch": 0.06927344067108646, + "grad_norm": 0.6747105121612549, + "learning_rate": 2.2993688007213706e-05, + "loss": 1.6729, + "step": 256 + }, + { + "epoch": 0.06981463942632932, + "grad_norm": 0.7698473334312439, + "learning_rate": 2.317403065825068e-05, + "loss": 1.6528, + "step": 258 + }, + { + "epoch": 0.07035583818157218, + "grad_norm": 0.6111325621604919, + "learning_rate": 2.3354373309287647e-05, + "loss": 1.6412, + "step": 260 + }, + { + "epoch": 0.07089703693681504, + "grad_norm": 0.7405019998550415, + "learning_rate": 2.3534715960324618e-05, + "loss": 1.6564, + "step": 262 + }, + { + "epoch": 0.07143823569205791, + "grad_norm": 0.6702501773834229, + "learning_rate": 2.371505861136159e-05, + "loss": 1.654, + "step": 264 + }, + { + "epoch": 0.07197943444730077, + "grad_norm": 0.7076373100280762, + "learning_rate": 2.389540126239856e-05, + "loss": 1.6301, + "step": 266 + }, + { + "epoch": 0.07252063320254364, + "grad_norm": 0.7239627242088318, + "learning_rate": 2.4075743913435528e-05, + "loss": 1.6575, + "step": 268 + }, + { + "epoch": 0.0730618319577865, + "grad_norm": 0.753480076789856, + "learning_rate": 2.42560865644725e-05, + "loss": 1.6603, + "step": 270 + }, + { + "epoch": 0.07360303071302936, + "grad_norm": 0.7261641025543213, + "learning_rate": 2.443642921550947e-05, + "loss": 1.6449, + "step": 272 + }, + { + "epoch": 0.07414422946827222, + "grad_norm": 0.6315119862556458, + "learning_rate": 2.461677186654644e-05, + "loss": 1.6538, + "step": 274 + }, + { + "epoch": 0.07468542822351508, + "grad_norm": 0.5698412656784058, + "learning_rate": 2.4797114517583408e-05, + "loss": 1.6663, + "step": 276 + }, + { + "epoch": 0.07522662697875795, + "grad_norm": 0.5968983173370361, + "learning_rate": 2.497745716862038e-05, + "loss": 1.643, + "step": 278 + }, + { + "epoch": 0.07576782573400082, + "grad_norm": 0.561126172542572, + "learning_rate": 2.5157799819657353e-05, + "loss": 1.6301, + "step": 280 + }, + { + "epoch": 0.07630902448924368, + "grad_norm": 0.7290865778923035, + "learning_rate": 2.533814247069432e-05, + "loss": 1.6412, + "step": 282 + }, + { + "epoch": 0.07685022324448654, + "grad_norm": 0.7629122138023376, + "learning_rate": 2.5518485121731288e-05, + "loss": 1.6335, + "step": 284 + }, + { + "epoch": 0.0773914219997294, + "grad_norm": 0.5383496284484863, + "learning_rate": 2.5698827772768262e-05, + "loss": 1.6226, + "step": 286 + }, + { + "epoch": 0.07793262075497226, + "grad_norm": 0.7778373956680298, + "learning_rate": 2.5879170423805233e-05, + "loss": 1.6333, + "step": 288 + }, + { + "epoch": 0.07847381951021512, + "grad_norm": 0.6851366758346558, + "learning_rate": 2.60595130748422e-05, + "loss": 1.6251, + "step": 290 + }, + { + "epoch": 0.079015018265458, + "grad_norm": 0.5947225689888, + "learning_rate": 2.623985572587917e-05, + "loss": 1.6298, + "step": 292 + }, + { + "epoch": 0.07955621702070086, + "grad_norm": 0.9742544889450073, + "learning_rate": 2.6420198376916146e-05, + "loss": 1.6252, + "step": 294 + }, + { + "epoch": 0.08009741577594372, + "grad_norm": 1.2064323425292969, + "learning_rate": 2.6600541027953113e-05, + "loss": 1.6152, + "step": 296 + }, + { + "epoch": 0.08063861453118658, + "grad_norm": 1.0506716966629028, + "learning_rate": 2.678088367899008e-05, + "loss": 1.6351, + "step": 298 + }, + { + "epoch": 0.08117981328642944, + "grad_norm": 1.2992738485336304, + "learning_rate": 2.696122633002705e-05, + "loss": 1.6193, + "step": 300 + }, + { + "epoch": 0.0817210120416723, + "grad_norm": 1.0616599321365356, + "learning_rate": 2.7141568981064026e-05, + "loss": 1.6135, + "step": 302 + }, + { + "epoch": 0.08226221079691516, + "grad_norm": 1.037997841835022, + "learning_rate": 2.7321911632100993e-05, + "loss": 1.6344, + "step": 304 + }, + { + "epoch": 0.08280340955215804, + "grad_norm": 0.8937569856643677, + "learning_rate": 2.7502254283137964e-05, + "loss": 1.6077, + "step": 306 + }, + { + "epoch": 0.0833446083074009, + "grad_norm": 1.1334234476089478, + "learning_rate": 2.7682596934174932e-05, + "loss": 1.6193, + "step": 308 + }, + { + "epoch": 0.08388580706264376, + "grad_norm": 0.8336219191551208, + "learning_rate": 2.7862939585211906e-05, + "loss": 1.5948, + "step": 310 + }, + { + "epoch": 0.08442700581788662, + "grad_norm": 1.1825398206710815, + "learning_rate": 2.8043282236248874e-05, + "loss": 1.6239, + "step": 312 + }, + { + "epoch": 0.08496820457312948, + "grad_norm": 0.7945433259010315, + "learning_rate": 2.8223624887285844e-05, + "loss": 1.6119, + "step": 314 + }, + { + "epoch": 0.08550940332837234, + "grad_norm": 0.6971009969711304, + "learning_rate": 2.8403967538322812e-05, + "loss": 1.5822, + "step": 316 + }, + { + "epoch": 0.0860506020836152, + "grad_norm": 0.6050766706466675, + "learning_rate": 2.8584310189359786e-05, + "loss": 1.6161, + "step": 318 + }, + { + "epoch": 0.08659180083885808, + "grad_norm": 0.6123189330101013, + "learning_rate": 2.8764652840396754e-05, + "loss": 1.5941, + "step": 320 + }, + { + "epoch": 0.08713299959410094, + "grad_norm": 0.5471253395080566, + "learning_rate": 2.8944995491433725e-05, + "loss": 1.603, + "step": 322 + }, + { + "epoch": 0.0876741983493438, + "grad_norm": 0.5793882608413696, + "learning_rate": 2.91253381424707e-05, + "loss": 1.6076, + "step": 324 + }, + { + "epoch": 0.08821539710458666, + "grad_norm": 0.5409413576126099, + "learning_rate": 2.9305680793507666e-05, + "loss": 1.5825, + "step": 326 + }, + { + "epoch": 0.08875659585982952, + "grad_norm": 6.757148265838623, + "learning_rate": 2.9486023444544637e-05, + "loss": 1.5942, + "step": 328 + }, + { + "epoch": 0.08929779461507238, + "grad_norm": 1.3357856273651123, + "learning_rate": 2.9666366095581605e-05, + "loss": 1.642, + "step": 330 + }, + { + "epoch": 0.08983899337031524, + "grad_norm": 0.8245829939842224, + "learning_rate": 2.984670874661858e-05, + "loss": 1.6062, + "step": 332 + }, + { + "epoch": 0.09038019212555812, + "grad_norm": 0.8888993263244629, + "learning_rate": 3.0027051397655547e-05, + "loss": 1.5952, + "step": 334 + }, + { + "epoch": 0.09092139088080098, + "grad_norm": 0.8923915028572083, + "learning_rate": 3.0207394048692517e-05, + "loss": 1.5977, + "step": 336 + }, + { + "epoch": 0.09146258963604384, + "grad_norm": 0.7443459033966064, + "learning_rate": 3.0387736699729485e-05, + "loss": 1.5738, + "step": 338 + }, + { + "epoch": 0.0920037883912867, + "grad_norm": 0.7297430038452148, + "learning_rate": 3.056807935076646e-05, + "loss": 1.5907, + "step": 340 + }, + { + "epoch": 0.09254498714652956, + "grad_norm": 0.6882812976837158, + "learning_rate": 3.074842200180343e-05, + "loss": 1.5767, + "step": 342 + }, + { + "epoch": 0.09308618590177242, + "grad_norm": 0.6150392889976501, + "learning_rate": 3.0928764652840394e-05, + "loss": 1.5747, + "step": 344 + }, + { + "epoch": 0.09362738465701528, + "grad_norm": 0.6230599284172058, + "learning_rate": 3.110910730387737e-05, + "loss": 1.583, + "step": 346 + }, + { + "epoch": 0.09416858341225816, + "grad_norm": 0.6081874966621399, + "learning_rate": 3.128944995491434e-05, + "loss": 1.5875, + "step": 348 + }, + { + "epoch": 0.09470978216750102, + "grad_norm": 0.5467821955680847, + "learning_rate": 3.146979260595131e-05, + "loss": 1.575, + "step": 350 + }, + { + "epoch": 0.09525098092274388, + "grad_norm": 0.5629361271858215, + "learning_rate": 3.165013525698828e-05, + "loss": 1.5828, + "step": 352 + }, + { + "epoch": 0.09579217967798674, + "grad_norm": 0.5995283126831055, + "learning_rate": 3.1830477908025245e-05, + "loss": 1.5872, + "step": 354 + }, + { + "epoch": 0.0963333784332296, + "grad_norm": 0.556450366973877, + "learning_rate": 3.201082055906222e-05, + "loss": 1.553, + "step": 356 + }, + { + "epoch": 0.09687457718847246, + "grad_norm": 0.6498537063598633, + "learning_rate": 3.219116321009919e-05, + "loss": 1.5667, + "step": 358 + }, + { + "epoch": 0.09741577594371532, + "grad_norm": 0.5891172885894775, + "learning_rate": 3.237150586113616e-05, + "loss": 1.5818, + "step": 360 + }, + { + "epoch": 0.0979569746989582, + "grad_norm": 0.6487797498703003, + "learning_rate": 3.2551848512173136e-05, + "loss": 1.5582, + "step": 362 + }, + { + "epoch": 0.09849817345420106, + "grad_norm": 0.5860658884048462, + "learning_rate": 3.27321911632101e-05, + "loss": 1.5725, + "step": 364 + }, + { + "epoch": 0.09903937220944392, + "grad_norm": 0.5619581937789917, + "learning_rate": 3.291253381424707e-05, + "loss": 1.5779, + "step": 366 + }, + { + "epoch": 0.09958057096468678, + "grad_norm": 0.7147429585456848, + "learning_rate": 3.309287646528404e-05, + "loss": 1.5766, + "step": 368 + }, + { + "epoch": 0.10012176971992964, + "grad_norm": 0.5840562582015991, + "learning_rate": 3.327321911632101e-05, + "loss": 1.5609, + "step": 370 + }, + { + "epoch": 0.1006629684751725, + "grad_norm": 0.6277860403060913, + "learning_rate": 3.345356176735798e-05, + "loss": 1.5645, + "step": 372 + }, + { + "epoch": 0.10120416723041536, + "grad_norm": 0.6395567655563354, + "learning_rate": 3.3633904418394954e-05, + "loss": 1.545, + "step": 374 + }, + { + "epoch": 0.10174536598565824, + "grad_norm": 0.6651553511619568, + "learning_rate": 3.381424706943192e-05, + "loss": 1.5643, + "step": 376 + }, + { + "epoch": 0.1022865647409011, + "grad_norm": 0.6691033244132996, + "learning_rate": 3.3994589720468896e-05, + "loss": 1.5705, + "step": 378 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 0.5426511764526367, + "learning_rate": 3.4174932371505863e-05, + "loss": 1.536, + "step": 380 + }, + { + "epoch": 0.10336896225138682, + "grad_norm": 0.6677694916725159, + "learning_rate": 3.435527502254283e-05, + "loss": 1.5664, + "step": 382 + }, + { + "epoch": 0.10391016100662968, + "grad_norm": 0.5283762216567993, + "learning_rate": 3.45356176735798e-05, + "loss": 1.5474, + "step": 384 + }, + { + "epoch": 0.10445135976187254, + "grad_norm": 0.652812659740448, + "learning_rate": 3.471596032461677e-05, + "loss": 1.5509, + "step": 386 + }, + { + "epoch": 0.1049925585171154, + "grad_norm": 0.8639987111091614, + "learning_rate": 3.489630297565375e-05, + "loss": 1.5563, + "step": 388 + }, + { + "epoch": 0.10553375727235827, + "grad_norm": 0.7726946473121643, + "learning_rate": 3.5076645626690715e-05, + "loss": 1.5682, + "step": 390 + }, + { + "epoch": 0.10607495602760114, + "grad_norm": 0.6511155962944031, + "learning_rate": 3.525698827772768e-05, + "loss": 1.5571, + "step": 392 + }, + { + "epoch": 0.106616154782844, + "grad_norm": 0.6578395962715149, + "learning_rate": 3.5437330928764656e-05, + "loss": 1.5452, + "step": 394 + }, + { + "epoch": 0.10715735353808686, + "grad_norm": 0.642919659614563, + "learning_rate": 3.5617673579801624e-05, + "loss": 1.5508, + "step": 396 + }, + { + "epoch": 0.10769855229332972, + "grad_norm": 0.5190348029136658, + "learning_rate": 3.579801623083859e-05, + "loss": 1.5432, + "step": 398 + }, + { + "epoch": 0.10823975104857259, + "grad_norm": 0.48932549357414246, + "learning_rate": 3.5978358881875566e-05, + "loss": 1.5544, + "step": 400 + }, + { + "epoch": 0.10878094980381545, + "grad_norm": 0.5018340945243835, + "learning_rate": 3.615870153291254e-05, + "loss": 1.5322, + "step": 402 + }, + { + "epoch": 0.10932214855905831, + "grad_norm": 0.5701499581336975, + "learning_rate": 3.633904418394951e-05, + "loss": 1.5288, + "step": 404 + }, + { + "epoch": 0.10986334731430118, + "grad_norm": 0.6049205660820007, + "learning_rate": 3.6519386834986475e-05, + "loss": 1.5627, + "step": 406 + }, + { + "epoch": 0.11040454606954404, + "grad_norm": 0.5781517028808594, + "learning_rate": 3.669972948602345e-05, + "loss": 1.542, + "step": 408 + }, + { + "epoch": 0.1109457448247869, + "grad_norm": 0.5594660043716431, + "learning_rate": 3.688007213706042e-05, + "loss": 1.5461, + "step": 410 + }, + { + "epoch": 0.11148694358002977, + "grad_norm": 0.5319619178771973, + "learning_rate": 3.7060414788097384e-05, + "loss": 1.5668, + "step": 412 + }, + { + "epoch": 0.11202814233527263, + "grad_norm": 0.5311123728752136, + "learning_rate": 3.724075743913435e-05, + "loss": 1.528, + "step": 414 + }, + { + "epoch": 0.11256934109051549, + "grad_norm": 0.5555101633071899, + "learning_rate": 3.7421100090171326e-05, + "loss": 1.5392, + "step": 416 + }, + { + "epoch": 0.11311053984575835, + "grad_norm": 0.5486223101615906, + "learning_rate": 3.76014427412083e-05, + "loss": 1.5337, + "step": 418 + }, + { + "epoch": 0.11365173860100122, + "grad_norm": 0.5156669020652771, + "learning_rate": 3.778178539224527e-05, + "loss": 1.5105, + "step": 420 + }, + { + "epoch": 0.11419293735624408, + "grad_norm": 0.49596554040908813, + "learning_rate": 3.7962128043282235e-05, + "loss": 1.515, + "step": 422 + }, + { + "epoch": 0.11473413611148695, + "grad_norm": 0.641333281993866, + "learning_rate": 3.814247069431921e-05, + "loss": 1.5328, + "step": 424 + }, + { + "epoch": 0.1152753348667298, + "grad_norm": 0.6106113195419312, + "learning_rate": 3.832281334535618e-05, + "loss": 1.5189, + "step": 426 + }, + { + "epoch": 0.11581653362197267, + "grad_norm": 0.5619134306907654, + "learning_rate": 3.8503155996393145e-05, + "loss": 1.5295, + "step": 428 + }, + { + "epoch": 0.11635773237721553, + "grad_norm": 0.5396978259086609, + "learning_rate": 3.868349864743012e-05, + "loss": 1.5173, + "step": 430 + }, + { + "epoch": 0.11689893113245839, + "grad_norm": 0.5466894507408142, + "learning_rate": 3.886384129846709e-05, + "loss": 1.5191, + "step": 432 + }, + { + "epoch": 0.11744012988770126, + "grad_norm": 0.5601218342781067, + "learning_rate": 3.904418394950406e-05, + "loss": 1.5285, + "step": 434 + }, + { + "epoch": 0.11798132864294412, + "grad_norm": 0.6620492935180664, + "learning_rate": 3.922452660054103e-05, + "loss": 1.4946, + "step": 436 + }, + { + "epoch": 0.11852252739818699, + "grad_norm": 0.49140048027038574, + "learning_rate": 3.9404869251578e-05, + "loss": 1.512, + "step": 438 + }, + { + "epoch": 0.11906372615342985, + "grad_norm": 0.5824118256568909, + "learning_rate": 3.958521190261497e-05, + "loss": 1.5244, + "step": 440 + }, + { + "epoch": 0.11960492490867271, + "grad_norm": 0.4967150092124939, + "learning_rate": 3.976555455365194e-05, + "loss": 1.5273, + "step": 442 + }, + { + "epoch": 0.12014612366391557, + "grad_norm": 0.5089767575263977, + "learning_rate": 3.994589720468891e-05, + "loss": 1.5119, + "step": 444 + }, + { + "epoch": 0.12068732241915843, + "grad_norm": 0.5404312014579773, + "learning_rate": 4.0126239855725886e-05, + "loss": 1.5072, + "step": 446 + }, + { + "epoch": 0.1212285211744013, + "grad_norm": 0.5239550471305847, + "learning_rate": 4.0306582506762853e-05, + "loss": 1.5336, + "step": 448 + }, + { + "epoch": 0.12176971992964417, + "grad_norm": 0.4974781274795532, + "learning_rate": 4.048692515779982e-05, + "loss": 1.5225, + "step": 450 + }, + { + "epoch": 0.12231091868488703, + "grad_norm": 0.5363791584968567, + "learning_rate": 4.066726780883679e-05, + "loss": 1.5176, + "step": 452 + }, + { + "epoch": 0.12285211744012989, + "grad_norm": 0.5095157027244568, + "learning_rate": 4.084761045987376e-05, + "loss": 1.4936, + "step": 454 + }, + { + "epoch": 0.12339331619537275, + "grad_norm": 0.4920356869697571, + "learning_rate": 4.102795311091073e-05, + "loss": 1.5269, + "step": 456 + }, + { + "epoch": 0.12393451495061561, + "grad_norm": 0.4940793514251709, + "learning_rate": 4.1208295761947705e-05, + "loss": 1.5072, + "step": 458 + }, + { + "epoch": 0.12447571370585847, + "grad_norm": 0.4805227220058441, + "learning_rate": 4.138863841298467e-05, + "loss": 1.4987, + "step": 460 + }, + { + "epoch": 0.12501691246110133, + "grad_norm": 0.49683934450149536, + "learning_rate": 4.1568981064021646e-05, + "loss": 1.5008, + "step": 462 + }, + { + "epoch": 0.1255581112163442, + "grad_norm": 0.5283801555633545, + "learning_rate": 4.1749323715058614e-05, + "loss": 1.5177, + "step": 464 + }, + { + "epoch": 0.12609930997158705, + "grad_norm": 0.5395119190216064, + "learning_rate": 4.192966636609558e-05, + "loss": 1.5106, + "step": 466 + }, + { + "epoch": 0.12664050872682991, + "grad_norm": 0.5403693914413452, + "learning_rate": 4.211000901713255e-05, + "loss": 1.4854, + "step": 468 + }, + { + "epoch": 0.1271817074820728, + "grad_norm": 0.4690951406955719, + "learning_rate": 4.229035166816952e-05, + "loss": 1.5079, + "step": 470 + }, + { + "epoch": 0.12772290623731566, + "grad_norm": 0.5077293515205383, + "learning_rate": 4.24706943192065e-05, + "loss": 1.4953, + "step": 472 + }, + { + "epoch": 0.12826410499255853, + "grad_norm": 0.440019816160202, + "learning_rate": 4.2651036970243465e-05, + "loss": 1.4864, + "step": 474 + }, + { + "epoch": 0.1288053037478014, + "grad_norm": 0.48672759532928467, + "learning_rate": 4.283137962128044e-05, + "loss": 1.5205, + "step": 476 + }, + { + "epoch": 0.12934650250304425, + "grad_norm": 0.4732811450958252, + "learning_rate": 4.301172227231741e-05, + "loss": 1.4998, + "step": 478 + }, + { + "epoch": 0.1298877012582871, + "grad_norm": 0.46713048219680786, + "learning_rate": 4.3192064923354374e-05, + "loss": 1.4893, + "step": 480 + }, + { + "epoch": 0.13042890001352997, + "grad_norm": 0.502356231212616, + "learning_rate": 4.337240757439134e-05, + "loss": 1.5125, + "step": 482 + }, + { + "epoch": 0.13097009876877283, + "grad_norm": 0.45067864656448364, + "learning_rate": 4.3552750225428316e-05, + "loss": 1.4978, + "step": 484 + }, + { + "epoch": 0.1315112975240157, + "grad_norm": 0.46964120864868164, + "learning_rate": 4.373309287646529e-05, + "loss": 1.5006, + "step": 486 + }, + { + "epoch": 0.13205249627925855, + "grad_norm": 0.47723180055618286, + "learning_rate": 4.391343552750226e-05, + "loss": 1.513, + "step": 488 + }, + { + "epoch": 0.1325936950345014, + "grad_norm": 0.5100542306900024, + "learning_rate": 4.4093778178539225e-05, + "loss": 1.5279, + "step": 490 + }, + { + "epoch": 0.13313489378974427, + "grad_norm": 0.5344257354736328, + "learning_rate": 4.42741208295762e-05, + "loss": 1.5193, + "step": 492 + }, + { + "epoch": 0.13367609254498714, + "grad_norm": 0.5867893695831299, + "learning_rate": 4.445446348061317e-05, + "loss": 1.512, + "step": 494 + }, + { + "epoch": 0.13421729130023, + "grad_norm": 0.7811394929885864, + "learning_rate": 4.4634806131650134e-05, + "loss": 1.5038, + "step": 496 + }, + { + "epoch": 0.13475849005547288, + "grad_norm": 0.8505339622497559, + "learning_rate": 4.48151487826871e-05, + "loss": 1.5169, + "step": 498 + }, + { + "epoch": 0.13529968881071575, + "grad_norm": 0.6337641477584839, + "learning_rate": 4.4995491433724076e-05, + "loss": 1.4951, + "step": 500 + }, + { + "epoch": 0.1358408875659586, + "grad_norm": 0.7979961633682251, + "learning_rate": 4.517583408476105e-05, + "loss": 1.5031, + "step": 502 + }, + { + "epoch": 0.13638208632120147, + "grad_norm": 0.6946894526481628, + "learning_rate": 4.535617673579802e-05, + "loss": 1.501, + "step": 504 + }, + { + "epoch": 0.13692328507644433, + "grad_norm": 0.6830259561538696, + "learning_rate": 4.5536519386834986e-05, + "loss": 1.4896, + "step": 506 + }, + { + "epoch": 0.1374644838316872, + "grad_norm": 0.5908662676811218, + "learning_rate": 4.571686203787196e-05, + "loss": 1.4992, + "step": 508 + }, + { + "epoch": 0.13800568258693005, + "grad_norm": 0.7655865550041199, + "learning_rate": 4.589720468890893e-05, + "loss": 1.4911, + "step": 510 + }, + { + "epoch": 0.1385468813421729, + "grad_norm": 0.5924785733222961, + "learning_rate": 4.6077547339945895e-05, + "loss": 1.4719, + "step": 512 + }, + { + "epoch": 0.13908808009741577, + "grad_norm": 0.6654263138771057, + "learning_rate": 4.625788999098287e-05, + "loss": 1.5109, + "step": 514 + }, + { + "epoch": 0.13962927885265863, + "grad_norm": 0.5296297073364258, + "learning_rate": 4.6438232642019843e-05, + "loss": 1.4934, + "step": 516 + }, + { + "epoch": 0.1401704776079015, + "grad_norm": 0.5698690414428711, + "learning_rate": 4.661857529305681e-05, + "loss": 1.4954, + "step": 518 + }, + { + "epoch": 0.14071167636314436, + "grad_norm": 0.5790325403213501, + "learning_rate": 4.679891794409378e-05, + "loss": 1.4673, + "step": 520 + }, + { + "epoch": 0.14125287511838722, + "grad_norm": 0.551480770111084, + "learning_rate": 4.697926059513075e-05, + "loss": 1.476, + "step": 522 + }, + { + "epoch": 0.14179407387363008, + "grad_norm": 0.5201780796051025, + "learning_rate": 4.715960324616772e-05, + "loss": 1.4701, + "step": 524 + }, + { + "epoch": 0.14233527262887297, + "grad_norm": 0.46442562341690063, + "learning_rate": 4.733994589720469e-05, + "loss": 1.4831, + "step": 526 + }, + { + "epoch": 0.14287647138411583, + "grad_norm": 0.5558522939682007, + "learning_rate": 4.752028854824166e-05, + "loss": 1.4729, + "step": 528 + }, + { + "epoch": 0.1434176701393587, + "grad_norm": 0.48511791229248047, + "learning_rate": 4.7700631199278636e-05, + "loss": 1.4742, + "step": 530 + }, + { + "epoch": 0.14395886889460155, + "grad_norm": 0.5244829058647156, + "learning_rate": 4.7880973850315604e-05, + "loss": 1.4928, + "step": 532 + }, + { + "epoch": 0.1445000676498444, + "grad_norm": 0.48878946900367737, + "learning_rate": 4.806131650135257e-05, + "loss": 1.4921, + "step": 534 + }, + { + "epoch": 0.14504126640508727, + "grad_norm": 0.5348760485649109, + "learning_rate": 4.824165915238954e-05, + "loss": 1.4917, + "step": 536 + }, + { + "epoch": 0.14558246516033013, + "grad_norm": 0.5444923639297485, + "learning_rate": 4.842200180342651e-05, + "loss": 1.4546, + "step": 538 + }, + { + "epoch": 0.146123663915573, + "grad_norm": 0.494761198759079, + "learning_rate": 4.860234445446348e-05, + "loss": 1.4751, + "step": 540 + }, + { + "epoch": 0.14666486267081585, + "grad_norm": 0.4921441674232483, + "learning_rate": 4.8782687105500455e-05, + "loss": 1.4767, + "step": 542 + }, + { + "epoch": 0.14720606142605872, + "grad_norm": 0.48382577300071716, + "learning_rate": 4.896302975653742e-05, + "loss": 1.485, + "step": 544 + }, + { + "epoch": 0.14774726018130158, + "grad_norm": 0.4616708755493164, + "learning_rate": 4.9143372407574397e-05, + "loss": 1.4732, + "step": 546 + }, + { + "epoch": 0.14828845893654444, + "grad_norm": 0.5030043125152588, + "learning_rate": 4.9323715058611364e-05, + "loss": 1.4799, + "step": 548 + }, + { + "epoch": 0.1488296576917873, + "grad_norm": 0.467230886220932, + "learning_rate": 4.950405770964833e-05, + "loss": 1.4594, + "step": 550 + }, + { + "epoch": 0.14937085644703016, + "grad_norm": 0.42864304780960083, + "learning_rate": 4.9684400360685306e-05, + "loss": 1.4748, + "step": 552 + }, + { + "epoch": 0.14991205520227305, + "grad_norm": 0.43733683228492737, + "learning_rate": 4.986474301172227e-05, + "loss": 1.462, + "step": 554 + }, + { + "epoch": 0.1504532539575159, + "grad_norm": 0.45550286769866943, + "learning_rate": 5.004508566275925e-05, + "loss": 1.475, + "step": 556 + }, + { + "epoch": 0.15099445271275877, + "grad_norm": 0.44999995827674866, + "learning_rate": 5.022542831379622e-05, + "loss": 1.4794, + "step": 558 + }, + { + "epoch": 0.15153565146800163, + "grad_norm": 0.5035279989242554, + "learning_rate": 5.040577096483319e-05, + "loss": 1.471, + "step": 560 + }, + { + "epoch": 0.1520768502232445, + "grad_norm": 0.44605591893196106, + "learning_rate": 5.058611361587016e-05, + "loss": 1.4461, + "step": 562 + }, + { + "epoch": 0.15261804897848735, + "grad_norm": 0.5482723712921143, + "learning_rate": 5.0766456266907124e-05, + "loss": 1.4597, + "step": 564 + }, + { + "epoch": 0.1531592477337302, + "grad_norm": 0.5323627591133118, + "learning_rate": 5.094679891794409e-05, + "loss": 1.4743, + "step": 566 + }, + { + "epoch": 0.15370044648897307, + "grad_norm": 0.5289944410324097, + "learning_rate": 5.1127141568981066e-05, + "loss": 1.5, + "step": 568 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.5446243286132812, + "learning_rate": 5.1307484220018034e-05, + "loss": 1.4751, + "step": 570 + }, + { + "epoch": 0.1547828439994588, + "grad_norm": 0.525830090045929, + "learning_rate": 5.1487826871055015e-05, + "loss": 1.4639, + "step": 572 + }, + { + "epoch": 0.15532404275470166, + "grad_norm": 0.48129191994667053, + "learning_rate": 5.166816952209198e-05, + "loss": 1.4652, + "step": 574 + }, + { + "epoch": 0.15586524150994452, + "grad_norm": 0.47915297746658325, + "learning_rate": 5.184851217312895e-05, + "loss": 1.4627, + "step": 576 + }, + { + "epoch": 0.15640644026518738, + "grad_norm": 0.5229325294494629, + "learning_rate": 5.202885482416592e-05, + "loss": 1.4525, + "step": 578 + }, + { + "epoch": 0.15694763902043024, + "grad_norm": 0.5452600121498108, + "learning_rate": 5.2209197475202885e-05, + "loss": 1.458, + "step": 580 + }, + { + "epoch": 0.15748883777567313, + "grad_norm": 0.427432656288147, + "learning_rate": 5.238954012623985e-05, + "loss": 1.4773, + "step": 582 + }, + { + "epoch": 0.158030036530916, + "grad_norm": 0.450712114572525, + "learning_rate": 5.2569882777276827e-05, + "loss": 1.469, + "step": 584 + }, + { + "epoch": 0.15857123528615885, + "grad_norm": 0.5500516891479492, + "learning_rate": 5.27502254283138e-05, + "loss": 1.4603, + "step": 586 + }, + { + "epoch": 0.1591124340414017, + "grad_norm": 0.457157164812088, + "learning_rate": 5.2930568079350775e-05, + "loss": 1.4785, + "step": 588 + }, + { + "epoch": 0.15965363279664457, + "grad_norm": 0.49750396609306335, + "learning_rate": 5.311091073038774e-05, + "loss": 1.4603, + "step": 590 + }, + { + "epoch": 0.16019483155188743, + "grad_norm": 0.5720525979995728, + "learning_rate": 5.329125338142471e-05, + "loss": 1.4753, + "step": 592 + }, + { + "epoch": 0.1607360303071303, + "grad_norm": 0.4425548315048218, + "learning_rate": 5.347159603246168e-05, + "loss": 1.462, + "step": 594 + }, + { + "epoch": 0.16127722906237316, + "grad_norm": 0.5064132809638977, + "learning_rate": 5.3651938683498645e-05, + "loss": 1.4596, + "step": 596 + }, + { + "epoch": 0.16181842781761602, + "grad_norm": 0.518460750579834, + "learning_rate": 5.383228133453562e-05, + "loss": 1.4763, + "step": 598 + }, + { + "epoch": 0.16235962657285888, + "grad_norm": 0.4613576829433441, + "learning_rate": 5.401262398557259e-05, + "loss": 1.4487, + "step": 600 + }, + { + "epoch": 0.16290082532810174, + "grad_norm": 0.7046213746070862, + "learning_rate": 5.419296663660957e-05, + "loss": 1.472, + "step": 602 + }, + { + "epoch": 0.1634420240833446, + "grad_norm": 0.6164196133613586, + "learning_rate": 5.4373309287646535e-05, + "loss": 1.4424, + "step": 604 + }, + { + "epoch": 0.16398322283858746, + "grad_norm": 0.5106020569801331, + "learning_rate": 5.45536519386835e-05, + "loss": 1.4567, + "step": 606 + }, + { + "epoch": 0.16452442159383032, + "grad_norm": 0.4291236400604248, + "learning_rate": 5.473399458972047e-05, + "loss": 1.4514, + "step": 608 + }, + { + "epoch": 0.16506562034907318, + "grad_norm": 0.46577414870262146, + "learning_rate": 5.491433724075744e-05, + "loss": 1.4408, + "step": 610 + }, + { + "epoch": 0.16560681910431607, + "grad_norm": 0.4729917049407959, + "learning_rate": 5.509467989179441e-05, + "loss": 1.4493, + "step": 612 + }, + { + "epoch": 0.16614801785955893, + "grad_norm": 0.4651925563812256, + "learning_rate": 5.527502254283138e-05, + "loss": 1.465, + "step": 614 + }, + { + "epoch": 0.1666892166148018, + "grad_norm": 0.4756859540939331, + "learning_rate": 5.545536519386835e-05, + "loss": 1.4641, + "step": 616 + }, + { + "epoch": 0.16723041537004465, + "grad_norm": 0.42555975914001465, + "learning_rate": 5.563570784490533e-05, + "loss": 1.4569, + "step": 618 + }, + { + "epoch": 0.16777161412528752, + "grad_norm": 0.5162522196769714, + "learning_rate": 5.5816050495942296e-05, + "loss": 1.4344, + "step": 620 + }, + { + "epoch": 0.16831281288053038, + "grad_norm": 0.5867063999176025, + "learning_rate": 5.599639314697926e-05, + "loss": 1.4647, + "step": 622 + }, + { + "epoch": 0.16885401163577324, + "grad_norm": 0.6629165410995483, + "learning_rate": 5.617673579801623e-05, + "loss": 1.473, + "step": 624 + }, + { + "epoch": 0.1693952103910161, + "grad_norm": 0.5905330777168274, + "learning_rate": 5.6357078449053205e-05, + "loss": 1.4459, + "step": 626 + }, + { + "epoch": 0.16993640914625896, + "grad_norm": 0.7457858324050903, + "learning_rate": 5.653742110009017e-05, + "loss": 1.4603, + "step": 628 + }, + { + "epoch": 0.17047760790150182, + "grad_norm": 0.5977684855461121, + "learning_rate": 5.671776375112714e-05, + "loss": 1.4621, + "step": 630 + }, + { + "epoch": 0.17101880665674468, + "grad_norm": 0.7097992897033691, + "learning_rate": 5.689810640216412e-05, + "loss": 1.4646, + "step": 632 + }, + { + "epoch": 0.17156000541198754, + "grad_norm": 0.5895450711250305, + "learning_rate": 5.707844905320109e-05, + "loss": 1.4338, + "step": 634 + }, + { + "epoch": 0.1721012041672304, + "grad_norm": 0.576877772808075, + "learning_rate": 5.7258791704238056e-05, + "loss": 1.4666, + "step": 636 + }, + { + "epoch": 0.17264240292247326, + "grad_norm": 0.541110098361969, + "learning_rate": 5.7439134355275024e-05, + "loss": 1.4624, + "step": 638 + }, + { + "epoch": 0.17318360167771615, + "grad_norm": 0.5172320604324341, + "learning_rate": 5.7619477006312e-05, + "loss": 1.473, + "step": 640 + }, + { + "epoch": 0.17372480043295901, + "grad_norm": 0.47511357069015503, + "learning_rate": 5.7799819657348965e-05, + "loss": 1.446, + "step": 642 + }, + { + "epoch": 0.17426599918820188, + "grad_norm": 0.48614808917045593, + "learning_rate": 5.798016230838593e-05, + "loss": 1.4394, + "step": 644 + }, + { + "epoch": 0.17480719794344474, + "grad_norm": 0.4435577094554901, + "learning_rate": 5.81605049594229e-05, + "loss": 1.43, + "step": 646 + }, + { + "epoch": 0.1753483966986876, + "grad_norm": 0.4458653926849365, + "learning_rate": 5.834084761045988e-05, + "loss": 1.46, + "step": 648 + }, + { + "epoch": 0.17588959545393046, + "grad_norm": 0.40675726532936096, + "learning_rate": 5.852119026149685e-05, + "loss": 1.4565, + "step": 650 + }, + { + "epoch": 0.17643079420917332, + "grad_norm": 0.4132504165172577, + "learning_rate": 5.8701532912533817e-05, + "loss": 1.4522, + "step": 652 + }, + { + "epoch": 0.17697199296441618, + "grad_norm": 0.40881386399269104, + "learning_rate": 5.888187556357079e-05, + "loss": 1.4232, + "step": 654 + }, + { + "epoch": 0.17751319171965904, + "grad_norm": 0.40527868270874023, + "learning_rate": 5.906221821460776e-05, + "loss": 1.441, + "step": 656 + }, + { + "epoch": 0.1780543904749019, + "grad_norm": 0.40227004885673523, + "learning_rate": 5.9242560865644726e-05, + "loss": 1.4259, + "step": 658 + }, + { + "epoch": 0.17859558923014476, + "grad_norm": 0.4043656289577484, + "learning_rate": 5.942290351668169e-05, + "loss": 1.4298, + "step": 660 + }, + { + "epoch": 0.17913678798538762, + "grad_norm": 0.4288482666015625, + "learning_rate": 5.9603246167718674e-05, + "loss": 1.4439, + "step": 662 + }, + { + "epoch": 0.17967798674063049, + "grad_norm": 0.4385060966014862, + "learning_rate": 5.978358881875564e-05, + "loss": 1.4237, + "step": 664 + }, + { + "epoch": 0.18021918549587335, + "grad_norm": 0.396980345249176, + "learning_rate": 5.996393146979261e-05, + "loss": 1.4174, + "step": 666 + }, + { + "epoch": 0.18076038425111624, + "grad_norm": 0.4060603678226471, + "learning_rate": 6.014427412082958e-05, + "loss": 1.4479, + "step": 668 + }, + { + "epoch": 0.1813015830063591, + "grad_norm": 0.4485025703907013, + "learning_rate": 6.032461677186655e-05, + "loss": 1.4493, + "step": 670 + }, + { + "epoch": 0.18184278176160196, + "grad_norm": 0.44034305214881897, + "learning_rate": 6.050495942290352e-05, + "loss": 1.4461, + "step": 672 + }, + { + "epoch": 0.18238398051684482, + "grad_norm": 0.418074369430542, + "learning_rate": 6.0685302073940486e-05, + "loss": 1.4287, + "step": 674 + }, + { + "epoch": 0.18292517927208768, + "grad_norm": 0.41937318444252014, + "learning_rate": 6.0865644724977454e-05, + "loss": 1.4338, + "step": 676 + }, + { + "epoch": 0.18346637802733054, + "grad_norm": 0.4103530943393707, + "learning_rate": 6.104598737601444e-05, + "loss": 1.4391, + "step": 678 + }, + { + "epoch": 0.1840075767825734, + "grad_norm": 0.4066039025783539, + "learning_rate": 6.122633002705141e-05, + "loss": 1.4357, + "step": 680 + }, + { + "epoch": 0.18454877553781626, + "grad_norm": 0.36903437972068787, + "learning_rate": 6.140667267808838e-05, + "loss": 1.4111, + "step": 682 + }, + { + "epoch": 0.18508997429305912, + "grad_norm": 0.37125757336616516, + "learning_rate": 6.158701532912534e-05, + "loss": 1.4233, + "step": 684 + }, + { + "epoch": 0.18563117304830198, + "grad_norm": 0.44102513790130615, + "learning_rate": 6.176735798016231e-05, + "loss": 1.4437, + "step": 686 + }, + { + "epoch": 0.18617237180354484, + "grad_norm": 0.4337277114391327, + "learning_rate": 6.194770063119928e-05, + "loss": 1.4425, + "step": 688 + }, + { + "epoch": 0.1867135705587877, + "grad_norm": 0.37394315004348755, + "learning_rate": 6.212804328223625e-05, + "loss": 1.4452, + "step": 690 + }, + { + "epoch": 0.18725476931403057, + "grad_norm": 0.41764944791793823, + "learning_rate": 6.230838593327321e-05, + "loss": 1.4535, + "step": 692 + }, + { + "epoch": 0.18779596806927343, + "grad_norm": 0.4214741289615631, + "learning_rate": 6.24887285843102e-05, + "loss": 1.4391, + "step": 694 + }, + { + "epoch": 0.18833716682451632, + "grad_norm": 0.4159027338027954, + "learning_rate": 6.266907123534716e-05, + "loss": 1.4197, + "step": 696 + }, + { + "epoch": 0.18887836557975918, + "grad_norm": 0.38865673542022705, + "learning_rate": 6.284941388638413e-05, + "loss": 1.4329, + "step": 698 + }, + { + "epoch": 0.18941956433500204, + "grad_norm": 0.43646490573883057, + "learning_rate": 6.30297565374211e-05, + "loss": 1.4147, + "step": 700 + }, + { + "epoch": 0.1899607630902449, + "grad_norm": 0.41997334361076355, + "learning_rate": 6.321009918845807e-05, + "loss": 1.4275, + "step": 702 + }, + { + "epoch": 0.19050196184548776, + "grad_norm": 0.38556602597236633, + "learning_rate": 6.339044183949505e-05, + "loss": 1.4258, + "step": 704 + }, + { + "epoch": 0.19104316060073062, + "grad_norm": 0.42955082654953003, + "learning_rate": 6.357078449053201e-05, + "loss": 1.4201, + "step": 706 + }, + { + "epoch": 0.19158435935597348, + "grad_norm": 0.3844427764415741, + "learning_rate": 6.3751127141569e-05, + "loss": 1.4448, + "step": 708 + }, + { + "epoch": 0.19212555811121634, + "grad_norm": 0.4312956929206848, + "learning_rate": 6.393146979260596e-05, + "loss": 1.4051, + "step": 710 + }, + { + "epoch": 0.1926667568664592, + "grad_norm": 0.4556865394115448, + "learning_rate": 6.411181244364293e-05, + "loss": 1.4305, + "step": 712 + }, + { + "epoch": 0.19320795562170207, + "grad_norm": 0.37053731083869934, + "learning_rate": 6.42921550946799e-05, + "loss": 1.4301, + "step": 714 + }, + { + "epoch": 0.19374915437694493, + "grad_norm": 0.3996010720729828, + "learning_rate": 6.447249774571686e-05, + "loss": 1.4282, + "step": 716 + }, + { + "epoch": 0.1942903531321878, + "grad_norm": 0.37610816955566406, + "learning_rate": 6.465284039675383e-05, + "loss": 1.4277, + "step": 718 + }, + { + "epoch": 0.19483155188743065, + "grad_norm": 0.3677166998386383, + "learning_rate": 6.48331830477908e-05, + "loss": 1.4029, + "step": 720 + }, + { + "epoch": 0.1953727506426735, + "grad_norm": 0.3841564357280731, + "learning_rate": 6.501352569882777e-05, + "loss": 1.4144, + "step": 722 + }, + { + "epoch": 0.1959139493979164, + "grad_norm": 0.3687719404697418, + "learning_rate": 6.519386834986475e-05, + "loss": 1.4079, + "step": 724 + }, + { + "epoch": 0.19645514815315926, + "grad_norm": 0.38350847363471985, + "learning_rate": 6.537421100090172e-05, + "loss": 1.4269, + "step": 726 + }, + { + "epoch": 0.19699634690840212, + "grad_norm": 0.39060813188552856, + "learning_rate": 6.555455365193868e-05, + "loss": 1.4265, + "step": 728 + }, + { + "epoch": 0.19753754566364498, + "grad_norm": 0.36068469285964966, + "learning_rate": 6.573489630297565e-05, + "loss": 1.4325, + "step": 730 + }, + { + "epoch": 0.19807874441888784, + "grad_norm": 0.41185086965560913, + "learning_rate": 6.591523895401263e-05, + "loss": 1.4348, + "step": 732 + }, + { + "epoch": 0.1986199431741307, + "grad_norm": 0.4441224932670593, + "learning_rate": 6.60955816050496e-05, + "loss": 1.4103, + "step": 734 + }, + { + "epoch": 0.19916114192937356, + "grad_norm": 0.3727317452430725, + "learning_rate": 6.627592425608657e-05, + "loss": 1.4188, + "step": 736 + }, + { + "epoch": 0.19970234068461643, + "grad_norm": 0.394972562789917, + "learning_rate": 6.645626690712355e-05, + "loss": 1.4095, + "step": 738 + }, + { + "epoch": 0.20024353943985929, + "grad_norm": 0.40716880559921265, + "learning_rate": 6.663660955816052e-05, + "loss": 1.4127, + "step": 740 + }, + { + "epoch": 0.20078473819510215, + "grad_norm": 0.4156644344329834, + "learning_rate": 6.681695220919748e-05, + "loss": 1.4189, + "step": 742 + }, + { + "epoch": 0.201325936950345, + "grad_norm": 0.3787958323955536, + "learning_rate": 6.699729486023445e-05, + "loss": 1.4221, + "step": 744 + }, + { + "epoch": 0.20186713570558787, + "grad_norm": 0.42427608370780945, + "learning_rate": 6.717763751127142e-05, + "loss": 1.4192, + "step": 746 + }, + { + "epoch": 0.20240833446083073, + "grad_norm": 0.4778277277946472, + "learning_rate": 6.735798016230839e-05, + "loss": 1.4024, + "step": 748 + }, + { + "epoch": 0.2029495332160736, + "grad_norm": 0.44801151752471924, + "learning_rate": 6.753832281334535e-05, + "loss": 1.4222, + "step": 750 + }, + { + "epoch": 0.20349073197131648, + "grad_norm": 0.46737611293792725, + "learning_rate": 6.771866546438232e-05, + "loss": 1.4117, + "step": 752 + }, + { + "epoch": 0.20403193072655934, + "grad_norm": 0.4184872806072235, + "learning_rate": 6.78990081154193e-05, + "loss": 1.4066, + "step": 754 + }, + { + "epoch": 0.2045731294818022, + "grad_norm": 0.40458211302757263, + "learning_rate": 6.807935076645627e-05, + "loss": 1.4274, + "step": 756 + }, + { + "epoch": 0.20511432823704506, + "grad_norm": 0.43926185369491577, + "learning_rate": 6.825969341749324e-05, + "loss": 1.4231, + "step": 758 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.4434867203235626, + "learning_rate": 6.844003606853022e-05, + "loss": 1.4121, + "step": 760 + }, + { + "epoch": 0.20619672574753078, + "grad_norm": 0.4500143826007843, + "learning_rate": 6.862037871956719e-05, + "loss": 1.4179, + "step": 762 + }, + { + "epoch": 0.20673792450277365, + "grad_norm": 0.45456650853157043, + "learning_rate": 6.880072137060415e-05, + "loss": 1.3912, + "step": 764 + }, + { + "epoch": 0.2072791232580165, + "grad_norm": 0.4214187264442444, + "learning_rate": 6.898106402164112e-05, + "loss": 1.3962, + "step": 766 + }, + { + "epoch": 0.20782032201325937, + "grad_norm": 0.427682101726532, + "learning_rate": 6.916140667267809e-05, + "loss": 1.4316, + "step": 768 + }, + { + "epoch": 0.20836152076850223, + "grad_norm": 0.44491469860076904, + "learning_rate": 6.934174932371507e-05, + "loss": 1.4218, + "step": 770 + }, + { + "epoch": 0.2089027195237451, + "grad_norm": 0.42736080288887024, + "learning_rate": 6.952209197475204e-05, + "loss": 1.3931, + "step": 772 + }, + { + "epoch": 0.20944391827898795, + "grad_norm": 0.4041571021080017, + "learning_rate": 6.9702434625789e-05, + "loss": 1.4201, + "step": 774 + }, + { + "epoch": 0.2099851170342308, + "grad_norm": 0.4250961244106293, + "learning_rate": 6.988277727682597e-05, + "loss": 1.4299, + "step": 776 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.4335261881351471, + "learning_rate": 7.006311992786294e-05, + "loss": 1.4125, + "step": 778 + }, + { + "epoch": 0.21106751454471653, + "grad_norm": 0.42000851035118103, + "learning_rate": 7.02434625788999e-05, + "loss": 1.3969, + "step": 780 + }, + { + "epoch": 0.21160871329995942, + "grad_norm": 0.38111838698387146, + "learning_rate": 7.042380522993687e-05, + "loss": 1.3795, + "step": 782 + }, + { + "epoch": 0.21214991205520228, + "grad_norm": 0.38366812467575073, + "learning_rate": 7.060414788097385e-05, + "loss": 1.4041, + "step": 784 + }, + { + "epoch": 0.21269111081044514, + "grad_norm": 0.4334602355957031, + "learning_rate": 7.078449053201082e-05, + "loss": 1.415, + "step": 786 + }, + { + "epoch": 0.213232309565688, + "grad_norm": 0.40296411514282227, + "learning_rate": 7.096483318304779e-05, + "loss": 1.4052, + "step": 788 + }, + { + "epoch": 0.21377350832093087, + "grad_norm": 0.4197232723236084, + "learning_rate": 7.114517583408477e-05, + "loss": 1.4205, + "step": 790 + }, + { + "epoch": 0.21431470707617373, + "grad_norm": 0.40287715196609497, + "learning_rate": 7.132551848512174e-05, + "loss": 1.4047, + "step": 792 + }, + { + "epoch": 0.2148559058314166, + "grad_norm": 0.37324196100234985, + "learning_rate": 7.15058611361587e-05, + "loss": 1.4398, + "step": 794 + }, + { + "epoch": 0.21539710458665945, + "grad_norm": 0.4409985840320587, + "learning_rate": 7.168620378719567e-05, + "loss": 1.3873, + "step": 796 + }, + { + "epoch": 0.2159383033419023, + "grad_norm": 0.41441893577575684, + "learning_rate": 7.186654643823264e-05, + "loss": 1.4174, + "step": 798 + }, + { + "epoch": 0.21647950209714517, + "grad_norm": 0.4271719455718994, + "learning_rate": 7.204688908926962e-05, + "loss": 1.3987, + "step": 800 + }, + { + "epoch": 0.21702070085238803, + "grad_norm": 0.4969992935657501, + "learning_rate": 7.222723174030659e-05, + "loss": 1.4049, + "step": 802 + }, + { + "epoch": 0.2175618996076309, + "grad_norm": 0.45711180567741394, + "learning_rate": 7.240757439134356e-05, + "loss": 1.4061, + "step": 804 + }, + { + "epoch": 0.21810309836287375, + "grad_norm": 0.4479979872703552, + "learning_rate": 7.258791704238052e-05, + "loss": 1.4049, + "step": 806 + }, + { + "epoch": 0.21864429711811662, + "grad_norm": 0.4708006978034973, + "learning_rate": 7.276825969341749e-05, + "loss": 1.3971, + "step": 808 + }, + { + "epoch": 0.2191854958733595, + "grad_norm": 0.4387456774711609, + "learning_rate": 7.294860234445446e-05, + "loss": 1.4272, + "step": 810 + }, + { + "epoch": 0.21972669462860236, + "grad_norm": 0.5285756587982178, + "learning_rate": 7.312894499549143e-05, + "loss": 1.3902, + "step": 812 + }, + { + "epoch": 0.22026789338384523, + "grad_norm": 0.5111876726150513, + "learning_rate": 7.330928764652841e-05, + "loss": 1.4176, + "step": 814 + }, + { + "epoch": 0.2208090921390881, + "grad_norm": 0.4643821716308594, + "learning_rate": 7.348963029756538e-05, + "loss": 1.4216, + "step": 816 + }, + { + "epoch": 0.22135029089433095, + "grad_norm": 0.5162214040756226, + "learning_rate": 7.366997294860236e-05, + "loss": 1.4025, + "step": 818 + }, + { + "epoch": 0.2218914896495738, + "grad_norm": 0.4296860992908478, + "learning_rate": 7.385031559963932e-05, + "loss": 1.3919, + "step": 820 + }, + { + "epoch": 0.22243268840481667, + "grad_norm": 0.4449775815010071, + "learning_rate": 7.403065825067629e-05, + "loss": 1.4002, + "step": 822 + }, + { + "epoch": 0.22297388716005953, + "grad_norm": 0.39713212847709656, + "learning_rate": 7.421100090171326e-05, + "loss": 1.4012, + "step": 824 + }, + { + "epoch": 0.2235150859153024, + "grad_norm": 0.41655346751213074, + "learning_rate": 7.439134355275023e-05, + "loss": 1.4155, + "step": 826 + }, + { + "epoch": 0.22405628467054525, + "grad_norm": 0.3751365542411804, + "learning_rate": 7.45716862037872e-05, + "loss": 1.4021, + "step": 828 + }, + { + "epoch": 0.2245974834257881, + "grad_norm": 0.41483408212661743, + "learning_rate": 7.475202885482417e-05, + "loss": 1.4207, + "step": 830 + }, + { + "epoch": 0.22513868218103097, + "grad_norm": 0.397360235452652, + "learning_rate": 7.493237150586114e-05, + "loss": 1.392, + "step": 832 + }, + { + "epoch": 0.22567988093627384, + "grad_norm": 0.3874877691268921, + "learning_rate": 7.511271415689811e-05, + "loss": 1.4143, + "step": 834 + }, + { + "epoch": 0.2262210796915167, + "grad_norm": 0.4382254481315613, + "learning_rate": 7.529305680793508e-05, + "loss": 1.4109, + "step": 836 + }, + { + "epoch": 0.22676227844675959, + "grad_norm": 0.3728530704975128, + "learning_rate": 7.547339945897204e-05, + "loss": 1.4215, + "step": 838 + }, + { + "epoch": 0.22730347720200245, + "grad_norm": 0.41155338287353516, + "learning_rate": 7.565374211000901e-05, + "loss": 1.3963, + "step": 840 + }, + { + "epoch": 0.2278446759572453, + "grad_norm": 0.3550320267677307, + "learning_rate": 7.5834084761046e-05, + "loss": 1.3998, + "step": 842 + }, + { + "epoch": 0.22838587471248817, + "grad_norm": 0.3858035206794739, + "learning_rate": 7.601442741208296e-05, + "loss": 1.387, + "step": 844 + }, + { + "epoch": 0.22892707346773103, + "grad_norm": 0.38636457920074463, + "learning_rate": 7.619477006311994e-05, + "loss": 1.387, + "step": 846 + }, + { + "epoch": 0.2294682722229739, + "grad_norm": 0.41915518045425415, + "learning_rate": 7.637511271415691e-05, + "loss": 1.3917, + "step": 848 + }, + { + "epoch": 0.23000947097821675, + "grad_norm": 0.35796865820884705, + "learning_rate": 7.655545536519388e-05, + "loss": 1.406, + "step": 850 + }, + { + "epoch": 0.2305506697334596, + "grad_norm": 0.35221853852272034, + "learning_rate": 7.673579801623084e-05, + "loss": 1.3892, + "step": 852 + }, + { + "epoch": 0.23109186848870247, + "grad_norm": 0.3815077245235443, + "learning_rate": 7.691614066726781e-05, + "loss": 1.3845, + "step": 854 + }, + { + "epoch": 0.23163306724394533, + "grad_norm": 0.3554491400718689, + "learning_rate": 7.709648331830478e-05, + "loss": 1.3644, + "step": 856 + }, + { + "epoch": 0.2321742659991882, + "grad_norm": 0.3762814998626709, + "learning_rate": 7.727682596934175e-05, + "loss": 1.3976, + "step": 858 + }, + { + "epoch": 0.23271546475443106, + "grad_norm": 0.34575173258781433, + "learning_rate": 7.745716862037873e-05, + "loss": 1.3925, + "step": 860 + }, + { + "epoch": 0.23325666350967392, + "grad_norm": 0.37864556908607483, + "learning_rate": 7.76375112714157e-05, + "loss": 1.3993, + "step": 862 + }, + { + "epoch": 0.23379786226491678, + "grad_norm": 0.34448474645614624, + "learning_rate": 7.781785392245266e-05, + "loss": 1.3855, + "step": 864 + }, + { + "epoch": 0.23433906102015967, + "grad_norm": 0.40932390093803406, + "learning_rate": 7.799819657348963e-05, + "loss": 1.395, + "step": 866 + }, + { + "epoch": 0.23488025977540253, + "grad_norm": 0.3737650513648987, + "learning_rate": 7.81785392245266e-05, + "loss": 1.3918, + "step": 868 + }, + { + "epoch": 0.2354214585306454, + "grad_norm": 0.42988118529319763, + "learning_rate": 7.835888187556357e-05, + "loss": 1.3837, + "step": 870 + }, + { + "epoch": 0.23596265728588825, + "grad_norm": 0.3865496814250946, + "learning_rate": 7.853922452660055e-05, + "loss": 1.3976, + "step": 872 + }, + { + "epoch": 0.2365038560411311, + "grad_norm": 0.3682670295238495, + "learning_rate": 7.871956717763751e-05, + "loss": 1.3792, + "step": 874 + }, + { + "epoch": 0.23704505479637397, + "grad_norm": 0.4236462712287903, + "learning_rate": 7.88999098286745e-05, + "loss": 1.4032, + "step": 876 + }, + { + "epoch": 0.23758625355161683, + "grad_norm": 0.3742213249206543, + "learning_rate": 7.908025247971146e-05, + "loss": 1.3709, + "step": 878 + }, + { + "epoch": 0.2381274523068597, + "grad_norm": 0.38234424591064453, + "learning_rate": 7.926059513074843e-05, + "loss": 1.3862, + "step": 880 + }, + { + "epoch": 0.23866865106210255, + "grad_norm": 0.37414151430130005, + "learning_rate": 7.94409377817854e-05, + "loss": 1.3751, + "step": 882 + }, + { + "epoch": 0.23920984981734542, + "grad_norm": 0.3838132619857788, + "learning_rate": 7.962128043282237e-05, + "loss": 1.3805, + "step": 884 + }, + { + "epoch": 0.23975104857258828, + "grad_norm": 0.3818622827529907, + "learning_rate": 7.980162308385933e-05, + "loss": 1.3735, + "step": 886 + }, + { + "epoch": 0.24029224732783114, + "grad_norm": 0.38791927695274353, + "learning_rate": 7.99819657348963e-05, + "loss": 1.3958, + "step": 888 + }, + { + "epoch": 0.240833446083074, + "grad_norm": 0.4164978861808777, + "learning_rate": 8.016230838593328e-05, + "loss": 1.421, + "step": 890 + }, + { + "epoch": 0.24137464483831686, + "grad_norm": 0.3721414804458618, + "learning_rate": 8.034265103697025e-05, + "loss": 1.3977, + "step": 892 + }, + { + "epoch": 0.24191584359355975, + "grad_norm": 0.37698984146118164, + "learning_rate": 8.052299368800722e-05, + "loss": 1.3854, + "step": 894 + }, + { + "epoch": 0.2424570423488026, + "grad_norm": 0.3553116023540497, + "learning_rate": 8.070333633904418e-05, + "loss": 1.3925, + "step": 896 + }, + { + "epoch": 0.24299824110404547, + "grad_norm": 0.37809059023857117, + "learning_rate": 8.088367899008115e-05, + "loss": 1.368, + "step": 898 + }, + { + "epoch": 0.24353943985928833, + "grad_norm": 0.3835943043231964, + "learning_rate": 8.106402164111813e-05, + "loss": 1.3992, + "step": 900 + }, + { + "epoch": 0.2440806386145312, + "grad_norm": 0.4013379216194153, + "learning_rate": 8.12443642921551e-05, + "loss": 1.3912, + "step": 902 + }, + { + "epoch": 0.24462183736977405, + "grad_norm": 0.37845560908317566, + "learning_rate": 8.142470694319207e-05, + "loss": 1.3934, + "step": 904 + }, + { + "epoch": 0.24516303612501691, + "grad_norm": 0.39762255549430847, + "learning_rate": 8.160504959422905e-05, + "loss": 1.3782, + "step": 906 + }, + { + "epoch": 0.24570423488025978, + "grad_norm": 0.36652496457099915, + "learning_rate": 8.178539224526602e-05, + "loss": 1.3787, + "step": 908 + }, + { + "epoch": 0.24624543363550264, + "grad_norm": 0.39953047037124634, + "learning_rate": 8.196573489630298e-05, + "loss": 1.3752, + "step": 910 + }, + { + "epoch": 0.2467866323907455, + "grad_norm": 0.35875022411346436, + "learning_rate": 8.214607754733995e-05, + "loss": 1.3768, + "step": 912 + }, + { + "epoch": 0.24732783114598836, + "grad_norm": 0.3617067337036133, + "learning_rate": 8.232642019837692e-05, + "loss": 1.3859, + "step": 914 + }, + { + "epoch": 0.24786902990123122, + "grad_norm": 0.38250839710235596, + "learning_rate": 8.250676284941389e-05, + "loss": 1.3897, + "step": 916 + }, + { + "epoch": 0.24841022865647408, + "grad_norm": 0.3404116928577423, + "learning_rate": 8.268710550045085e-05, + "loss": 1.3933, + "step": 918 + }, + { + "epoch": 0.24895142741171694, + "grad_norm": 0.3547706604003906, + "learning_rate": 8.286744815148782e-05, + "loss": 1.3787, + "step": 920 + }, + { + "epoch": 0.2494926261669598, + "grad_norm": 0.32752275466918945, + "learning_rate": 8.30477908025248e-05, + "loss": 1.3905, + "step": 922 + }, + { + "epoch": 0.25003382492220266, + "grad_norm": 0.3413980007171631, + "learning_rate": 8.322813345356177e-05, + "loss": 1.385, + "step": 924 + }, + { + "epoch": 0.25057502367744555, + "grad_norm": 0.5574982762336731, + "learning_rate": 8.340847610459874e-05, + "loss": 1.3869, + "step": 926 + }, + { + "epoch": 0.2511162224326884, + "grad_norm": 0.41128844022750854, + "learning_rate": 8.358881875563572e-05, + "loss": 1.3583, + "step": 928 + }, + { + "epoch": 0.2516574211879313, + "grad_norm": 0.3476073145866394, + "learning_rate": 8.376916140667269e-05, + "loss": 1.3832, + "step": 930 + }, + { + "epoch": 0.2521986199431741, + "grad_norm": 0.34838998317718506, + "learning_rate": 8.394950405770965e-05, + "loss": 1.3748, + "step": 932 + }, + { + "epoch": 0.252739818698417, + "grad_norm": 0.3552824556827545, + "learning_rate": 8.412984670874662e-05, + "loss": 1.3936, + "step": 934 + }, + { + "epoch": 0.25328101745365983, + "grad_norm": 0.34918278455734253, + "learning_rate": 8.43101893597836e-05, + "loss": 1.3733, + "step": 936 + }, + { + "epoch": 0.2538222162089027, + "grad_norm": 0.431455135345459, + "learning_rate": 8.449053201082057e-05, + "loss": 1.3924, + "step": 938 + }, + { + "epoch": 0.2543634149641456, + "grad_norm": 0.37811046838760376, + "learning_rate": 8.467087466185754e-05, + "loss": 1.3861, + "step": 940 + }, + { + "epoch": 0.25490461371938844, + "grad_norm": 0.35659778118133545, + "learning_rate": 8.48512173128945e-05, + "loss": 1.3736, + "step": 942 + }, + { + "epoch": 0.25544581247463133, + "grad_norm": 0.4327319264411926, + "learning_rate": 8.503155996393147e-05, + "loss": 1.3883, + "step": 944 + }, + { + "epoch": 0.25598701122987416, + "grad_norm": 0.39134231209754944, + "learning_rate": 8.521190261496844e-05, + "loss": 1.3704, + "step": 946 + }, + { + "epoch": 0.25652820998511705, + "grad_norm": 0.39573270082473755, + "learning_rate": 8.53922452660054e-05, + "loss": 1.4047, + "step": 948 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.3299993872642517, + "learning_rate": 8.557258791704237e-05, + "loss": 1.3778, + "step": 950 + }, + { + "epoch": 0.2576106074956028, + "grad_norm": 0.3559456765651703, + "learning_rate": 8.575293056807936e-05, + "loss": 1.3794, + "step": 952 + }, + { + "epoch": 0.2581518062508456, + "grad_norm": 0.36347028613090515, + "learning_rate": 8.593327321911632e-05, + "loss": 1.3817, + "step": 954 + }, + { + "epoch": 0.2586930050060885, + "grad_norm": 0.39882585406303406, + "learning_rate": 8.611361587015329e-05, + "loss": 1.3565, + "step": 956 + }, + { + "epoch": 0.2592342037613313, + "grad_norm": 0.3932117223739624, + "learning_rate": 8.629395852119027e-05, + "loss": 1.396, + "step": 958 + }, + { + "epoch": 0.2597754025165742, + "grad_norm": 0.3526294231414795, + "learning_rate": 8.647430117222724e-05, + "loss": 1.3624, + "step": 960 + }, + { + "epoch": 0.26031660127181705, + "grad_norm": 0.3804738223552704, + "learning_rate": 8.66546438232642e-05, + "loss": 1.3616, + "step": 962 + }, + { + "epoch": 0.26085780002705994, + "grad_norm": 0.36557725071907043, + "learning_rate": 8.683498647430117e-05, + "loss": 1.3997, + "step": 964 + }, + { + "epoch": 0.2613989987823028, + "grad_norm": 0.3574380874633789, + "learning_rate": 8.701532912533815e-05, + "loss": 1.3901, + "step": 966 + }, + { + "epoch": 0.26194019753754566, + "grad_norm": 0.4025056064128876, + "learning_rate": 8.719567177637512e-05, + "loss": 1.3707, + "step": 968 + }, + { + "epoch": 0.26248139629278855, + "grad_norm": 0.3687063157558441, + "learning_rate": 8.737601442741209e-05, + "loss": 1.3679, + "step": 970 + }, + { + "epoch": 0.2630225950480314, + "grad_norm": 0.3697878420352936, + "learning_rate": 8.755635707844906e-05, + "loss": 1.3981, + "step": 972 + }, + { + "epoch": 0.26356379380327427, + "grad_norm": 0.34241798520088196, + "learning_rate": 8.773669972948602e-05, + "loss": 1.3728, + "step": 974 + }, + { + "epoch": 0.2641049925585171, + "grad_norm": 0.40002745389938354, + "learning_rate": 8.791704238052299e-05, + "loss": 1.3732, + "step": 976 + }, + { + "epoch": 0.26464619131376, + "grad_norm": 0.42943906784057617, + "learning_rate": 8.809738503155996e-05, + "loss": 1.3731, + "step": 978 + }, + { + "epoch": 0.2651873900690028, + "grad_norm": 0.37437063455581665, + "learning_rate": 8.827772768259693e-05, + "loss": 1.372, + "step": 980 + }, + { + "epoch": 0.2657285888242457, + "grad_norm": 0.3378891944885254, + "learning_rate": 8.845807033363391e-05, + "loss": 1.3777, + "step": 982 + }, + { + "epoch": 0.26626978757948855, + "grad_norm": 0.32884734869003296, + "learning_rate": 8.863841298467088e-05, + "loss": 1.3639, + "step": 984 + }, + { + "epoch": 0.26681098633473144, + "grad_norm": 0.3945903480052948, + "learning_rate": 8.881875563570786e-05, + "loss": 1.3722, + "step": 986 + }, + { + "epoch": 0.26735218508997427, + "grad_norm": 0.39569205045700073, + "learning_rate": 8.899909828674482e-05, + "loss": 1.376, + "step": 988 + }, + { + "epoch": 0.26789338384521716, + "grad_norm": 0.31659135222435, + "learning_rate": 8.917944093778179e-05, + "loss": 1.3807, + "step": 990 + }, + { + "epoch": 0.26843458260046, + "grad_norm": 0.44032666087150574, + "learning_rate": 8.935978358881876e-05, + "loss": 1.3986, + "step": 992 + }, + { + "epoch": 0.2689757813557029, + "grad_norm": 0.3445993661880493, + "learning_rate": 8.954012623985573e-05, + "loss": 1.3589, + "step": 994 + }, + { + "epoch": 0.26951698011094577, + "grad_norm": 0.3693557679653168, + "learning_rate": 8.97204688908927e-05, + "loss": 1.3593, + "step": 996 + }, + { + "epoch": 0.2700581788661886, + "grad_norm": 0.3965442478656769, + "learning_rate": 8.990081154192968e-05, + "loss": 1.3909, + "step": 998 + }, + { + "epoch": 0.2705993776214315, + "grad_norm": 0.4038390815258026, + "learning_rate": 9.008115419296664e-05, + "loss": 1.3629, + "step": 1000 + }, + { + "epoch": 0.2711405763766743, + "grad_norm": 0.36394256353378296, + "learning_rate": 9.026149684400361e-05, + "loss": 1.3812, + "step": 1002 + }, + { + "epoch": 0.2716817751319172, + "grad_norm": 0.4527181386947632, + "learning_rate": 9.044183949504058e-05, + "loss": 1.3692, + "step": 1004 + }, + { + "epoch": 0.27222297388716005, + "grad_norm": 0.37700143456459045, + "learning_rate": 9.062218214607755e-05, + "loss": 1.3652, + "step": 1006 + }, + { + "epoch": 0.27276417264240294, + "grad_norm": 0.45016244053840637, + "learning_rate": 9.080252479711451e-05, + "loss": 1.3657, + "step": 1008 + }, + { + "epoch": 0.27330537139764577, + "grad_norm": 0.42159709334373474, + "learning_rate": 9.09828674481515e-05, + "loss": 1.3702, + "step": 1010 + }, + { + "epoch": 0.27384657015288866, + "grad_norm": 0.3884572982788086, + "learning_rate": 9.116321009918846e-05, + "loss": 1.3535, + "step": 1012 + }, + { + "epoch": 0.2743877689081315, + "grad_norm": 0.37507420778274536, + "learning_rate": 9.134355275022544e-05, + "loss": 1.3659, + "step": 1014 + }, + { + "epoch": 0.2749289676633744, + "grad_norm": 0.35269656777381897, + "learning_rate": 9.152389540126241e-05, + "loss": 1.3623, + "step": 1016 + }, + { + "epoch": 0.2754701664186172, + "grad_norm": 0.3543412387371063, + "learning_rate": 9.170423805229938e-05, + "loss": 1.3695, + "step": 1018 + }, + { + "epoch": 0.2760113651738601, + "grad_norm": 0.3173674941062927, + "learning_rate": 9.188458070333635e-05, + "loss": 1.3572, + "step": 1020 + }, + { + "epoch": 0.276552563929103, + "grad_norm": 0.3729746341705322, + "learning_rate": 9.206492335437331e-05, + "loss": 1.3888, + "step": 1022 + }, + { + "epoch": 0.2770937626843458, + "grad_norm": 0.33210429549217224, + "learning_rate": 9.224526600541028e-05, + "loss": 1.3395, + "step": 1024 + }, + { + "epoch": 0.2776349614395887, + "grad_norm": 0.338366836309433, + "learning_rate": 9.242560865644725e-05, + "loss": 1.3498, + "step": 1026 + }, + { + "epoch": 0.27817616019483155, + "grad_norm": 0.3367864191532135, + "learning_rate": 9.260595130748423e-05, + "loss": 1.3548, + "step": 1028 + }, + { + "epoch": 0.27871735895007443, + "grad_norm": 0.40313002467155457, + "learning_rate": 9.27862939585212e-05, + "loss": 1.4059, + "step": 1030 + }, + { + "epoch": 0.27925855770531727, + "grad_norm": 0.3434394299983978, + "learning_rate": 9.296663660955816e-05, + "loss": 1.3522, + "step": 1032 + }, + { + "epoch": 0.27979975646056016, + "grad_norm": 0.35454580187797546, + "learning_rate": 9.314697926059513e-05, + "loss": 1.3838, + "step": 1034 + }, + { + "epoch": 0.280340955215803, + "grad_norm": 0.3280038833618164, + "learning_rate": 9.33273219116321e-05, + "loss": 1.3753, + "step": 1036 + }, + { + "epoch": 0.2808821539710459, + "grad_norm": 0.4306875169277191, + "learning_rate": 9.350766456266907e-05, + "loss": 1.3807, + "step": 1038 + }, + { + "epoch": 0.2814233527262887, + "grad_norm": 0.3500923812389374, + "learning_rate": 9.368800721370605e-05, + "loss": 1.36, + "step": 1040 + }, + { + "epoch": 0.2819645514815316, + "grad_norm": 0.3702130913734436, + "learning_rate": 9.386834986474301e-05, + "loss": 1.3919, + "step": 1042 + }, + { + "epoch": 0.28250575023677443, + "grad_norm": 0.3651416599750519, + "learning_rate": 9.404869251578e-05, + "loss": 1.3805, + "step": 1044 + }, + { + "epoch": 0.2830469489920173, + "grad_norm": 0.35927796363830566, + "learning_rate": 9.422903516681696e-05, + "loss": 1.3507, + "step": 1046 + }, + { + "epoch": 0.28358814774726016, + "grad_norm": 0.36750975251197815, + "learning_rate": 9.440937781785393e-05, + "loss": 1.3475, + "step": 1048 + }, + { + "epoch": 0.28412934650250304, + "grad_norm": 0.31946998834609985, + "learning_rate": 9.45897204688909e-05, + "loss": 1.3708, + "step": 1050 + }, + { + "epoch": 0.28467054525774593, + "grad_norm": 0.3447932302951813, + "learning_rate": 9.477006311992787e-05, + "loss": 1.3519, + "step": 1052 + }, + { + "epoch": 0.28521174401298877, + "grad_norm": 0.31405511498451233, + "learning_rate": 9.495040577096483e-05, + "loss": 1.3806, + "step": 1054 + }, + { + "epoch": 0.28575294276823165, + "grad_norm": 0.3198442757129669, + "learning_rate": 9.51307484220018e-05, + "loss": 1.368, + "step": 1056 + }, + { + "epoch": 0.2862941415234745, + "grad_norm": 0.33328956365585327, + "learning_rate": 9.531109107303878e-05, + "loss": 1.3429, + "step": 1058 + }, + { + "epoch": 0.2868353402787174, + "grad_norm": 0.29432907700538635, + "learning_rate": 9.549143372407575e-05, + "loss": 1.3698, + "step": 1060 + }, + { + "epoch": 0.2873765390339602, + "grad_norm": 0.3468937575817108, + "learning_rate": 9.567177637511272e-05, + "loss": 1.356, + "step": 1062 + }, + { + "epoch": 0.2879177377892031, + "grad_norm": 0.3619658350944519, + "learning_rate": 9.585211902614968e-05, + "loss": 1.3596, + "step": 1064 + }, + { + "epoch": 0.28845893654444593, + "grad_norm": 0.3384917378425598, + "learning_rate": 9.603246167718665e-05, + "loss": 1.3693, + "step": 1066 + }, + { + "epoch": 0.2890001352996888, + "grad_norm": 0.3724029064178467, + "learning_rate": 9.621280432822363e-05, + "loss": 1.3639, + "step": 1068 + }, + { + "epoch": 0.28954133405493165, + "grad_norm": 0.7029115557670593, + "learning_rate": 9.63931469792606e-05, + "loss": 1.3557, + "step": 1070 + }, + { + "epoch": 0.29008253281017454, + "grad_norm": 0.5529230833053589, + "learning_rate": 9.657348963029757e-05, + "loss": 1.3657, + "step": 1072 + }, + { + "epoch": 0.2906237315654174, + "grad_norm": 0.4254820644855499, + "learning_rate": 9.675383228133455e-05, + "loss": 1.3633, + "step": 1074 + }, + { + "epoch": 0.29116493032066026, + "grad_norm": 0.4930615723133087, + "learning_rate": 9.693417493237152e-05, + "loss": 1.3714, + "step": 1076 + }, + { + "epoch": 0.2917061290759031, + "grad_norm": 0.4455857574939728, + "learning_rate": 9.711451758340848e-05, + "loss": 1.3615, + "step": 1078 + }, + { + "epoch": 0.292247327831146, + "grad_norm": 0.4171796441078186, + "learning_rate": 9.729486023444545e-05, + "loss": 1.3673, + "step": 1080 + }, + { + "epoch": 0.2927885265863889, + "grad_norm": 0.37810683250427246, + "learning_rate": 9.747520288548242e-05, + "loss": 1.3683, + "step": 1082 + }, + { + "epoch": 0.2933297253416317, + "grad_norm": 0.4057900905609131, + "learning_rate": 9.765554553651939e-05, + "loss": 1.3674, + "step": 1084 + }, + { + "epoch": 0.2938709240968746, + "grad_norm": 0.40583640336990356, + "learning_rate": 9.783588818755635e-05, + "loss": 1.3566, + "step": 1086 + }, + { + "epoch": 0.29441212285211743, + "grad_norm": 0.39454150199890137, + "learning_rate": 9.801623083859334e-05, + "loss": 1.3611, + "step": 1088 + }, + { + "epoch": 0.2949533216073603, + "grad_norm": 0.42229679226875305, + "learning_rate": 9.81965734896303e-05, + "loss": 1.3726, + "step": 1090 + }, + { + "epoch": 0.29549452036260315, + "grad_norm": 0.3274170160293579, + "learning_rate": 9.837691614066727e-05, + "loss": 1.3375, + "step": 1092 + }, + { + "epoch": 0.29603571911784604, + "grad_norm": 0.40999388694763184, + "learning_rate": 9.855725879170424e-05, + "loss": 1.3548, + "step": 1094 + }, + { + "epoch": 0.2965769178730889, + "grad_norm": 0.33515796065330505, + "learning_rate": 9.873760144274122e-05, + "loss": 1.3903, + "step": 1096 + }, + { + "epoch": 0.29711811662833176, + "grad_norm": 0.3834095597267151, + "learning_rate": 9.891794409377819e-05, + "loss": 1.3653, + "step": 1098 + }, + { + "epoch": 0.2976593153835746, + "grad_norm": 0.34850651025772095, + "learning_rate": 9.909828674481515e-05, + "loss": 1.3573, + "step": 1100 + }, + { + "epoch": 0.2982005141388175, + "grad_norm": 0.3811749815940857, + "learning_rate": 9.927862939585212e-05, + "loss": 1.3843, + "step": 1102 + }, + { + "epoch": 0.2987417128940603, + "grad_norm": 0.3308597803115845, + "learning_rate": 9.94589720468891e-05, + "loss": 1.3492, + "step": 1104 + }, + { + "epoch": 0.2992829116493032, + "grad_norm": 0.31952470541000366, + "learning_rate": 9.963931469792607e-05, + "loss": 1.3586, + "step": 1106 + }, + { + "epoch": 0.2998241104045461, + "grad_norm": 0.3433592915534973, + "learning_rate": 9.981965734896304e-05, + "loss": 1.3524, + "step": 1108 + }, + { + "epoch": 0.30036530915978893, + "grad_norm": 0.4547680914402008, + "learning_rate": 0.0001, + "loss": 1.3562, + "step": 1110 + }, + { + "epoch": 0.3009065079150318, + "grad_norm": 0.4963592290878296, + "learning_rate": 9.999999008881264e-05, + "loss": 1.3452, + "step": 1112 + }, + { + "epoch": 0.30144770667027465, + "grad_norm": 1.1111193895339966, + "learning_rate": 9.999996035525452e-05, + "loss": 1.3732, + "step": 1114 + }, + { + "epoch": 0.30198890542551754, + "grad_norm": 0.6860964298248291, + "learning_rate": 9.999991079933739e-05, + "loss": 1.3689, + "step": 1116 + }, + { + "epoch": 0.3025301041807604, + "grad_norm": 0.7344204783439636, + "learning_rate": 9.999984142108093e-05, + "loss": 1.3575, + "step": 1118 + }, + { + "epoch": 0.30307130293600326, + "grad_norm": 0.6534725427627563, + "learning_rate": 9.999975222051263e-05, + "loss": 1.376, + "step": 1120 + }, + { + "epoch": 0.3036125016912461, + "grad_norm": 0.5108229517936707, + "learning_rate": 9.999964319766785e-05, + "loss": 1.3741, + "step": 1122 + }, + { + "epoch": 0.304153700446489, + "grad_norm": 0.4888688325881958, + "learning_rate": 9.99995143525898e-05, + "loss": 1.3555, + "step": 1124 + }, + { + "epoch": 0.3046948992017318, + "grad_norm": 0.42808806896209717, + "learning_rate": 9.999936568532962e-05, + "loss": 1.3548, + "step": 1126 + }, + { + "epoch": 0.3052360979569747, + "grad_norm": 0.3921727240085602, + "learning_rate": 9.999919719594617e-05, + "loss": 1.3559, + "step": 1128 + }, + { + "epoch": 0.30577729671221754, + "grad_norm": 0.3473529517650604, + "learning_rate": 9.999900888450628e-05, + "loss": 1.3603, + "step": 1130 + }, + { + "epoch": 0.3063184954674604, + "grad_norm": 0.3337381184101105, + "learning_rate": 9.999880075108464e-05, + "loss": 1.3642, + "step": 1132 + }, + { + "epoch": 0.30685969422270326, + "grad_norm": 0.3363231122493744, + "learning_rate": 9.99985727957637e-05, + "loss": 1.3606, + "step": 1134 + }, + { + "epoch": 0.30740089297794615, + "grad_norm": 0.32726484537124634, + "learning_rate": 9.999832501863386e-05, + "loss": 1.3493, + "step": 1136 + }, + { + "epoch": 0.30794209173318904, + "grad_norm": 0.3190646767616272, + "learning_rate": 9.999805741979338e-05, + "loss": 1.3518, + "step": 1138 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.31244540214538574, + "learning_rate": 9.999776999934831e-05, + "loss": 1.3495, + "step": 1140 + }, + { + "epoch": 0.30902448924367476, + "grad_norm": 0.3286384344100952, + "learning_rate": 9.999746275741261e-05, + "loss": 1.3517, + "step": 1142 + }, + { + "epoch": 0.3095656879989176, + "grad_norm": 0.3630046546459198, + "learning_rate": 9.99971356941081e-05, + "loss": 1.3641, + "step": 1144 + }, + { + "epoch": 0.3101068867541605, + "grad_norm": 0.30771151185035706, + "learning_rate": 9.999678880956443e-05, + "loss": 1.3571, + "step": 1146 + }, + { + "epoch": 0.3106480855094033, + "grad_norm": 0.30026301741600037, + "learning_rate": 9.99964221039191e-05, + "loss": 1.3541, + "step": 1148 + }, + { + "epoch": 0.3111892842646462, + "grad_norm": 0.3128298223018646, + "learning_rate": 9.999603557731754e-05, + "loss": 1.3556, + "step": 1150 + }, + { + "epoch": 0.31173048301988904, + "grad_norm": 0.30185452103614807, + "learning_rate": 9.999562922991293e-05, + "loss": 1.3484, + "step": 1152 + }, + { + "epoch": 0.3122716817751319, + "grad_norm": 0.3274635076522827, + "learning_rate": 9.99952030618664e-05, + "loss": 1.3729, + "step": 1154 + }, + { + "epoch": 0.31281288053037476, + "grad_norm": 0.30549076199531555, + "learning_rate": 9.999475707334692e-05, + "loss": 1.3642, + "step": 1156 + }, + { + "epoch": 0.31335407928561765, + "grad_norm": 0.3147718906402588, + "learning_rate": 9.999429126453126e-05, + "loss": 1.3493, + "step": 1158 + }, + { + "epoch": 0.3138952780408605, + "grad_norm": 0.6205586791038513, + "learning_rate": 9.99938056356041e-05, + "loss": 1.3623, + "step": 1160 + }, + { + "epoch": 0.31443647679610337, + "grad_norm": 0.3471706211566925, + "learning_rate": 9.999330018675798e-05, + "loss": 1.3533, + "step": 1162 + }, + { + "epoch": 0.31497767555134626, + "grad_norm": 1.3515815734863281, + "learning_rate": 9.999277491819328e-05, + "loss": 1.3565, + "step": 1164 + }, + { + "epoch": 0.3155188743065891, + "grad_norm": 733.9155883789062, + "learning_rate": 9.999222983011824e-05, + "loss": 5.2143, + "step": 1166 + }, + { + "epoch": 0.316060073061832, + "grad_norm": 2.9439170360565186, + "learning_rate": 9.999166492274894e-05, + "loss": 1.4438, + "step": 1168 + }, + { + "epoch": 0.3166012718170748, + "grad_norm": 1.5871142148971558, + "learning_rate": 9.999108019630938e-05, + "loss": 1.4426, + "step": 1170 + }, + { + "epoch": 0.3171424705723177, + "grad_norm": 711.9217529296875, + "learning_rate": 9.999047565103132e-05, + "loss": 3.6935, + "step": 1172 + }, + { + "epoch": 0.31768366932756054, + "grad_norm": 100.76264953613281, + "learning_rate": 9.998985128715448e-05, + "loss": 4.2396, + "step": 1174 + }, + { + "epoch": 0.3182248680828034, + "grad_norm": 108.88189697265625, + "learning_rate": 9.998920710492634e-05, + "loss": 4.9929, + "step": 1176 + }, + { + "epoch": 0.31876606683804626, + "grad_norm": 72.18595123291016, + "learning_rate": 9.998854310460233e-05, + "loss": 6.0375, + "step": 1178 + }, + { + "epoch": 0.31930726559328915, + "grad_norm": 59.48538589477539, + "learning_rate": 9.998785928644567e-05, + "loss": 5.8932, + "step": 1180 + }, + { + "epoch": 0.319848464348532, + "grad_norm": 36.32703399658203, + "learning_rate": 9.998715565072744e-05, + "loss": 6.5369, + "step": 1182 + }, + { + "epoch": 0.32038966310377487, + "grad_norm": 18.565351486206055, + "learning_rate": 9.998643219772664e-05, + "loss": 6.1671, + "step": 1184 + }, + { + "epoch": 0.3209308618590177, + "grad_norm": 45.84898376464844, + "learning_rate": 9.998568892773003e-05, + "loss": 5.9379, + "step": 1186 + }, + { + "epoch": 0.3214720606142606, + "grad_norm": 66.2480239868164, + "learning_rate": 9.998492584103232e-05, + "loss": 5.7071, + "step": 1188 + }, + { + "epoch": 0.3220132593695034, + "grad_norm": 41.693092346191406, + "learning_rate": 9.998414293793599e-05, + "loss": 6.3198, + "step": 1190 + }, + { + "epoch": 0.3225544581247463, + "grad_norm": 19.323413848876953, + "learning_rate": 9.998334021875147e-05, + "loss": 5.377, + "step": 1192 + }, + { + "epoch": 0.3230956568799892, + "grad_norm": 15.907301902770996, + "learning_rate": 9.998251768379696e-05, + "loss": 4.5293, + "step": 1194 + }, + { + "epoch": 0.32363685563523203, + "grad_norm": 80.1374740600586, + "learning_rate": 9.998167533339857e-05, + "loss": 4.3471, + "step": 1196 + }, + { + "epoch": 0.3241780543904749, + "grad_norm": 23.298336029052734, + "learning_rate": 9.998081316789024e-05, + "loss": 3.7461, + "step": 1198 + }, + { + "epoch": 0.32471925314571776, + "grad_norm": 82.48027801513672, + "learning_rate": 9.997993118761378e-05, + "loss": 4.1647, + "step": 1200 + }, + { + "epoch": 0.32526045190096065, + "grad_norm": 27.916913986206055, + "learning_rate": 9.997902939291883e-05, + "loss": 3.9092, + "step": 1202 + }, + { + "epoch": 0.3258016506562035, + "grad_norm": 15.70148754119873, + "learning_rate": 9.997810778416293e-05, + "loss": 3.1628, + "step": 1204 + }, + { + "epoch": 0.32634284941144637, + "grad_norm": 18.33330535888672, + "learning_rate": 9.997716636171142e-05, + "loss": 2.8777, + "step": 1206 + }, + { + "epoch": 0.3268840481666892, + "grad_norm": 10.6620512008667, + "learning_rate": 9.997620512593755e-05, + "loss": 2.3009, + "step": 1208 + }, + { + "epoch": 0.3274252469219321, + "grad_norm": 32.01799011230469, + "learning_rate": 9.99752240772224e-05, + "loss": 1.9617, + "step": 1210 + }, + { + "epoch": 0.3279664456771749, + "grad_norm": 5.677090644836426, + "learning_rate": 9.997422321595488e-05, + "loss": 1.8401, + "step": 1212 + }, + { + "epoch": 0.3285076444324178, + "grad_norm": 8.914667129516602, + "learning_rate": 9.997320254253179e-05, + "loss": 1.6707, + "step": 1214 + }, + { + "epoch": 0.32904884318766064, + "grad_norm": 2.3725008964538574, + "learning_rate": 9.997216205735779e-05, + "loss": 1.5757, + "step": 1216 + }, + { + "epoch": 0.32959004194290353, + "grad_norm": 2.418389320373535, + "learning_rate": 9.997110176084538e-05, + "loss": 1.5154, + "step": 1218 + }, + { + "epoch": 0.33013124069814637, + "grad_norm": 2.802185297012329, + "learning_rate": 9.997002165341487e-05, + "loss": 1.4883, + "step": 1220 + }, + { + "epoch": 0.33067243945338926, + "grad_norm": 2.1769211292266846, + "learning_rate": 9.996892173549452e-05, + "loss": 1.445, + "step": 1222 + }, + { + "epoch": 0.33121363820863214, + "grad_norm": 1.799670934677124, + "learning_rate": 9.996780200752035e-05, + "loss": 1.4276, + "step": 1224 + }, + { + "epoch": 0.331754836963875, + "grad_norm": 3.2545313835144043, + "learning_rate": 9.996666246993627e-05, + "loss": 1.4394, + "step": 1226 + }, + { + "epoch": 0.33229603571911787, + "grad_norm": 1.1922351121902466, + "learning_rate": 9.996550312319408e-05, + "loss": 1.4359, + "step": 1228 + }, + { + "epoch": 0.3328372344743607, + "grad_norm": 2.6813228130340576, + "learning_rate": 9.996432396775339e-05, + "loss": 1.4229, + "step": 1230 + }, + { + "epoch": 0.3333784332296036, + "grad_norm": 1.6968843936920166, + "learning_rate": 9.996312500408165e-05, + "loss": 1.4281, + "step": 1232 + }, + { + "epoch": 0.3339196319848464, + "grad_norm": 1.3502254486083984, + "learning_rate": 9.996190623265421e-05, + "loss": 1.408, + "step": 1234 + }, + { + "epoch": 0.3344608307400893, + "grad_norm": 1.2809518575668335, + "learning_rate": 9.996066765395424e-05, + "loss": 1.4176, + "step": 1236 + }, + { + "epoch": 0.33500202949533214, + "grad_norm": 1.0455057621002197, + "learning_rate": 9.995940926847279e-05, + "loss": 1.4056, + "step": 1238 + }, + { + "epoch": 0.33554322825057503, + "grad_norm": 1.3292824029922485, + "learning_rate": 9.99581310767087e-05, + "loss": 1.4033, + "step": 1240 + }, + { + "epoch": 0.33608442700581787, + "grad_norm": 1.5960067510604858, + "learning_rate": 9.995683307916875e-05, + "loss": 1.379, + "step": 1242 + }, + { + "epoch": 0.33662562576106075, + "grad_norm": 1.0471105575561523, + "learning_rate": 9.99555152763675e-05, + "loss": 1.3823, + "step": 1244 + }, + { + "epoch": 0.3371668245163036, + "grad_norm": 2.339273452758789, + "learning_rate": 9.99541776688274e-05, + "loss": 1.3698, + "step": 1246 + }, + { + "epoch": 0.3377080232715465, + "grad_norm": 0.81674724817276, + "learning_rate": 9.995282025707875e-05, + "loss": 1.4154, + "step": 1248 + }, + { + "epoch": 0.33824922202678936, + "grad_norm": 0.6240290999412537, + "learning_rate": 9.995144304165968e-05, + "loss": 1.4035, + "step": 1250 + }, + { + "epoch": 0.3387904207820322, + "grad_norm": 2.281787872314453, + "learning_rate": 9.995004602311619e-05, + "loss": 1.3906, + "step": 1252 + }, + { + "epoch": 0.3393316195372751, + "grad_norm": 0.6818395853042603, + "learning_rate": 9.99486292020021e-05, + "loss": 1.3853, + "step": 1254 + }, + { + "epoch": 0.3398728182925179, + "grad_norm": 6.299881935119629, + "learning_rate": 9.994719257887915e-05, + "loss": 1.3856, + "step": 1256 + }, + { + "epoch": 0.3404140170477608, + "grad_norm": 0.8173750638961792, + "learning_rate": 9.994573615431686e-05, + "loss": 1.3871, + "step": 1258 + }, + { + "epoch": 0.34095521580300364, + "grad_norm": 2.155395746231079, + "learning_rate": 9.994425992889262e-05, + "loss": 1.3382, + "step": 1260 + }, + { + "epoch": 0.34149641455824653, + "grad_norm": 0.5846114754676819, + "learning_rate": 9.99427639031917e-05, + "loss": 1.3978, + "step": 1262 + }, + { + "epoch": 0.34203761331348936, + "grad_norm": 0.6624069213867188, + "learning_rate": 9.994124807780717e-05, + "loss": 1.3792, + "step": 1264 + }, + { + "epoch": 0.34257881206873225, + "grad_norm": 0.5708588361740112, + "learning_rate": 9.993971245333998e-05, + "loss": 1.3677, + "step": 1266 + }, + { + "epoch": 0.3431200108239751, + "grad_norm": 0.5245474576950073, + "learning_rate": 9.993815703039894e-05, + "loss": 1.3672, + "step": 1268 + }, + { + "epoch": 0.343661209579218, + "grad_norm": 0.501871645450592, + "learning_rate": 9.993658180960069e-05, + "loss": 1.3674, + "step": 1270 + }, + { + "epoch": 0.3442024083344608, + "grad_norm": 0.5990382432937622, + "learning_rate": 9.993498679156969e-05, + "loss": 1.3804, + "step": 1272 + }, + { + "epoch": 0.3447436070897037, + "grad_norm": 0.42392146587371826, + "learning_rate": 9.993337197693833e-05, + "loss": 1.3628, + "step": 1274 + }, + { + "epoch": 0.34528480584494653, + "grad_norm": 0.46936917304992676, + "learning_rate": 9.993173736634676e-05, + "loss": 1.3696, + "step": 1276 + }, + { + "epoch": 0.3458260046001894, + "grad_norm": 0.52222740650177, + "learning_rate": 9.993008296044304e-05, + "loss": 1.3697, + "step": 1278 + }, + { + "epoch": 0.3463672033554323, + "grad_norm": 0.3582518398761749, + "learning_rate": 9.992840875988305e-05, + "loss": 1.3825, + "step": 1280 + }, + { + "epoch": 0.34690840211067514, + "grad_norm": 0.3533988296985626, + "learning_rate": 9.99267147653305e-05, + "loss": 1.361, + "step": 1282 + }, + { + "epoch": 0.34744960086591803, + "grad_norm": 0.35905274748802185, + "learning_rate": 9.992500097745702e-05, + "loss": 1.3721, + "step": 1284 + }, + { + "epoch": 0.34799079962116086, + "grad_norm": 0.3057416081428528, + "learning_rate": 9.9923267396942e-05, + "loss": 1.369, + "step": 1286 + }, + { + "epoch": 0.34853199837640375, + "grad_norm": 0.3299311101436615, + "learning_rate": 9.992151402447272e-05, + "loss": 1.358, + "step": 1288 + }, + { + "epoch": 0.3490731971316466, + "grad_norm": 0.3086453080177307, + "learning_rate": 9.99197408607443e-05, + "loss": 1.3534, + "step": 1290 + }, + { + "epoch": 0.3496143958868895, + "grad_norm": 0.3111782968044281, + "learning_rate": 9.991794790645969e-05, + "loss": 1.3605, + "step": 1292 + }, + { + "epoch": 0.3501555946421323, + "grad_norm": 0.3231568932533264, + "learning_rate": 9.991613516232974e-05, + "loss": 1.3543, + "step": 1294 + }, + { + "epoch": 0.3506967933973752, + "grad_norm": 0.3288814425468445, + "learning_rate": 9.991430262907309e-05, + "loss": 1.3521, + "step": 1296 + }, + { + "epoch": 0.35123799215261803, + "grad_norm": 0.3239436745643616, + "learning_rate": 9.991245030741622e-05, + "loss": 1.3335, + "step": 1298 + }, + { + "epoch": 0.3517791909078609, + "grad_norm": 0.3560773730278015, + "learning_rate": 9.991057819809353e-05, + "loss": 1.3487, + "step": 1300 + }, + { + "epoch": 0.35232038966310375, + "grad_norm": 0.4387347400188446, + "learning_rate": 9.990868630184716e-05, + "loss": 1.3548, + "step": 1302 + }, + { + "epoch": 0.35286158841834664, + "grad_norm": 0.32067278027534485, + "learning_rate": 9.990677461942717e-05, + "loss": 1.3471, + "step": 1304 + }, + { + "epoch": 0.3534027871735895, + "grad_norm": 0.4399580955505371, + "learning_rate": 9.990484315159146e-05, + "loss": 1.3588, + "step": 1306 + }, + { + "epoch": 0.35394398592883236, + "grad_norm": 0.9175602793693542, + "learning_rate": 9.990289189910571e-05, + "loss": 1.3432, + "step": 1308 + }, + { + "epoch": 0.35448518468407525, + "grad_norm": 0.45273318886756897, + "learning_rate": 9.990092086274352e-05, + "loss": 1.3434, + "step": 1310 + }, + { + "epoch": 0.3550263834393181, + "grad_norm": 0.3346487879753113, + "learning_rate": 9.989893004328632e-05, + "loss": 1.3339, + "step": 1312 + }, + { + "epoch": 0.35556758219456097, + "grad_norm": 0.4779951870441437, + "learning_rate": 9.989691944152333e-05, + "loss": 1.3561, + "step": 1314 + }, + { + "epoch": 0.3561087809498038, + "grad_norm": 0.6359366774559021, + "learning_rate": 9.989488905825166e-05, + "loss": 1.3499, + "step": 1316 + }, + { + "epoch": 0.3566499797050467, + "grad_norm": 0.5867050290107727, + "learning_rate": 9.989283889427625e-05, + "loss": 1.3791, + "step": 1318 + }, + { + "epoch": 0.3571911784602895, + "grad_norm": 1.869691014289856, + "learning_rate": 9.989076895040989e-05, + "loss": 1.3663, + "step": 1320 + }, + { + "epoch": 0.3577323772155324, + "grad_norm": 2.7147843837738037, + "learning_rate": 9.98886792274732e-05, + "loss": 1.358, + "step": 1322 + }, + { + "epoch": 0.35827357597077525, + "grad_norm": 0.8717885613441467, + "learning_rate": 9.988656972629465e-05, + "loss": 1.34, + "step": 1324 + }, + { + "epoch": 0.35881477472601814, + "grad_norm": 0.7126337885856628, + "learning_rate": 9.988444044771054e-05, + "loss": 1.3281, + "step": 1326 + }, + { + "epoch": 0.35935597348126097, + "grad_norm": 0.7409217357635498, + "learning_rate": 9.988229139256502e-05, + "loss": 1.3571, + "step": 1328 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.5892549157142639, + "learning_rate": 9.988012256171006e-05, + "loss": 1.3269, + "step": 1330 + }, + { + "epoch": 0.3604383709917467, + "grad_norm": 0.4858717620372772, + "learning_rate": 9.98779339560055e-05, + "loss": 1.3506, + "step": 1332 + }, + { + "epoch": 0.3609795697469896, + "grad_norm": 0.37409740686416626, + "learning_rate": 9.987572557631903e-05, + "loss": 1.3339, + "step": 1334 + }, + { + "epoch": 0.36152076850223247, + "grad_norm": 0.38315168023109436, + "learning_rate": 9.987349742352611e-05, + "loss": 1.3404, + "step": 1336 + }, + { + "epoch": 0.3620619672574753, + "grad_norm": 0.32702726125717163, + "learning_rate": 9.987124949851014e-05, + "loss": 1.3595, + "step": 1338 + }, + { + "epoch": 0.3626031660127182, + "grad_norm": 0.3133656680583954, + "learning_rate": 9.986898180216226e-05, + "loss": 1.3428, + "step": 1340 + }, + { + "epoch": 0.363144364767961, + "grad_norm": 0.2916230857372284, + "learning_rate": 9.986669433538152e-05, + "loss": 1.3381, + "step": 1342 + }, + { + "epoch": 0.3636855635232039, + "grad_norm": 0.28036215901374817, + "learning_rate": 9.986438709907476e-05, + "loss": 1.3447, + "step": 1344 + }, + { + "epoch": 0.36422676227844675, + "grad_norm": 0.30352699756622314, + "learning_rate": 9.98620600941567e-05, + "loss": 1.3427, + "step": 1346 + }, + { + "epoch": 0.36476796103368964, + "grad_norm": 0.3100769519805908, + "learning_rate": 9.985971332154984e-05, + "loss": 1.3603, + "step": 1348 + }, + { + "epoch": 0.36530915978893247, + "grad_norm": 0.2933647930622101, + "learning_rate": 9.98573467821846e-05, + "loss": 1.3646, + "step": 1350 + }, + { + "epoch": 0.36585035854417536, + "grad_norm": 0.2938663959503174, + "learning_rate": 9.985496047699916e-05, + "loss": 1.3763, + "step": 1352 + }, + { + "epoch": 0.3663915572994182, + "grad_norm": 0.2916519343852997, + "learning_rate": 9.985255440693955e-05, + "loss": 1.3431, + "step": 1354 + }, + { + "epoch": 0.3669327560546611, + "grad_norm": 0.2954147756099701, + "learning_rate": 9.985012857295968e-05, + "loss": 1.338, + "step": 1356 + }, + { + "epoch": 0.3674739548099039, + "grad_norm": 0.2839341163635254, + "learning_rate": 9.984768297602125e-05, + "loss": 1.3653, + "step": 1358 + }, + { + "epoch": 0.3680151535651468, + "grad_norm": 0.2878473699092865, + "learning_rate": 9.984521761709382e-05, + "loss": 1.3302, + "step": 1360 + }, + { + "epoch": 0.3685563523203897, + "grad_norm": 0.2859325408935547, + "learning_rate": 9.984273249715478e-05, + "loss": 1.3273, + "step": 1362 + }, + { + "epoch": 0.3690975510756325, + "grad_norm": 0.28399959206581116, + "learning_rate": 9.984022761718933e-05, + "loss": 1.3516, + "step": 1364 + }, + { + "epoch": 0.3696387498308754, + "grad_norm": 0.29740169644355774, + "learning_rate": 9.983770297819052e-05, + "loss": 1.3389, + "step": 1366 + }, + { + "epoch": 0.37017994858611825, + "grad_norm": 0.3143361806869507, + "learning_rate": 9.983515858115928e-05, + "loss": 1.3557, + "step": 1368 + }, + { + "epoch": 0.37072114734136113, + "grad_norm": 0.30783936381340027, + "learning_rate": 9.983259442710429e-05, + "loss": 1.3498, + "step": 1370 + }, + { + "epoch": 0.37126234609660397, + "grad_norm": 0.297091543674469, + "learning_rate": 9.983001051704211e-05, + "loss": 1.3308, + "step": 1372 + }, + { + "epoch": 0.37180354485184686, + "grad_norm": 0.3118893504142761, + "learning_rate": 9.982740685199712e-05, + "loss": 1.3372, + "step": 1374 + }, + { + "epoch": 0.3723447436070897, + "grad_norm": 0.2826865017414093, + "learning_rate": 9.982478343300155e-05, + "loss": 1.3488, + "step": 1376 + }, + { + "epoch": 0.3728859423623326, + "grad_norm": 0.2829175889492035, + "learning_rate": 9.982214026109544e-05, + "loss": 1.3693, + "step": 1378 + }, + { + "epoch": 0.3734271411175754, + "grad_norm": 0.3026389479637146, + "learning_rate": 9.981947733732668e-05, + "loss": 1.3276, + "step": 1380 + }, + { + "epoch": 0.3739683398728183, + "grad_norm": 0.30112889409065247, + "learning_rate": 9.981679466275096e-05, + "loss": 1.3441, + "step": 1382 + }, + { + "epoch": 0.37450953862806113, + "grad_norm": 0.27241262793540955, + "learning_rate": 9.981409223843183e-05, + "loss": 1.3373, + "step": 1384 + }, + { + "epoch": 0.375050737383304, + "grad_norm": 0.2804114520549774, + "learning_rate": 9.981137006544066e-05, + "loss": 1.344, + "step": 1386 + }, + { + "epoch": 0.37559193613854686, + "grad_norm": 0.27698764204978943, + "learning_rate": 9.980862814485665e-05, + "loss": 1.3543, + "step": 1388 + }, + { + "epoch": 0.37613313489378974, + "grad_norm": 0.29283177852630615, + "learning_rate": 9.980586647776681e-05, + "loss": 1.3332, + "step": 1390 + }, + { + "epoch": 0.37667433364903263, + "grad_norm": 0.2896028459072113, + "learning_rate": 9.980308506526604e-05, + "loss": 1.3392, + "step": 1392 + }, + { + "epoch": 0.37721553240427547, + "grad_norm": 0.27882838249206543, + "learning_rate": 9.980028390845697e-05, + "loss": 1.336, + "step": 1394 + }, + { + "epoch": 0.37775673115951836, + "grad_norm": 0.2886262834072113, + "learning_rate": 9.979746300845015e-05, + "loss": 1.3331, + "step": 1396 + }, + { + "epoch": 0.3782979299147612, + "grad_norm": 0.3085189163684845, + "learning_rate": 9.97946223663639e-05, + "loss": 1.3296, + "step": 1398 + }, + { + "epoch": 0.3788391286700041, + "grad_norm": 0.3342386484146118, + "learning_rate": 9.97917619833244e-05, + "loss": 1.351, + "step": 1400 + }, + { + "epoch": 0.3793803274252469, + "grad_norm": 0.3263756036758423, + "learning_rate": 9.978888186046562e-05, + "loss": 1.3526, + "step": 1402 + }, + { + "epoch": 0.3799215261804898, + "grad_norm": 0.292346715927124, + "learning_rate": 9.97859819989294e-05, + "loss": 1.3498, + "step": 1404 + }, + { + "epoch": 0.38046272493573263, + "grad_norm": 0.29072263836860657, + "learning_rate": 9.978306239986536e-05, + "loss": 1.3423, + "step": 1406 + }, + { + "epoch": 0.3810039236909755, + "grad_norm": 0.3350834548473358, + "learning_rate": 9.978012306443101e-05, + "loss": 1.3559, + "step": 1408 + }, + { + "epoch": 0.38154512244621835, + "grad_norm": 0.28721559047698975, + "learning_rate": 9.977716399379157e-05, + "loss": 1.3294, + "step": 1410 + }, + { + "epoch": 0.38208632120146124, + "grad_norm": 0.3062276244163513, + "learning_rate": 9.977418518912023e-05, + "loss": 1.3457, + "step": 1412 + }, + { + "epoch": 0.3826275199567041, + "grad_norm": 0.30255332589149475, + "learning_rate": 9.977118665159791e-05, + "loss": 1.3371, + "step": 1414 + }, + { + "epoch": 0.38316871871194697, + "grad_norm": 0.2800199091434479, + "learning_rate": 9.976816838241334e-05, + "loss": 1.3439, + "step": 1416 + }, + { + "epoch": 0.3837099174671898, + "grad_norm": 0.2754746675491333, + "learning_rate": 9.976513038276312e-05, + "loss": 1.3303, + "step": 1418 + }, + { + "epoch": 0.3842511162224327, + "grad_norm": 0.29933616518974304, + "learning_rate": 9.976207265385168e-05, + "loss": 1.3365, + "step": 1420 + }, + { + "epoch": 0.3847923149776756, + "grad_norm": 0.3023386001586914, + "learning_rate": 9.975899519689122e-05, + "loss": 1.3164, + "step": 1422 + }, + { + "epoch": 0.3853335137329184, + "grad_norm": 0.2901383936405182, + "learning_rate": 9.975589801310181e-05, + "loss": 1.3209, + "step": 1424 + }, + { + "epoch": 0.3858747124881613, + "grad_norm": 0.28566035628318787, + "learning_rate": 9.975278110371131e-05, + "loss": 1.3301, + "step": 1426 + }, + { + "epoch": 0.38641591124340413, + "grad_norm": 0.3010505735874176, + "learning_rate": 9.974964446995543e-05, + "loss": 1.319, + "step": 1428 + }, + { + "epoch": 0.386957109998647, + "grad_norm": 0.2977135479450226, + "learning_rate": 9.974648811307766e-05, + "loss": 1.3311, + "step": 1430 + }, + { + "epoch": 0.38749830875388985, + "grad_norm": 0.28914034366607666, + "learning_rate": 9.974331203432932e-05, + "loss": 1.343, + "step": 1432 + }, + { + "epoch": 0.38803950750913274, + "grad_norm": 0.2842980623245239, + "learning_rate": 9.974011623496958e-05, + "loss": 1.3162, + "step": 1434 + }, + { + "epoch": 0.3885807062643756, + "grad_norm": 0.3048929274082184, + "learning_rate": 9.97369007162654e-05, + "loss": 1.3166, + "step": 1436 + }, + { + "epoch": 0.38912190501961846, + "grad_norm": 0.3024531304836273, + "learning_rate": 9.973366547949157e-05, + "loss": 1.3156, + "step": 1438 + }, + { + "epoch": 0.3896631037748613, + "grad_norm": 0.2911103367805481, + "learning_rate": 9.973041052593068e-05, + "loss": 1.3314, + "step": 1440 + }, + { + "epoch": 0.3902043025301042, + "grad_norm": 0.30932334065437317, + "learning_rate": 9.972713585687317e-05, + "loss": 1.3144, + "step": 1442 + }, + { + "epoch": 0.390745501285347, + "grad_norm": 0.302971750497818, + "learning_rate": 9.972384147361725e-05, + "loss": 1.3431, + "step": 1444 + }, + { + "epoch": 0.3912867000405899, + "grad_norm": 0.32412296533584595, + "learning_rate": 9.972052737746898e-05, + "loss": 1.3167, + "step": 1446 + }, + { + "epoch": 0.3918278987958328, + "grad_norm": 0.4637945890426636, + "learning_rate": 9.97171935697422e-05, + "loss": 1.3433, + "step": 1448 + }, + { + "epoch": 0.39236909755107563, + "grad_norm": 0.32690081000328064, + "learning_rate": 9.971384005175864e-05, + "loss": 1.3327, + "step": 1450 + }, + { + "epoch": 0.3929102963063185, + "grad_norm": 0.3049994111061096, + "learning_rate": 9.971046682484776e-05, + "loss": 1.3401, + "step": 1452 + }, + { + "epoch": 0.39345149506156135, + "grad_norm": 0.306095689535141, + "learning_rate": 9.970707389034688e-05, + "loss": 1.3205, + "step": 1454 + }, + { + "epoch": 0.39399269381680424, + "grad_norm": 0.3375592529773712, + "learning_rate": 9.970366124960111e-05, + "loss": 1.3243, + "step": 1456 + }, + { + "epoch": 0.3945338925720471, + "grad_norm": 0.30508387088775635, + "learning_rate": 9.970022890396338e-05, + "loss": 1.3342, + "step": 1458 + }, + { + "epoch": 0.39507509132728996, + "grad_norm": 0.2996918261051178, + "learning_rate": 9.969677685479444e-05, + "loss": 1.3457, + "step": 1460 + }, + { + "epoch": 0.3956162900825328, + "grad_norm": 0.29500269889831543, + "learning_rate": 9.969330510346286e-05, + "loss": 1.3306, + "step": 1462 + }, + { + "epoch": 0.3961574888377757, + "grad_norm": 0.28392598032951355, + "learning_rate": 9.9689813651345e-05, + "loss": 1.3347, + "step": 1464 + }, + { + "epoch": 0.3966986875930185, + "grad_norm": 0.2859434485435486, + "learning_rate": 9.968630249982503e-05, + "loss": 1.3342, + "step": 1466 + }, + { + "epoch": 0.3972398863482614, + "grad_norm": 0.3038876950740814, + "learning_rate": 9.968277165029494e-05, + "loss": 1.3248, + "step": 1468 + }, + { + "epoch": 0.39778108510350424, + "grad_norm": 0.3060581088066101, + "learning_rate": 9.967922110415454e-05, + "loss": 1.3403, + "step": 1470 + }, + { + "epoch": 0.39832228385874713, + "grad_norm": 0.30475133657455444, + "learning_rate": 9.96756508628114e-05, + "loss": 1.3338, + "step": 1472 + }, + { + "epoch": 0.39886348261398996, + "grad_norm": 0.33263343572616577, + "learning_rate": 9.967206092768095e-05, + "loss": 1.3209, + "step": 1474 + }, + { + "epoch": 0.39940468136923285, + "grad_norm": 0.2895435094833374, + "learning_rate": 9.966845130018645e-05, + "loss": 1.3352, + "step": 1476 + }, + { + "epoch": 0.39994588012447574, + "grad_norm": 0.27237775921821594, + "learning_rate": 9.966482198175886e-05, + "loss": 1.3239, + "step": 1478 + }, + { + "epoch": 0.40048707887971857, + "grad_norm": 0.2740168571472168, + "learning_rate": 9.966117297383707e-05, + "loss": 1.3371, + "step": 1480 + }, + { + "epoch": 0.40102827763496146, + "grad_norm": 0.30601269006729126, + "learning_rate": 9.965750427786768e-05, + "loss": 1.343, + "step": 1482 + }, + { + "epoch": 0.4015694763902043, + "grad_norm": 0.28768840432167053, + "learning_rate": 9.965381589530518e-05, + "loss": 1.3442, + "step": 1484 + }, + { + "epoch": 0.4021106751454472, + "grad_norm": 0.28244882822036743, + "learning_rate": 9.965010782761177e-05, + "loss": 1.3336, + "step": 1486 + }, + { + "epoch": 0.40265187390069, + "grad_norm": 0.2694818079471588, + "learning_rate": 9.964638007625754e-05, + "loss": 1.3448, + "step": 1488 + }, + { + "epoch": 0.4031930726559329, + "grad_norm": 0.29507288336753845, + "learning_rate": 9.964263264272033e-05, + "loss": 1.327, + "step": 1490 + }, + { + "epoch": 0.40373427141117574, + "grad_norm": 0.3036315143108368, + "learning_rate": 9.963886552848581e-05, + "loss": 1.3289, + "step": 1492 + }, + { + "epoch": 0.4042754701664186, + "grad_norm": 0.2737107574939728, + "learning_rate": 9.963507873504744e-05, + "loss": 1.3281, + "step": 1494 + }, + { + "epoch": 0.40481666892166146, + "grad_norm": 0.29833105206489563, + "learning_rate": 9.963127226390647e-05, + "loss": 1.3378, + "step": 1496 + }, + { + "epoch": 0.40535786767690435, + "grad_norm": 0.32203689217567444, + "learning_rate": 9.9627446116572e-05, + "loss": 1.3158, + "step": 1498 + }, + { + "epoch": 0.4058990664321472, + "grad_norm": 0.27837038040161133, + "learning_rate": 9.962360029456086e-05, + "loss": 1.3051, + "step": 1500 + }, + { + "epoch": 0.40644026518739007, + "grad_norm": 0.2688932418823242, + "learning_rate": 9.961973479939774e-05, + "loss": 1.339, + "step": 1502 + }, + { + "epoch": 0.40698146394263296, + "grad_norm": 0.2779388725757599, + "learning_rate": 9.96158496326151e-05, + "loss": 1.3264, + "step": 1504 + }, + { + "epoch": 0.4075226626978758, + "grad_norm": 0.27401190996170044, + "learning_rate": 9.961194479575321e-05, + "loss": 1.3139, + "step": 1506 + }, + { + "epoch": 0.4080638614531187, + "grad_norm": 0.270448237657547, + "learning_rate": 9.960802029036012e-05, + "loss": 1.3253, + "step": 1508 + }, + { + "epoch": 0.4086050602083615, + "grad_norm": 0.29150158166885376, + "learning_rate": 9.96040761179917e-05, + "loss": 1.3324, + "step": 1510 + }, + { + "epoch": 0.4091462589636044, + "grad_norm": 0.2666511833667755, + "learning_rate": 9.960011228021159e-05, + "loss": 1.325, + "step": 1512 + }, + { + "epoch": 0.40968745771884724, + "grad_norm": 0.2782241106033325, + "learning_rate": 9.959612877859125e-05, + "loss": 1.3162, + "step": 1514 + }, + { + "epoch": 0.4102286564740901, + "grad_norm": 0.2845720946788788, + "learning_rate": 9.959212561470996e-05, + "loss": 1.3316, + "step": 1516 + }, + { + "epoch": 0.41076985522933296, + "grad_norm": 0.27991780638694763, + "learning_rate": 9.958810279015473e-05, + "loss": 1.3121, + "step": 1518 + }, + { + "epoch": 0.41131105398457585, + "grad_norm": 0.2804965674877167, + "learning_rate": 9.958406030652043e-05, + "loss": 1.3246, + "step": 1520 + }, + { + "epoch": 0.4118522527398187, + "grad_norm": 0.2732795178890228, + "learning_rate": 9.957999816540965e-05, + "loss": 1.3217, + "step": 1522 + }, + { + "epoch": 0.41239345149506157, + "grad_norm": 0.28181079030036926, + "learning_rate": 9.957591636843284e-05, + "loss": 1.3374, + "step": 1524 + }, + { + "epoch": 0.4129346502503044, + "grad_norm": 0.3096240162849426, + "learning_rate": 9.957181491720822e-05, + "loss": 1.3324, + "step": 1526 + }, + { + "epoch": 0.4134758490055473, + "grad_norm": 0.2709742486476898, + "learning_rate": 9.95676938133618e-05, + "loss": 1.3055, + "step": 1528 + }, + { + "epoch": 0.4140170477607901, + "grad_norm": 0.27309080958366394, + "learning_rate": 9.956355305852736e-05, + "loss": 1.313, + "step": 1530 + }, + { + "epoch": 0.414558246516033, + "grad_norm": 0.29801151156425476, + "learning_rate": 9.955939265434652e-05, + "loss": 1.3185, + "step": 1532 + }, + { + "epoch": 0.4150994452712759, + "grad_norm": 0.28698021173477173, + "learning_rate": 9.955521260246865e-05, + "loss": 1.3214, + "step": 1534 + }, + { + "epoch": 0.41564064402651874, + "grad_norm": 0.2641914188861847, + "learning_rate": 9.955101290455093e-05, + "loss": 1.317, + "step": 1536 + }, + { + "epoch": 0.4161818427817616, + "grad_norm": 0.26065558195114136, + "learning_rate": 9.954679356225832e-05, + "loss": 1.3253, + "step": 1538 + }, + { + "epoch": 0.41672304153700446, + "grad_norm": 0.27157294750213623, + "learning_rate": 9.954255457726354e-05, + "loss": 1.3218, + "step": 1540 + }, + { + "epoch": 0.41726424029224735, + "grad_norm": 0.2833496630191803, + "learning_rate": 9.953829595124715e-05, + "loss": 1.32, + "step": 1542 + }, + { + "epoch": 0.4178054390474902, + "grad_norm": 0.2757824659347534, + "learning_rate": 9.953401768589745e-05, + "loss": 1.3165, + "step": 1544 + }, + { + "epoch": 0.41834663780273307, + "grad_norm": 0.2609362304210663, + "learning_rate": 9.952971978291059e-05, + "loss": 1.3229, + "step": 1546 + }, + { + "epoch": 0.4188878365579759, + "grad_norm": 0.2863214313983917, + "learning_rate": 9.952540224399043e-05, + "loss": 1.3217, + "step": 1548 + }, + { + "epoch": 0.4194290353132188, + "grad_norm": 0.27573657035827637, + "learning_rate": 9.952106507084864e-05, + "loss": 1.3151, + "step": 1550 + }, + { + "epoch": 0.4199702340684616, + "grad_norm": 0.26843398809432983, + "learning_rate": 9.95167082652047e-05, + "loss": 1.3185, + "step": 1552 + }, + { + "epoch": 0.4205114328237045, + "grad_norm": 0.25903749465942383, + "learning_rate": 9.951233182878585e-05, + "loss": 1.3142, + "step": 1554 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.27221450209617615, + "learning_rate": 9.950793576332713e-05, + "loss": 1.3119, + "step": 1556 + }, + { + "epoch": 0.42159383033419023, + "grad_norm": 0.2897038161754608, + "learning_rate": 9.950352007057134e-05, + "loss": 1.3217, + "step": 1558 + }, + { + "epoch": 0.42213502908943307, + "grad_norm": 0.2515231668949127, + "learning_rate": 9.949908475226905e-05, + "loss": 1.3263, + "step": 1560 + }, + { + "epoch": 0.42267622784467596, + "grad_norm": 0.26686710119247437, + "learning_rate": 9.949462981017865e-05, + "loss": 1.3269, + "step": 1562 + }, + { + "epoch": 0.42321742659991884, + "grad_norm": 0.2747204899787903, + "learning_rate": 9.949015524606629e-05, + "loss": 1.3208, + "step": 1564 + }, + { + "epoch": 0.4237586253551617, + "grad_norm": 0.25866395235061646, + "learning_rate": 9.948566106170589e-05, + "loss": 1.3273, + "step": 1566 + }, + { + "epoch": 0.42429982411040457, + "grad_norm": 0.2659189999103546, + "learning_rate": 9.948114725887918e-05, + "loss": 1.2955, + "step": 1568 + }, + { + "epoch": 0.4248410228656474, + "grad_norm": 0.25262853503227234, + "learning_rate": 9.947661383937563e-05, + "loss": 1.284, + "step": 1570 + }, + { + "epoch": 0.4253822216208903, + "grad_norm": 0.24780422449111938, + "learning_rate": 9.94720608049925e-05, + "loss": 1.3168, + "step": 1572 + }, + { + "epoch": 0.4259234203761331, + "grad_norm": 0.2663845121860504, + "learning_rate": 9.946748815753484e-05, + "loss": 1.313, + "step": 1574 + }, + { + "epoch": 0.426464619131376, + "grad_norm": 0.2906511425971985, + "learning_rate": 9.946289589881545e-05, + "loss": 1.3197, + "step": 1576 + }, + { + "epoch": 0.42700581788661884, + "grad_norm": 0.28401264548301697, + "learning_rate": 9.945828403065493e-05, + "loss": 1.3254, + "step": 1578 + }, + { + "epoch": 0.42754701664186173, + "grad_norm": 0.27820122241973877, + "learning_rate": 9.945365255488164e-05, + "loss": 1.3153, + "step": 1580 + }, + { + "epoch": 0.42808821539710457, + "grad_norm": 0.2573559880256653, + "learning_rate": 9.944900147333173e-05, + "loss": 1.3144, + "step": 1582 + }, + { + "epoch": 0.42862941415234745, + "grad_norm": 0.2536357343196869, + "learning_rate": 9.944433078784909e-05, + "loss": 1.3172, + "step": 1584 + }, + { + "epoch": 0.4291706129075903, + "grad_norm": 0.2745160758495331, + "learning_rate": 9.94396405002854e-05, + "loss": 1.3023, + "step": 1586 + }, + { + "epoch": 0.4297118116628332, + "grad_norm": 0.290393203496933, + "learning_rate": 9.943493061250013e-05, + "loss": 1.3095, + "step": 1588 + }, + { + "epoch": 0.43025301041807607, + "grad_norm": 0.29357218742370605, + "learning_rate": 9.94302011263605e-05, + "loss": 1.3232, + "step": 1590 + }, + { + "epoch": 0.4307942091733189, + "grad_norm": 0.2756180167198181, + "learning_rate": 9.94254520437415e-05, + "loss": 1.3179, + "step": 1592 + }, + { + "epoch": 0.4313354079285618, + "grad_norm": 0.30225417017936707, + "learning_rate": 9.942068336652589e-05, + "loss": 1.3353, + "step": 1594 + }, + { + "epoch": 0.4318766066838046, + "grad_norm": 0.26694637537002563, + "learning_rate": 9.94158950966042e-05, + "loss": 1.318, + "step": 1596 + }, + { + "epoch": 0.4324178054390475, + "grad_norm": 0.2528863549232483, + "learning_rate": 9.941108723587471e-05, + "loss": 1.3282, + "step": 1598 + }, + { + "epoch": 0.43295900419429034, + "grad_norm": 0.25261232256889343, + "learning_rate": 9.940625978624353e-05, + "loss": 1.3178, + "step": 1600 + }, + { + "epoch": 0.43350020294953323, + "grad_norm": 0.2624775767326355, + "learning_rate": 9.940141274962444e-05, + "loss": 1.31, + "step": 1602 + }, + { + "epoch": 0.43404140170477606, + "grad_norm": 0.260810524225235, + "learning_rate": 9.939654612793908e-05, + "loss": 1.3162, + "step": 1604 + }, + { + "epoch": 0.43458260046001895, + "grad_norm": 0.2815745174884796, + "learning_rate": 9.939165992311676e-05, + "loss": 1.3112, + "step": 1606 + }, + { + "epoch": 0.4351237992152618, + "grad_norm": 0.2773973345756531, + "learning_rate": 9.938675413709466e-05, + "loss": 1.3, + "step": 1608 + }, + { + "epoch": 0.4356649979705047, + "grad_norm": 0.26486915349960327, + "learning_rate": 9.938182877181763e-05, + "loss": 1.3193, + "step": 1610 + }, + { + "epoch": 0.4362061967257475, + "grad_norm": 0.26103830337524414, + "learning_rate": 9.937688382923832e-05, + "loss": 1.3244, + "step": 1612 + }, + { + "epoch": 0.4367473954809904, + "grad_norm": 0.2556493878364563, + "learning_rate": 9.937191931131716e-05, + "loss": 1.3087, + "step": 1614 + }, + { + "epoch": 0.43728859423623323, + "grad_norm": 0.2739090919494629, + "learning_rate": 9.93669352200223e-05, + "loss": 1.3009, + "step": 1616 + }, + { + "epoch": 0.4378297929914761, + "grad_norm": 0.26297444105148315, + "learning_rate": 9.936193155732967e-05, + "loss": 1.2971, + "step": 1618 + }, + { + "epoch": 0.438370991746719, + "grad_norm": 0.2587411403656006, + "learning_rate": 9.935690832522297e-05, + "loss": 1.3259, + "step": 1620 + }, + { + "epoch": 0.43891219050196184, + "grad_norm": 0.2419731616973877, + "learning_rate": 9.935186552569366e-05, + "loss": 1.3123, + "step": 1622 + }, + { + "epoch": 0.43945338925720473, + "grad_norm": 0.27424389123916626, + "learning_rate": 9.934680316074092e-05, + "loss": 1.3196, + "step": 1624 + }, + { + "epoch": 0.43999458801244756, + "grad_norm": 0.258242666721344, + "learning_rate": 9.934172123237173e-05, + "loss": 1.3044, + "step": 1626 + }, + { + "epoch": 0.44053578676769045, + "grad_norm": 0.2621035575866699, + "learning_rate": 9.933661974260078e-05, + "loss": 1.3111, + "step": 1628 + }, + { + "epoch": 0.4410769855229333, + "grad_norm": 0.25349390506744385, + "learning_rate": 9.93314986934506e-05, + "loss": 1.3025, + "step": 1630 + }, + { + "epoch": 0.4416181842781762, + "grad_norm": 0.2615620195865631, + "learning_rate": 9.932635808695136e-05, + "loss": 1.3291, + "step": 1632 + }, + { + "epoch": 0.442159383033419, + "grad_norm": 0.2933880686759949, + "learning_rate": 9.932119792514105e-05, + "loss": 1.3327, + "step": 1634 + }, + { + "epoch": 0.4427005817886619, + "grad_norm": 0.2584700286388397, + "learning_rate": 9.931601821006544e-05, + "loss": 1.3031, + "step": 1636 + }, + { + "epoch": 0.44324178054390473, + "grad_norm": 0.2718084156513214, + "learning_rate": 9.931081894377797e-05, + "loss": 1.3053, + "step": 1638 + }, + { + "epoch": 0.4437829792991476, + "grad_norm": 0.27105703949928284, + "learning_rate": 9.93056001283399e-05, + "loss": 1.3012, + "step": 1640 + }, + { + "epoch": 0.44432417805439045, + "grad_norm": 0.27265292406082153, + "learning_rate": 9.930036176582021e-05, + "loss": 1.2957, + "step": 1642 + }, + { + "epoch": 0.44486537680963334, + "grad_norm": 0.26121169328689575, + "learning_rate": 9.929510385829564e-05, + "loss": 1.3062, + "step": 1644 + }, + { + "epoch": 0.44540657556487623, + "grad_norm": 0.26841971278190613, + "learning_rate": 9.928982640785067e-05, + "loss": 1.3192, + "step": 1646 + }, + { + "epoch": 0.44594777432011906, + "grad_norm": 0.27634862065315247, + "learning_rate": 9.928452941657755e-05, + "loss": 1.3005, + "step": 1648 + }, + { + "epoch": 0.44648897307536195, + "grad_norm": 0.25527122616767883, + "learning_rate": 9.927921288657623e-05, + "loss": 1.3121, + "step": 1650 + }, + { + "epoch": 0.4470301718306048, + "grad_norm": 0.2733294665813446, + "learning_rate": 9.927387681995443e-05, + "loss": 1.3051, + "step": 1652 + }, + { + "epoch": 0.44757137058584767, + "grad_norm": 0.2783257067203522, + "learning_rate": 9.926852121882766e-05, + "loss": 1.2947, + "step": 1654 + }, + { + "epoch": 0.4481125693410905, + "grad_norm": 0.2672583758831024, + "learning_rate": 9.926314608531911e-05, + "loss": 1.3272, + "step": 1656 + }, + { + "epoch": 0.4486537680963334, + "grad_norm": 0.2568219304084778, + "learning_rate": 9.925775142155974e-05, + "loss": 1.3025, + "step": 1658 + }, + { + "epoch": 0.4491949668515762, + "grad_norm": 0.2576539218425751, + "learning_rate": 9.925233722968826e-05, + "loss": 1.2715, + "step": 1660 + }, + { + "epoch": 0.4497361656068191, + "grad_norm": 0.25898897647857666, + "learning_rate": 9.924690351185109e-05, + "loss": 1.3039, + "step": 1662 + }, + { + "epoch": 0.45027736436206195, + "grad_norm": 0.25795668363571167, + "learning_rate": 9.924145027020242e-05, + "loss": 1.3115, + "step": 1664 + }, + { + "epoch": 0.45081856311730484, + "grad_norm": 0.2781166136264801, + "learning_rate": 9.92359775069042e-05, + "loss": 1.3017, + "step": 1666 + }, + { + "epoch": 0.45135976187254767, + "grad_norm": 0.2871512770652771, + "learning_rate": 9.923048522412608e-05, + "loss": 1.3206, + "step": 1668 + }, + { + "epoch": 0.45190096062779056, + "grad_norm": 0.27760595083236694, + "learning_rate": 9.922497342404544e-05, + "loss": 1.3214, + "step": 1670 + }, + { + "epoch": 0.4524421593830334, + "grad_norm": 0.26959067583084106, + "learning_rate": 9.921944210884746e-05, + "loss": 1.3144, + "step": 1672 + }, + { + "epoch": 0.4529833581382763, + "grad_norm": 0.2662011384963989, + "learning_rate": 9.921389128072498e-05, + "loss": 1.3022, + "step": 1674 + }, + { + "epoch": 0.45352455689351917, + "grad_norm": 0.28014811873435974, + "learning_rate": 9.920832094187861e-05, + "loss": 1.3104, + "step": 1676 + }, + { + "epoch": 0.454065755648762, + "grad_norm": 0.2560974955558777, + "learning_rate": 9.920273109451673e-05, + "loss": 1.3113, + "step": 1678 + }, + { + "epoch": 0.4546069544040049, + "grad_norm": 0.285339891910553, + "learning_rate": 9.91971217408554e-05, + "loss": 1.3126, + "step": 1680 + }, + { + "epoch": 0.4551481531592477, + "grad_norm": 0.29105204343795776, + "learning_rate": 9.919149288311843e-05, + "loss": 1.3248, + "step": 1682 + }, + { + "epoch": 0.4556893519144906, + "grad_norm": 0.2868146002292633, + "learning_rate": 9.918584452353739e-05, + "loss": 1.3217, + "step": 1684 + }, + { + "epoch": 0.45623055066973345, + "grad_norm": 0.26717278361320496, + "learning_rate": 9.918017666435152e-05, + "loss": 1.2991, + "step": 1686 + }, + { + "epoch": 0.45677174942497634, + "grad_norm": 0.2560403048992157, + "learning_rate": 9.917448930780786e-05, + "loss": 1.3091, + "step": 1688 + }, + { + "epoch": 0.45731294818021917, + "grad_norm": 0.2610042989253998, + "learning_rate": 9.916878245616114e-05, + "loss": 1.2948, + "step": 1690 + }, + { + "epoch": 0.45785414693546206, + "grad_norm": 0.27322304248809814, + "learning_rate": 9.916305611167382e-05, + "loss": 1.3121, + "step": 1692 + }, + { + "epoch": 0.4583953456907049, + "grad_norm": 0.26559844613075256, + "learning_rate": 9.91573102766161e-05, + "loss": 1.307, + "step": 1694 + }, + { + "epoch": 0.4589365444459478, + "grad_norm": 0.2677384316921234, + "learning_rate": 9.91515449532659e-05, + "loss": 1.2925, + "step": 1696 + }, + { + "epoch": 0.4594777432011906, + "grad_norm": 0.2670448422431946, + "learning_rate": 9.914576014390888e-05, + "loss": 1.3051, + "step": 1698 + }, + { + "epoch": 0.4600189419564335, + "grad_norm": 0.2537919878959656, + "learning_rate": 9.91399558508384e-05, + "loss": 1.3047, + "step": 1700 + }, + { + "epoch": 0.46056014071167634, + "grad_norm": 0.2712916433811188, + "learning_rate": 9.913413207635555e-05, + "loss": 1.2949, + "step": 1702 + }, + { + "epoch": 0.4611013394669192, + "grad_norm": 0.27910125255584717, + "learning_rate": 9.912828882276917e-05, + "loss": 1.336, + "step": 1704 + }, + { + "epoch": 0.4616425382221621, + "grad_norm": 0.25917065143585205, + "learning_rate": 9.91224260923958e-05, + "loss": 1.2938, + "step": 1706 + }, + { + "epoch": 0.46218373697740495, + "grad_norm": 0.265024334192276, + "learning_rate": 9.91165438875597e-05, + "loss": 1.2876, + "step": 1708 + }, + { + "epoch": 0.46272493573264784, + "grad_norm": 0.2637651860713959, + "learning_rate": 9.911064221059286e-05, + "loss": 1.3128, + "step": 1710 + }, + { + "epoch": 0.46326613448789067, + "grad_norm": 0.25448864698410034, + "learning_rate": 9.910472106383495e-05, + "loss": 1.3289, + "step": 1712 + }, + { + "epoch": 0.46380733324313356, + "grad_norm": 0.24903124570846558, + "learning_rate": 9.909878044963346e-05, + "loss": 1.3013, + "step": 1714 + }, + { + "epoch": 0.4643485319983764, + "grad_norm": 0.258848637342453, + "learning_rate": 9.909282037034347e-05, + "loss": 1.3052, + "step": 1716 + }, + { + "epoch": 0.4648897307536193, + "grad_norm": 0.25806304812431335, + "learning_rate": 9.908684082832787e-05, + "loss": 1.286, + "step": 1718 + }, + { + "epoch": 0.4654309295088621, + "grad_norm": 0.26794132590293884, + "learning_rate": 9.908084182595723e-05, + "loss": 1.3069, + "step": 1720 + }, + { + "epoch": 0.465972128264105, + "grad_norm": 0.26079118251800537, + "learning_rate": 9.907482336560983e-05, + "loss": 1.3145, + "step": 1722 + }, + { + "epoch": 0.46651332701934783, + "grad_norm": 0.25958481431007385, + "learning_rate": 9.906878544967169e-05, + "loss": 1.3098, + "step": 1724 + }, + { + "epoch": 0.4670545257745907, + "grad_norm": 0.2390812784433365, + "learning_rate": 9.906272808053652e-05, + "loss": 1.3085, + "step": 1726 + }, + { + "epoch": 0.46759572452983356, + "grad_norm": 0.263637900352478, + "learning_rate": 9.905665126060574e-05, + "loss": 1.2933, + "step": 1728 + }, + { + "epoch": 0.46813692328507645, + "grad_norm": 0.2462746798992157, + "learning_rate": 9.90505549922885e-05, + "loss": 1.2877, + "step": 1730 + }, + { + "epoch": 0.46867812204031933, + "grad_norm": 0.244845911860466, + "learning_rate": 9.904443927800164e-05, + "loss": 1.325, + "step": 1732 + }, + { + "epoch": 0.46921932079556217, + "grad_norm": 0.28249332308769226, + "learning_rate": 9.903830412016974e-05, + "loss": 1.313, + "step": 1734 + }, + { + "epoch": 0.46976051955080506, + "grad_norm": 0.29556336998939514, + "learning_rate": 9.903214952122504e-05, + "loss": 1.3142, + "step": 1736 + }, + { + "epoch": 0.4703017183060479, + "grad_norm": 0.2746431827545166, + "learning_rate": 9.902597548360754e-05, + "loss": 1.3096, + "step": 1738 + }, + { + "epoch": 0.4708429170612908, + "grad_norm": 0.2979538142681122, + "learning_rate": 9.901978200976492e-05, + "loss": 1.2849, + "step": 1740 + }, + { + "epoch": 0.4713841158165336, + "grad_norm": 0.2766527235507965, + "learning_rate": 9.901356910215255e-05, + "loss": 1.3089, + "step": 1742 + }, + { + "epoch": 0.4719253145717765, + "grad_norm": 0.25000783801078796, + "learning_rate": 9.900733676323353e-05, + "loss": 1.308, + "step": 1744 + }, + { + "epoch": 0.47246651332701933, + "grad_norm": 0.26226234436035156, + "learning_rate": 9.900108499547864e-05, + "loss": 1.3041, + "step": 1746 + }, + { + "epoch": 0.4730077120822622, + "grad_norm": 0.2794544994831085, + "learning_rate": 9.899481380136642e-05, + "loss": 1.3312, + "step": 1748 + }, + { + "epoch": 0.47354891083750505, + "grad_norm": 0.24771127104759216, + "learning_rate": 9.898852318338303e-05, + "loss": 1.2853, + "step": 1750 + }, + { + "epoch": 0.47409010959274794, + "grad_norm": 0.2811632752418518, + "learning_rate": 9.898221314402238e-05, + "loss": 1.3019, + "step": 1752 + }, + { + "epoch": 0.4746313083479908, + "grad_norm": 0.2812533378601074, + "learning_rate": 9.897588368578608e-05, + "loss": 1.3298, + "step": 1754 + }, + { + "epoch": 0.47517250710323367, + "grad_norm": 0.25955653190612793, + "learning_rate": 9.896953481118341e-05, + "loss": 1.3093, + "step": 1756 + }, + { + "epoch": 0.4757137058584765, + "grad_norm": 0.2653108537197113, + "learning_rate": 9.896316652273136e-05, + "loss": 1.2898, + "step": 1758 + }, + { + "epoch": 0.4762549046137194, + "grad_norm": 0.27985796332359314, + "learning_rate": 9.895677882295466e-05, + "loss": 1.2928, + "step": 1760 + }, + { + "epoch": 0.4767961033689623, + "grad_norm": 0.2889133393764496, + "learning_rate": 9.895037171438568e-05, + "loss": 1.3088, + "step": 1762 + }, + { + "epoch": 0.4773373021242051, + "grad_norm": 0.2615009546279907, + "learning_rate": 9.894394519956448e-05, + "loss": 1.3212, + "step": 1764 + }, + { + "epoch": 0.477878500879448, + "grad_norm": 0.24938960373401642, + "learning_rate": 9.893749928103885e-05, + "loss": 1.2982, + "step": 1766 + }, + { + "epoch": 0.47841969963469083, + "grad_norm": 0.27132853865623474, + "learning_rate": 9.893103396136427e-05, + "loss": 1.294, + "step": 1768 + }, + { + "epoch": 0.4789608983899337, + "grad_norm": 0.2632822096347809, + "learning_rate": 9.89245492431039e-05, + "loss": 1.2852, + "step": 1770 + }, + { + "epoch": 0.47950209714517655, + "grad_norm": 0.27269670367240906, + "learning_rate": 9.891804512882856e-05, + "loss": 1.2934, + "step": 1772 + }, + { + "epoch": 0.48004329590041944, + "grad_norm": 0.2572595179080963, + "learning_rate": 9.891152162111683e-05, + "loss": 1.2719, + "step": 1774 + }, + { + "epoch": 0.4805844946556623, + "grad_norm": 0.2708267867565155, + "learning_rate": 9.890497872255489e-05, + "loss": 1.2907, + "step": 1776 + }, + { + "epoch": 0.48112569341090516, + "grad_norm": 0.28407028317451477, + "learning_rate": 9.889841643573671e-05, + "loss": 1.2977, + "step": 1778 + }, + { + "epoch": 0.481666892166148, + "grad_norm": 0.26248103380203247, + "learning_rate": 9.889183476326386e-05, + "loss": 1.2993, + "step": 1780 + }, + { + "epoch": 0.4822080909213909, + "grad_norm": 0.26148512959480286, + "learning_rate": 9.888523370774563e-05, + "loss": 1.2893, + "step": 1782 + }, + { + "epoch": 0.4827492896766337, + "grad_norm": 0.2815425395965576, + "learning_rate": 9.8878613271799e-05, + "loss": 1.3015, + "step": 1784 + }, + { + "epoch": 0.4832904884318766, + "grad_norm": 0.26061713695526123, + "learning_rate": 9.887197345804862e-05, + "loss": 1.2781, + "step": 1786 + }, + { + "epoch": 0.4838316871871195, + "grad_norm": 0.2641533613204956, + "learning_rate": 9.886531426912683e-05, + "loss": 1.2993, + "step": 1788 + }, + { + "epoch": 0.48437288594236233, + "grad_norm": 0.25920137763023376, + "learning_rate": 9.885863570767364e-05, + "loss": 1.2955, + "step": 1790 + }, + { + "epoch": 0.4849140846976052, + "grad_norm": 0.24002158641815186, + "learning_rate": 9.885193777633676e-05, + "loss": 1.2932, + "step": 1792 + }, + { + "epoch": 0.48545528345284805, + "grad_norm": 0.2643393576145172, + "learning_rate": 9.884522047777157e-05, + "loss": 1.2963, + "step": 1794 + }, + { + "epoch": 0.48599648220809094, + "grad_norm": 0.2522197663784027, + "learning_rate": 9.883848381464112e-05, + "loss": 1.2947, + "step": 1796 + }, + { + "epoch": 0.4865376809633338, + "grad_norm": 0.2431286871433258, + "learning_rate": 9.883172778961613e-05, + "loss": 1.3112, + "step": 1798 + }, + { + "epoch": 0.48707887971857666, + "grad_norm": 0.26892608404159546, + "learning_rate": 9.882495240537505e-05, + "loss": 1.2904, + "step": 1800 + }, + { + "epoch": 0.4876200784738195, + "grad_norm": 0.2528528571128845, + "learning_rate": 9.881815766460392e-05, + "loss": 1.2949, + "step": 1802 + }, + { + "epoch": 0.4881612772290624, + "grad_norm": 0.2614927291870117, + "learning_rate": 9.881134356999652e-05, + "loss": 1.288, + "step": 1804 + }, + { + "epoch": 0.4887024759843052, + "grad_norm": 0.2523605227470398, + "learning_rate": 9.880451012425426e-05, + "loss": 1.3029, + "step": 1806 + }, + { + "epoch": 0.4892436747395481, + "grad_norm": 0.24303248524665833, + "learning_rate": 9.879765733008627e-05, + "loss": 1.3107, + "step": 1808 + }, + { + "epoch": 0.48978487349479094, + "grad_norm": 0.2470557987689972, + "learning_rate": 9.879078519020933e-05, + "loss": 1.2856, + "step": 1810 + }, + { + "epoch": 0.49032607225003383, + "grad_norm": 0.2526317536830902, + "learning_rate": 9.878389370734784e-05, + "loss": 1.2965, + "step": 1812 + }, + { + "epoch": 0.49086727100527666, + "grad_norm": 0.2483314871788025, + "learning_rate": 9.877698288423394e-05, + "loss": 1.3016, + "step": 1814 + }, + { + "epoch": 0.49140846976051955, + "grad_norm": 0.24746839702129364, + "learning_rate": 9.877005272360741e-05, + "loss": 1.2944, + "step": 1816 + }, + { + "epoch": 0.49194966851576244, + "grad_norm": 0.24739988148212433, + "learning_rate": 9.876310322821568e-05, + "loss": 1.3037, + "step": 1818 + }, + { + "epoch": 0.4924908672710053, + "grad_norm": 0.2740204632282257, + "learning_rate": 9.875613440081387e-05, + "loss": 1.3116, + "step": 1820 + }, + { + "epoch": 0.49303206602624816, + "grad_norm": 0.27116379141807556, + "learning_rate": 9.874914624416475e-05, + "loss": 1.288, + "step": 1822 + }, + { + "epoch": 0.493573264781491, + "grad_norm": 0.24231554567813873, + "learning_rate": 9.874213876103878e-05, + "loss": 1.2975, + "step": 1824 + }, + { + "epoch": 0.4941144635367339, + "grad_norm": 0.2590995728969574, + "learning_rate": 9.873511195421402e-05, + "loss": 1.2678, + "step": 1826 + }, + { + "epoch": 0.4946556622919767, + "grad_norm": 0.25694531202316284, + "learning_rate": 9.872806582647625e-05, + "loss": 1.28, + "step": 1828 + }, + { + "epoch": 0.4951968610472196, + "grad_norm": 0.25455620884895325, + "learning_rate": 9.87210003806189e-05, + "loss": 1.2942, + "step": 1830 + }, + { + "epoch": 0.49573805980246244, + "grad_norm": 0.2639889121055603, + "learning_rate": 9.871391561944302e-05, + "loss": 1.3161, + "step": 1832 + }, + { + "epoch": 0.4962792585577053, + "grad_norm": 0.271282821893692, + "learning_rate": 9.870681154575737e-05, + "loss": 1.3071, + "step": 1834 + }, + { + "epoch": 0.49682045731294816, + "grad_norm": 0.26479372382164, + "learning_rate": 9.869968816237833e-05, + "loss": 1.2841, + "step": 1836 + }, + { + "epoch": 0.49736165606819105, + "grad_norm": 0.26040130853652954, + "learning_rate": 9.869254547212997e-05, + "loss": 1.2989, + "step": 1838 + }, + { + "epoch": 0.4979028548234339, + "grad_norm": 0.26563623547554016, + "learning_rate": 9.868538347784396e-05, + "loss": 1.2965, + "step": 1840 + }, + { + "epoch": 0.49844405357867677, + "grad_norm": 0.26089224219322205, + "learning_rate": 9.867820218235969e-05, + "loss": 1.3071, + "step": 1842 + }, + { + "epoch": 0.4989852523339196, + "grad_norm": 0.27151811122894287, + "learning_rate": 9.867100158852412e-05, + "loss": 1.287, + "step": 1844 + }, + { + "epoch": 0.4995264510891625, + "grad_norm": 0.2477792203426361, + "learning_rate": 9.866378169919192e-05, + "loss": 1.2894, + "step": 1846 + }, + { + "epoch": 0.5000676498444053, + "grad_norm": 0.24871942400932312, + "learning_rate": 9.865654251722545e-05, + "loss": 1.3024, + "step": 1848 + }, + { + "epoch": 0.5006088485996483, + "grad_norm": 0.26377877593040466, + "learning_rate": 9.86492840454946e-05, + "loss": 1.2939, + "step": 1850 + }, + { + "epoch": 0.5011500473548911, + "grad_norm": 0.258228063583374, + "learning_rate": 9.8642006286877e-05, + "loss": 1.291, + "step": 1852 + }, + { + "epoch": 0.5016912461101339, + "grad_norm": 0.26982301473617554, + "learning_rate": 9.86347092442579e-05, + "loss": 1.2845, + "step": 1854 + }, + { + "epoch": 0.5022324448653768, + "grad_norm": 0.24094600975513458, + "learning_rate": 9.862739292053021e-05, + "loss": 1.2744, + "step": 1856 + }, + { + "epoch": 0.5027736436206197, + "grad_norm": 0.25840380787849426, + "learning_rate": 9.862005731859442e-05, + "loss": 1.2966, + "step": 1858 + }, + { + "epoch": 0.5033148423758625, + "grad_norm": 0.26734429597854614, + "learning_rate": 9.861270244135877e-05, + "loss": 1.2856, + "step": 1860 + }, + { + "epoch": 0.5038560411311054, + "grad_norm": 0.24431397020816803, + "learning_rate": 9.860532829173903e-05, + "loss": 1.2871, + "step": 1862 + }, + { + "epoch": 0.5043972398863482, + "grad_norm": 0.25425857305526733, + "learning_rate": 9.859793487265869e-05, + "loss": 1.2822, + "step": 1864 + }, + { + "epoch": 0.5049384386415912, + "grad_norm": 0.25332111120224, + "learning_rate": 9.859052218704885e-05, + "loss": 1.2723, + "step": 1866 + }, + { + "epoch": 0.505479637396834, + "grad_norm": 0.24775418639183044, + "learning_rate": 9.858309023784826e-05, + "loss": 1.2934, + "step": 1868 + }, + { + "epoch": 0.5060208361520768, + "grad_norm": 0.24880458414554596, + "learning_rate": 9.857563902800328e-05, + "loss": 1.3041, + "step": 1870 + }, + { + "epoch": 0.5065620349073197, + "grad_norm": 0.2574135959148407, + "learning_rate": 9.856816856046793e-05, + "loss": 1.2855, + "step": 1872 + }, + { + "epoch": 0.5071032336625626, + "grad_norm": 0.26873350143432617, + "learning_rate": 9.856067883820386e-05, + "loss": 1.3055, + "step": 1874 + }, + { + "epoch": 0.5076444324178054, + "grad_norm": 0.23742420971393585, + "learning_rate": 9.855316986418036e-05, + "loss": 1.3029, + "step": 1876 + }, + { + "epoch": 0.5081856311730483, + "grad_norm": 0.2398921549320221, + "learning_rate": 9.854564164137432e-05, + "loss": 1.2849, + "step": 1878 + }, + { + "epoch": 0.5087268299282912, + "grad_norm": 0.25182288885116577, + "learning_rate": 9.85380941727703e-05, + "loss": 1.2981, + "step": 1880 + }, + { + "epoch": 0.509268028683534, + "grad_norm": 0.23373378813266754, + "learning_rate": 9.853052746136048e-05, + "loss": 1.2772, + "step": 1882 + }, + { + "epoch": 0.5098092274387769, + "grad_norm": 0.2581213712692261, + "learning_rate": 9.852294151014466e-05, + "loss": 1.3147, + "step": 1884 + }, + { + "epoch": 0.5103504261940197, + "grad_norm": 0.26642751693725586, + "learning_rate": 9.851533632213028e-05, + "loss": 1.2885, + "step": 1886 + }, + { + "epoch": 0.5108916249492627, + "grad_norm": 0.24029181897640228, + "learning_rate": 9.850771190033237e-05, + "loss": 1.297, + "step": 1888 + }, + { + "epoch": 0.5114328237045055, + "grad_norm": 0.2555221915245056, + "learning_rate": 9.850006824777364e-05, + "loss": 1.284, + "step": 1890 + }, + { + "epoch": 0.5119740224597483, + "grad_norm": 0.2723660171031952, + "learning_rate": 9.849240536748439e-05, + "loss": 1.2821, + "step": 1892 + }, + { + "epoch": 0.5125152212149912, + "grad_norm": 0.24772705137729645, + "learning_rate": 9.848472326250253e-05, + "loss": 1.2743, + "step": 1894 + }, + { + "epoch": 0.5130564199702341, + "grad_norm": 0.2344834804534912, + "learning_rate": 9.847702193587365e-05, + "loss": 1.286, + "step": 1896 + }, + { + "epoch": 0.5135976187254769, + "grad_norm": 0.23948362469673157, + "learning_rate": 9.846930139065088e-05, + "loss": 1.2673, + "step": 1898 + }, + { + "epoch": 0.5141388174807198, + "grad_norm": 0.27207908034324646, + "learning_rate": 9.846156162989503e-05, + "loss": 1.3041, + "step": 1900 + }, + { + "epoch": 0.5146800162359627, + "grad_norm": 0.2407965511083603, + "learning_rate": 9.845380265667454e-05, + "loss": 1.2875, + "step": 1902 + }, + { + "epoch": 0.5152212149912055, + "grad_norm": 0.2517203688621521, + "learning_rate": 9.844602447406538e-05, + "loss": 1.2855, + "step": 1904 + }, + { + "epoch": 0.5157624137464484, + "grad_norm": 0.24267178773880005, + "learning_rate": 9.843822708515123e-05, + "loss": 1.2711, + "step": 1906 + }, + { + "epoch": 0.5163036125016912, + "grad_norm": 0.23933006823062897, + "learning_rate": 9.843041049302331e-05, + "loss": 1.3094, + "step": 1908 + }, + { + "epoch": 0.5168448112569342, + "grad_norm": 0.21948301792144775, + "learning_rate": 9.842257470078054e-05, + "loss": 1.2686, + "step": 1910 + }, + { + "epoch": 0.517386010012177, + "grad_norm": 0.239594966173172, + "learning_rate": 9.841471971152933e-05, + "loss": 1.2959, + "step": 1912 + }, + { + "epoch": 0.5179272087674198, + "grad_norm": 0.26850634813308716, + "learning_rate": 9.840684552838385e-05, + "loss": 1.2969, + "step": 1914 + }, + { + "epoch": 0.5184684075226627, + "grad_norm": 0.26066869497299194, + "learning_rate": 9.839895215446573e-05, + "loss": 1.2935, + "step": 1916 + }, + { + "epoch": 0.5190096062779056, + "grad_norm": 0.25288596749305725, + "learning_rate": 9.839103959290433e-05, + "loss": 1.2922, + "step": 1918 + }, + { + "epoch": 0.5195508050331484, + "grad_norm": 0.24453966319561005, + "learning_rate": 9.838310784683655e-05, + "loss": 1.3058, + "step": 1920 + }, + { + "epoch": 0.5200920037883913, + "grad_norm": 0.25353509187698364, + "learning_rate": 9.837515691940689e-05, + "loss": 1.3161, + "step": 1922 + }, + { + "epoch": 0.5206332025436341, + "grad_norm": 0.24898375570774078, + "learning_rate": 9.836718681376749e-05, + "loss": 1.2925, + "step": 1924 + }, + { + "epoch": 0.521174401298877, + "grad_norm": 0.2576977014541626, + "learning_rate": 9.835919753307807e-05, + "loss": 1.2916, + "step": 1926 + }, + { + "epoch": 0.5217156000541199, + "grad_norm": 0.25432518124580383, + "learning_rate": 9.8351189080506e-05, + "loss": 1.2866, + "step": 1928 + }, + { + "epoch": 0.5222567988093627, + "grad_norm": 0.2504200339317322, + "learning_rate": 9.834316145922615e-05, + "loss": 1.2728, + "step": 1930 + }, + { + "epoch": 0.5227979975646057, + "grad_norm": 0.2627692222595215, + "learning_rate": 9.83351146724211e-05, + "loss": 1.2853, + "step": 1932 + }, + { + "epoch": 0.5233391963198485, + "grad_norm": 0.2776716351509094, + "learning_rate": 9.832704872328094e-05, + "loss": 1.2881, + "step": 1934 + }, + { + "epoch": 0.5238803950750913, + "grad_norm": 0.24669450521469116, + "learning_rate": 9.831896361500344e-05, + "loss": 1.2681, + "step": 1936 + }, + { + "epoch": 0.5244215938303342, + "grad_norm": 0.24949464201927185, + "learning_rate": 9.831085935079387e-05, + "loss": 1.2851, + "step": 1938 + }, + { + "epoch": 0.5249627925855771, + "grad_norm": 0.2585392892360687, + "learning_rate": 9.830273593386518e-05, + "loss": 1.2796, + "step": 1940 + }, + { + "epoch": 0.5255039913408199, + "grad_norm": 0.26086801290512085, + "learning_rate": 9.829459336743787e-05, + "loss": 1.293, + "step": 1942 + }, + { + "epoch": 0.5260451900960628, + "grad_norm": 0.25490057468414307, + "learning_rate": 9.828643165474006e-05, + "loss": 1.2824, + "step": 1944 + }, + { + "epoch": 0.5265863888513056, + "grad_norm": 0.24865177273750305, + "learning_rate": 9.827825079900739e-05, + "loss": 1.2835, + "step": 1946 + }, + { + "epoch": 0.5271275876065485, + "grad_norm": 0.25498902797698975, + "learning_rate": 9.827005080348317e-05, + "loss": 1.2931, + "step": 1948 + }, + { + "epoch": 0.5276687863617914, + "grad_norm": 0.2585375905036926, + "learning_rate": 9.826183167141828e-05, + "loss": 1.2659, + "step": 1950 + }, + { + "epoch": 0.5282099851170342, + "grad_norm": 0.2300305813550949, + "learning_rate": 9.825359340607116e-05, + "loss": 1.3019, + "step": 1952 + }, + { + "epoch": 0.528751183872277, + "grad_norm": 0.24674038589000702, + "learning_rate": 9.824533601070784e-05, + "loss": 1.2784, + "step": 1954 + }, + { + "epoch": 0.52929238262752, + "grad_norm": 0.23458759486675262, + "learning_rate": 9.823705948860195e-05, + "loss": 1.2779, + "step": 1956 + }, + { + "epoch": 0.5298335813827628, + "grad_norm": 0.24736309051513672, + "learning_rate": 9.822876384303472e-05, + "loss": 1.3083, + "step": 1958 + }, + { + "epoch": 0.5303747801380057, + "grad_norm": 0.25108450651168823, + "learning_rate": 9.82204490772949e-05, + "loss": 1.3044, + "step": 1960 + }, + { + "epoch": 0.5309159788932486, + "grad_norm": 0.23308375477790833, + "learning_rate": 9.82121151946789e-05, + "loss": 1.2694, + "step": 1962 + }, + { + "epoch": 0.5314571776484914, + "grad_norm": 0.2283206284046173, + "learning_rate": 9.820376219849064e-05, + "loss": 1.2735, + "step": 1964 + }, + { + "epoch": 0.5319983764037343, + "grad_norm": 0.24121573567390442, + "learning_rate": 9.819539009204164e-05, + "loss": 1.2799, + "step": 1966 + }, + { + "epoch": 0.5325395751589771, + "grad_norm": 0.24135661125183105, + "learning_rate": 9.8186998878651e-05, + "loss": 1.295, + "step": 1968 + }, + { + "epoch": 0.53308077391422, + "grad_norm": 0.24390241503715515, + "learning_rate": 9.817858856164542e-05, + "loss": 1.2812, + "step": 1970 + }, + { + "epoch": 0.5336219726694629, + "grad_norm": 0.24739502370357513, + "learning_rate": 9.817015914435913e-05, + "loss": 1.2872, + "step": 1972 + }, + { + "epoch": 0.5341631714247057, + "grad_norm": 0.25517916679382324, + "learning_rate": 9.816171063013395e-05, + "loss": 1.2718, + "step": 1974 + }, + { + "epoch": 0.5347043701799485, + "grad_norm": 0.25479528307914734, + "learning_rate": 9.815324302231928e-05, + "loss": 1.2952, + "step": 1976 + }, + { + "epoch": 0.5352455689351915, + "grad_norm": 0.24998174607753754, + "learning_rate": 9.814475632427206e-05, + "loss": 1.2914, + "step": 1978 + }, + { + "epoch": 0.5357867676904343, + "grad_norm": 0.2341603934764862, + "learning_rate": 9.813625053935686e-05, + "loss": 1.2793, + "step": 1980 + }, + { + "epoch": 0.5363279664456772, + "grad_norm": 0.23716285824775696, + "learning_rate": 9.812772567094574e-05, + "loss": 1.2872, + "step": 1982 + }, + { + "epoch": 0.53686916520092, + "grad_norm": 0.2324230819940567, + "learning_rate": 9.81191817224184e-05, + "loss": 1.2604, + "step": 1984 + }, + { + "epoch": 0.5374103639561629, + "grad_norm": 0.24399405717849731, + "learning_rate": 9.811061869716205e-05, + "loss": 1.2972, + "step": 1986 + }, + { + "epoch": 0.5379515627114058, + "grad_norm": 0.24572497606277466, + "learning_rate": 9.810203659857145e-05, + "loss": 1.2784, + "step": 1988 + }, + { + "epoch": 0.5384927614666486, + "grad_norm": 0.22993844747543335, + "learning_rate": 9.8093435430049e-05, + "loss": 1.2886, + "step": 1990 + }, + { + "epoch": 0.5390339602218915, + "grad_norm": 0.24518661201000214, + "learning_rate": 9.808481519500458e-05, + "loss": 1.2622, + "step": 1992 + }, + { + "epoch": 0.5395751589771344, + "grad_norm": 0.2601888179779053, + "learning_rate": 9.807617589685568e-05, + "loss": 1.2739, + "step": 1994 + }, + { + "epoch": 0.5401163577323772, + "grad_norm": 0.24736261367797852, + "learning_rate": 9.80675175390273e-05, + "loss": 1.2748, + "step": 1996 + }, + { + "epoch": 0.54065755648762, + "grad_norm": 0.2332574725151062, + "learning_rate": 9.805884012495203e-05, + "loss": 1.2639, + "step": 1998 + }, + { + "epoch": 0.541198755242863, + "grad_norm": 0.2662294805049896, + "learning_rate": 9.805014365807004e-05, + "loss": 1.2914, + "step": 2000 + }, + { + "epoch": 0.5417399539981058, + "grad_norm": 0.28600943088531494, + "learning_rate": 9.804142814182902e-05, + "loss": 1.2657, + "step": 2002 + }, + { + "epoch": 0.5422811527533486, + "grad_norm": 0.2814892530441284, + "learning_rate": 9.803269357968416e-05, + "loss": 1.2839, + "step": 2004 + }, + { + "epoch": 0.5428223515085915, + "grad_norm": 0.24939605593681335, + "learning_rate": 9.802393997509833e-05, + "loss": 1.2692, + "step": 2006 + }, + { + "epoch": 0.5433635502638344, + "grad_norm": 0.2562806308269501, + "learning_rate": 9.801516733154181e-05, + "loss": 1.291, + "step": 2008 + }, + { + "epoch": 0.5439047490190773, + "grad_norm": 0.2617442011833191, + "learning_rate": 9.800637565249255e-05, + "loss": 1.2808, + "step": 2010 + }, + { + "epoch": 0.5444459477743201, + "grad_norm": 0.2421412616968155, + "learning_rate": 9.799756494143593e-05, + "loss": 1.2733, + "step": 2012 + }, + { + "epoch": 0.5449871465295629, + "grad_norm": 0.25231024622917175, + "learning_rate": 9.798873520186497e-05, + "loss": 1.2695, + "step": 2014 + }, + { + "epoch": 0.5455283452848059, + "grad_norm": 0.25108659267425537, + "learning_rate": 9.79798864372802e-05, + "loss": 1.298, + "step": 2016 + }, + { + "epoch": 0.5460695440400487, + "grad_norm": 0.24615678191184998, + "learning_rate": 9.79710186511897e-05, + "loss": 1.3127, + "step": 2018 + }, + { + "epoch": 0.5466107427952915, + "grad_norm": 0.23436503112316132, + "learning_rate": 9.796213184710904e-05, + "loss": 1.2896, + "step": 2020 + }, + { + "epoch": 0.5471519415505345, + "grad_norm": 0.23453901708126068, + "learning_rate": 9.79532260285614e-05, + "loss": 1.2761, + "step": 2022 + }, + { + "epoch": 0.5476931403057773, + "grad_norm": 0.2413233071565628, + "learning_rate": 9.794430119907748e-05, + "loss": 1.2744, + "step": 2024 + }, + { + "epoch": 0.5482343390610201, + "grad_norm": 0.2426893562078476, + "learning_rate": 9.793535736219546e-05, + "loss": 1.2615, + "step": 2026 + }, + { + "epoch": 0.548775537816263, + "grad_norm": 0.23853014409542084, + "learning_rate": 9.792639452146115e-05, + "loss": 1.2897, + "step": 2028 + }, + { + "epoch": 0.5493167365715059, + "grad_norm": 0.24866445362567902, + "learning_rate": 9.791741268042784e-05, + "loss": 1.2957, + "step": 2030 + }, + { + "epoch": 0.5498579353267488, + "grad_norm": 0.24467822909355164, + "learning_rate": 9.790841184265633e-05, + "loss": 1.2867, + "step": 2032 + }, + { + "epoch": 0.5503991340819916, + "grad_norm": 0.2393324077129364, + "learning_rate": 9.7899392011715e-05, + "loss": 1.3061, + "step": 2034 + }, + { + "epoch": 0.5509403328372344, + "grad_norm": 0.23834531009197235, + "learning_rate": 9.789035319117974e-05, + "loss": 1.2957, + "step": 2036 + }, + { + "epoch": 0.5514815315924774, + "grad_norm": 0.2603852450847626, + "learning_rate": 9.788129538463397e-05, + "loss": 1.2897, + "step": 2038 + }, + { + "epoch": 0.5520227303477202, + "grad_norm": 0.26540425419807434, + "learning_rate": 9.787221859566861e-05, + "loss": 1.2829, + "step": 2040 + }, + { + "epoch": 0.552563929102963, + "grad_norm": 0.25125250220298767, + "learning_rate": 9.786312282788216e-05, + "loss": 1.2708, + "step": 2042 + }, + { + "epoch": 0.553105127858206, + "grad_norm": 0.23911471664905548, + "learning_rate": 9.785400808488061e-05, + "loss": 1.2949, + "step": 2044 + }, + { + "epoch": 0.5536463266134488, + "grad_norm": 0.23871150612831116, + "learning_rate": 9.784487437027746e-05, + "loss": 1.2863, + "step": 2046 + }, + { + "epoch": 0.5541875253686916, + "grad_norm": 0.25253376364707947, + "learning_rate": 9.783572168769376e-05, + "loss": 1.2797, + "step": 2048 + }, + { + "epoch": 0.5547287241239345, + "grad_norm": 0.25140559673309326, + "learning_rate": 9.782655004075807e-05, + "loss": 1.2666, + "step": 2050 + }, + { + "epoch": 0.5552699228791774, + "grad_norm": 0.25297242403030396, + "learning_rate": 9.781735943310646e-05, + "loss": 1.2935, + "step": 2052 + }, + { + "epoch": 0.5558111216344203, + "grad_norm": 0.28536322712898254, + "learning_rate": 9.780814986838252e-05, + "loss": 1.2891, + "step": 2054 + }, + { + "epoch": 0.5563523203896631, + "grad_norm": 0.28267911076545715, + "learning_rate": 9.779892135023738e-05, + "loss": 1.2846, + "step": 2056 + }, + { + "epoch": 0.5568935191449059, + "grad_norm": 0.24850498139858246, + "learning_rate": 9.778967388232964e-05, + "loss": 1.2823, + "step": 2058 + }, + { + "epoch": 0.5574347179001489, + "grad_norm": 0.4929364025592804, + "learning_rate": 9.778040746832544e-05, + "loss": 1.2681, + "step": 2060 + }, + { + "epoch": 0.5579759166553917, + "grad_norm": 0.25423306226730347, + "learning_rate": 9.777112211189843e-05, + "loss": 1.2765, + "step": 2062 + }, + { + "epoch": 0.5585171154106345, + "grad_norm": 0.23608753085136414, + "learning_rate": 9.776181781672977e-05, + "loss": 1.2756, + "step": 2064 + }, + { + "epoch": 0.5590583141658774, + "grad_norm": 0.3117451071739197, + "learning_rate": 9.775249458650812e-05, + "loss": 1.2731, + "step": 2066 + }, + { + "epoch": 0.5595995129211203, + "grad_norm": 0.2454603612422943, + "learning_rate": 9.774315242492965e-05, + "loss": 1.2821, + "step": 2068 + }, + { + "epoch": 0.5601407116763631, + "grad_norm": 0.3214171528816223, + "learning_rate": 9.773379133569804e-05, + "loss": 1.2964, + "step": 2070 + }, + { + "epoch": 0.560681910431606, + "grad_norm": 0.23589906096458435, + "learning_rate": 9.772441132252448e-05, + "loss": 1.2794, + "step": 2072 + }, + { + "epoch": 0.5612231091868489, + "grad_norm": 0.23020370304584503, + "learning_rate": 9.771501238912763e-05, + "loss": 1.2753, + "step": 2074 + }, + { + "epoch": 0.5617643079420918, + "grad_norm": 0.2368050515651703, + "learning_rate": 9.77055945392337e-05, + "loss": 1.3048, + "step": 2076 + }, + { + "epoch": 0.5623055066973346, + "grad_norm": 0.2581866383552551, + "learning_rate": 9.769615777657633e-05, + "loss": 1.2765, + "step": 2078 + }, + { + "epoch": 0.5628467054525774, + "grad_norm": 0.2481439858675003, + "learning_rate": 9.768670210489675e-05, + "loss": 1.2957, + "step": 2080 + }, + { + "epoch": 0.5633879042078204, + "grad_norm": 0.2861919701099396, + "learning_rate": 9.767722752794361e-05, + "loss": 1.2647, + "step": 2082 + }, + { + "epoch": 0.5639291029630632, + "grad_norm": 0.2552880346775055, + "learning_rate": 9.766773404947309e-05, + "loss": 1.2675, + "step": 2084 + }, + { + "epoch": 0.564470301718306, + "grad_norm": 0.251891165971756, + "learning_rate": 9.765822167324885e-05, + "loss": 1.2799, + "step": 2086 + }, + { + "epoch": 0.5650115004735489, + "grad_norm": 0.25395113229751587, + "learning_rate": 9.764869040304205e-05, + "loss": 1.2916, + "step": 2088 + }, + { + "epoch": 0.5655526992287918, + "grad_norm": 0.2496347427368164, + "learning_rate": 9.763914024263136e-05, + "loss": 1.2722, + "step": 2090 + }, + { + "epoch": 0.5660938979840346, + "grad_norm": 0.24722573161125183, + "learning_rate": 9.762957119580287e-05, + "loss": 1.2722, + "step": 2092 + }, + { + "epoch": 0.5666350967392775, + "grad_norm": 0.23567502200603485, + "learning_rate": 9.761998326635026e-05, + "loss": 1.2681, + "step": 2094 + }, + { + "epoch": 0.5671762954945203, + "grad_norm": 0.2396802455186844, + "learning_rate": 9.76103764580746e-05, + "loss": 1.2509, + "step": 2096 + }, + { + "epoch": 0.5677174942497633, + "grad_norm": 0.24394263327121735, + "learning_rate": 9.76007507747845e-05, + "loss": 1.2863, + "step": 2098 + }, + { + "epoch": 0.5682586930050061, + "grad_norm": 0.23184406757354736, + "learning_rate": 9.759110622029604e-05, + "loss": 1.2827, + "step": 2100 + }, + { + "epoch": 0.5687998917602489, + "grad_norm": 0.24522008001804352, + "learning_rate": 9.758144279843275e-05, + "loss": 1.2716, + "step": 2102 + }, + { + "epoch": 0.5693410905154919, + "grad_norm": 0.2323451191186905, + "learning_rate": 9.757176051302573e-05, + "loss": 1.2747, + "step": 2104 + }, + { + "epoch": 0.5698822892707347, + "grad_norm": 0.26049986481666565, + "learning_rate": 9.756205936791344e-05, + "loss": 1.2976, + "step": 2106 + }, + { + "epoch": 0.5704234880259775, + "grad_norm": 0.24207298457622528, + "learning_rate": 9.75523393669419e-05, + "loss": 1.2637, + "step": 2108 + }, + { + "epoch": 0.5709646867812204, + "grad_norm": 0.23590189218521118, + "learning_rate": 9.754260051396459e-05, + "loss": 1.2725, + "step": 2110 + }, + { + "epoch": 0.5715058855364633, + "grad_norm": 0.25714561343193054, + "learning_rate": 9.753284281284243e-05, + "loss": 1.2751, + "step": 2112 + }, + { + "epoch": 0.5720470842917061, + "grad_norm": 0.2563743591308594, + "learning_rate": 9.752306626744385e-05, + "loss": 1.2552, + "step": 2114 + }, + { + "epoch": 0.572588283046949, + "grad_norm": 0.23137059807777405, + "learning_rate": 9.751327088164474e-05, + "loss": 1.2826, + "step": 2116 + }, + { + "epoch": 0.5731294818021918, + "grad_norm": 0.23491452634334564, + "learning_rate": 9.750345665932844e-05, + "loss": 1.2909, + "step": 2118 + }, + { + "epoch": 0.5736706805574348, + "grad_norm": 0.23278982937335968, + "learning_rate": 9.749362360438579e-05, + "loss": 1.2904, + "step": 2120 + }, + { + "epoch": 0.5742118793126776, + "grad_norm": 0.22500935196876526, + "learning_rate": 9.748377172071508e-05, + "loss": 1.2822, + "step": 2122 + }, + { + "epoch": 0.5747530780679204, + "grad_norm": 0.23745082318782806, + "learning_rate": 9.747390101222205e-05, + "loss": 1.279, + "step": 2124 + }, + { + "epoch": 0.5752942768231633, + "grad_norm": 0.24000118672847748, + "learning_rate": 9.746401148281993e-05, + "loss": 1.2806, + "step": 2126 + }, + { + "epoch": 0.5758354755784062, + "grad_norm": 0.24468575417995453, + "learning_rate": 9.74541031364294e-05, + "loss": 1.2867, + "step": 2128 + }, + { + "epoch": 0.576376674333649, + "grad_norm": 0.23120936751365662, + "learning_rate": 9.744417597697859e-05, + "loss": 1.2666, + "step": 2130 + }, + { + "epoch": 0.5769178730888919, + "grad_norm": 0.25929006934165955, + "learning_rate": 9.743423000840309e-05, + "loss": 1.2672, + "step": 2132 + }, + { + "epoch": 0.5774590718441348, + "grad_norm": 0.2530214786529541, + "learning_rate": 9.742426523464598e-05, + "loss": 1.2769, + "step": 2134 + }, + { + "epoch": 0.5780002705993776, + "grad_norm": 0.2752387225627899, + "learning_rate": 9.741428165965775e-05, + "loss": 1.2562, + "step": 2136 + }, + { + "epoch": 0.5785414693546205, + "grad_norm": 0.25394052267074585, + "learning_rate": 9.740427928739638e-05, + "loss": 1.2758, + "step": 2138 + }, + { + "epoch": 0.5790826681098633, + "grad_norm": 0.25311702489852905, + "learning_rate": 9.739425812182728e-05, + "loss": 1.2603, + "step": 2140 + }, + { + "epoch": 0.5796238668651063, + "grad_norm": 0.26108497381210327, + "learning_rate": 9.738421816692329e-05, + "loss": 1.2627, + "step": 2142 + }, + { + "epoch": 0.5801650656203491, + "grad_norm": 0.2541772425174713, + "learning_rate": 9.737415942666476e-05, + "loss": 1.2752, + "step": 2144 + }, + { + "epoch": 0.5807062643755919, + "grad_norm": 0.24984823167324066, + "learning_rate": 9.736408190503943e-05, + "loss": 1.2673, + "step": 2146 + }, + { + "epoch": 0.5812474631308348, + "grad_norm": 0.2763904333114624, + "learning_rate": 9.735398560604251e-05, + "loss": 1.2936, + "step": 2148 + }, + { + "epoch": 0.5817886618860777, + "grad_norm": 0.26247066259384155, + "learning_rate": 9.734387053367669e-05, + "loss": 1.2636, + "step": 2150 + }, + { + "epoch": 0.5823298606413205, + "grad_norm": 0.27749454975128174, + "learning_rate": 9.7333736691952e-05, + "loss": 1.2857, + "step": 2152 + }, + { + "epoch": 0.5828710593965634, + "grad_norm": 0.39380860328674316, + "learning_rate": 9.732358408488602e-05, + "loss": 1.2916, + "step": 2154 + }, + { + "epoch": 0.5834122581518062, + "grad_norm": 0.2505074441432953, + "learning_rate": 9.731341271650372e-05, + "loss": 1.2548, + "step": 2156 + }, + { + "epoch": 0.5839534569070491, + "grad_norm": 0.2549828588962555, + "learning_rate": 9.730322259083751e-05, + "loss": 1.2884, + "step": 2158 + }, + { + "epoch": 0.584494655662292, + "grad_norm": 0.24714533984661102, + "learning_rate": 9.729301371192724e-05, + "loss": 1.2823, + "step": 2160 + }, + { + "epoch": 0.5850358544175348, + "grad_norm": 0.24945247173309326, + "learning_rate": 9.728278608382018e-05, + "loss": 1.2976, + "step": 2162 + }, + { + "epoch": 0.5855770531727778, + "grad_norm": 0.2512315511703491, + "learning_rate": 9.727253971057109e-05, + "loss": 1.2883, + "step": 2164 + }, + { + "epoch": 0.5861182519280206, + "grad_norm": 0.2401745468378067, + "learning_rate": 9.726227459624207e-05, + "loss": 1.2637, + "step": 2166 + }, + { + "epoch": 0.5866594506832634, + "grad_norm": 0.260251522064209, + "learning_rate": 9.725199074490271e-05, + "loss": 1.2618, + "step": 2168 + }, + { + "epoch": 0.5872006494385063, + "grad_norm": 0.2533782124519348, + "learning_rate": 9.724168816063004e-05, + "loss": 1.2825, + "step": 2170 + }, + { + "epoch": 0.5877418481937492, + "grad_norm": 0.2545458972454071, + "learning_rate": 9.723136684750847e-05, + "loss": 1.2784, + "step": 2172 + }, + { + "epoch": 0.588283046948992, + "grad_norm": 0.24370916187763214, + "learning_rate": 9.722102680962988e-05, + "loss": 1.2601, + "step": 2174 + }, + { + "epoch": 0.5888242457042349, + "grad_norm": 0.23707440495491028, + "learning_rate": 9.721066805109353e-05, + "loss": 1.2818, + "step": 2176 + }, + { + "epoch": 0.5893654444594777, + "grad_norm": 0.22903890907764435, + "learning_rate": 9.720029057600615e-05, + "loss": 1.2686, + "step": 2178 + }, + { + "epoch": 0.5899066432147206, + "grad_norm": 0.22820548713207245, + "learning_rate": 9.718989438848182e-05, + "loss": 1.2749, + "step": 2180 + }, + { + "epoch": 0.5904478419699635, + "grad_norm": 0.2249859720468521, + "learning_rate": 9.717947949264214e-05, + "loss": 1.2649, + "step": 2182 + }, + { + "epoch": 0.5909890407252063, + "grad_norm": 0.23568090796470642, + "learning_rate": 9.716904589261602e-05, + "loss": 1.2764, + "step": 2184 + }, + { + "epoch": 0.5915302394804492, + "grad_norm": 0.24089080095291138, + "learning_rate": 9.715859359253987e-05, + "loss": 1.2801, + "step": 2186 + }, + { + "epoch": 0.5920714382356921, + "grad_norm": 0.2259254902601242, + "learning_rate": 9.714812259655746e-05, + "loss": 1.2805, + "step": 2188 + }, + { + "epoch": 0.5926126369909349, + "grad_norm": 0.23276519775390625, + "learning_rate": 9.713763290881999e-05, + "loss": 1.2635, + "step": 2190 + }, + { + "epoch": 0.5931538357461777, + "grad_norm": 0.24884091317653656, + "learning_rate": 9.712712453348607e-05, + "loss": 1.2984, + "step": 2192 + }, + { + "epoch": 0.5936950345014207, + "grad_norm": 0.23471422493457794, + "learning_rate": 9.711659747472171e-05, + "loss": 1.2742, + "step": 2194 + }, + { + "epoch": 0.5942362332566635, + "grad_norm": 0.25790145993232727, + "learning_rate": 9.710605173670037e-05, + "loss": 1.2865, + "step": 2196 + }, + { + "epoch": 0.5947774320119064, + "grad_norm": 0.24584504961967468, + "learning_rate": 9.709548732360285e-05, + "loss": 1.2826, + "step": 2198 + }, + { + "epoch": 0.5953186307671492, + "grad_norm": 0.23682548105716705, + "learning_rate": 9.708490423961741e-05, + "loss": 1.2499, + "step": 2200 + }, + { + "epoch": 0.5958598295223921, + "grad_norm": 0.24267072975635529, + "learning_rate": 9.707430248893964e-05, + "loss": 1.2514, + "step": 2202 + }, + { + "epoch": 0.596401028277635, + "grad_norm": 0.2546815276145935, + "learning_rate": 9.706368207577264e-05, + "loss": 1.2755, + "step": 2204 + }, + { + "epoch": 0.5969422270328778, + "grad_norm": 0.24322691559791565, + "learning_rate": 9.70530430043268e-05, + "loss": 1.2817, + "step": 2206 + }, + { + "epoch": 0.5974834257881206, + "grad_norm": 0.22995691001415253, + "learning_rate": 9.704238527882e-05, + "loss": 1.2487, + "step": 2208 + }, + { + "epoch": 0.5980246245433636, + "grad_norm": 0.25768396258354187, + "learning_rate": 9.70317089034774e-05, + "loss": 1.2956, + "step": 2210 + }, + { + "epoch": 0.5985658232986064, + "grad_norm": 0.2691928744316101, + "learning_rate": 9.702101388253167e-05, + "loss": 1.2704, + "step": 2212 + }, + { + "epoch": 0.5991070220538492, + "grad_norm": 0.2356652021408081, + "learning_rate": 9.701030022022282e-05, + "loss": 1.2548, + "step": 2214 + }, + { + "epoch": 0.5996482208090922, + "grad_norm": 0.24094751477241516, + "learning_rate": 9.699956792079825e-05, + "loss": 1.2616, + "step": 2216 + }, + { + "epoch": 0.600189419564335, + "grad_norm": 0.2429209202528, + "learning_rate": 9.698881698851274e-05, + "loss": 1.2603, + "step": 2218 + }, + { + "epoch": 0.6007306183195779, + "grad_norm": 0.24263691902160645, + "learning_rate": 9.69780474276285e-05, + "loss": 1.2788, + "step": 2220 + }, + { + "epoch": 0.6012718170748207, + "grad_norm": 0.25438740849494934, + "learning_rate": 9.696725924241506e-05, + "loss": 1.2823, + "step": 2222 + }, + { + "epoch": 0.6018130158300636, + "grad_norm": 0.258472204208374, + "learning_rate": 9.695645243714939e-05, + "loss": 1.2673, + "step": 2224 + }, + { + "epoch": 0.6023542145853065, + "grad_norm": 0.2571878433227539, + "learning_rate": 9.694562701611583e-05, + "loss": 1.295, + "step": 2226 + }, + { + "epoch": 0.6028954133405493, + "grad_norm": 0.2430989295244217, + "learning_rate": 9.693478298360607e-05, + "loss": 1.2595, + "step": 2228 + }, + { + "epoch": 0.6034366120957921, + "grad_norm": 0.23489908874034882, + "learning_rate": 9.692392034391922e-05, + "loss": 1.2773, + "step": 2230 + }, + { + "epoch": 0.6039778108510351, + "grad_norm": 0.2507382035255432, + "learning_rate": 9.691303910136171e-05, + "loss": 1.2782, + "step": 2232 + }, + { + "epoch": 0.6045190096062779, + "grad_norm": 0.23355726897716522, + "learning_rate": 9.690213926024743e-05, + "loss": 1.263, + "step": 2234 + }, + { + "epoch": 0.6050602083615207, + "grad_norm": 0.2275291532278061, + "learning_rate": 9.689122082489754e-05, + "loss": 1.2677, + "step": 2236 + }, + { + "epoch": 0.6056014071167636, + "grad_norm": 0.2314850389957428, + "learning_rate": 9.688028379964068e-05, + "loss": 1.2646, + "step": 2238 + }, + { + "epoch": 0.6061426058720065, + "grad_norm": 0.24879969656467438, + "learning_rate": 9.686932818881278e-05, + "loss": 1.2704, + "step": 2240 + }, + { + "epoch": 0.6066838046272494, + "grad_norm": 0.23156161606311798, + "learning_rate": 9.685835399675717e-05, + "loss": 1.2795, + "step": 2242 + }, + { + "epoch": 0.6072250033824922, + "grad_norm": 0.22952421009540558, + "learning_rate": 9.684736122782454e-05, + "loss": 1.2597, + "step": 2244 + }, + { + "epoch": 0.6077662021377351, + "grad_norm": 1.0009911060333252, + "learning_rate": 9.683634988637293e-05, + "loss": 1.2504, + "step": 2246 + }, + { + "epoch": 0.608307400892978, + "grad_norm": 0.2649003267288208, + "learning_rate": 9.682531997676777e-05, + "loss": 1.2376, + "step": 2248 + }, + { + "epoch": 0.6088485996482208, + "grad_norm": 0.31321823596954346, + "learning_rate": 9.681427150338187e-05, + "loss": 1.2607, + "step": 2250 + }, + { + "epoch": 0.6093897984034636, + "grad_norm": 0.3142634332180023, + "learning_rate": 9.680320447059532e-05, + "loss": 1.261, + "step": 2252 + }, + { + "epoch": 0.6099309971587066, + "grad_norm": 0.31247085332870483, + "learning_rate": 9.679211888279565e-05, + "loss": 1.2685, + "step": 2254 + }, + { + "epoch": 0.6104721959139494, + "grad_norm": 0.25763556361198425, + "learning_rate": 9.67810147443777e-05, + "loss": 1.2542, + "step": 2256 + }, + { + "epoch": 0.6110133946691922, + "grad_norm": 0.2788141667842865, + "learning_rate": 9.676989205974367e-05, + "loss": 1.2747, + "step": 2258 + }, + { + "epoch": 0.6115545934244351, + "grad_norm": 0.26279813051223755, + "learning_rate": 9.675875083330315e-05, + "loss": 1.261, + "step": 2260 + }, + { + "epoch": 0.612095792179678, + "grad_norm": 0.24764376878738403, + "learning_rate": 9.674759106947302e-05, + "loss": 1.2632, + "step": 2262 + }, + { + "epoch": 0.6126369909349209, + "grad_norm": 0.2378121018409729, + "learning_rate": 9.673641277267756e-05, + "loss": 1.2569, + "step": 2264 + }, + { + "epoch": 0.6131781896901637, + "grad_norm": 0.25457054376602173, + "learning_rate": 9.672521594734838e-05, + "loss": 1.2667, + "step": 2266 + }, + { + "epoch": 0.6137193884454065, + "grad_norm": 0.2589806616306305, + "learning_rate": 9.67140005979244e-05, + "loss": 1.2515, + "step": 2268 + }, + { + "epoch": 0.6142605872006495, + "grad_norm": 0.23375307023525238, + "learning_rate": 9.670276672885195e-05, + "loss": 1.2608, + "step": 2270 + }, + { + "epoch": 0.6148017859558923, + "grad_norm": 0.22978229820728302, + "learning_rate": 9.669151434458468e-05, + "loss": 1.2516, + "step": 2272 + }, + { + "epoch": 0.6153429847111351, + "grad_norm": 0.22958585619926453, + "learning_rate": 9.668024344958353e-05, + "loss": 1.2617, + "step": 2274 + }, + { + "epoch": 0.6158841834663781, + "grad_norm": 0.22783328592777252, + "learning_rate": 9.666895404831685e-05, + "loss": 1.2732, + "step": 2276 + }, + { + "epoch": 0.6164253822216209, + "grad_norm": 0.2413301318883896, + "learning_rate": 9.665764614526027e-05, + "loss": 1.2501, + "step": 2278 + }, + { + "epoch": 0.6169665809768637, + "grad_norm": 0.2591399550437927, + "learning_rate": 9.66463197448968e-05, + "loss": 1.2657, + "step": 2280 + }, + { + "epoch": 0.6175077797321066, + "grad_norm": 0.23001301288604736, + "learning_rate": 9.663497485171675e-05, + "loss": 1.2698, + "step": 2282 + }, + { + "epoch": 0.6180489784873495, + "grad_norm": 0.21373826265335083, + "learning_rate": 9.662361147021779e-05, + "loss": 1.2651, + "step": 2284 + }, + { + "epoch": 0.6185901772425924, + "grad_norm": 0.2302403599023819, + "learning_rate": 9.66122296049049e-05, + "loss": 1.2786, + "step": 2286 + }, + { + "epoch": 0.6191313759978352, + "grad_norm": 0.24121953547000885, + "learning_rate": 9.660082926029038e-05, + "loss": 1.2639, + "step": 2288 + }, + { + "epoch": 0.619672574753078, + "grad_norm": 0.22190925478935242, + "learning_rate": 9.658941044089387e-05, + "loss": 1.2472, + "step": 2290 + }, + { + "epoch": 0.620213773508321, + "grad_norm": 0.22907890379428864, + "learning_rate": 9.657797315124234e-05, + "loss": 1.2475, + "step": 2292 + }, + { + "epoch": 0.6207549722635638, + "grad_norm": 0.23821642994880676, + "learning_rate": 9.656651739587008e-05, + "loss": 1.2689, + "step": 2294 + }, + { + "epoch": 0.6212961710188066, + "grad_norm": 0.25953301787376404, + "learning_rate": 9.655504317931867e-05, + "loss": 1.2587, + "step": 2296 + }, + { + "epoch": 0.6218373697740495, + "grad_norm": 0.24054677784442902, + "learning_rate": 9.654355050613705e-05, + "loss": 1.2541, + "step": 2298 + }, + { + "epoch": 0.6223785685292924, + "grad_norm": 0.23474815487861633, + "learning_rate": 9.65320393808815e-05, + "loss": 1.2605, + "step": 2300 + }, + { + "epoch": 0.6229197672845352, + "grad_norm": 0.23981337249279022, + "learning_rate": 9.652050980811551e-05, + "loss": 1.267, + "step": 2302 + }, + { + "epoch": 0.6234609660397781, + "grad_norm": 0.21820946037769318, + "learning_rate": 9.650896179240997e-05, + "loss": 1.2555, + "step": 2304 + }, + { + "epoch": 0.624002164795021, + "grad_norm": 0.2165161520242691, + "learning_rate": 9.64973953383431e-05, + "loss": 1.271, + "step": 2306 + }, + { + "epoch": 0.6245433635502639, + "grad_norm": 0.22105515003204346, + "learning_rate": 9.648581045050035e-05, + "loss": 1.2663, + "step": 2308 + }, + { + "epoch": 0.6250845623055067, + "grad_norm": 0.22626088559627533, + "learning_rate": 9.647420713347454e-05, + "loss": 1.2645, + "step": 2310 + }, + { + "epoch": 0.6256257610607495, + "grad_norm": 0.2347354292869568, + "learning_rate": 9.646258539186577e-05, + "loss": 1.2372, + "step": 2312 + }, + { + "epoch": 0.6261669598159925, + "grad_norm": 0.2388308346271515, + "learning_rate": 9.645094523028144e-05, + "loss": 1.2652, + "step": 2314 + }, + { + "epoch": 0.6267081585712353, + "grad_norm": 0.2252940982580185, + "learning_rate": 9.643928665333628e-05, + "loss": 1.2595, + "step": 2316 + }, + { + "epoch": 0.6272493573264781, + "grad_norm": 0.24020199477672577, + "learning_rate": 9.64276096656523e-05, + "loss": 1.3079, + "step": 2318 + }, + { + "epoch": 0.627790556081721, + "grad_norm": 0.23432402312755585, + "learning_rate": 9.64159142718588e-05, + "loss": 1.2718, + "step": 2320 + }, + { + "epoch": 0.6283317548369639, + "grad_norm": 0.22962002456188202, + "learning_rate": 9.640420047659239e-05, + "loss": 1.2606, + "step": 2322 + }, + { + "epoch": 0.6288729535922067, + "grad_norm": 0.21251855790615082, + "learning_rate": 9.6392468284497e-05, + "loss": 1.2568, + "step": 2324 + }, + { + "epoch": 0.6294141523474496, + "grad_norm": 0.2140374481678009, + "learning_rate": 9.63807177002238e-05, + "loss": 1.276, + "step": 2326 + }, + { + "epoch": 0.6299553511026925, + "grad_norm": 0.21366523206233978, + "learning_rate": 9.636894872843132e-05, + "loss": 1.2521, + "step": 2328 + }, + { + "epoch": 0.6304965498579354, + "grad_norm": 0.22407646477222443, + "learning_rate": 9.635716137378528e-05, + "loss": 1.2692, + "step": 2330 + }, + { + "epoch": 0.6310377486131782, + "grad_norm": 0.24414391815662384, + "learning_rate": 9.63453556409588e-05, + "loss": 1.2554, + "step": 2332 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.23787756264209747, + "learning_rate": 9.633353153463219e-05, + "loss": 1.2471, + "step": 2334 + }, + { + "epoch": 0.632120146123664, + "grad_norm": 0.2248927801847458, + "learning_rate": 9.63216890594931e-05, + "loss": 1.2586, + "step": 2336 + }, + { + "epoch": 0.6326613448789068, + "grad_norm": 0.2224208265542984, + "learning_rate": 9.630982822023648e-05, + "loss": 1.293, + "step": 2338 + }, + { + "epoch": 0.6332025436341496, + "grad_norm": 0.2312854379415512, + "learning_rate": 9.62979490215645e-05, + "loss": 1.2604, + "step": 2340 + }, + { + "epoch": 0.6337437423893925, + "grad_norm": 0.22154025733470917, + "learning_rate": 9.628605146818665e-05, + "loss": 1.2645, + "step": 2342 + }, + { + "epoch": 0.6342849411446354, + "grad_norm": 0.2136823982000351, + "learning_rate": 9.627413556481968e-05, + "loss": 1.2375, + "step": 2344 + }, + { + "epoch": 0.6348261398998782, + "grad_norm": 0.21541404724121094, + "learning_rate": 9.626220131618763e-05, + "loss": 1.2771, + "step": 2346 + }, + { + "epoch": 0.6353673386551211, + "grad_norm": 0.22025029361248016, + "learning_rate": 9.625024872702178e-05, + "loss": 1.261, + "step": 2348 + }, + { + "epoch": 0.6359085374103639, + "grad_norm": 0.2375534474849701, + "learning_rate": 9.623827780206073e-05, + "loss": 1.2808, + "step": 2350 + }, + { + "epoch": 0.6364497361656068, + "grad_norm": 0.23297767341136932, + "learning_rate": 9.62262885460503e-05, + "loss": 1.2697, + "step": 2352 + }, + { + "epoch": 0.6369909349208497, + "grad_norm": 0.24082797765731812, + "learning_rate": 9.621428096374363e-05, + "loss": 1.2347, + "step": 2354 + }, + { + "epoch": 0.6375321336760925, + "grad_norm": 0.22009813785552979, + "learning_rate": 9.620225505990105e-05, + "loss": 1.2631, + "step": 2356 + }, + { + "epoch": 0.6380733324313355, + "grad_norm": 0.22501374781131744, + "learning_rate": 9.619021083929025e-05, + "loss": 1.2563, + "step": 2358 + }, + { + "epoch": 0.6386145311865783, + "grad_norm": 0.22494594752788544, + "learning_rate": 9.61781483066861e-05, + "loss": 1.2532, + "step": 2360 + }, + { + "epoch": 0.6391557299418211, + "grad_norm": 0.3569008409976959, + "learning_rate": 9.616606746687078e-05, + "loss": 1.2684, + "step": 2362 + }, + { + "epoch": 0.639696928697064, + "grad_norm": 207.0965576171875, + "learning_rate": 9.61539683246337e-05, + "loss": 1.3637, + "step": 2364 + }, + { + "epoch": 0.6402381274523069, + "grad_norm": 0.4599202573299408, + "learning_rate": 9.614185088477152e-05, + "loss": 1.292, + "step": 2366 + }, + { + "epoch": 0.6407793262075497, + "grad_norm": 0.3244802951812744, + "learning_rate": 9.61297151520882e-05, + "loss": 1.2585, + "step": 2368 + }, + { + "epoch": 0.6413205249627926, + "grad_norm": 0.30332016944885254, + "learning_rate": 9.611756113139488e-05, + "loss": 1.2619, + "step": 2370 + }, + { + "epoch": 0.6418617237180354, + "grad_norm": 0.2982909083366394, + "learning_rate": 9.610538882751001e-05, + "loss": 1.2637, + "step": 2372 + }, + { + "epoch": 0.6424029224732783, + "grad_norm": 5.417288303375244, + "learning_rate": 9.609319824525928e-05, + "loss": 1.2713, + "step": 2374 + }, + { + "epoch": 0.6429441212285212, + "grad_norm": 0.4198252260684967, + "learning_rate": 9.608098938947562e-05, + "loss": 1.2541, + "step": 2376 + }, + { + "epoch": 0.643485319983764, + "grad_norm": 0.8178582191467285, + "learning_rate": 9.606876226499918e-05, + "loss": 1.2884, + "step": 2378 + }, + { + "epoch": 0.6440265187390068, + "grad_norm": 0.33514025807380676, + "learning_rate": 9.60565168766774e-05, + "loss": 1.2719, + "step": 2380 + }, + { + "epoch": 0.6445677174942498, + "grad_norm": 0.2973354756832123, + "learning_rate": 9.60442532293649e-05, + "loss": 1.2515, + "step": 2382 + }, + { + "epoch": 0.6451089162494926, + "grad_norm": 0.4670213758945465, + "learning_rate": 9.603197132792359e-05, + "loss": 1.2665, + "step": 2384 + }, + { + "epoch": 0.6456501150047355, + "grad_norm": 0.3197322189807892, + "learning_rate": 9.60196711772226e-05, + "loss": 1.2574, + "step": 2386 + }, + { + "epoch": 0.6461913137599784, + "grad_norm": 1.1344069242477417, + "learning_rate": 9.600735278213828e-05, + "loss": 1.2689, + "step": 2388 + }, + { + "epoch": 0.6467325125152212, + "grad_norm": 0.5379347801208496, + "learning_rate": 9.599501614755425e-05, + "loss": 1.249, + "step": 2390 + }, + { + "epoch": 0.6472737112704641, + "grad_norm": 0.33201339840888977, + "learning_rate": 9.598266127836131e-05, + "loss": 1.2729, + "step": 2392 + }, + { + "epoch": 0.6478149100257069, + "grad_norm": 8.969808578491211, + "learning_rate": 9.597028817945753e-05, + "loss": 1.2768, + "step": 2394 + }, + { + "epoch": 0.6483561087809498, + "grad_norm": 0.3650411069393158, + "learning_rate": 9.595789685574821e-05, + "loss": 1.2511, + "step": 2396 + }, + { + "epoch": 0.6488973075361927, + "grad_norm": 0.8414996862411499, + "learning_rate": 9.594548731214583e-05, + "loss": 1.2707, + "step": 2398 + }, + { + "epoch": 0.6494385062914355, + "grad_norm": 0.5362874269485474, + "learning_rate": 9.593305955357016e-05, + "loss": 1.2453, + "step": 2400 + }, + { + "epoch": 0.6499797050466783, + "grad_norm": 0.40546804666519165, + "learning_rate": 9.592061358494813e-05, + "loss": 1.2665, + "step": 2402 + }, + { + "epoch": 0.6505209038019213, + "grad_norm": 0.29758453369140625, + "learning_rate": 9.590814941121389e-05, + "loss": 1.2538, + "step": 2404 + }, + { + "epoch": 0.6510621025571641, + "grad_norm": 0.2636415660381317, + "learning_rate": 9.589566703730888e-05, + "loss": 1.2457, + "step": 2406 + }, + { + "epoch": 0.651603301312407, + "grad_norm": 0.2844487130641937, + "learning_rate": 9.588316646818168e-05, + "loss": 1.257, + "step": 2408 + }, + { + "epoch": 0.6521445000676498, + "grad_norm": 0.2777060568332672, + "learning_rate": 9.587064770878808e-05, + "loss": 1.2506, + "step": 2410 + }, + { + "epoch": 0.6526856988228927, + "grad_norm": 0.2585492730140686, + "learning_rate": 9.585811076409117e-05, + "loss": 1.2472, + "step": 2412 + }, + { + "epoch": 0.6532268975781356, + "grad_norm": 0.24312525987625122, + "learning_rate": 9.584555563906116e-05, + "loss": 1.2703, + "step": 2414 + }, + { + "epoch": 0.6537680963333784, + "grad_norm": 0.2286798357963562, + "learning_rate": 9.583298233867549e-05, + "loss": 1.2582, + "step": 2416 + }, + { + "epoch": 0.6543092950886213, + "grad_norm": 0.22804994881153107, + "learning_rate": 9.582039086791883e-05, + "loss": 1.2538, + "step": 2418 + }, + { + "epoch": 0.6548504938438642, + "grad_norm": 0.2244635969400406, + "learning_rate": 9.580778123178303e-05, + "loss": 1.2481, + "step": 2420 + }, + { + "epoch": 0.655391692599107, + "grad_norm": 0.22303158044815063, + "learning_rate": 9.579515343526714e-05, + "loss": 1.2574, + "step": 2422 + }, + { + "epoch": 0.6559328913543498, + "grad_norm": 0.2208811491727829, + "learning_rate": 9.578250748337742e-05, + "loss": 1.2579, + "step": 2424 + }, + { + "epoch": 0.6564740901095928, + "grad_norm": 0.20853403210639954, + "learning_rate": 9.576984338112736e-05, + "loss": 1.2619, + "step": 2426 + }, + { + "epoch": 0.6570152888648356, + "grad_norm": 0.20974035561084747, + "learning_rate": 9.575716113353757e-05, + "loss": 1.2605, + "step": 2428 + }, + { + "epoch": 0.6575564876200785, + "grad_norm": 0.22891463339328766, + "learning_rate": 9.57444607456359e-05, + "loss": 1.2586, + "step": 2430 + }, + { + "epoch": 0.6580976863753213, + "grad_norm": 0.21693287789821625, + "learning_rate": 9.57317422224574e-05, + "loss": 1.2505, + "step": 2432 + }, + { + "epoch": 0.6586388851305642, + "grad_norm": 0.21806494891643524, + "learning_rate": 9.57190055690443e-05, + "loss": 1.261, + "step": 2434 + }, + { + "epoch": 0.6591800838858071, + "grad_norm": 0.24015147984027863, + "learning_rate": 9.570625079044601e-05, + "loss": 1.2564, + "step": 2436 + }, + { + "epoch": 0.6597212826410499, + "grad_norm": 0.26577669382095337, + "learning_rate": 9.569347789171912e-05, + "loss": 1.2716, + "step": 2438 + }, + { + "epoch": 0.6602624813962927, + "grad_norm": 0.2382255643606186, + "learning_rate": 9.568068687792741e-05, + "loss": 1.2465, + "step": 2440 + }, + { + "epoch": 0.6608036801515357, + "grad_norm": 0.22770415246486664, + "learning_rate": 9.566787775414188e-05, + "loss": 1.2229, + "step": 2442 + }, + { + "epoch": 0.6613448789067785, + "grad_norm": 0.23449081182479858, + "learning_rate": 9.565505052544065e-05, + "loss": 1.2582, + "step": 2444 + }, + { + "epoch": 0.6618860776620213, + "grad_norm": 0.22105945646762848, + "learning_rate": 9.564220519690903e-05, + "loss": 1.2505, + "step": 2446 + }, + { + "epoch": 0.6624272764172643, + "grad_norm": 0.22349369525909424, + "learning_rate": 9.562934177363953e-05, + "loss": 1.2578, + "step": 2448 + }, + { + "epoch": 0.6629684751725071, + "grad_norm": 0.23770608007907867, + "learning_rate": 9.561646026073184e-05, + "loss": 1.2399, + "step": 2450 + }, + { + "epoch": 0.66350967392775, + "grad_norm": 0.2204604148864746, + "learning_rate": 9.56035606632928e-05, + "loss": 1.2512, + "step": 2452 + }, + { + "epoch": 0.6640508726829928, + "grad_norm": 0.2204030454158783, + "learning_rate": 9.559064298643638e-05, + "loss": 1.2821, + "step": 2454 + }, + { + "epoch": 0.6645920714382357, + "grad_norm": 0.2169465720653534, + "learning_rate": 9.55777072352838e-05, + "loss": 1.2529, + "step": 2456 + }, + { + "epoch": 0.6651332701934786, + "grad_norm": 0.2273695021867752, + "learning_rate": 9.55647534149634e-05, + "loss": 1.2497, + "step": 2458 + }, + { + "epoch": 0.6656744689487214, + "grad_norm": 0.22077496349811554, + "learning_rate": 9.555178153061069e-05, + "loss": 1.2433, + "step": 2460 + }, + { + "epoch": 0.6662156677039642, + "grad_norm": 0.2203417718410492, + "learning_rate": 9.553879158736833e-05, + "loss": 1.2464, + "step": 2462 + }, + { + "epoch": 0.6667568664592072, + "grad_norm": 0.22205059230327606, + "learning_rate": 9.552578359038617e-05, + "loss": 1.2611, + "step": 2464 + }, + { + "epoch": 0.66729806521445, + "grad_norm": 0.2206515222787857, + "learning_rate": 9.551275754482119e-05, + "loss": 1.2624, + "step": 2466 + }, + { + "epoch": 0.6678392639696928, + "grad_norm": 0.21758343279361725, + "learning_rate": 9.549971345583753e-05, + "loss": 1.2406, + "step": 2468 + }, + { + "epoch": 0.6683804627249358, + "grad_norm": 0.21517138183116913, + "learning_rate": 9.548665132860647e-05, + "loss": 1.2538, + "step": 2470 + }, + { + "epoch": 0.6689216614801786, + "grad_norm": 0.21490350365638733, + "learning_rate": 9.547357116830648e-05, + "loss": 1.2534, + "step": 2472 + }, + { + "epoch": 0.6694628602354215, + "grad_norm": 0.2156359702348709, + "learning_rate": 9.546047298012315e-05, + "loss": 1.2459, + "step": 2474 + }, + { + "epoch": 0.6700040589906643, + "grad_norm": 0.2196791172027588, + "learning_rate": 9.544735676924923e-05, + "loss": 1.2534, + "step": 2476 + }, + { + "epoch": 0.6705452577459072, + "grad_norm": 0.22666549682617188, + "learning_rate": 9.54342225408846e-05, + "loss": 1.252, + "step": 2478 + }, + { + "epoch": 0.6710864565011501, + "grad_norm": 0.2314993143081665, + "learning_rate": 9.54210703002363e-05, + "loss": 1.2478, + "step": 2480 + }, + { + "epoch": 0.6716276552563929, + "grad_norm": 0.2225077599287033, + "learning_rate": 9.54079000525185e-05, + "loss": 1.2465, + "step": 2482 + }, + { + "epoch": 0.6721688540116357, + "grad_norm": 0.22268906235694885, + "learning_rate": 9.539471180295249e-05, + "loss": 1.2453, + "step": 2484 + }, + { + "epoch": 0.6727100527668787, + "grad_norm": 0.30744513869285583, + "learning_rate": 9.538150555676677e-05, + "loss": 1.2874, + "step": 2486 + }, + { + "epoch": 0.6732512515221215, + "grad_norm": 0.27435171604156494, + "learning_rate": 9.536828131919686e-05, + "loss": 1.2533, + "step": 2488 + }, + { + "epoch": 0.6737924502773643, + "grad_norm": 0.5657795667648315, + "learning_rate": 9.535503909548553e-05, + "loss": 1.2567, + "step": 2490 + }, + { + "epoch": 0.6743336490326072, + "grad_norm": 0.4795803129673004, + "learning_rate": 9.53417788908826e-05, + "loss": 1.2563, + "step": 2492 + }, + { + "epoch": 0.6748748477878501, + "grad_norm": 0.3125123977661133, + "learning_rate": 9.532850071064503e-05, + "loss": 1.251, + "step": 2494 + }, + { + "epoch": 0.675416046543093, + "grad_norm": 0.2949443459510803, + "learning_rate": 9.531520456003696e-05, + "loss": 1.2491, + "step": 2496 + }, + { + "epoch": 0.6759572452983358, + "grad_norm": 0.289389967918396, + "learning_rate": 9.530189044432959e-05, + "loss": 1.2571, + "step": 2498 + }, + { + "epoch": 0.6764984440535787, + "grad_norm": 0.24411126971244812, + "learning_rate": 9.528855836880127e-05, + "loss": 1.2528, + "step": 2500 + }, + { + "epoch": 0.6770396428088216, + "grad_norm": 0.38176965713500977, + "learning_rate": 9.527520833873748e-05, + "loss": 1.2462, + "step": 2502 + }, + { + "epoch": 0.6775808415640644, + "grad_norm": 0.25295090675354004, + "learning_rate": 9.52618403594308e-05, + "loss": 1.2601, + "step": 2504 + }, + { + "epoch": 0.6781220403193072, + "grad_norm": 0.24630951881408691, + "learning_rate": 9.524845443618091e-05, + "loss": 1.2398, + "step": 2506 + }, + { + "epoch": 0.6786632390745502, + "grad_norm": 0.25156068801879883, + "learning_rate": 9.523505057429466e-05, + "loss": 1.2429, + "step": 2508 + }, + { + "epoch": 0.679204437829793, + "grad_norm": 0.23003700375556946, + "learning_rate": 9.522162877908596e-05, + "loss": 1.2569, + "step": 2510 + }, + { + "epoch": 0.6797456365850358, + "grad_norm": 0.2248392552137375, + "learning_rate": 9.520818905587585e-05, + "loss": 1.2506, + "step": 2512 + }, + { + "epoch": 0.6802868353402787, + "grad_norm": 0.22383219003677368, + "learning_rate": 9.519473140999246e-05, + "loss": 1.2294, + "step": 2514 + }, + { + "epoch": 0.6808280340955216, + "grad_norm": 0.22723117470741272, + "learning_rate": 9.518125584677106e-05, + "loss": 1.2658, + "step": 2516 + }, + { + "epoch": 0.6813692328507645, + "grad_norm": 0.24425800144672394, + "learning_rate": 9.516776237155402e-05, + "loss": 1.233, + "step": 2518 + }, + { + "epoch": 0.6819104316060073, + "grad_norm": 0.22345170378684998, + "learning_rate": 9.515425098969075e-05, + "loss": 1.248, + "step": 2520 + }, + { + "epoch": 0.6824516303612501, + "grad_norm": 0.21297229826450348, + "learning_rate": 9.514072170653782e-05, + "loss": 1.2453, + "step": 2522 + }, + { + "epoch": 0.6829928291164931, + "grad_norm": 0.21216444671154022, + "learning_rate": 9.51271745274589e-05, + "loss": 1.2473, + "step": 2524 + }, + { + "epoch": 0.6835340278717359, + "grad_norm": 0.2091735154390335, + "learning_rate": 9.511360945782472e-05, + "loss": 1.2451, + "step": 2526 + }, + { + "epoch": 0.6840752266269787, + "grad_norm": 0.21291106939315796, + "learning_rate": 9.510002650301313e-05, + "loss": 1.2772, + "step": 2528 + }, + { + "epoch": 0.6846164253822217, + "grad_norm": 0.21953986585140228, + "learning_rate": 9.508642566840901e-05, + "loss": 1.2533, + "step": 2530 + }, + { + "epoch": 0.6851576241374645, + "grad_norm": 0.21948380768299103, + "learning_rate": 9.507280695940446e-05, + "loss": 1.2797, + "step": 2532 + }, + { + "epoch": 0.6856988228927073, + "grad_norm": 0.21971148252487183, + "learning_rate": 9.505917038139851e-05, + "loss": 1.2609, + "step": 2534 + }, + { + "epoch": 0.6862400216479502, + "grad_norm": 0.21478046476840973, + "learning_rate": 9.504551593979738e-05, + "loss": 1.2625, + "step": 2536 + }, + { + "epoch": 0.6867812204031931, + "grad_norm": 0.21927322447299957, + "learning_rate": 9.503184364001431e-05, + "loss": 1.2415, + "step": 2538 + }, + { + "epoch": 0.687322419158436, + "grad_norm": 0.2084941267967224, + "learning_rate": 9.501815348746971e-05, + "loss": 1.2455, + "step": 2540 + }, + { + "epoch": 0.6878636179136788, + "grad_norm": 0.20336540043354034, + "learning_rate": 9.500444548759095e-05, + "loss": 1.2505, + "step": 2542 + }, + { + "epoch": 0.6884048166689216, + "grad_norm": 0.21661430597305298, + "learning_rate": 9.499071964581256e-05, + "loss": 1.235, + "step": 2544 + }, + { + "epoch": 0.6889460154241646, + "grad_norm": 0.2240605354309082, + "learning_rate": 9.497697596757609e-05, + "loss": 1.2546, + "step": 2546 + }, + { + "epoch": 0.6894872141794074, + "grad_norm": 0.2289547622203827, + "learning_rate": 9.496321445833022e-05, + "loss": 1.2387, + "step": 2548 + }, + { + "epoch": 0.6900284129346502, + "grad_norm": 0.22886811196804047, + "learning_rate": 9.494943512353063e-05, + "loss": 1.2531, + "step": 2550 + }, + { + "epoch": 0.6905696116898931, + "grad_norm": 0.2151922732591629, + "learning_rate": 9.493563796864014e-05, + "loss": 1.2447, + "step": 2552 + }, + { + "epoch": 0.691110810445136, + "grad_norm": 0.2263440489768982, + "learning_rate": 9.492182299912857e-05, + "loss": 1.245, + "step": 2554 + }, + { + "epoch": 0.6916520092003788, + "grad_norm": 0.23101641237735748, + "learning_rate": 9.490799022047286e-05, + "loss": 1.2253, + "step": 2556 + }, + { + "epoch": 0.6921932079556217, + "grad_norm": 0.2258201241493225, + "learning_rate": 9.489413963815694e-05, + "loss": 1.2477, + "step": 2558 + }, + { + "epoch": 0.6927344067108646, + "grad_norm": 0.2227460741996765, + "learning_rate": 9.488027125767187e-05, + "loss": 1.2215, + "step": 2560 + }, + { + "epoch": 0.6932756054661074, + "grad_norm": 0.2213139533996582, + "learning_rate": 9.48663850845157e-05, + "loss": 1.2308, + "step": 2562 + }, + { + "epoch": 0.6938168042213503, + "grad_norm": 0.22192241251468658, + "learning_rate": 9.485248112419363e-05, + "loss": 1.2487, + "step": 2564 + }, + { + "epoch": 0.6943580029765931, + "grad_norm": 0.21532469987869263, + "learning_rate": 9.483855938221777e-05, + "loss": 1.2498, + "step": 2566 + }, + { + "epoch": 0.6948992017318361, + "grad_norm": 0.21143551170825958, + "learning_rate": 9.482461986410743e-05, + "loss": 1.2453, + "step": 2568 + }, + { + "epoch": 0.6954404004870789, + "grad_norm": 0.21282954514026642, + "learning_rate": 9.481066257538886e-05, + "loss": 1.2499, + "step": 2570 + }, + { + "epoch": 0.6959815992423217, + "grad_norm": 0.219988152384758, + "learning_rate": 9.47966875215954e-05, + "loss": 1.2478, + "step": 2572 + }, + { + "epoch": 0.6965227979975646, + "grad_norm": 0.21327020227909088, + "learning_rate": 9.478269470826744e-05, + "loss": 1.2364, + "step": 2574 + }, + { + "epoch": 0.6970639967528075, + "grad_norm": 0.2091750204563141, + "learning_rate": 9.476868414095237e-05, + "loss": 1.2494, + "step": 2576 + }, + { + "epoch": 0.6976051955080503, + "grad_norm": 0.2145649939775467, + "learning_rate": 9.475465582520466e-05, + "loss": 1.254, + "step": 2578 + }, + { + "epoch": 0.6981463942632932, + "grad_norm": 0.21477670967578888, + "learning_rate": 9.474060976658578e-05, + "loss": 1.2678, + "step": 2580 + }, + { + "epoch": 0.6986875930185361, + "grad_norm": 0.21862445771694183, + "learning_rate": 9.472654597066431e-05, + "loss": 1.2512, + "step": 2582 + }, + { + "epoch": 0.699228791773779, + "grad_norm": 0.21111270785331726, + "learning_rate": 9.471246444301574e-05, + "loss": 1.2587, + "step": 2584 + }, + { + "epoch": 0.6997699905290218, + "grad_norm": 0.21332062780857086, + "learning_rate": 9.469836518922269e-05, + "loss": 1.2569, + "step": 2586 + }, + { + "epoch": 0.7003111892842646, + "grad_norm": 0.21386279165744781, + "learning_rate": 9.468424821487476e-05, + "loss": 1.2308, + "step": 2588 + }, + { + "epoch": 0.7008523880395076, + "grad_norm": 0.20638014376163483, + "learning_rate": 9.46701135255686e-05, + "loss": 1.2453, + "step": 2590 + }, + { + "epoch": 0.7013935867947504, + "grad_norm": 0.2437312752008438, + "learning_rate": 9.465596112690787e-05, + "loss": 1.2523, + "step": 2592 + }, + { + "epoch": 0.7019347855499932, + "grad_norm": 0.22395059466362, + "learning_rate": 9.464179102450325e-05, + "loss": 1.2535, + "step": 2594 + }, + { + "epoch": 0.7024759843052361, + "grad_norm": 0.22118812799453735, + "learning_rate": 9.462760322397246e-05, + "loss": 1.2488, + "step": 2596 + }, + { + "epoch": 0.703017183060479, + "grad_norm": 0.22880488634109497, + "learning_rate": 9.461339773094021e-05, + "loss": 1.2407, + "step": 2598 + }, + { + "epoch": 0.7035583818157218, + "grad_norm": 0.21199798583984375, + "learning_rate": 9.45991745510382e-05, + "loss": 1.2476, + "step": 2600 + }, + { + "epoch": 0.7040995805709647, + "grad_norm": 0.20646455883979797, + "learning_rate": 9.458493368990519e-05, + "loss": 1.2556, + "step": 2602 + }, + { + "epoch": 0.7046407793262075, + "grad_norm": 0.2136593908071518, + "learning_rate": 9.457067515318698e-05, + "loss": 1.2567, + "step": 2604 + }, + { + "epoch": 0.7051819780814504, + "grad_norm": 0.214664489030838, + "learning_rate": 9.455639894653627e-05, + "loss": 1.266, + "step": 2606 + }, + { + "epoch": 0.7057231768366933, + "grad_norm": 0.2101629078388214, + "learning_rate": 9.454210507561285e-05, + "loss": 1.2499, + "step": 2608 + }, + { + "epoch": 0.7062643755919361, + "grad_norm": 0.2157791256904602, + "learning_rate": 9.452779354608348e-05, + "loss": 1.2421, + "step": 2610 + }, + { + "epoch": 0.706805574347179, + "grad_norm": 0.20827960968017578, + "learning_rate": 9.451346436362196e-05, + "loss": 1.2566, + "step": 2612 + }, + { + "epoch": 0.7073467731024219, + "grad_norm": 0.21283753216266632, + "learning_rate": 9.449911753390901e-05, + "loss": 1.2561, + "step": 2614 + }, + { + "epoch": 0.7078879718576647, + "grad_norm": 0.22358572483062744, + "learning_rate": 9.448475306263245e-05, + "loss": 1.2418, + "step": 2616 + }, + { + "epoch": 0.7084291706129076, + "grad_norm": 0.21198727190494537, + "learning_rate": 9.4470370955487e-05, + "loss": 1.2511, + "step": 2618 + }, + { + "epoch": 0.7089703693681505, + "grad_norm": 0.21495653688907623, + "learning_rate": 9.445597121817442e-05, + "loss": 1.2294, + "step": 2620 + }, + { + "epoch": 0.7095115681233933, + "grad_norm": 0.21378777921199799, + "learning_rate": 9.444155385640345e-05, + "loss": 1.2375, + "step": 2622 + }, + { + "epoch": 0.7100527668786362, + "grad_norm": 0.21197205781936646, + "learning_rate": 9.442711887588981e-05, + "loss": 1.251, + "step": 2624 + }, + { + "epoch": 0.710593965633879, + "grad_norm": 0.21979504823684692, + "learning_rate": 9.441266628235624e-05, + "loss": 1.2467, + "step": 2626 + }, + { + "epoch": 0.7111351643891219, + "grad_norm": 0.21565599739551544, + "learning_rate": 9.43981960815324e-05, + "loss": 1.22, + "step": 2628 + }, + { + "epoch": 0.7116763631443648, + "grad_norm": 0.19891119003295898, + "learning_rate": 9.438370827915499e-05, + "loss": 1.215, + "step": 2630 + }, + { + "epoch": 0.7122175618996076, + "grad_norm": 0.21079830825328827, + "learning_rate": 9.436920288096764e-05, + "loss": 1.2407, + "step": 2632 + }, + { + "epoch": 0.7127587606548504, + "grad_norm": 0.21531549096107483, + "learning_rate": 9.435467989272099e-05, + "loss": 1.2348, + "step": 2634 + }, + { + "epoch": 0.7132999594100934, + "grad_norm": 0.22583681344985962, + "learning_rate": 9.434013932017265e-05, + "loss": 1.2567, + "step": 2636 + }, + { + "epoch": 0.7138411581653362, + "grad_norm": 0.24707137048244476, + "learning_rate": 9.432558116908718e-05, + "loss": 1.244, + "step": 2638 + }, + { + "epoch": 0.714382356920579, + "grad_norm": 0.23890820145606995, + "learning_rate": 9.431100544523614e-05, + "loss": 1.2361, + "step": 2640 + }, + { + "epoch": 0.714923555675822, + "grad_norm": 0.2275097668170929, + "learning_rate": 9.429641215439802e-05, + "loss": 1.2337, + "step": 2642 + }, + { + "epoch": 0.7154647544310648, + "grad_norm": 0.22068314254283905, + "learning_rate": 9.42818013023583e-05, + "loss": 1.246, + "step": 2644 + }, + { + "epoch": 0.7160059531863077, + "grad_norm": 0.22214053571224213, + "learning_rate": 9.426717289490943e-05, + "loss": 1.2507, + "step": 2646 + }, + { + "epoch": 0.7165471519415505, + "grad_norm": 0.21483547985553741, + "learning_rate": 9.425252693785078e-05, + "loss": 1.2223, + "step": 2648 + }, + { + "epoch": 0.7170883506967934, + "grad_norm": 0.21457841992378235, + "learning_rate": 9.423786343698872e-05, + "loss": 1.2494, + "step": 2650 + }, + { + "epoch": 0.7176295494520363, + "grad_norm": 0.20471327006816864, + "learning_rate": 9.422318239813656e-05, + "loss": 1.2426, + "step": 2652 + }, + { + "epoch": 0.7181707482072791, + "grad_norm": 0.20799721777439117, + "learning_rate": 9.420848382711455e-05, + "loss": 1.2409, + "step": 2654 + }, + { + "epoch": 0.7187119469625219, + "grad_norm": 0.2095753401517868, + "learning_rate": 9.41937677297499e-05, + "loss": 1.2349, + "step": 2656 + }, + { + "epoch": 0.7192531457177649, + "grad_norm": 0.2103864848613739, + "learning_rate": 9.417903411187678e-05, + "loss": 1.2432, + "step": 2658 + }, + { + "epoch": 0.7197943444730077, + "grad_norm": 0.20874999463558197, + "learning_rate": 9.416428297933631e-05, + "loss": 1.24, + "step": 2660 + }, + { + "epoch": 0.7203355432282506, + "grad_norm": 0.21667924523353577, + "learning_rate": 9.41495143379765e-05, + "loss": 1.254, + "step": 2662 + }, + { + "epoch": 0.7208767419834934, + "grad_norm": 0.20849965512752533, + "learning_rate": 9.413472819365237e-05, + "loss": 1.2494, + "step": 2664 + }, + { + "epoch": 0.7214179407387363, + "grad_norm": 0.2131972759962082, + "learning_rate": 9.411992455222585e-05, + "loss": 1.2233, + "step": 2666 + }, + { + "epoch": 0.7219591394939792, + "grad_norm": 0.21590593457221985, + "learning_rate": 9.410510341956579e-05, + "loss": 1.2428, + "step": 2668 + }, + { + "epoch": 0.722500338249222, + "grad_norm": 0.21747298538684845, + "learning_rate": 9.409026480154801e-05, + "loss": 1.2495, + "step": 2670 + }, + { + "epoch": 0.7230415370044649, + "grad_norm": 0.21579551696777344, + "learning_rate": 9.407540870405523e-05, + "loss": 1.2513, + "step": 2672 + }, + { + "epoch": 0.7235827357597078, + "grad_norm": 0.20697540044784546, + "learning_rate": 9.40605351329771e-05, + "loss": 1.2364, + "step": 2674 + }, + { + "epoch": 0.7241239345149506, + "grad_norm": 0.215818852186203, + "learning_rate": 9.404564409421024e-05, + "loss": 1.2242, + "step": 2676 + }, + { + "epoch": 0.7246651332701934, + "grad_norm": 0.21552613377571106, + "learning_rate": 9.403073559365816e-05, + "loss": 1.2378, + "step": 2678 + }, + { + "epoch": 0.7252063320254364, + "grad_norm": 0.20463980734348297, + "learning_rate": 9.401580963723127e-05, + "loss": 1.2144, + "step": 2680 + }, + { + "epoch": 0.7257475307806792, + "grad_norm": 0.20748072862625122, + "learning_rate": 9.400086623084696e-05, + "loss": 1.2422, + "step": 2682 + }, + { + "epoch": 0.726288729535922, + "grad_norm": 0.21622253954410553, + "learning_rate": 9.398590538042948e-05, + "loss": 1.2466, + "step": 2684 + }, + { + "epoch": 0.7268299282911649, + "grad_norm": 0.21229557693004608, + "learning_rate": 9.397092709191005e-05, + "loss": 1.2533, + "step": 2686 + }, + { + "epoch": 0.7273711270464078, + "grad_norm": 0.2206655591726303, + "learning_rate": 9.395593137122676e-05, + "loss": 1.2368, + "step": 2688 + }, + { + "epoch": 0.7279123258016507, + "grad_norm": 0.22106198966503143, + "learning_rate": 9.39409182243246e-05, + "loss": 1.2523, + "step": 2690 + }, + { + "epoch": 0.7284535245568935, + "grad_norm": 0.21155452728271484, + "learning_rate": 9.392588765715554e-05, + "loss": 1.2558, + "step": 2692 + }, + { + "epoch": 0.7289947233121363, + "grad_norm": 0.2205546647310257, + "learning_rate": 9.39108396756784e-05, + "loss": 1.2409, + "step": 2694 + }, + { + "epoch": 0.7295359220673793, + "grad_norm": 0.2159835547208786, + "learning_rate": 9.389577428585888e-05, + "loss": 1.248, + "step": 2696 + }, + { + "epoch": 0.7300771208226221, + "grad_norm": 0.20885945856571198, + "learning_rate": 9.388069149366966e-05, + "loss": 1.2388, + "step": 2698 + }, + { + "epoch": 0.7306183195778649, + "grad_norm": 0.2038174420595169, + "learning_rate": 9.386559130509026e-05, + "loss": 1.2213, + "step": 2700 + }, + { + "epoch": 0.7311595183331079, + "grad_norm": 0.21526674926280975, + "learning_rate": 9.385047372610709e-05, + "loss": 1.2369, + "step": 2702 + }, + { + "epoch": 0.7317007170883507, + "grad_norm": 0.21164047718048096, + "learning_rate": 9.383533876271349e-05, + "loss": 1.2414, + "step": 2704 + }, + { + "epoch": 0.7322419158435935, + "grad_norm": 0.20897522568702698, + "learning_rate": 9.38201864209097e-05, + "loss": 1.2396, + "step": 2706 + }, + { + "epoch": 0.7327831145988364, + "grad_norm": 0.20399637520313263, + "learning_rate": 9.38050167067028e-05, + "loss": 1.2254, + "step": 2708 + }, + { + "epoch": 0.7333243133540793, + "grad_norm": 0.21298326551914215, + "learning_rate": 9.37898296261068e-05, + "loss": 1.2465, + "step": 2710 + }, + { + "epoch": 0.7338655121093222, + "grad_norm": 0.2132457196712494, + "learning_rate": 9.377462518514257e-05, + "loss": 1.2309, + "step": 2712 + }, + { + "epoch": 0.734406710864565, + "grad_norm": 0.22349213063716888, + "learning_rate": 9.375940338983789e-05, + "loss": 1.2446, + "step": 2714 + }, + { + "epoch": 0.7349479096198078, + "grad_norm": 0.2120126485824585, + "learning_rate": 9.374416424622738e-05, + "loss": 1.2468, + "step": 2716 + }, + { + "epoch": 0.7354891083750508, + "grad_norm": 0.21380288898944855, + "learning_rate": 9.372890776035259e-05, + "loss": 1.2379, + "step": 2718 + }, + { + "epoch": 0.7360303071302936, + "grad_norm": 0.20808126032352448, + "learning_rate": 9.371363393826187e-05, + "loss": 1.2507, + "step": 2720 + }, + { + "epoch": 0.7365715058855364, + "grad_norm": 0.21934252977371216, + "learning_rate": 9.369834278601052e-05, + "loss": 1.2572, + "step": 2722 + }, + { + "epoch": 0.7371127046407794, + "grad_norm": 0.20850279927253723, + "learning_rate": 9.36830343096607e-05, + "loss": 1.2538, + "step": 2724 + }, + { + "epoch": 0.7376539033960222, + "grad_norm": 0.216210275888443, + "learning_rate": 9.366770851528137e-05, + "loss": 1.2307, + "step": 2726 + }, + { + "epoch": 0.738195102151265, + "grad_norm": 0.20865590870380402, + "learning_rate": 9.365236540894842e-05, + "loss": 1.2293, + "step": 2728 + }, + { + "epoch": 0.7387363009065079, + "grad_norm": 0.20254139602184296, + "learning_rate": 9.363700499674462e-05, + "loss": 1.2543, + "step": 2730 + }, + { + "epoch": 0.7392774996617508, + "grad_norm": 0.21307919919490814, + "learning_rate": 9.36216272847595e-05, + "loss": 1.2353, + "step": 2732 + }, + { + "epoch": 0.7398186984169937, + "grad_norm": 0.21729370951652527, + "learning_rate": 9.360623227908957e-05, + "loss": 1.2384, + "step": 2734 + }, + { + "epoch": 0.7403598971722365, + "grad_norm": 0.2232731133699417, + "learning_rate": 9.359081998583812e-05, + "loss": 1.2237, + "step": 2736 + }, + { + "epoch": 0.7409010959274793, + "grad_norm": 0.2216210663318634, + "learning_rate": 9.357539041111531e-05, + "loss": 1.2338, + "step": 2738 + }, + { + "epoch": 0.7414422946827223, + "grad_norm": 0.22623343765735626, + "learning_rate": 9.355994356103818e-05, + "loss": 1.2307, + "step": 2740 + }, + { + "epoch": 0.7419834934379651, + "grad_norm": 0.23438121378421783, + "learning_rate": 9.354447944173059e-05, + "loss": 1.2363, + "step": 2742 + }, + { + "epoch": 0.7425246921932079, + "grad_norm": 0.22417870163917542, + "learning_rate": 9.352899805932322e-05, + "loss": 1.2658, + "step": 2744 + }, + { + "epoch": 0.7430658909484508, + "grad_norm": 0.23071123659610748, + "learning_rate": 9.351349941995366e-05, + "loss": 1.2414, + "step": 2746 + }, + { + "epoch": 0.7436070897036937, + "grad_norm": 0.2217751145362854, + "learning_rate": 9.349798352976629e-05, + "loss": 1.2392, + "step": 2748 + }, + { + "epoch": 0.7441482884589365, + "grad_norm": 0.23684772849082947, + "learning_rate": 9.348245039491235e-05, + "loss": 1.2503, + "step": 2750 + }, + { + "epoch": 0.7446894872141794, + "grad_norm": 0.22171282768249512, + "learning_rate": 9.34669000215499e-05, + "loss": 1.2287, + "step": 2752 + }, + { + "epoch": 0.7452306859694223, + "grad_norm": 0.22197501361370087, + "learning_rate": 9.345133241584387e-05, + "loss": 1.1991, + "step": 2754 + }, + { + "epoch": 0.7457718847246652, + "grad_norm": 0.2621997594833374, + "learning_rate": 9.343574758396598e-05, + "loss": 1.2346, + "step": 2756 + }, + { + "epoch": 0.746313083479908, + "grad_norm": 0.21338815987110138, + "learning_rate": 9.342014553209482e-05, + "loss": 1.2437, + "step": 2758 + }, + { + "epoch": 0.7468542822351508, + "grad_norm": 0.21545028686523438, + "learning_rate": 9.340452626641574e-05, + "loss": 1.2558, + "step": 2760 + }, + { + "epoch": 0.7473954809903938, + "grad_norm": 0.20994403958320618, + "learning_rate": 9.338888979312101e-05, + "loss": 1.2272, + "step": 2762 + }, + { + "epoch": 0.7479366797456366, + "grad_norm": 0.21458999812602997, + "learning_rate": 9.337323611840964e-05, + "loss": 1.2522, + "step": 2764 + }, + { + "epoch": 0.7484778785008794, + "grad_norm": 0.21569861471652985, + "learning_rate": 9.335756524848751e-05, + "loss": 1.2348, + "step": 2766 + }, + { + "epoch": 0.7490190772561223, + "grad_norm": 0.21545200049877167, + "learning_rate": 9.334187718956727e-05, + "loss": 1.2351, + "step": 2768 + }, + { + "epoch": 0.7495602760113652, + "grad_norm": 0.20550045371055603, + "learning_rate": 9.332617194786844e-05, + "loss": 1.2333, + "step": 2770 + }, + { + "epoch": 0.750101474766608, + "grad_norm": 0.21121762692928314, + "learning_rate": 9.331044952961729e-05, + "loss": 1.2347, + "step": 2772 + }, + { + "epoch": 0.7506426735218509, + "grad_norm": 0.2130371332168579, + "learning_rate": 9.329470994104697e-05, + "loss": 1.2384, + "step": 2774 + }, + { + "epoch": 0.7511838722770937, + "grad_norm": 0.2100599855184555, + "learning_rate": 9.327895318839739e-05, + "loss": 1.2572, + "step": 2776 + }, + { + "epoch": 0.7517250710323367, + "grad_norm": 0.2141609936952591, + "learning_rate": 9.326317927791526e-05, + "loss": 1.2493, + "step": 2778 + }, + { + "epoch": 0.7522662697875795, + "grad_norm": 0.20215147733688354, + "learning_rate": 9.32473882158541e-05, + "loss": 1.2328, + "step": 2780 + }, + { + "epoch": 0.7528074685428223, + "grad_norm": 0.20483511686325073, + "learning_rate": 9.323158000847428e-05, + "loss": 1.2467, + "step": 2782 + }, + { + "epoch": 0.7533486672980653, + "grad_norm": 0.21057730913162231, + "learning_rate": 9.32157546620429e-05, + "loss": 1.226, + "step": 2784 + }, + { + "epoch": 0.7538898660533081, + "grad_norm": 0.21694819629192352, + "learning_rate": 9.319991218283385e-05, + "loss": 1.2269, + "step": 2786 + }, + { + "epoch": 0.7544310648085509, + "grad_norm": 0.22040502727031708, + "learning_rate": 9.318405257712788e-05, + "loss": 1.2336, + "step": 2788 + }, + { + "epoch": 0.7549722635637938, + "grad_norm": 0.21631376445293427, + "learning_rate": 9.31681758512125e-05, + "loss": 1.2426, + "step": 2790 + }, + { + "epoch": 0.7555134623190367, + "grad_norm": 0.2051629275083542, + "learning_rate": 9.315228201138194e-05, + "loss": 1.2528, + "step": 2792 + }, + { + "epoch": 0.7560546610742795, + "grad_norm": 0.2192334532737732, + "learning_rate": 9.313637106393733e-05, + "loss": 1.2262, + "step": 2794 + }, + { + "epoch": 0.7565958598295224, + "grad_norm": 0.21075467765331268, + "learning_rate": 9.31204430151865e-05, + "loss": 1.2464, + "step": 2796 + }, + { + "epoch": 0.7571370585847652, + "grad_norm": 0.20599377155303955, + "learning_rate": 9.31044978714441e-05, + "loss": 1.2284, + "step": 2798 + }, + { + "epoch": 0.7576782573400082, + "grad_norm": 0.20556782186031342, + "learning_rate": 9.308853563903153e-05, + "loss": 1.2337, + "step": 2800 + }, + { + "epoch": 0.758219456095251, + "grad_norm": 0.2129114270210266, + "learning_rate": 9.307255632427698e-05, + "loss": 1.2351, + "step": 2802 + }, + { + "epoch": 0.7587606548504938, + "grad_norm": 0.22170618176460266, + "learning_rate": 9.305655993351539e-05, + "loss": 1.2509, + "step": 2804 + }, + { + "epoch": 0.7593018536057367, + "grad_norm": 0.2149934470653534, + "learning_rate": 9.304054647308853e-05, + "loss": 1.2506, + "step": 2806 + }, + { + "epoch": 0.7598430523609796, + "grad_norm": 0.21681110560894012, + "learning_rate": 9.302451594934488e-05, + "loss": 1.2446, + "step": 2808 + }, + { + "epoch": 0.7603842511162224, + "grad_norm": 0.2138003557920456, + "learning_rate": 9.300846836863966e-05, + "loss": 1.2315, + "step": 2810 + }, + { + "epoch": 0.7609254498714653, + "grad_norm": 0.21714389324188232, + "learning_rate": 9.299240373733495e-05, + "loss": 1.2237, + "step": 2812 + }, + { + "epoch": 0.7614666486267082, + "grad_norm": 0.20781435072422028, + "learning_rate": 9.297632206179951e-05, + "loss": 1.235, + "step": 2814 + }, + { + "epoch": 0.762007847381951, + "grad_norm": 0.21106529235839844, + "learning_rate": 9.296022334840889e-05, + "loss": 1.241, + "step": 2816 + }, + { + "epoch": 0.7625490461371939, + "grad_norm": 0.20268838107585907, + "learning_rate": 9.294410760354537e-05, + "loss": 1.2582, + "step": 2818 + }, + { + "epoch": 0.7630902448924367, + "grad_norm": 0.20030425488948822, + "learning_rate": 9.292797483359801e-05, + "loss": 1.2428, + "step": 2820 + }, + { + "epoch": 0.7636314436476797, + "grad_norm": 0.2100449800491333, + "learning_rate": 9.291182504496258e-05, + "loss": 1.2367, + "step": 2822 + }, + { + "epoch": 0.7641726424029225, + "grad_norm": 0.21454234421253204, + "learning_rate": 9.289565824404165e-05, + "loss": 1.2261, + "step": 2824 + }, + { + "epoch": 0.7647138411581653, + "grad_norm": 0.21005463600158691, + "learning_rate": 9.28794744372445e-05, + "loss": 1.237, + "step": 2826 + }, + { + "epoch": 0.7652550399134082, + "grad_norm": 0.2123933732509613, + "learning_rate": 9.286327363098717e-05, + "loss": 1.2115, + "step": 2828 + }, + { + "epoch": 0.7657962386686511, + "grad_norm": 0.2171681821346283, + "learning_rate": 9.284705583169239e-05, + "loss": 1.2415, + "step": 2830 + }, + { + "epoch": 0.7663374374238939, + "grad_norm": 0.21714134514331818, + "learning_rate": 9.283082104578972e-05, + "loss": 1.237, + "step": 2832 + }, + { + "epoch": 0.7668786361791368, + "grad_norm": 0.20111635327339172, + "learning_rate": 9.281456927971536e-05, + "loss": 1.2237, + "step": 2834 + }, + { + "epoch": 0.7674198349343796, + "grad_norm": 0.20329216122627258, + "learning_rate": 9.279830053991232e-05, + "loss": 1.2338, + "step": 2836 + }, + { + "epoch": 0.7679610336896225, + "grad_norm": 0.21309934556484222, + "learning_rate": 9.278201483283026e-05, + "loss": 1.2314, + "step": 2838 + }, + { + "epoch": 0.7685022324448654, + "grad_norm": 0.2031441330909729, + "learning_rate": 9.276571216492562e-05, + "loss": 1.2016, + "step": 2840 + }, + { + "epoch": 0.7690434312001082, + "grad_norm": 0.20808055996894836, + "learning_rate": 9.274939254266157e-05, + "loss": 1.2367, + "step": 2842 + }, + { + "epoch": 0.7695846299553512, + "grad_norm": 0.2009919136762619, + "learning_rate": 9.273305597250797e-05, + "loss": 1.2488, + "step": 2844 + }, + { + "epoch": 0.770125828710594, + "grad_norm": 0.2107824683189392, + "learning_rate": 9.27167024609414e-05, + "loss": 1.2268, + "step": 2846 + }, + { + "epoch": 0.7706670274658368, + "grad_norm": 0.21597540378570557, + "learning_rate": 9.270033201444517e-05, + "loss": 1.251, + "step": 2848 + }, + { + "epoch": 0.7712082262210797, + "grad_norm": 0.21570149064064026, + "learning_rate": 9.268394463950934e-05, + "loss": 1.2291, + "step": 2850 + }, + { + "epoch": 0.7717494249763226, + "grad_norm": 0.2073807716369629, + "learning_rate": 9.266754034263061e-05, + "loss": 1.2229, + "step": 2852 + }, + { + "epoch": 0.7722906237315654, + "grad_norm": 0.21396131813526154, + "learning_rate": 9.265111913031243e-05, + "loss": 1.2354, + "step": 2854 + }, + { + "epoch": 0.7728318224868083, + "grad_norm": 0.22582246363162994, + "learning_rate": 9.263468100906494e-05, + "loss": 1.2304, + "step": 2856 + }, + { + "epoch": 0.7733730212420511, + "grad_norm": 0.20621982216835022, + "learning_rate": 9.2618225985405e-05, + "loss": 1.2178, + "step": 2858 + }, + { + "epoch": 0.773914219997294, + "grad_norm": 0.20848476886749268, + "learning_rate": 9.260175406585619e-05, + "loss": 1.2256, + "step": 2860 + }, + { + "epoch": 0.7744554187525369, + "grad_norm": 0.20401284098625183, + "learning_rate": 9.258526525694871e-05, + "loss": 1.2201, + "step": 2862 + }, + { + "epoch": 0.7749966175077797, + "grad_norm": 0.20163412392139435, + "learning_rate": 9.256875956521953e-05, + "loss": 1.2537, + "step": 2864 + }, + { + "epoch": 0.7755378162630227, + "grad_norm": 0.21364827454090118, + "learning_rate": 9.255223699721229e-05, + "loss": 1.2473, + "step": 2866 + }, + { + "epoch": 0.7760790150182655, + "grad_norm": 0.20431366562843323, + "learning_rate": 9.253569755947732e-05, + "loss": 1.2261, + "step": 2868 + }, + { + "epoch": 0.7766202137735083, + "grad_norm": 0.20286224782466888, + "learning_rate": 9.251914125857167e-05, + "loss": 1.2227, + "step": 2870 + }, + { + "epoch": 0.7771614125287511, + "grad_norm": 0.20155613124370575, + "learning_rate": 9.2502568101059e-05, + "loss": 1.223, + "step": 2872 + }, + { + "epoch": 0.7777026112839941, + "grad_norm": 0.20083202421665192, + "learning_rate": 9.24859780935097e-05, + "loss": 1.2361, + "step": 2874 + }, + { + "epoch": 0.7782438100392369, + "grad_norm": 0.20565393567085266, + "learning_rate": 9.246937124250086e-05, + "loss": 1.2454, + "step": 2876 + }, + { + "epoch": 0.7787850087944798, + "grad_norm": 0.20953059196472168, + "learning_rate": 9.24527475546162e-05, + "loss": 1.2402, + "step": 2878 + }, + { + "epoch": 0.7793262075497226, + "grad_norm": 0.20281900465488434, + "learning_rate": 9.243610703644616e-05, + "loss": 1.243, + "step": 2880 + }, + { + "epoch": 0.7798674063049655, + "grad_norm": 0.2107364684343338, + "learning_rate": 9.241944969458784e-05, + "loss": 1.2531, + "step": 2882 + }, + { + "epoch": 0.7804086050602084, + "grad_norm": 0.2223702222108841, + "learning_rate": 9.240277553564495e-05, + "loss": 1.2358, + "step": 2884 + }, + { + "epoch": 0.7809498038154512, + "grad_norm": 0.21632793545722961, + "learning_rate": 9.2386084566228e-05, + "loss": 1.2228, + "step": 2886 + }, + { + "epoch": 0.781491002570694, + "grad_norm": 0.20548765361309052, + "learning_rate": 9.2369376792954e-05, + "loss": 1.2211, + "step": 2888 + }, + { + "epoch": 0.782032201325937, + "grad_norm": 0.2088259905576706, + "learning_rate": 9.235265222244676e-05, + "loss": 1.2279, + "step": 2890 + }, + { + "epoch": 0.7825734000811798, + "grad_norm": 0.20582431554794312, + "learning_rate": 9.233591086133666e-05, + "loss": 1.2302, + "step": 2892 + }, + { + "epoch": 0.7831145988364226, + "grad_norm": 0.20779459178447723, + "learning_rate": 9.23191527162608e-05, + "loss": 1.2463, + "step": 2894 + }, + { + "epoch": 0.7836557975916656, + "grad_norm": 0.20974069833755493, + "learning_rate": 9.23023777938629e-05, + "loss": 1.2299, + "step": 2896 + }, + { + "epoch": 0.7841969963469084, + "grad_norm": 0.21260978281497955, + "learning_rate": 9.228558610079331e-05, + "loss": 1.2288, + "step": 2898 + }, + { + "epoch": 0.7847381951021513, + "grad_norm": 0.21376334130764008, + "learning_rate": 9.226877764370908e-05, + "loss": 1.2365, + "step": 2900 + }, + { + "epoch": 0.7852793938573941, + "grad_norm": 0.2044704556465149, + "learning_rate": 9.225195242927387e-05, + "loss": 1.2308, + "step": 2902 + }, + { + "epoch": 0.785820592612637, + "grad_norm": 0.20474430918693542, + "learning_rate": 9.2235110464158e-05, + "loss": 1.2247, + "step": 2904 + }, + { + "epoch": 0.7863617913678799, + "grad_norm": 0.20992141962051392, + "learning_rate": 9.221825175503842e-05, + "loss": 1.2319, + "step": 2906 + }, + { + "epoch": 0.7869029901231227, + "grad_norm": 0.21773672103881836, + "learning_rate": 9.220137630859874e-05, + "loss": 1.2248, + "step": 2908 + }, + { + "epoch": 0.7874441888783655, + "grad_norm": 0.21675674617290497, + "learning_rate": 9.218448413152913e-05, + "loss": 1.235, + "step": 2910 + }, + { + "epoch": 0.7879853876336085, + "grad_norm": 0.2061057984828949, + "learning_rate": 9.216757523052653e-05, + "loss": 1.2381, + "step": 2912 + }, + { + "epoch": 0.7885265863888513, + "grad_norm": 0.20627647638320923, + "learning_rate": 9.215064961229438e-05, + "loss": 1.2299, + "step": 2914 + }, + { + "epoch": 0.7890677851440941, + "grad_norm": 0.616995096206665, + "learning_rate": 9.213370728354283e-05, + "loss": 1.2687, + "step": 2916 + }, + { + "epoch": 0.789608983899337, + "grad_norm": 0.23795656859874725, + "learning_rate": 9.21167482509886e-05, + "loss": 1.2376, + "step": 2918 + }, + { + "epoch": 0.7901501826545799, + "grad_norm": 0.2422102987766266, + "learning_rate": 9.209977252135506e-05, + "loss": 1.2429, + "step": 2920 + }, + { + "epoch": 0.7906913814098228, + "grad_norm": 0.2670714259147644, + "learning_rate": 9.208278010137222e-05, + "loss": 1.2101, + "step": 2922 + }, + { + "epoch": 0.7912325801650656, + "grad_norm": 0.294065922498703, + "learning_rate": 9.206577099777664e-05, + "loss": 1.2465, + "step": 2924 + }, + { + "epoch": 0.7917737789203085, + "grad_norm": 0.2536512315273285, + "learning_rate": 9.204874521731158e-05, + "loss": 1.224, + "step": 2926 + }, + { + "epoch": 0.7923149776755514, + "grad_norm": 0.23533768951892853, + "learning_rate": 9.203170276672681e-05, + "loss": 1.2375, + "step": 2928 + }, + { + "epoch": 0.7928561764307942, + "grad_norm": 0.29083576798439026, + "learning_rate": 9.201464365277883e-05, + "loss": 1.2247, + "step": 2930 + }, + { + "epoch": 0.793397375186037, + "grad_norm": 0.278577595949173, + "learning_rate": 9.199756788223067e-05, + "loss": 1.2459, + "step": 2932 + }, + { + "epoch": 0.79393857394128, + "grad_norm": 0.28439784049987793, + "learning_rate": 9.198047546185193e-05, + "loss": 1.224, + "step": 2934 + }, + { + "epoch": 0.7944797726965228, + "grad_norm": 0.45653703808784485, + "learning_rate": 9.196336639841892e-05, + "loss": 1.2389, + "step": 2936 + }, + { + "epoch": 0.7950209714517656, + "grad_norm": 0.43514567613601685, + "learning_rate": 9.194624069871442e-05, + "loss": 1.2365, + "step": 2938 + }, + { + "epoch": 0.7955621702070085, + "grad_norm": 0.39155837893486023, + "learning_rate": 9.192909836952794e-05, + "loss": 1.2364, + "step": 2940 + }, + { + "epoch": 0.7961033689622514, + "grad_norm": 0.2874111235141754, + "learning_rate": 9.191193941765546e-05, + "loss": 1.2255, + "step": 2942 + }, + { + "epoch": 0.7966445677174943, + "grad_norm": 0.2747213840484619, + "learning_rate": 9.189476384989963e-05, + "loss": 1.2283, + "step": 2944 + }, + { + "epoch": 0.7971857664727371, + "grad_norm": 0.2161729484796524, + "learning_rate": 9.187757167306966e-05, + "loss": 1.2346, + "step": 2946 + }, + { + "epoch": 0.7977269652279799, + "grad_norm": 0.22049780189990997, + "learning_rate": 9.186036289398134e-05, + "loss": 1.2422, + "step": 2948 + }, + { + "epoch": 0.7982681639832229, + "grad_norm": 0.22543483972549438, + "learning_rate": 9.184313751945704e-05, + "loss": 1.2366, + "step": 2950 + }, + { + "epoch": 0.7988093627384657, + "grad_norm": 0.227324977517128, + "learning_rate": 9.182589555632572e-05, + "loss": 1.2251, + "step": 2952 + }, + { + "epoch": 0.7993505614937085, + "grad_norm": 0.21022869646549225, + "learning_rate": 9.180863701142293e-05, + "loss": 1.2337, + "step": 2954 + }, + { + "epoch": 0.7998917602489515, + "grad_norm": 0.21206127107143402, + "learning_rate": 9.179136189159074e-05, + "loss": 1.2277, + "step": 2956 + }, + { + "epoch": 0.8004329590041943, + "grad_norm": 0.22186043858528137, + "learning_rate": 9.177407020367788e-05, + "loss": 1.2471, + "step": 2958 + }, + { + "epoch": 0.8009741577594371, + "grad_norm": 0.24853624403476715, + "learning_rate": 9.175676195453955e-05, + "loss": 1.245, + "step": 2960 + }, + { + "epoch": 0.80151535651468, + "grad_norm": 0.24665629863739014, + "learning_rate": 9.173943715103757e-05, + "loss": 1.2357, + "step": 2962 + }, + { + "epoch": 0.8020565552699229, + "grad_norm": 0.25784316658973694, + "learning_rate": 9.172209580004035e-05, + "loss": 1.2382, + "step": 2964 + }, + { + "epoch": 0.8025977540251658, + "grad_norm": 0.22399091720581055, + "learning_rate": 9.170473790842278e-05, + "loss": 1.2208, + "step": 2966 + }, + { + "epoch": 0.8031389527804086, + "grad_norm": 0.2071622610092163, + "learning_rate": 9.168736348306638e-05, + "loss": 1.217, + "step": 2968 + }, + { + "epoch": 0.8036801515356514, + "grad_norm": 0.20199859142303467, + "learning_rate": 9.166997253085918e-05, + "loss": 1.2489, + "step": 2970 + }, + { + "epoch": 0.8042213502908944, + "grad_norm": 0.22274377942085266, + "learning_rate": 9.165256505869581e-05, + "loss": 1.2417, + "step": 2972 + }, + { + "epoch": 0.8047625490461372, + "grad_norm": 0.2210993617773056, + "learning_rate": 9.163514107347738e-05, + "loss": 1.2395, + "step": 2974 + }, + { + "epoch": 0.80530374780138, + "grad_norm": 0.22720351815223694, + "learning_rate": 9.161770058211161e-05, + "loss": 1.2454, + "step": 2976 + }, + { + "epoch": 0.8058449465566229, + "grad_norm": 0.22297875583171844, + "learning_rate": 9.160024359151274e-05, + "loss": 1.2279, + "step": 2978 + }, + { + "epoch": 0.8063861453118658, + "grad_norm": 0.2187831699848175, + "learning_rate": 9.158277010860153e-05, + "loss": 1.2481, + "step": 2980 + }, + { + "epoch": 0.8069273440671086, + "grad_norm": 0.2193155139684677, + "learning_rate": 9.15652801403053e-05, + "loss": 1.2349, + "step": 2982 + }, + { + "epoch": 0.8074685428223515, + "grad_norm": 0.21247895061969757, + "learning_rate": 9.154777369355793e-05, + "loss": 1.2109, + "step": 2984 + }, + { + "epoch": 0.8080097415775944, + "grad_norm": 0.2187221348285675, + "learning_rate": 9.15302507752998e-05, + "loss": 1.2453, + "step": 2986 + }, + { + "epoch": 0.8085509403328373, + "grad_norm": 0.2421714961528778, + "learning_rate": 9.151271139247782e-05, + "loss": 1.2325, + "step": 2988 + }, + { + "epoch": 0.8090921390880801, + "grad_norm": 0.25165337324142456, + "learning_rate": 9.149515555204542e-05, + "loss": 1.2345, + "step": 2990 + }, + { + "epoch": 0.8096333378433229, + "grad_norm": 0.6235466003417969, + "learning_rate": 9.147758326096259e-05, + "loss": 1.2307, + "step": 2992 + }, + { + "epoch": 0.8101745365985659, + "grad_norm": 1.8468120098114014, + "learning_rate": 9.14599945261958e-05, + "loss": 1.2226, + "step": 2994 + }, + { + "epoch": 0.8107157353538087, + "grad_norm": 0.6419652104377747, + "learning_rate": 9.144238935471809e-05, + "loss": 1.237, + "step": 2996 + }, + { + "epoch": 0.8112569341090515, + "grad_norm": 9.949187278747559, + "learning_rate": 9.142476775350895e-05, + "loss": 1.2359, + "step": 2998 + }, + { + "epoch": 0.8117981328642944, + "grad_norm": 51.645015716552734, + "learning_rate": 9.140712972955445e-05, + "loss": 3.9273, + "step": 3000 + }, + { + "epoch": 0.8123393316195373, + "grad_norm": 162.77838134765625, + "learning_rate": 9.138947528984714e-05, + "loss": 7.3207, + "step": 3002 + }, + { + "epoch": 0.8128805303747801, + "grad_norm": 143.2057342529297, + "learning_rate": 9.137180444138604e-05, + "loss": 6.7469, + "step": 3004 + }, + { + "epoch": 0.813421729130023, + "grad_norm": 24.596132278442383, + "learning_rate": 9.135411719117677e-05, + "loss": 6.7275, + "step": 3006 + }, + { + "epoch": 0.8139629278852659, + "grad_norm": 113.07401275634766, + "learning_rate": 9.133641354623135e-05, + "loss": 6.8831, + "step": 3008 + }, + { + "epoch": 0.8145041266405088, + "grad_norm": 54.99746322631836, + "learning_rate": 9.131869351356836e-05, + "loss": 6.957, + "step": 3010 + }, + { + "epoch": 0.8150453253957516, + "grad_norm": 41.836971282958984, + "learning_rate": 9.130095710021287e-05, + "loss": 6.9197, + "step": 3012 + }, + { + "epoch": 0.8155865241509944, + "grad_norm": 16.159334182739258, + "learning_rate": 9.128320431319643e-05, + "loss": 6.6745, + "step": 3014 + }, + { + "epoch": 0.8161277229062374, + "grad_norm": 35.35638427734375, + "learning_rate": 9.12654351595571e-05, + "loss": 6.5526, + "step": 3016 + }, + { + "epoch": 0.8166689216614802, + "grad_norm": 13.418272018432617, + "learning_rate": 9.124764964633941e-05, + "loss": 6.5421, + "step": 3018 + }, + { + "epoch": 0.817210120416723, + "grad_norm": 76.91167449951172, + "learning_rate": 9.122984778059436e-05, + "loss": 6.598, + "step": 3020 + }, + { + "epoch": 0.8177513191719659, + "grad_norm": 29.271677017211914, + "learning_rate": 9.121202956937949e-05, + "loss": 6.746, + "step": 3022 + }, + { + "epoch": 0.8182925179272088, + "grad_norm": 6.679831027984619, + "learning_rate": 9.119419501975876e-05, + "loss": 6.6051, + "step": 3024 + }, + { + "epoch": 0.8188337166824516, + "grad_norm": 2.620072364807129, + "learning_rate": 9.117634413880264e-05, + "loss": 6.4967, + "step": 3026 + }, + { + "epoch": 0.8193749154376945, + "grad_norm": 35.06947708129883, + "learning_rate": 9.115847693358808e-05, + "loss": 6.5821, + "step": 3028 + }, + { + "epoch": 0.8199161141929373, + "grad_norm": 7.402040004730225, + "learning_rate": 9.114059341119846e-05, + "loss": 6.6581, + "step": 3030 + }, + { + "epoch": 0.8204573129481803, + "grad_norm": 9.003362655639648, + "learning_rate": 9.112269357872367e-05, + "loss": 6.5743, + "step": 3032 + }, + { + "epoch": 0.8209985117034231, + "grad_norm": 19.55501937866211, + "learning_rate": 9.110477744326008e-05, + "loss": 6.5286, + "step": 3034 + }, + { + "epoch": 0.8215397104586659, + "grad_norm": 16.157516479492188, + "learning_rate": 9.108684501191048e-05, + "loss": 6.5647, + "step": 3036 + }, + { + "epoch": 0.8220809092139089, + "grad_norm": 11.481574058532715, + "learning_rate": 9.10688962917841e-05, + "loss": 6.5452, + "step": 3038 + }, + { + "epoch": 0.8226221079691517, + "grad_norm": 12.504718780517578, + "learning_rate": 9.105093128999672e-05, + "loss": 6.4514, + "step": 3040 + }, + { + "epoch": 0.8231633067243945, + "grad_norm": 12.287551879882812, + "learning_rate": 9.103295001367049e-05, + "loss": 6.4944, + "step": 3042 + }, + { + "epoch": 0.8237045054796374, + "grad_norm": 6.438191890716553, + "learning_rate": 9.101495246993405e-05, + "loss": 6.5133, + "step": 3044 + }, + { + "epoch": 0.8242457042348803, + "grad_norm": 10.490718841552734, + "learning_rate": 9.099693866592249e-05, + "loss": 6.6264, + "step": 3046 + }, + { + "epoch": 0.8247869029901231, + "grad_norm": 3.485543966293335, + "learning_rate": 9.097890860877732e-05, + "loss": 6.6551, + "step": 3048 + }, + { + "epoch": 0.825328101745366, + "grad_norm": 7.037774562835693, + "learning_rate": 9.096086230564653e-05, + "loss": 6.5162, + "step": 3050 + }, + { + "epoch": 0.8258693005006088, + "grad_norm": 7.147155284881592, + "learning_rate": 9.094279976368452e-05, + "loss": 6.4698, + "step": 3052 + }, + { + "epoch": 0.8264104992558517, + "grad_norm": 6.157515048980713, + "learning_rate": 9.092472099005212e-05, + "loss": 6.452, + "step": 3054 + }, + { + "epoch": 0.8269516980110946, + "grad_norm": 24.150165557861328, + "learning_rate": 9.090662599191666e-05, + "loss": 6.4559, + "step": 3056 + }, + { + "epoch": 0.8274928967663374, + "grad_norm": 2.645434617996216, + "learning_rate": 9.088851477645181e-05, + "loss": 6.4264, + "step": 3058 + }, + { + "epoch": 0.8280340955215802, + "grad_norm": 26.285385131835938, + "learning_rate": 9.087038735083775e-05, + "loss": 6.4145, + "step": 3060 + }, + { + "epoch": 0.8285752942768232, + "grad_norm": 37.51917266845703, + "learning_rate": 9.085224372226105e-05, + "loss": 6.6315, + "step": 3062 + }, + { + "epoch": 0.829116493032066, + "grad_norm": 4.452447891235352, + "learning_rate": 9.083408389791468e-05, + "loss": 6.5029, + "step": 3064 + }, + { + "epoch": 0.8296576917873089, + "grad_norm": 2.6523871421813965, + "learning_rate": 9.081590788499807e-05, + "loss": 6.4376, + "step": 3066 + }, + { + "epoch": 0.8301988905425518, + "grad_norm": 8.193984031677246, + "learning_rate": 9.079771569071706e-05, + "loss": 6.4238, + "step": 3068 + }, + { + "epoch": 0.8307400892977946, + "grad_norm": 6.554834365844727, + "learning_rate": 9.07795073222839e-05, + "loss": 6.4618, + "step": 3070 + }, + { + "epoch": 0.8312812880530375, + "grad_norm": 9.348752975463867, + "learning_rate": 9.076128278691726e-05, + "loss": 6.4341, + "step": 3072 + }, + { + "epoch": 0.8318224868082803, + "grad_norm": 5.486053466796875, + "learning_rate": 9.07430420918422e-05, + "loss": 6.4018, + "step": 3074 + }, + { + "epoch": 0.8323636855635232, + "grad_norm": 6.757177352905273, + "learning_rate": 9.07247852442902e-05, + "loss": 6.4055, + "step": 3076 + }, + { + "epoch": 0.8329048843187661, + "grad_norm": 2.9033281803131104, + "learning_rate": 9.070651225149913e-05, + "loss": 6.3955, + "step": 3078 + }, + { + "epoch": 0.8334460830740089, + "grad_norm": 9.029162406921387, + "learning_rate": 9.068822312071328e-05, + "loss": 6.3819, + "step": 3080 + }, + { + "epoch": 0.8339872818292517, + "grad_norm": 1.991795539855957, + "learning_rate": 9.066991785918333e-05, + "loss": 6.4096, + "step": 3082 + }, + { + "epoch": 0.8345284805844947, + "grad_norm": 1.6264301538467407, + "learning_rate": 9.065159647416637e-05, + "loss": 6.3804, + "step": 3084 + }, + { + "epoch": 0.8350696793397375, + "grad_norm": 2.102339506149292, + "learning_rate": 9.063325897292587e-05, + "loss": 6.3781, + "step": 3086 + }, + { + "epoch": 0.8356108780949804, + "grad_norm": 1.4373193979263306, + "learning_rate": 9.061490536273164e-05, + "loss": 6.3785, + "step": 3088 + }, + { + "epoch": 0.8361520768502232, + "grad_norm": 1.3095070123672485, + "learning_rate": 9.059653565085997e-05, + "loss": 6.3726, + "step": 3090 + }, + { + "epoch": 0.8366932756054661, + "grad_norm": 1.7201792001724243, + "learning_rate": 9.057814984459347e-05, + "loss": 6.3754, + "step": 3092 + }, + { + "epoch": 0.837234474360709, + "grad_norm": 2.334815740585327, + "learning_rate": 9.055974795122113e-05, + "loss": 6.3692, + "step": 3094 + }, + { + "epoch": 0.8377756731159518, + "grad_norm": 2.3444583415985107, + "learning_rate": 9.054132997803837e-05, + "loss": 6.353, + "step": 3096 + }, + { + "epoch": 0.8383168718711947, + "grad_norm": 4.413025379180908, + "learning_rate": 9.052289593234693e-05, + "loss": 6.345, + "step": 3098 + }, + { + "epoch": 0.8388580706264376, + "grad_norm": 6.012264251708984, + "learning_rate": 9.050444582145495e-05, + "loss": 6.3102, + "step": 3100 + }, + { + "epoch": 0.8393992693816804, + "grad_norm": 16.267066955566406, + "learning_rate": 9.04859796526769e-05, + "loss": 6.2907, + "step": 3102 + }, + { + "epoch": 0.8399404681369232, + "grad_norm": 22.159154891967773, + "learning_rate": 9.046749743333369e-05, + "loss": 6.2638, + "step": 3104 + }, + { + "epoch": 0.8404816668921662, + "grad_norm": 12.406789779663086, + "learning_rate": 9.044899917075251e-05, + "loss": 6.2353, + "step": 3106 + }, + { + "epoch": 0.841022865647409, + "grad_norm": 13.376330375671387, + "learning_rate": 9.043048487226697e-05, + "loss": 6.2521, + "step": 3108 + }, + { + "epoch": 0.8415640644026519, + "grad_norm": 56.80801773071289, + "learning_rate": 9.041195454521702e-05, + "loss": 6.525, + "step": 3110 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 25.448945999145508, + "learning_rate": 9.039340819694897e-05, + "loss": 6.2921, + "step": 3112 + }, + { + "epoch": 0.8426464619131376, + "grad_norm": 14.529874801635742, + "learning_rate": 9.037484583481544e-05, + "loss": 6.3741, + "step": 3114 + }, + { + "epoch": 0.8431876606683805, + "grad_norm": 60.734962463378906, + "learning_rate": 9.035626746617547e-05, + "loss": 6.4179, + "step": 3116 + }, + { + "epoch": 0.8437288594236233, + "grad_norm": 65.94615936279297, + "learning_rate": 9.033767309839438e-05, + "loss": 7.1273, + "step": 3118 + }, + { + "epoch": 0.8442700581788661, + "grad_norm": 18.26249122619629, + "learning_rate": 9.031906273884388e-05, + "loss": 6.5407, + "step": 3120 + }, + { + "epoch": 0.8448112569341091, + "grad_norm": 35.836326599121094, + "learning_rate": 9.030043639490197e-05, + "loss": 6.3452, + "step": 3122 + }, + { + "epoch": 0.8453524556893519, + "grad_norm": 31.0974178314209, + "learning_rate": 9.028179407395305e-05, + "loss": 6.7146, + "step": 3124 + }, + { + "epoch": 0.8458936544445947, + "grad_norm": 20.31138801574707, + "learning_rate": 9.026313578338782e-05, + "loss": 6.3789, + "step": 3126 + }, + { + "epoch": 0.8464348531998377, + "grad_norm": 22.042652130126953, + "learning_rate": 9.024446153060328e-05, + "loss": 6.2103, + "step": 3128 + }, + { + "epoch": 0.8469760519550805, + "grad_norm": 24.26772117614746, + "learning_rate": 9.022577132300283e-05, + "loss": 6.2928, + "step": 3130 + }, + { + "epoch": 0.8475172507103234, + "grad_norm": 23.121070861816406, + "learning_rate": 9.020706516799615e-05, + "loss": 6.3883, + "step": 3132 + }, + { + "epoch": 0.8480584494655662, + "grad_norm": 22.5421199798584, + "learning_rate": 9.018834307299922e-05, + "loss": 6.2152, + "step": 3134 + }, + { + "epoch": 0.8485996482208091, + "grad_norm": 15.424832344055176, + "learning_rate": 9.016960504543439e-05, + "loss": 6.2134, + "step": 3136 + }, + { + "epoch": 0.849140846976052, + "grad_norm": 4.891734600067139, + "learning_rate": 9.015085109273029e-05, + "loss": 6.137, + "step": 3138 + }, + { + "epoch": 0.8496820457312948, + "grad_norm": 3.1807522773742676, + "learning_rate": 9.01320812223219e-05, + "loss": 6.075, + "step": 3140 + }, + { + "epoch": 0.8502232444865376, + "grad_norm": 3.0970818996429443, + "learning_rate": 9.011329544165047e-05, + "loss": 6.0443, + "step": 3142 + }, + { + "epoch": 0.8507644432417806, + "grad_norm": 1.8074407577514648, + "learning_rate": 9.009449375816358e-05, + "loss": 6.0025, + "step": 3144 + }, + { + "epoch": 0.8513056419970234, + "grad_norm": 2.570284843444824, + "learning_rate": 9.007567617931512e-05, + "loss": 5.9863, + "step": 3146 + }, + { + "epoch": 0.8518468407522662, + "grad_norm": 4.418685436248779, + "learning_rate": 9.005684271256525e-05, + "loss": 5.9694, + "step": 3148 + }, + { + "epoch": 0.8523880395075092, + "grad_norm": 6.861987113952637, + "learning_rate": 9.003799336538046e-05, + "loss": 5.9542, + "step": 3150 + }, + { + "epoch": 0.852929238262752, + "grad_norm": 4.844227313995361, + "learning_rate": 9.001912814523353e-05, + "loss": 5.9202, + "step": 3152 + }, + { + "epoch": 0.8534704370179949, + "grad_norm": 3.8856005668640137, + "learning_rate": 9.000024705960352e-05, + "loss": 5.9001, + "step": 3154 + }, + { + "epoch": 0.8540116357732377, + "grad_norm": 7.827765464782715, + "learning_rate": 8.998135011597583e-05, + "loss": 5.9147, + "step": 3156 + }, + { + "epoch": 0.8545528345284806, + "grad_norm": 8.146918296813965, + "learning_rate": 8.996243732184206e-05, + "loss": 5.8791, + "step": 3158 + }, + { + "epoch": 0.8550940332837235, + "grad_norm": 2.4434125423431396, + "learning_rate": 8.994350868470015e-05, + "loss": 5.8594, + "step": 3160 + }, + { + "epoch": 0.8556352320389663, + "grad_norm": 13.466486930847168, + "learning_rate": 8.99245642120543e-05, + "loss": 5.9065, + "step": 3162 + }, + { + "epoch": 0.8561764307942091, + "grad_norm": 15.81242561340332, + "learning_rate": 8.990560391141503e-05, + "loss": 5.8803, + "step": 3164 + }, + { + "epoch": 0.8567176295494521, + "grad_norm": 9.728450775146484, + "learning_rate": 8.988662779029909e-05, + "loss": 5.9393, + "step": 3166 + }, + { + "epoch": 0.8572588283046949, + "grad_norm": 15.22099494934082, + "learning_rate": 8.98676358562295e-05, + "loss": 5.9002, + "step": 3168 + }, + { + "epoch": 0.8578000270599377, + "grad_norm": 6.861898422241211, + "learning_rate": 8.98486281167356e-05, + "loss": 5.8433, + "step": 3170 + }, + { + "epoch": 0.8583412258151806, + "grad_norm": 1.44621741771698, + "learning_rate": 8.982960457935293e-05, + "loss": 5.8085, + "step": 3172 + }, + { + "epoch": 0.8588824245704235, + "grad_norm": 6.946808815002441, + "learning_rate": 8.981056525162332e-05, + "loss": 5.8282, + "step": 3174 + }, + { + "epoch": 0.8594236233256664, + "grad_norm": 7.4296555519104, + "learning_rate": 8.979151014109488e-05, + "loss": 5.8066, + "step": 3176 + }, + { + "epoch": 0.8599648220809092, + "grad_norm": 1.931650161743164, + "learning_rate": 8.977243925532196e-05, + "loss": 5.7569, + "step": 3178 + }, + { + "epoch": 0.8605060208361521, + "grad_norm": 1.9481853246688843, + "learning_rate": 8.975335260186515e-05, + "loss": 5.7733, + "step": 3180 + }, + { + "epoch": 0.861047219591395, + "grad_norm": 5.015133380889893, + "learning_rate": 8.973425018829134e-05, + "loss": 5.7617, + "step": 3182 + }, + { + "epoch": 0.8615884183466378, + "grad_norm": 9.222882270812988, + "learning_rate": 8.971513202217359e-05, + "loss": 5.7758, + "step": 3184 + }, + { + "epoch": 0.8621296171018806, + "grad_norm": 9.576149940490723, + "learning_rate": 8.969599811109128e-05, + "loss": 5.7407, + "step": 3186 + }, + { + "epoch": 0.8626708158571236, + "grad_norm": 10.73227310180664, + "learning_rate": 8.967684846262997e-05, + "loss": 5.746, + "step": 3188 + }, + { + "epoch": 0.8632120146123664, + "grad_norm": 3.917820692062378, + "learning_rate": 8.965768308438155e-05, + "loss": 5.7495, + "step": 3190 + }, + { + "epoch": 0.8637532133676092, + "grad_norm": 16.54001808166504, + "learning_rate": 8.963850198394402e-05, + "loss": 5.8388, + "step": 3192 + }, + { + "epoch": 0.8642944121228521, + "grad_norm": 7.986750602722168, + "learning_rate": 8.961930516892172e-05, + "loss": 5.7186, + "step": 3194 + }, + { + "epoch": 0.864835610878095, + "grad_norm": 6.636990547180176, + "learning_rate": 8.960009264692518e-05, + "loss": 5.7177, + "step": 3196 + }, + { + "epoch": 0.8653768096333379, + "grad_norm": 10.408087730407715, + "learning_rate": 8.958086442557111e-05, + "loss": 5.7057, + "step": 3198 + }, + { + "epoch": 0.8659180083885807, + "grad_norm": 7.17811918258667, + "learning_rate": 8.956162051248253e-05, + "loss": 5.6626, + "step": 3200 + }, + { + "epoch": 0.8664592071438235, + "grad_norm": 3.971498489379883, + "learning_rate": 8.954236091528865e-05, + "loss": 5.6487, + "step": 3202 + }, + { + "epoch": 0.8670004058990665, + "grad_norm": 12.803196907043457, + "learning_rate": 8.952308564162486e-05, + "loss": 5.6778, + "step": 3204 + }, + { + "epoch": 0.8675416046543093, + "grad_norm": 6.128145694732666, + "learning_rate": 8.950379469913281e-05, + "loss": 5.6023, + "step": 3206 + }, + { + "epoch": 0.8680828034095521, + "grad_norm": 2.572782278060913, + "learning_rate": 8.948448809546033e-05, + "loss": 5.5705, + "step": 3208 + }, + { + "epoch": 0.8686240021647951, + "grad_norm": 7.732709884643555, + "learning_rate": 8.94651658382615e-05, + "loss": 5.5648, + "step": 3210 + }, + { + "epoch": 0.8691652009200379, + "grad_norm": 5.356006145477295, + "learning_rate": 8.944582793519657e-05, + "loss": 5.544, + "step": 3212 + }, + { + "epoch": 0.8697063996752807, + "grad_norm": 6.853109359741211, + "learning_rate": 8.9426474393932e-05, + "loss": 5.5661, + "step": 3214 + }, + { + "epoch": 0.8702475984305236, + "grad_norm": 6.834400177001953, + "learning_rate": 8.940710522214044e-05, + "loss": 5.4848, + "step": 3216 + }, + { + "epoch": 0.8707887971857665, + "grad_norm": 3.9893531799316406, + "learning_rate": 8.938772042750078e-05, + "loss": 5.4885, + "step": 3218 + }, + { + "epoch": 0.8713299959410093, + "grad_norm": 5.868941307067871, + "learning_rate": 8.936832001769805e-05, + "loss": 5.4513, + "step": 3220 + }, + { + "epoch": 0.8718711946962522, + "grad_norm": 7.860538482666016, + "learning_rate": 8.934890400042351e-05, + "loss": 5.3947, + "step": 3222 + }, + { + "epoch": 0.872412393451495, + "grad_norm": 7.617331027984619, + "learning_rate": 8.932947238337456e-05, + "loss": 5.3621, + "step": 3224 + }, + { + "epoch": 0.872953592206738, + "grad_norm": 13.739005088806152, + "learning_rate": 8.931002517425484e-05, + "loss": 5.3659, + "step": 3226 + }, + { + "epoch": 0.8734947909619808, + "grad_norm": 32.63043212890625, + "learning_rate": 8.929056238077416e-05, + "loss": 5.4167, + "step": 3228 + }, + { + "epoch": 0.8740359897172236, + "grad_norm": 10.652158737182617, + "learning_rate": 8.927108401064847e-05, + "loss": 5.44, + "step": 3230 + }, + { + "epoch": 0.8745771884724665, + "grad_norm": 8.121881484985352, + "learning_rate": 8.925159007159994e-05, + "loss": 5.321, + "step": 3232 + }, + { + "epoch": 0.8751183872277094, + "grad_norm": 14.56945514678955, + "learning_rate": 8.923208057135688e-05, + "loss": 5.3423, + "step": 3234 + }, + { + "epoch": 0.8756595859829522, + "grad_norm": 19.6383113861084, + "learning_rate": 8.92125555176538e-05, + "loss": 5.3221, + "step": 3236 + }, + { + "epoch": 0.8762007847381951, + "grad_norm": 6.179553985595703, + "learning_rate": 8.919301491823133e-05, + "loss": 5.2839, + "step": 3238 + }, + { + "epoch": 0.876741983493438, + "grad_norm": 14.963215827941895, + "learning_rate": 8.917345878083631e-05, + "loss": 5.2851, + "step": 3240 + }, + { + "epoch": 0.8772831822486808, + "grad_norm": 9.411989212036133, + "learning_rate": 8.915388711322173e-05, + "loss": 5.2005, + "step": 3242 + }, + { + "epoch": 0.8778243810039237, + "grad_norm": 4.158975601196289, + "learning_rate": 8.91342999231467e-05, + "loss": 5.1834, + "step": 3244 + }, + { + "epoch": 0.8783655797591665, + "grad_norm": 6.446174621582031, + "learning_rate": 8.911469721837655e-05, + "loss": 5.1028, + "step": 3246 + }, + { + "epoch": 0.8789067785144095, + "grad_norm": 7.448984622955322, + "learning_rate": 8.909507900668269e-05, + "loss": 5.0743, + "step": 3248 + }, + { + "epoch": 0.8794479772696523, + "grad_norm": 13.622466087341309, + "learning_rate": 8.907544529584273e-05, + "loss": 5.0598, + "step": 3250 + }, + { + "epoch": 0.8799891760248951, + "grad_norm": 7.748263835906982, + "learning_rate": 8.905579609364041e-05, + "loss": 5.009, + "step": 3252 + }, + { + "epoch": 0.880530374780138, + "grad_norm": 7.795956611633301, + "learning_rate": 8.903613140786558e-05, + "loss": 4.9863, + "step": 3254 + }, + { + "epoch": 0.8810715735353809, + "grad_norm": 4.537596702575684, + "learning_rate": 8.901645124631428e-05, + "loss": 4.9221, + "step": 3256 + }, + { + "epoch": 0.8816127722906237, + "grad_norm": 9.186697006225586, + "learning_rate": 8.899675561678863e-05, + "loss": 4.9103, + "step": 3258 + }, + { + "epoch": 0.8821539710458666, + "grad_norm": 6.000752925872803, + "learning_rate": 8.897704452709697e-05, + "loss": 4.8237, + "step": 3260 + }, + { + "epoch": 0.8826951698011094, + "grad_norm": 3.594982862472534, + "learning_rate": 8.895731798505366e-05, + "loss": 4.8564, + "step": 3262 + }, + { + "epoch": 0.8832363685563523, + "grad_norm": 6.02200174331665, + "learning_rate": 8.893757599847927e-05, + "loss": 4.8013, + "step": 3264 + }, + { + "epoch": 0.8837775673115952, + "grad_norm": 4.270401477813721, + "learning_rate": 8.891781857520044e-05, + "loss": 4.7153, + "step": 3266 + }, + { + "epoch": 0.884318766066838, + "grad_norm": 3.288161516189575, + "learning_rate": 8.889804572304995e-05, + "loss": 4.6645, + "step": 3268 + }, + { + "epoch": 0.884859964822081, + "grad_norm": 3.725242853164673, + "learning_rate": 8.887825744986674e-05, + "loss": 4.6046, + "step": 3270 + }, + { + "epoch": 0.8854011635773238, + "grad_norm": 5.191972732543945, + "learning_rate": 8.885845376349574e-05, + "loss": 4.5773, + "step": 3272 + }, + { + "epoch": 0.8859423623325666, + "grad_norm": 4.172395706176758, + "learning_rate": 8.883863467178814e-05, + "loss": 4.5419, + "step": 3274 + }, + { + "epoch": 0.8864835610878095, + "grad_norm": 3.393796920776367, + "learning_rate": 8.881880018260116e-05, + "loss": 4.4957, + "step": 3276 + }, + { + "epoch": 0.8870247598430524, + "grad_norm": 4.771553993225098, + "learning_rate": 8.87989503037981e-05, + "loss": 4.4532, + "step": 3278 + }, + { + "epoch": 0.8875659585982952, + "grad_norm": 3.170055389404297, + "learning_rate": 8.877908504324843e-05, + "loss": 4.3909, + "step": 3280 + }, + { + "epoch": 0.8881071573535381, + "grad_norm": 2.902820110321045, + "learning_rate": 8.875920440882767e-05, + "loss": 4.3511, + "step": 3282 + }, + { + "epoch": 0.8886483561087809, + "grad_norm": 3.18314528465271, + "learning_rate": 8.873930840841745e-05, + "loss": 4.3131, + "step": 3284 + }, + { + "epoch": 0.8891895548640238, + "grad_norm": 2.8853447437286377, + "learning_rate": 8.871939704990548e-05, + "loss": 4.2587, + "step": 3286 + }, + { + "epoch": 0.8897307536192667, + "grad_norm": 3.534005880355835, + "learning_rate": 8.869947034118557e-05, + "loss": 4.2324, + "step": 3288 + }, + { + "epoch": 0.8902719523745095, + "grad_norm": 3.527754068374634, + "learning_rate": 8.867952829015761e-05, + "loss": 4.2085, + "step": 3290 + }, + { + "epoch": 0.8908131511297525, + "grad_norm": 3.0101592540740967, + "learning_rate": 8.86595709047276e-05, + "loss": 4.1537, + "step": 3292 + }, + { + "epoch": 0.8913543498849953, + "grad_norm": 2.5968217849731445, + "learning_rate": 8.863959819280759e-05, + "loss": 4.1401, + "step": 3294 + }, + { + "epoch": 0.8918955486402381, + "grad_norm": 2.631981134414673, + "learning_rate": 8.861961016231569e-05, + "loss": 4.0802, + "step": 3296 + }, + { + "epoch": 0.892436747395481, + "grad_norm": 2.6680941581726074, + "learning_rate": 8.859960682117612e-05, + "loss": 4.085, + "step": 3298 + }, + { + "epoch": 0.8929779461507239, + "grad_norm": 3.104212999343872, + "learning_rate": 8.857958817731915e-05, + "loss": 4.0593, + "step": 3300 + }, + { + "epoch": 0.8935191449059667, + "grad_norm": 3.0496253967285156, + "learning_rate": 8.855955423868112e-05, + "loss": 4.0423, + "step": 3302 + }, + { + "epoch": 0.8940603436612096, + "grad_norm": 2.760368824005127, + "learning_rate": 8.853950501320443e-05, + "loss": 4.0191, + "step": 3304 + }, + { + "epoch": 0.8946015424164524, + "grad_norm": 3.808211326599121, + "learning_rate": 8.851944050883756e-05, + "loss": 3.9884, + "step": 3306 + }, + { + "epoch": 0.8951427411716953, + "grad_norm": 3.354142427444458, + "learning_rate": 8.849936073353502e-05, + "loss": 3.9332, + "step": 3308 + }, + { + "epoch": 0.8956839399269382, + "grad_norm": 4.658494472503662, + "learning_rate": 8.84792656952574e-05, + "loss": 3.9316, + "step": 3310 + }, + { + "epoch": 0.896225138682181, + "grad_norm": 3.3359267711639404, + "learning_rate": 8.845915540197132e-05, + "loss": 3.9064, + "step": 3312 + }, + { + "epoch": 0.8967663374374238, + "grad_norm": 4.346761703491211, + "learning_rate": 8.843902986164943e-05, + "loss": 3.8806, + "step": 3314 + }, + { + "epoch": 0.8973075361926668, + "grad_norm": 1.9214677810668945, + "learning_rate": 8.84188890822705e-05, + "loss": 3.8454, + "step": 3316 + }, + { + "epoch": 0.8978487349479096, + "grad_norm": 3.275961399078369, + "learning_rate": 8.839873307181925e-05, + "loss": 3.8613, + "step": 3318 + }, + { + "epoch": 0.8983899337031525, + "grad_norm": 2.879671573638916, + "learning_rate": 8.83785618382865e-05, + "loss": 3.8142, + "step": 3320 + }, + { + "epoch": 0.8989311324583954, + "grad_norm": 2.393937587738037, + "learning_rate": 8.83583753896691e-05, + "loss": 3.7892, + "step": 3322 + }, + { + "epoch": 0.8994723312136382, + "grad_norm": 2.331608772277832, + "learning_rate": 8.833817373396986e-05, + "loss": 3.7633, + "step": 3324 + }, + { + "epoch": 0.9000135299688811, + "grad_norm": 3.3159894943237305, + "learning_rate": 8.831795687919775e-05, + "loss": 3.7549, + "step": 3326 + }, + { + "epoch": 0.9005547287241239, + "grad_norm": 1.6604549884796143, + "learning_rate": 8.829772483336763e-05, + "loss": 3.7204, + "step": 3328 + }, + { + "epoch": 0.9010959274793668, + "grad_norm": 3.669424057006836, + "learning_rate": 8.827747760450047e-05, + "loss": 3.6946, + "step": 3330 + }, + { + "epoch": 0.9016371262346097, + "grad_norm": 5.51679801940918, + "learning_rate": 8.825721520062325e-05, + "loss": 3.6713, + "step": 3332 + }, + { + "epoch": 0.9021783249898525, + "grad_norm": 3.6048996448516846, + "learning_rate": 8.823693762976891e-05, + "loss": 3.6855, + "step": 3334 + }, + { + "epoch": 0.9027195237450953, + "grad_norm": 3.728710174560547, + "learning_rate": 8.821664489997648e-05, + "loss": 3.6089, + "step": 3336 + }, + { + "epoch": 0.9032607225003383, + "grad_norm": 2.6480770111083984, + "learning_rate": 8.819633701929093e-05, + "loss": 3.609, + "step": 3338 + }, + { + "epoch": 0.9038019212555811, + "grad_norm": 2.4537196159362793, + "learning_rate": 8.817601399576329e-05, + "loss": 3.5668, + "step": 3340 + }, + { + "epoch": 0.904343120010824, + "grad_norm": 8.629188537597656, + "learning_rate": 8.815567583745056e-05, + "loss": 3.6679, + "step": 3342 + }, + { + "epoch": 0.9048843187660668, + "grad_norm": 11.455950736999512, + "learning_rate": 8.813532255241576e-05, + "loss": 3.6838, + "step": 3344 + }, + { + "epoch": 0.9054255175213097, + "grad_norm": 9.66180419921875, + "learning_rate": 8.81149541487279e-05, + "loss": 3.5955, + "step": 3346 + }, + { + "epoch": 0.9059667162765526, + "grad_norm": 5.880641937255859, + "learning_rate": 8.809457063446198e-05, + "loss": 3.4443, + "step": 3348 + }, + { + "epoch": 0.9065079150317954, + "grad_norm": 4.776944637298584, + "learning_rate": 8.807417201769899e-05, + "loss": 3.3301, + "step": 3350 + }, + { + "epoch": 0.9070491137870383, + "grad_norm": 19.446060180664062, + "learning_rate": 8.805375830652591e-05, + "loss": 3.3896, + "step": 3352 + }, + { + "epoch": 0.9075903125422812, + "grad_norm": 7.380261421203613, + "learning_rate": 8.80333295090357e-05, + "loss": 3.4393, + "step": 3354 + }, + { + "epoch": 0.908131511297524, + "grad_norm": 4.7472734451293945, + "learning_rate": 8.801288563332732e-05, + "loss": 3.223, + "step": 3356 + }, + { + "epoch": 0.9086727100527668, + "grad_norm": 4.367702960968018, + "learning_rate": 8.799242668750567e-05, + "loss": 2.9019, + "step": 3358 + }, + { + "epoch": 0.9092139088080098, + "grad_norm": 12.423567771911621, + "learning_rate": 8.797195267968169e-05, + "loss": 2.6458, + "step": 3360 + }, + { + "epoch": 0.9097551075632526, + "grad_norm": 57.905941009521484, + "learning_rate": 8.795146361797219e-05, + "loss": 2.5072, + "step": 3362 + }, + { + "epoch": 0.9102963063184955, + "grad_norm": 29.289100646972656, + "learning_rate": 8.793095951050007e-05, + "loss": 2.077, + "step": 3364 + }, + { + "epoch": 0.9108375050737383, + "grad_norm": 4.703507423400879, + "learning_rate": 8.79104403653941e-05, + "loss": 1.6952, + "step": 3366 + }, + { + "epoch": 0.9113787038289812, + "grad_norm": 3.7362613677978516, + "learning_rate": 8.788990619078903e-05, + "loss": 1.5028, + "step": 3368 + }, + { + "epoch": 0.9119199025842241, + "grad_norm": 1.9051291942596436, + "learning_rate": 8.78693569948256e-05, + "loss": 1.4128, + "step": 3370 + }, + { + "epoch": 0.9124611013394669, + "grad_norm": 4.28729772567749, + "learning_rate": 8.784879278565049e-05, + "loss": 1.3943, + "step": 3372 + }, + { + "epoch": 0.9130023000947097, + "grad_norm": 1.3635573387145996, + "learning_rate": 8.782821357141633e-05, + "loss": 1.3544, + "step": 3374 + }, + { + "epoch": 0.9135434988499527, + "grad_norm": 1.0148718357086182, + "learning_rate": 8.780761936028168e-05, + "loss": 1.3325, + "step": 3376 + }, + { + "epoch": 0.9140846976051955, + "grad_norm": 1.0702952146530151, + "learning_rate": 8.778701016041108e-05, + "loss": 1.2865, + "step": 3378 + }, + { + "epoch": 0.9146258963604383, + "grad_norm": 0.7008850574493408, + "learning_rate": 8.776638597997498e-05, + "loss": 1.2967, + "step": 3380 + }, + { + "epoch": 0.9151670951156813, + "grad_norm": 0.6870774626731873, + "learning_rate": 8.77457468271498e-05, + "loss": 1.2745, + "step": 3382 + }, + { + "epoch": 0.9157082938709241, + "grad_norm": 0.42447954416275024, + "learning_rate": 8.772509271011788e-05, + "loss": 1.2755, + "step": 3384 + }, + { + "epoch": 0.916249492626167, + "grad_norm": 0.37188005447387695, + "learning_rate": 8.77044236370675e-05, + "loss": 1.2747, + "step": 3386 + }, + { + "epoch": 0.9167906913814098, + "grad_norm": 0.29276731610298157, + "learning_rate": 8.768373961619283e-05, + "loss": 1.2385, + "step": 3388 + }, + { + "epoch": 0.9173318901366527, + "grad_norm": 0.29821205139160156, + "learning_rate": 8.766304065569404e-05, + "loss": 1.2333, + "step": 3390 + }, + { + "epoch": 0.9178730888918956, + "grad_norm": 0.3106642961502075, + "learning_rate": 8.764232676377715e-05, + "loss": 1.2528, + "step": 3392 + }, + { + "epoch": 0.9184142876471384, + "grad_norm": 0.2433456927537918, + "learning_rate": 8.762159794865414e-05, + "loss": 1.2644, + "step": 3394 + }, + { + "epoch": 0.9189554864023812, + "grad_norm": 0.2584933042526245, + "learning_rate": 8.76008542185429e-05, + "loss": 1.2563, + "step": 3396 + }, + { + "epoch": 0.9194966851576242, + "grad_norm": 0.2554602324962616, + "learning_rate": 8.758009558166723e-05, + "loss": 1.2529, + "step": 3398 + }, + { + "epoch": 0.920037883912867, + "grad_norm": 0.2501952052116394, + "learning_rate": 8.755932204625682e-05, + "loss": 1.2461, + "step": 3400 + }, + { + "epoch": 0.9205790826681098, + "grad_norm": 0.2322869598865509, + "learning_rate": 8.753853362054731e-05, + "loss": 1.2455, + "step": 3402 + }, + { + "epoch": 0.9211202814233527, + "grad_norm": 0.26621463894844055, + "learning_rate": 8.751773031278022e-05, + "loss": 1.2414, + "step": 3404 + }, + { + "epoch": 0.9216614801785956, + "grad_norm": 0.2485044300556183, + "learning_rate": 8.749691213120297e-05, + "loss": 1.2543, + "step": 3406 + }, + { + "epoch": 0.9222026789338384, + "grad_norm": 0.22953349351882935, + "learning_rate": 8.747607908406886e-05, + "loss": 1.2455, + "step": 3408 + }, + { + "epoch": 0.9227438776890813, + "grad_norm": 0.2378481775522232, + "learning_rate": 8.74552311796371e-05, + "loss": 1.2324, + "step": 3410 + }, + { + "epoch": 0.9232850764443242, + "grad_norm": 0.2420920431613922, + "learning_rate": 8.743436842617279e-05, + "loss": 1.2472, + "step": 3412 + }, + { + "epoch": 0.9238262751995671, + "grad_norm": 0.240481436252594, + "learning_rate": 8.741349083194694e-05, + "loss": 1.2512, + "step": 3414 + }, + { + "epoch": 0.9243674739548099, + "grad_norm": 0.21981249749660492, + "learning_rate": 8.73925984052364e-05, + "loss": 1.2234, + "step": 3416 + }, + { + "epoch": 0.9249086727100527, + "grad_norm": 0.21758708357810974, + "learning_rate": 8.73716911543239e-05, + "loss": 1.2423, + "step": 3418 + }, + { + "epoch": 0.9254498714652957, + "grad_norm": 0.2222822904586792, + "learning_rate": 8.735076908749811e-05, + "loss": 1.2475, + "step": 3420 + }, + { + "epoch": 0.9259910702205385, + "grad_norm": 0.2375306487083435, + "learning_rate": 8.73298322130535e-05, + "loss": 1.2511, + "step": 3422 + }, + { + "epoch": 0.9265322689757813, + "grad_norm": 0.20608864724636078, + "learning_rate": 8.730888053929047e-05, + "loss": 1.2334, + "step": 3424 + }, + { + "epoch": 0.9270734677310242, + "grad_norm": 0.21576453745365143, + "learning_rate": 8.728791407451524e-05, + "loss": 1.2389, + "step": 3426 + }, + { + "epoch": 0.9276146664862671, + "grad_norm": 0.21437953412532806, + "learning_rate": 8.726693282703991e-05, + "loss": 1.2383, + "step": 3428 + }, + { + "epoch": 0.92815586524151, + "grad_norm": 0.23263736069202423, + "learning_rate": 8.724593680518243e-05, + "loss": 1.2585, + "step": 3430 + }, + { + "epoch": 0.9286970639967528, + "grad_norm": 0.2303396314382553, + "learning_rate": 8.722492601726665e-05, + "loss": 1.2426, + "step": 3432 + }, + { + "epoch": 0.9292382627519957, + "grad_norm": 0.21872375905513763, + "learning_rate": 8.720390047162223e-05, + "loss": 1.2487, + "step": 3434 + }, + { + "epoch": 0.9297794615072386, + "grad_norm": 1.3575583696365356, + "learning_rate": 8.71828601765847e-05, + "loss": 1.2599, + "step": 3436 + }, + { + "epoch": 0.9303206602624814, + "grad_norm": 0.2798626720905304, + "learning_rate": 8.716180514049543e-05, + "loss": 1.2413, + "step": 3438 + }, + { + "epoch": 0.9308618590177242, + "grad_norm": 0.3947781026363373, + "learning_rate": 8.714073537170162e-05, + "loss": 1.2391, + "step": 3440 + }, + { + "epoch": 0.9314030577729672, + "grad_norm": 0.30611512064933777, + "learning_rate": 8.711965087855635e-05, + "loss": 1.2285, + "step": 3442 + }, + { + "epoch": 0.93194425652821, + "grad_norm": 0.2964535355567932, + "learning_rate": 8.709855166941849e-05, + "loss": 1.2332, + "step": 3444 + }, + { + "epoch": 0.9324854552834528, + "grad_norm": 0.30566874146461487, + "learning_rate": 8.70774377526528e-05, + "loss": 1.254, + "step": 3446 + }, + { + "epoch": 0.9330266540386957, + "grad_norm": 0.25171148777008057, + "learning_rate": 8.705630913662983e-05, + "loss": 1.2445, + "step": 3448 + }, + { + "epoch": 0.9335678527939386, + "grad_norm": 0.2296695113182068, + "learning_rate": 8.703516582972595e-05, + "loss": 1.2306, + "step": 3450 + }, + { + "epoch": 0.9341090515491814, + "grad_norm": 0.2543555498123169, + "learning_rate": 8.701400784032339e-05, + "loss": 1.2623, + "step": 3452 + }, + { + "epoch": 0.9346502503044243, + "grad_norm": 0.22564424574375153, + "learning_rate": 8.699283517681017e-05, + "loss": 1.2413, + "step": 3454 + }, + { + "epoch": 0.9351914490596671, + "grad_norm": 0.22345957159996033, + "learning_rate": 8.697164784758014e-05, + "loss": 1.2244, + "step": 3456 + }, + { + "epoch": 0.9357326478149101, + "grad_norm": 0.23204098641872406, + "learning_rate": 8.695044586103296e-05, + "loss": 1.2322, + "step": 3458 + }, + { + "epoch": 0.9362738465701529, + "grad_norm": 0.36212965846061707, + "learning_rate": 8.692922922557412e-05, + "loss": 1.2371, + "step": 3460 + }, + { + "epoch": 0.9368150453253957, + "grad_norm": 0.22496424615383148, + "learning_rate": 8.690799794961489e-05, + "loss": 1.2255, + "step": 3462 + }, + { + "epoch": 0.9373562440806387, + "grad_norm": 0.23747360706329346, + "learning_rate": 8.688675204157236e-05, + "loss": 1.2515, + "step": 3464 + }, + { + "epoch": 0.9378974428358815, + "grad_norm": 0.20290309190750122, + "learning_rate": 8.686549150986943e-05, + "loss": 1.2445, + "step": 3466 + }, + { + "epoch": 0.9384386415911243, + "grad_norm": 0.22112032771110535, + "learning_rate": 8.684421636293474e-05, + "loss": 1.2467, + "step": 3468 + }, + { + "epoch": 0.9389798403463672, + "grad_norm": 0.19764398038387299, + "learning_rate": 8.682292660920281e-05, + "loss": 1.2528, + "step": 3470 + }, + { + "epoch": 0.9395210391016101, + "grad_norm": 0.22094492614269257, + "learning_rate": 8.680162225711392e-05, + "loss": 1.2208, + "step": 3472 + }, + { + "epoch": 0.940062237856853, + "grad_norm": 0.20873764157295227, + "learning_rate": 8.678030331511409e-05, + "loss": 1.2376, + "step": 3474 + }, + { + "epoch": 0.9406034366120958, + "grad_norm": 0.21512436866760254, + "learning_rate": 8.675896979165517e-05, + "loss": 1.2481, + "step": 3476 + }, + { + "epoch": 0.9411446353673386, + "grad_norm": 0.20858894288539886, + "learning_rate": 8.673762169519479e-05, + "loss": 1.2354, + "step": 3478 + }, + { + "epoch": 0.9416858341225816, + "grad_norm": 0.21578043699264526, + "learning_rate": 8.671625903419636e-05, + "loss": 1.231, + "step": 3480 + }, + { + "epoch": 0.9422270328778244, + "grad_norm": 0.214979350566864, + "learning_rate": 8.669488181712904e-05, + "loss": 1.246, + "step": 3482 + }, + { + "epoch": 0.9427682316330672, + "grad_norm": 0.21148213744163513, + "learning_rate": 8.667349005246776e-05, + "loss": 1.2501, + "step": 3484 + }, + { + "epoch": 0.94330943038831, + "grad_norm": 0.21602432429790497, + "learning_rate": 8.665208374869327e-05, + "loss": 1.2312, + "step": 3486 + }, + { + "epoch": 0.943850629143553, + "grad_norm": 0.2216794639825821, + "learning_rate": 8.6630662914292e-05, + "loss": 1.2405, + "step": 3488 + }, + { + "epoch": 0.9443918278987958, + "grad_norm": 0.20914500951766968, + "learning_rate": 8.660922755775622e-05, + "loss": 1.2429, + "step": 3490 + }, + { + "epoch": 0.9449330266540387, + "grad_norm": 0.22172749042510986, + "learning_rate": 8.658777768758393e-05, + "loss": 1.2467, + "step": 3492 + }, + { + "epoch": 0.9454742254092816, + "grad_norm": 0.20988859236240387, + "learning_rate": 8.656631331227883e-05, + "loss": 1.2299, + "step": 3494 + }, + { + "epoch": 0.9460154241645244, + "grad_norm": 0.2031324952840805, + "learning_rate": 8.654483444035047e-05, + "loss": 1.2186, + "step": 3496 + }, + { + "epoch": 0.9465566229197673, + "grad_norm": 0.21033494174480438, + "learning_rate": 8.652334108031406e-05, + "loss": 1.2293, + "step": 3498 + }, + { + "epoch": 0.9470978216750101, + "grad_norm": 0.2091301679611206, + "learning_rate": 8.650183324069059e-05, + "loss": 1.2181, + "step": 3500 + }, + { + "epoch": 0.947639020430253, + "grad_norm": 0.2000979781150818, + "learning_rate": 8.648031093000681e-05, + "loss": 1.2375, + "step": 3502 + }, + { + "epoch": 0.9481802191854959, + "grad_norm": 0.20794962346553802, + "learning_rate": 8.645877415679519e-05, + "loss": 1.2363, + "step": 3504 + }, + { + "epoch": 0.9487214179407387, + "grad_norm": 0.2179345339536667, + "learning_rate": 8.64372229295939e-05, + "loss": 1.2251, + "step": 3506 + }, + { + "epoch": 0.9492626166959816, + "grad_norm": 0.21006426215171814, + "learning_rate": 8.64156572569469e-05, + "loss": 1.2252, + "step": 3508 + }, + { + "epoch": 0.9498038154512245, + "grad_norm": 0.21278280019760132, + "learning_rate": 8.639407714740382e-05, + "loss": 1.2507, + "step": 3510 + }, + { + "epoch": 0.9503450142064673, + "grad_norm": 0.19911755621433258, + "learning_rate": 8.637248260952006e-05, + "loss": 1.224, + "step": 3512 + }, + { + "epoch": 0.9508862129617102, + "grad_norm": 0.19069471955299377, + "learning_rate": 8.63508736518567e-05, + "loss": 1.2134, + "step": 3514 + }, + { + "epoch": 0.951427411716953, + "grad_norm": 0.20871645212173462, + "learning_rate": 8.632925028298059e-05, + "loss": 1.2223, + "step": 3516 + }, + { + "epoch": 0.9519686104721959, + "grad_norm": 0.2007288932800293, + "learning_rate": 8.630761251146424e-05, + "loss": 1.2283, + "step": 3518 + }, + { + "epoch": 0.9525098092274388, + "grad_norm": 0.21398819983005524, + "learning_rate": 8.628596034588588e-05, + "loss": 1.2378, + "step": 3520 + }, + { + "epoch": 0.9530510079826816, + "grad_norm": 0.2010694146156311, + "learning_rate": 8.626429379482946e-05, + "loss": 1.229, + "step": 3522 + }, + { + "epoch": 0.9535922067379246, + "grad_norm": 0.20102912187576294, + "learning_rate": 8.624261286688466e-05, + "loss": 1.2346, + "step": 3524 + }, + { + "epoch": 0.9541334054931674, + "grad_norm": 0.20535993576049805, + "learning_rate": 8.62209175706468e-05, + "loss": 1.2358, + "step": 3526 + }, + { + "epoch": 0.9546746042484102, + "grad_norm": 0.22004395723342896, + "learning_rate": 8.619920791471693e-05, + "loss": 1.2261, + "step": 3528 + }, + { + "epoch": 0.955215803003653, + "grad_norm": 0.19609929621219635, + "learning_rate": 8.617748390770179e-05, + "loss": 1.2432, + "step": 3530 + }, + { + "epoch": 0.955757001758896, + "grad_norm": 0.19269846379756927, + "learning_rate": 8.615574555821382e-05, + "loss": 1.2244, + "step": 3532 + }, + { + "epoch": 0.9562982005141388, + "grad_norm": 0.1955200880765915, + "learning_rate": 8.613399287487112e-05, + "loss": 1.2339, + "step": 3534 + }, + { + "epoch": 0.9568393992693817, + "grad_norm": 0.19891057908535004, + "learning_rate": 8.611222586629749e-05, + "loss": 1.2197, + "step": 3536 + }, + { + "epoch": 0.9573805980246245, + "grad_norm": 0.20394091308116913, + "learning_rate": 8.60904445411224e-05, + "loss": 1.2386, + "step": 3538 + }, + { + "epoch": 0.9579217967798674, + "grad_norm": 0.2031053602695465, + "learning_rate": 8.606864890798104e-05, + "loss": 1.2273, + "step": 3540 + }, + { + "epoch": 0.9584629955351103, + "grad_norm": 0.1948944330215454, + "learning_rate": 8.604683897551417e-05, + "loss": 1.2232, + "step": 3542 + }, + { + "epoch": 0.9590041942903531, + "grad_norm": 0.20535525679588318, + "learning_rate": 8.602501475236833e-05, + "loss": 1.2411, + "step": 3544 + }, + { + "epoch": 0.9595453930455959, + "grad_norm": 0.2302812784910202, + "learning_rate": 8.600317624719565e-05, + "loss": 1.2151, + "step": 3546 + }, + { + "epoch": 0.9600865918008389, + "grad_norm": 0.21900974214076996, + "learning_rate": 8.598132346865398e-05, + "loss": 1.2153, + "step": 3548 + }, + { + "epoch": 0.9606277905560817, + "grad_norm": 0.2264300286769867, + "learning_rate": 8.59594564254068e-05, + "loss": 1.2273, + "step": 3550 + }, + { + "epoch": 0.9611689893113246, + "grad_norm": 0.20842230319976807, + "learning_rate": 8.59375751261232e-05, + "loss": 1.2131, + "step": 3552 + }, + { + "epoch": 0.9617101880665675, + "grad_norm": 0.20842507481575012, + "learning_rate": 8.5915679579478e-05, + "loss": 1.2412, + "step": 3554 + }, + { + "epoch": 0.9622513868218103, + "grad_norm": 0.21368186175823212, + "learning_rate": 8.589376979415164e-05, + "loss": 1.2435, + "step": 3556 + }, + { + "epoch": 0.9627925855770532, + "grad_norm": 0.23424792289733887, + "learning_rate": 8.587184577883018e-05, + "loss": 1.2162, + "step": 3558 + }, + { + "epoch": 0.963333784332296, + "grad_norm": 0.2133053094148636, + "learning_rate": 8.584990754220536e-05, + "loss": 1.2279, + "step": 3560 + }, + { + "epoch": 0.9638749830875389, + "grad_norm": 0.2146514356136322, + "learning_rate": 8.582795509297453e-05, + "loss": 1.2622, + "step": 3562 + }, + { + "epoch": 0.9644161818427818, + "grad_norm": 0.20293276011943817, + "learning_rate": 8.580598843984069e-05, + "loss": 1.2154, + "step": 3564 + }, + { + "epoch": 0.9649573805980246, + "grad_norm": 0.2056434154510498, + "learning_rate": 8.578400759151244e-05, + "loss": 1.2203, + "step": 3566 + }, + { + "epoch": 0.9654985793532674, + "grad_norm": 0.20615997910499573, + "learning_rate": 8.576201255670406e-05, + "loss": 1.2452, + "step": 3568 + }, + { + "epoch": 0.9660397781085104, + "grad_norm": 0.2047366350889206, + "learning_rate": 8.574000334413541e-05, + "loss": 1.2406, + "step": 3570 + }, + { + "epoch": 0.9665809768637532, + "grad_norm": 0.2215116322040558, + "learning_rate": 8.571797996253201e-05, + "loss": 1.2117, + "step": 3572 + }, + { + "epoch": 0.967122175618996, + "grad_norm": 0.21521443128585815, + "learning_rate": 8.569594242062494e-05, + "loss": 1.2336, + "step": 3574 + }, + { + "epoch": 0.967663374374239, + "grad_norm": 0.20681914687156677, + "learning_rate": 8.567389072715095e-05, + "loss": 1.2217, + "step": 3576 + }, + { + "epoch": 0.9682045731294818, + "grad_norm": 0.21918296813964844, + "learning_rate": 8.56518248908524e-05, + "loss": 1.2301, + "step": 3578 + }, + { + "epoch": 0.9687457718847247, + "grad_norm": 0.21497774124145508, + "learning_rate": 8.562974492047717e-05, + "loss": 1.2128, + "step": 3580 + }, + { + "epoch": 0.9692869706399675, + "grad_norm": 0.23987126350402832, + "learning_rate": 8.560765082477887e-05, + "loss": 1.2442, + "step": 3582 + }, + { + "epoch": 0.9698281693952104, + "grad_norm": 0.25350961089134216, + "learning_rate": 8.558554261251663e-05, + "loss": 1.2145, + "step": 3584 + }, + { + "epoch": 0.9703693681504533, + "grad_norm": 0.21898071467876434, + "learning_rate": 8.556342029245518e-05, + "loss": 1.2319, + "step": 3586 + }, + { + "epoch": 0.9709105669056961, + "grad_norm": 0.20007579028606415, + "learning_rate": 8.554128387336489e-05, + "loss": 1.2261, + "step": 3588 + }, + { + "epoch": 0.9714517656609389, + "grad_norm": 0.2057974636554718, + "learning_rate": 8.551913336402167e-05, + "loss": 1.2354, + "step": 3590 + }, + { + "epoch": 0.9719929644161819, + "grad_norm": 0.22897422313690186, + "learning_rate": 8.549696877320701e-05, + "loss": 1.2264, + "step": 3592 + }, + { + "epoch": 0.9725341631714247, + "grad_norm": 0.21826642751693726, + "learning_rate": 8.547479010970805e-05, + "loss": 1.2464, + "step": 3594 + }, + { + "epoch": 0.9730753619266675, + "grad_norm": 0.24179087579250336, + "learning_rate": 8.545259738231744e-05, + "loss": 1.2296, + "step": 3596 + }, + { + "epoch": 0.9736165606819104, + "grad_norm": 0.21856723725795746, + "learning_rate": 8.543039059983344e-05, + "loss": 1.2291, + "step": 3598 + }, + { + "epoch": 0.9741577594371533, + "grad_norm": 0.22182489931583405, + "learning_rate": 8.540816977105986e-05, + "loss": 1.244, + "step": 3600 + }, + { + "epoch": 0.9746989581923962, + "grad_norm": 0.276157021522522, + "learning_rate": 8.538593490480612e-05, + "loss": 1.2137, + "step": 3602 + }, + { + "epoch": 0.975240156947639, + "grad_norm": 0.2525583505630493, + "learning_rate": 8.536368600988715e-05, + "loss": 1.2271, + "step": 3604 + }, + { + "epoch": 0.9757813557028819, + "grad_norm": 0.2494456022977829, + "learning_rate": 8.534142309512348e-05, + "loss": 1.2274, + "step": 3606 + }, + { + "epoch": 0.9763225544581248, + "grad_norm": 0.23215800523757935, + "learning_rate": 8.531914616934119e-05, + "loss": 1.2183, + "step": 3608 + }, + { + "epoch": 0.9768637532133676, + "grad_norm": 0.22265788912773132, + "learning_rate": 8.529685524137188e-05, + "loss": 1.2279, + "step": 3610 + }, + { + "epoch": 0.9774049519686104, + "grad_norm": 0.23256999254226685, + "learning_rate": 8.527455032005278e-05, + "loss": 1.2368, + "step": 3612 + }, + { + "epoch": 0.9779461507238534, + "grad_norm": 0.22369737923145294, + "learning_rate": 8.52522314142266e-05, + "loss": 1.2486, + "step": 3614 + }, + { + "epoch": 0.9784873494790962, + "grad_norm": 0.21790017187595367, + "learning_rate": 8.522989853274159e-05, + "loss": 1.2191, + "step": 3616 + }, + { + "epoch": 0.979028548234339, + "grad_norm": 0.20743697881698608, + "learning_rate": 8.520755168445162e-05, + "loss": 1.2325, + "step": 3618 + }, + { + "epoch": 0.9795697469895819, + "grad_norm": 0.20409737527370453, + "learning_rate": 8.518519087821599e-05, + "loss": 1.2286, + "step": 3620 + }, + { + "epoch": 0.9801109457448248, + "grad_norm": 0.21322283148765564, + "learning_rate": 8.51628161228996e-05, + "loss": 1.2238, + "step": 3622 + }, + { + "epoch": 0.9806521445000677, + "grad_norm": 0.2082875818014145, + "learning_rate": 8.514042742737289e-05, + "loss": 1.2297, + "step": 3624 + }, + { + "epoch": 0.9811933432553105, + "grad_norm": 0.205684632062912, + "learning_rate": 8.511802480051178e-05, + "loss": 1.2187, + "step": 3626 + }, + { + "epoch": 0.9817345420105533, + "grad_norm": 0.20009024441242218, + "learning_rate": 8.509560825119772e-05, + "loss": 1.2217, + "step": 3628 + }, + { + "epoch": 0.9822757407657963, + "grad_norm": 0.24374030530452728, + "learning_rate": 8.507317778831774e-05, + "loss": 1.2232, + "step": 3630 + }, + { + "epoch": 0.9828169395210391, + "grad_norm": 0.2268858402967453, + "learning_rate": 8.505073342076429e-05, + "loss": 1.2308, + "step": 3632 + }, + { + "epoch": 0.9833581382762819, + "grad_norm": 1.9946271181106567, + "learning_rate": 8.502827515743541e-05, + "loss": 1.2203, + "step": 3634 + }, + { + "epoch": 0.9838993370315249, + "grad_norm": 0.3267524838447571, + "learning_rate": 8.500580300723464e-05, + "loss": 1.2236, + "step": 3636 + }, + { + "epoch": 0.9844405357867677, + "grad_norm": 0.2528623640537262, + "learning_rate": 8.498331697907096e-05, + "loss": 1.2204, + "step": 3638 + }, + { + "epoch": 0.9849817345420105, + "grad_norm": 0.22566676139831543, + "learning_rate": 8.496081708185895e-05, + "loss": 1.2249, + "step": 3640 + }, + { + "epoch": 0.9855229332972534, + "grad_norm": 0.24136385321617126, + "learning_rate": 8.493830332451861e-05, + "loss": 1.2328, + "step": 3642 + }, + { + "epoch": 0.9860641320524963, + "grad_norm": 0.2164882868528366, + "learning_rate": 8.491577571597546e-05, + "loss": 1.2525, + "step": 3644 + }, + { + "epoch": 0.9866053308077392, + "grad_norm": 0.22188369929790497, + "learning_rate": 8.489323426516054e-05, + "loss": 1.2069, + "step": 3646 + }, + { + "epoch": 0.987146529562982, + "grad_norm": 0.23576316237449646, + "learning_rate": 8.487067898101031e-05, + "loss": 1.2429, + "step": 3648 + }, + { + "epoch": 0.9876877283182248, + "grad_norm": 0.2139056921005249, + "learning_rate": 8.484810987246678e-05, + "loss": 1.2196, + "step": 3650 + }, + { + "epoch": 0.9882289270734678, + "grad_norm": 0.22105053067207336, + "learning_rate": 8.482552694847744e-05, + "loss": 1.2389, + "step": 3652 + }, + { + "epoch": 0.9887701258287106, + "grad_norm": 0.22355502843856812, + "learning_rate": 8.480293021799518e-05, + "loss": 1.2162, + "step": 3654 + }, + { + "epoch": 0.9893113245839534, + "grad_norm": 0.23202285170555115, + "learning_rate": 8.478031968997845e-05, + "loss": 1.2233, + "step": 3656 + }, + { + "epoch": 0.9898525233391963, + "grad_norm": 0.209551602602005, + "learning_rate": 8.475769537339115e-05, + "loss": 1.2196, + "step": 3658 + }, + { + "epoch": 0.9903937220944392, + "grad_norm": 0.23315675556659698, + "learning_rate": 8.473505727720261e-05, + "loss": 1.2342, + "step": 3660 + }, + { + "epoch": 0.990934920849682, + "grad_norm": 0.22182142734527588, + "learning_rate": 8.471240541038765e-05, + "loss": 1.2239, + "step": 3662 + }, + { + "epoch": 0.9914761196049249, + "grad_norm": 0.2126477062702179, + "learning_rate": 8.468973978192654e-05, + "loss": 1.2344, + "step": 3664 + }, + { + "epoch": 0.9920173183601678, + "grad_norm": 0.19775603711605072, + "learning_rate": 8.466706040080504e-05, + "loss": 1.2101, + "step": 3666 + }, + { + "epoch": 0.9925585171154107, + "grad_norm": 0.21096171438694, + "learning_rate": 8.46443672760143e-05, + "loss": 1.2345, + "step": 3668 + }, + { + "epoch": 0.9930997158706535, + "grad_norm": 0.2033991813659668, + "learning_rate": 8.462166041655098e-05, + "loss": 1.2307, + "step": 3670 + }, + { + "epoch": 0.9936409146258963, + "grad_norm": 0.1952131688594818, + "learning_rate": 8.459893983141714e-05, + "loss": 1.2362, + "step": 3672 + }, + { + "epoch": 0.9941821133811393, + "grad_norm": 0.2031036764383316, + "learning_rate": 8.45762055296203e-05, + "loss": 1.2249, + "step": 3674 + }, + { + "epoch": 0.9947233121363821, + "grad_norm": 0.1928708553314209, + "learning_rate": 8.455345752017343e-05, + "loss": 1.2276, + "step": 3676 + }, + { + "epoch": 0.9952645108916249, + "grad_norm": 0.1981291025876999, + "learning_rate": 8.453069581209489e-05, + "loss": 1.2154, + "step": 3678 + }, + { + "epoch": 0.9958057096468678, + "grad_norm": 0.2076874077320099, + "learning_rate": 8.450792041440856e-05, + "loss": 1.2306, + "step": 3680 + }, + { + "epoch": 0.9963469084021107, + "grad_norm": 0.23212751746177673, + "learning_rate": 8.448513133614364e-05, + "loss": 1.2212, + "step": 3682 + }, + { + "epoch": 0.9968881071573535, + "grad_norm": 0.22317342460155487, + "learning_rate": 8.446232858633481e-05, + "loss": 1.2364, + "step": 3684 + }, + { + "epoch": 0.9974293059125964, + "grad_norm": 0.2007218599319458, + "learning_rate": 8.443951217402216e-05, + "loss": 1.223, + "step": 3686 + }, + { + "epoch": 0.9979705046678392, + "grad_norm": 0.2146870344877243, + "learning_rate": 8.441668210825122e-05, + "loss": 1.2248, + "step": 3688 + }, + { + "epoch": 0.9985117034230822, + "grad_norm": 0.18961116671562195, + "learning_rate": 8.43938383980729e-05, + "loss": 1.2217, + "step": 3690 + }, + { + "epoch": 0.999052902178325, + "grad_norm": 0.2126687914133072, + "learning_rate": 8.437098105254353e-05, + "loss": 1.1985, + "step": 3692 + }, + { + "epoch": 0.9995941009335678, + "grad_norm": 0.2034890204668045, + "learning_rate": 8.434811008072486e-05, + "loss": 1.2364, + "step": 3694 + }, + { + "epoch": 1.0, + "grad_norm": 0.25010621547698975, + "learning_rate": 8.432522549168402e-05, + "loss": 1.2423, + "step": 3696 + }, + { + "epoch": 1.0005411987552428, + "grad_norm": 0.3276897072792053, + "learning_rate": 8.430232729449353e-05, + "loss": 1.1664, + "step": 3698 + }, + { + "epoch": 1.0010823975104857, + "grad_norm": 0.25672757625579834, + "learning_rate": 8.427941549823134e-05, + "loss": 1.1803, + "step": 3700 + }, + { + "epoch": 1.0016235962657285, + "grad_norm": 0.24125026166439056, + "learning_rate": 8.42564901119808e-05, + "loss": 1.174, + "step": 3702 + }, + { + "epoch": 1.0021647950209716, + "grad_norm": 0.23144879937171936, + "learning_rate": 8.42335511448306e-05, + "loss": 1.1553, + "step": 3704 + }, + { + "epoch": 1.0027059937762144, + "grad_norm": 0.20953825116157532, + "learning_rate": 8.421059860587481e-05, + "loss": 1.153, + "step": 3706 + }, + { + "epoch": 1.0032471925314572, + "grad_norm": 0.22265411913394928, + "learning_rate": 8.418763250421293e-05, + "loss": 1.1571, + "step": 3708 + }, + { + "epoch": 1.0037883912867, + "grad_norm": 0.21143150329589844, + "learning_rate": 8.416465284894983e-05, + "loss": 1.1711, + "step": 3710 + }, + { + "epoch": 1.0043295900419429, + "grad_norm": 0.22948767244815826, + "learning_rate": 8.41416596491957e-05, + "loss": 1.1781, + "step": 3712 + }, + { + "epoch": 1.0048707887971857, + "grad_norm": 0.2108934074640274, + "learning_rate": 8.411865291406618e-05, + "loss": 1.1737, + "step": 3714 + }, + { + "epoch": 1.0054119875524286, + "grad_norm": 0.21712428331375122, + "learning_rate": 8.409563265268218e-05, + "loss": 1.1585, + "step": 3716 + }, + { + "epoch": 1.0059531863076714, + "grad_norm": 0.20881038904190063, + "learning_rate": 8.407259887417007e-05, + "loss": 1.1594, + "step": 3718 + }, + { + "epoch": 1.0064943850629144, + "grad_norm": 0.22524814307689667, + "learning_rate": 8.404955158766153e-05, + "loss": 1.1535, + "step": 3720 + }, + { + "epoch": 1.0070355838181573, + "grad_norm": 0.2055925875902176, + "learning_rate": 8.402649080229357e-05, + "loss": 1.1518, + "step": 3722 + }, + { + "epoch": 1.0075767825734, + "grad_norm": 0.22745977342128754, + "learning_rate": 8.40034165272086e-05, + "loss": 1.1666, + "step": 3724 + }, + { + "epoch": 1.008117981328643, + "grad_norm": 0.21135374903678894, + "learning_rate": 8.398032877155435e-05, + "loss": 1.1599, + "step": 3726 + }, + { + "epoch": 1.0086591800838858, + "grad_norm": 0.22054436802864075, + "learning_rate": 8.395722754448392e-05, + "loss": 1.1608, + "step": 3728 + }, + { + "epoch": 1.0092003788391286, + "grad_norm": 0.24490900337696075, + "learning_rate": 8.393411285515571e-05, + "loss": 1.1391, + "step": 3730 + }, + { + "epoch": 1.0097415775943714, + "grad_norm": 0.21600517630577087, + "learning_rate": 8.39109847127335e-05, + "loss": 1.186, + "step": 3732 + }, + { + "epoch": 1.0102827763496145, + "grad_norm": 0.21650472283363342, + "learning_rate": 8.388784312638638e-05, + "loss": 1.1682, + "step": 3734 + }, + { + "epoch": 1.0108239751048573, + "grad_norm": 0.2596750557422638, + "learning_rate": 8.386468810528875e-05, + "loss": 1.1753, + "step": 3736 + }, + { + "epoch": 1.0113651738601002, + "grad_norm": 0.20751477777957916, + "learning_rate": 8.38415196586204e-05, + "loss": 1.1714, + "step": 3738 + }, + { + "epoch": 1.011906372615343, + "grad_norm": 0.20294234156608582, + "learning_rate": 8.381833779556638e-05, + "loss": 1.1517, + "step": 3740 + }, + { + "epoch": 1.0124475713705858, + "grad_norm": 0.2066117227077484, + "learning_rate": 8.379514252531709e-05, + "loss": 1.141, + "step": 3742 + }, + { + "epoch": 1.0129887701258287, + "grad_norm": 0.21005451679229736, + "learning_rate": 8.377193385706823e-05, + "loss": 1.1613, + "step": 3744 + }, + { + "epoch": 1.0135299688810715, + "grad_norm": 0.21950848400592804, + "learning_rate": 8.374871180002082e-05, + "loss": 1.1692, + "step": 3746 + }, + { + "epoch": 1.0140711676363143, + "grad_norm": 0.2529325485229492, + "learning_rate": 8.372547636338117e-05, + "loss": 1.1721, + "step": 3748 + }, + { + "epoch": 1.0146123663915574, + "grad_norm": 0.24388043582439423, + "learning_rate": 8.370222755636094e-05, + "loss": 1.1608, + "step": 3750 + }, + { + "epoch": 1.0151535651468002, + "grad_norm": 0.22465898096561432, + "learning_rate": 8.367896538817704e-05, + "loss": 1.1759, + "step": 3752 + }, + { + "epoch": 1.015694763902043, + "grad_norm": 0.24493010342121124, + "learning_rate": 8.365568986805172e-05, + "loss": 1.1578, + "step": 3754 + }, + { + "epoch": 1.0162359626572859, + "grad_norm": 0.24231097102165222, + "learning_rate": 8.363240100521249e-05, + "loss": 1.1726, + "step": 3756 + }, + { + "epoch": 1.0167771614125287, + "grad_norm": 0.20873676240444183, + "learning_rate": 8.360909880889214e-05, + "loss": 1.1745, + "step": 3758 + }, + { + "epoch": 1.0173183601677716, + "grad_norm": 0.2147914171218872, + "learning_rate": 8.35857832883288e-05, + "loss": 1.1444, + "step": 3760 + }, + { + "epoch": 1.0178595589230144, + "grad_norm": 0.22383643686771393, + "learning_rate": 8.356245445276585e-05, + "loss": 1.1597, + "step": 3762 + }, + { + "epoch": 1.0184007576782574, + "grad_norm": 0.23194357752799988, + "learning_rate": 8.35391123114519e-05, + "loss": 1.152, + "step": 3764 + }, + { + "epoch": 1.0189419564335003, + "grad_norm": 0.23192541301250458, + "learning_rate": 8.351575687364095e-05, + "loss": 1.187, + "step": 3766 + }, + { + "epoch": 1.019483155188743, + "grad_norm": 0.24341407418251038, + "learning_rate": 8.349238814859217e-05, + "loss": 1.1569, + "step": 3768 + }, + { + "epoch": 1.020024353943986, + "grad_norm": 0.22612228989601135, + "learning_rate": 8.346900614557001e-05, + "loss": 1.1721, + "step": 3770 + }, + { + "epoch": 1.0205655526992288, + "grad_norm": 0.22228077054023743, + "learning_rate": 8.344561087384426e-05, + "loss": 1.1564, + "step": 3772 + }, + { + "epoch": 1.0211067514544716, + "grad_norm": 0.22433893382549286, + "learning_rate": 8.342220234268987e-05, + "loss": 1.14, + "step": 3774 + }, + { + "epoch": 1.0216479502097144, + "grad_norm": 0.22573336958885193, + "learning_rate": 8.339878056138712e-05, + "loss": 1.1738, + "step": 3776 + }, + { + "epoch": 1.0221891489649573, + "grad_norm": 0.21434976160526276, + "learning_rate": 8.337534553922151e-05, + "loss": 1.1667, + "step": 3778 + }, + { + "epoch": 1.0227303477202003, + "grad_norm": 0.20959998667240143, + "learning_rate": 8.335189728548381e-05, + "loss": 1.1811, + "step": 3780 + }, + { + "epoch": 1.0232715464754432, + "grad_norm": 0.21443971991539001, + "learning_rate": 8.332843580946999e-05, + "loss": 1.1715, + "step": 3782 + }, + { + "epoch": 1.023812745230686, + "grad_norm": 0.21527649462223053, + "learning_rate": 8.330496112048131e-05, + "loss": 1.1938, + "step": 3784 + }, + { + "epoch": 1.0243539439859288, + "grad_norm": 0.22661571204662323, + "learning_rate": 8.328147322782424e-05, + "loss": 1.1676, + "step": 3786 + }, + { + "epoch": 1.0248951427411717, + "grad_norm": 0.23696081340312958, + "learning_rate": 8.325797214081052e-05, + "loss": 1.1905, + "step": 3788 + }, + { + "epoch": 1.0254363414964145, + "grad_norm": 0.22171412408351898, + "learning_rate": 8.323445786875709e-05, + "loss": 1.1466, + "step": 3790 + }, + { + "epoch": 1.0259775402516573, + "grad_norm": 0.20770622789859772, + "learning_rate": 8.321093042098612e-05, + "loss": 1.1502, + "step": 3792 + }, + { + "epoch": 1.0265187390069004, + "grad_norm": 0.20591381192207336, + "learning_rate": 8.318738980682499e-05, + "loss": 1.159, + "step": 3794 + }, + { + "epoch": 1.0270599377621432, + "grad_norm": 0.22844432294368744, + "learning_rate": 8.316383603560633e-05, + "loss": 1.1805, + "step": 3796 + }, + { + "epoch": 1.027601136517386, + "grad_norm": 0.22971852123737335, + "learning_rate": 8.314026911666799e-05, + "loss": 1.1606, + "step": 3798 + }, + { + "epoch": 1.0281423352726289, + "grad_norm": 0.20094196498394012, + "learning_rate": 8.311668905935301e-05, + "loss": 1.1648, + "step": 3800 + }, + { + "epoch": 1.0286835340278717, + "grad_norm": 0.21529503166675568, + "learning_rate": 8.309309587300964e-05, + "loss": 1.1672, + "step": 3802 + }, + { + "epoch": 1.0292247327831145, + "grad_norm": 0.21473126113414764, + "learning_rate": 8.306948956699132e-05, + "loss": 1.1534, + "step": 3804 + }, + { + "epoch": 1.0297659315383574, + "grad_norm": 0.21291570365428925, + "learning_rate": 8.304587015065674e-05, + "loss": 1.1472, + "step": 3806 + }, + { + "epoch": 1.0303071302936004, + "grad_norm": 0.21232275664806366, + "learning_rate": 8.302223763336976e-05, + "loss": 1.1697, + "step": 3808 + }, + { + "epoch": 1.0308483290488433, + "grad_norm": 0.23822760581970215, + "learning_rate": 8.299859202449939e-05, + "loss": 1.1677, + "step": 3810 + }, + { + "epoch": 1.031389527804086, + "grad_norm": 0.24717482924461365, + "learning_rate": 8.297493333341992e-05, + "loss": 1.171, + "step": 3812 + }, + { + "epoch": 1.031930726559329, + "grad_norm": 0.2262924462556839, + "learning_rate": 8.295126156951076e-05, + "loss": 1.1654, + "step": 3814 + }, + { + "epoch": 1.0324719253145718, + "grad_norm": 0.22285550832748413, + "learning_rate": 8.292757674215652e-05, + "loss": 1.1583, + "step": 3816 + }, + { + "epoch": 1.0330131240698146, + "grad_norm": 0.22245456278324127, + "learning_rate": 8.290387886074698e-05, + "loss": 1.146, + "step": 3818 + }, + { + "epoch": 1.0335543228250574, + "grad_norm": 0.2081366628408432, + "learning_rate": 8.288016793467714e-05, + "loss": 1.1551, + "step": 3820 + }, + { + "epoch": 1.0340955215803003, + "grad_norm": 0.20822925865650177, + "learning_rate": 8.28564439733471e-05, + "loss": 1.1625, + "step": 3822 + }, + { + "epoch": 1.0346367203355433, + "grad_norm": 0.2068616896867752, + "learning_rate": 8.283270698616218e-05, + "loss": 1.1715, + "step": 3824 + }, + { + "epoch": 1.0351779190907862, + "grad_norm": 0.19563379883766174, + "learning_rate": 8.280895698253286e-05, + "loss": 1.1446, + "step": 3826 + }, + { + "epoch": 1.035719117846029, + "grad_norm": 0.214201420545578, + "learning_rate": 8.278519397187474e-05, + "loss": 1.1653, + "step": 3828 + }, + { + "epoch": 1.0362603166012718, + "grad_norm": 0.22149145603179932, + "learning_rate": 8.276141796360865e-05, + "loss": 1.1601, + "step": 3830 + }, + { + "epoch": 1.0368015153565147, + "grad_norm": 0.20305697619915009, + "learning_rate": 8.273762896716049e-05, + "loss": 1.1684, + "step": 3832 + }, + { + "epoch": 1.0373427141117575, + "grad_norm": 0.22398139536380768, + "learning_rate": 8.271382699196135e-05, + "loss": 1.1711, + "step": 3834 + }, + { + "epoch": 1.0378839128670003, + "grad_norm": 0.23009321093559265, + "learning_rate": 8.26900120474475e-05, + "loss": 1.1759, + "step": 3836 + }, + { + "epoch": 1.0384251116222432, + "grad_norm": 0.2264915555715561, + "learning_rate": 8.266618414306028e-05, + "loss": 1.1633, + "step": 3838 + }, + { + "epoch": 1.0389663103774862, + "grad_norm": 0.20079673826694489, + "learning_rate": 8.26423432882462e-05, + "loss": 1.1672, + "step": 3840 + }, + { + "epoch": 1.039507509132729, + "grad_norm": 0.20294247567653656, + "learning_rate": 8.261848949245694e-05, + "loss": 1.1736, + "step": 3842 + }, + { + "epoch": 1.0400487078879719, + "grad_norm": 0.21752537786960602, + "learning_rate": 8.259462276514924e-05, + "loss": 1.1472, + "step": 3844 + }, + { + "epoch": 1.0405899066432147, + "grad_norm": 0.21777775883674622, + "learning_rate": 8.257074311578504e-05, + "loss": 1.1658, + "step": 3846 + }, + { + "epoch": 1.0411311053984575, + "grad_norm": 0.2083350569009781, + "learning_rate": 8.254685055383135e-05, + "loss": 1.1561, + "step": 3848 + }, + { + "epoch": 1.0416723041537004, + "grad_norm": 0.22833497822284698, + "learning_rate": 8.252294508876031e-05, + "loss": 1.1512, + "step": 3850 + }, + { + "epoch": 1.0422135029089432, + "grad_norm": 0.22746221721172333, + "learning_rate": 8.249902673004917e-05, + "loss": 1.1617, + "step": 3852 + }, + { + "epoch": 1.0427547016641863, + "grad_norm": 0.23666687309741974, + "learning_rate": 8.247509548718035e-05, + "loss": 1.1588, + "step": 3854 + }, + { + "epoch": 1.043295900419429, + "grad_norm": 0.22944943606853485, + "learning_rate": 8.245115136964128e-05, + "loss": 1.184, + "step": 3856 + }, + { + "epoch": 1.043837099174672, + "grad_norm": 0.22142989933490753, + "learning_rate": 8.242719438692458e-05, + "loss": 1.1535, + "step": 3858 + }, + { + "epoch": 1.0443782979299148, + "grad_norm": 0.22694510221481323, + "learning_rate": 8.240322454852791e-05, + "loss": 1.16, + "step": 3860 + }, + { + "epoch": 1.0449194966851576, + "grad_norm": 0.2044788897037506, + "learning_rate": 8.237924186395408e-05, + "loss": 1.1324, + "step": 3862 + }, + { + "epoch": 1.0454606954404004, + "grad_norm": 0.1951093226671219, + "learning_rate": 8.235524634271094e-05, + "loss": 1.1572, + "step": 3864 + }, + { + "epoch": 1.0460018941956433, + "grad_norm": 0.2163376808166504, + "learning_rate": 8.233123799431148e-05, + "loss": 1.1703, + "step": 3866 + }, + { + "epoch": 1.0465430929508863, + "grad_norm": 0.2503316402435303, + "learning_rate": 8.230721682827372e-05, + "loss": 1.1462, + "step": 3868 + }, + { + "epoch": 1.0470842917061292, + "grad_norm": 0.30522701144218445, + "learning_rate": 8.228318285412081e-05, + "loss": 1.1584, + "step": 3870 + }, + { + "epoch": 1.047625490461372, + "grad_norm": 1.271936058998108, + "learning_rate": 8.225913608138095e-05, + "loss": 1.1548, + "step": 3872 + }, + { + "epoch": 1.0481666892166148, + "grad_norm": 0.40829160809516907, + "learning_rate": 8.223507651958743e-05, + "loss": 1.1716, + "step": 3874 + }, + { + "epoch": 1.0487078879718577, + "grad_norm": 0.4559307396411896, + "learning_rate": 8.22110041782786e-05, + "loss": 1.1752, + "step": 3876 + }, + { + "epoch": 1.0492490867271005, + "grad_norm": 0.30671998858451843, + "learning_rate": 8.21869190669979e-05, + "loss": 1.1629, + "step": 3878 + }, + { + "epoch": 1.0497902854823433, + "grad_norm": 0.2638482451438904, + "learning_rate": 8.216282119529378e-05, + "loss": 1.1504, + "step": 3880 + }, + { + "epoch": 1.0503314842375862, + "grad_norm": 0.24638111889362335, + "learning_rate": 8.213871057271978e-05, + "loss": 1.1551, + "step": 3882 + }, + { + "epoch": 1.0508726829928292, + "grad_norm": 0.25287169218063354, + "learning_rate": 8.21145872088345e-05, + "loss": 1.1707, + "step": 3884 + }, + { + "epoch": 1.051413881748072, + "grad_norm": 0.23718564212322235, + "learning_rate": 8.209045111320161e-05, + "loss": 1.1642, + "step": 3886 + }, + { + "epoch": 1.0519550805033149, + "grad_norm": 0.21913887560367584, + "learning_rate": 8.20663022953898e-05, + "loss": 1.175, + "step": 3888 + }, + { + "epoch": 1.0524962792585577, + "grad_norm": 0.24881695210933685, + "learning_rate": 8.204214076497278e-05, + "loss": 1.1722, + "step": 3890 + }, + { + "epoch": 1.0530374780138005, + "grad_norm": 0.24020962417125702, + "learning_rate": 8.201796653152936e-05, + "loss": 1.1563, + "step": 3892 + }, + { + "epoch": 1.0535786767690434, + "grad_norm": 0.20904816687107086, + "learning_rate": 8.199377960464333e-05, + "loss": 1.1779, + "step": 3894 + }, + { + "epoch": 1.0541198755242862, + "grad_norm": 0.2172718644142151, + "learning_rate": 8.196957999390356e-05, + "loss": 1.1471, + "step": 3896 + }, + { + "epoch": 1.054661074279529, + "grad_norm": 0.2129276990890503, + "learning_rate": 8.194536770890391e-05, + "loss": 1.1648, + "step": 3898 + }, + { + "epoch": 1.055202273034772, + "grad_norm": 0.22173850238323212, + "learning_rate": 8.192114275924327e-05, + "loss": 1.1558, + "step": 3900 + }, + { + "epoch": 1.055743471790015, + "grad_norm": 0.20564569532871246, + "learning_rate": 8.18969051545256e-05, + "loss": 1.1654, + "step": 3902 + }, + { + "epoch": 1.0562846705452578, + "grad_norm": 0.20729513466358185, + "learning_rate": 8.187265490435978e-05, + "loss": 1.164, + "step": 3904 + }, + { + "epoch": 1.0568258693005006, + "grad_norm": 0.23436667025089264, + "learning_rate": 8.184839201835981e-05, + "loss": 1.1603, + "step": 3906 + }, + { + "epoch": 1.0573670680557434, + "grad_norm": 0.2048719972372055, + "learning_rate": 8.182411650614464e-05, + "loss": 1.1805, + "step": 3908 + }, + { + "epoch": 1.0579082668109863, + "grad_norm": 0.21470339596271515, + "learning_rate": 8.17998283773382e-05, + "loss": 1.1507, + "step": 3910 + }, + { + "epoch": 1.058449465566229, + "grad_norm": 0.2106187790632248, + "learning_rate": 8.177552764156951e-05, + "loss": 1.1654, + "step": 3912 + }, + { + "epoch": 1.0589906643214722, + "grad_norm": 0.2404891699552536, + "learning_rate": 8.17512143084725e-05, + "loss": 1.1374, + "step": 3914 + }, + { + "epoch": 1.059531863076715, + "grad_norm": 0.20607243478298187, + "learning_rate": 8.172688838768614e-05, + "loss": 1.1777, + "step": 3916 + }, + { + "epoch": 1.0600730618319578, + "grad_norm": 0.2091047763824463, + "learning_rate": 8.170254988885438e-05, + "loss": 1.1609, + "step": 3918 + }, + { + "epoch": 1.0606142605872007, + "grad_norm": 0.21691367030143738, + "learning_rate": 8.167819882162617e-05, + "loss": 1.1554, + "step": 3920 + }, + { + "epoch": 1.0611554593424435, + "grad_norm": 0.221197247505188, + "learning_rate": 8.16538351956554e-05, + "loss": 1.1579, + "step": 3922 + }, + { + "epoch": 1.0616966580976863, + "grad_norm": 0.21998266875743866, + "learning_rate": 8.162945902060099e-05, + "loss": 1.1558, + "step": 3924 + }, + { + "epoch": 1.0622378568529292, + "grad_norm": 0.21962489187717438, + "learning_rate": 8.160507030612684e-05, + "loss": 1.1618, + "step": 3926 + }, + { + "epoch": 1.0627790556081722, + "grad_norm": 0.20770075917243958, + "learning_rate": 8.158066906190174e-05, + "loss": 1.1664, + "step": 3928 + }, + { + "epoch": 1.063320254363415, + "grad_norm": 0.20497769117355347, + "learning_rate": 8.155625529759951e-05, + "loss": 1.1577, + "step": 3930 + }, + { + "epoch": 1.0638614531186579, + "grad_norm": 0.21967573463916779, + "learning_rate": 8.153182902289897e-05, + "loss": 1.163, + "step": 3932 + }, + { + "epoch": 1.0644026518739007, + "grad_norm": 0.21568840742111206, + "learning_rate": 8.150739024748383e-05, + "loss": 1.1735, + "step": 3934 + }, + { + "epoch": 1.0649438506291435, + "grad_norm": 0.20917922258377075, + "learning_rate": 8.148293898104277e-05, + "loss": 1.1491, + "step": 3936 + }, + { + "epoch": 1.0654850493843864, + "grad_norm": 0.21231794357299805, + "learning_rate": 8.145847523326944e-05, + "loss": 1.1523, + "step": 3938 + }, + { + "epoch": 1.0660262481396292, + "grad_norm": 0.2191847562789917, + "learning_rate": 8.143399901386244e-05, + "loss": 1.154, + "step": 3940 + }, + { + "epoch": 1.066567446894872, + "grad_norm": 0.23816804587841034, + "learning_rate": 8.140951033252528e-05, + "loss": 1.1512, + "step": 3942 + }, + { + "epoch": 1.067108645650115, + "grad_norm": 0.20847490429878235, + "learning_rate": 8.138500919896649e-05, + "loss": 1.1584, + "step": 3944 + }, + { + "epoch": 1.067649844405358, + "grad_norm": 0.2102094441652298, + "learning_rate": 8.136049562289943e-05, + "loss": 1.1688, + "step": 3946 + }, + { + "epoch": 1.0681910431606008, + "grad_norm": 0.2000993937253952, + "learning_rate": 8.133596961404246e-05, + "loss": 1.1703, + "step": 3948 + }, + { + "epoch": 1.0687322419158436, + "grad_norm": 0.20474949479103088, + "learning_rate": 8.131143118211888e-05, + "loss": 1.1433, + "step": 3950 + }, + { + "epoch": 1.0692734406710864, + "grad_norm": 0.2065473198890686, + "learning_rate": 8.128688033685685e-05, + "loss": 1.1583, + "step": 3952 + }, + { + "epoch": 1.0698146394263293, + "grad_norm": 0.22109980881214142, + "learning_rate": 8.126231708798953e-05, + "loss": 1.1619, + "step": 3954 + }, + { + "epoch": 1.070355838181572, + "grad_norm": 0.2283199280500412, + "learning_rate": 8.123774144525492e-05, + "loss": 1.1649, + "step": 3956 + }, + { + "epoch": 1.070897036936815, + "grad_norm": 0.21421821415424347, + "learning_rate": 8.1213153418396e-05, + "loss": 1.1548, + "step": 3958 + }, + { + "epoch": 1.071438235692058, + "grad_norm": 0.20380717515945435, + "learning_rate": 8.118855301716061e-05, + "loss": 1.1502, + "step": 3960 + }, + { + "epoch": 1.0719794344473008, + "grad_norm": 0.2934326231479645, + "learning_rate": 8.116394025130156e-05, + "loss": 1.1655, + "step": 3962 + }, + { + "epoch": 1.0725206332025436, + "grad_norm": 0.637283444404602, + "learning_rate": 8.113931513057647e-05, + "loss": 1.163, + "step": 3964 + }, + { + "epoch": 1.0730618319577865, + "grad_norm": 0.3647371828556061, + "learning_rate": 8.111467766474793e-05, + "loss": 1.1736, + "step": 3966 + }, + { + "epoch": 1.0736030307130293, + "grad_norm": 0.8815774917602539, + "learning_rate": 8.10900278635834e-05, + "loss": 1.1622, + "step": 3968 + }, + { + "epoch": 1.0741442294682721, + "grad_norm": 1.764570713043213, + "learning_rate": 8.106536573685523e-05, + "loss": 1.1594, + "step": 3970 + }, + { + "epoch": 1.074685428223515, + "grad_norm": 0.295211523771286, + "learning_rate": 8.104069129434067e-05, + "loss": 1.1623, + "step": 3972 + }, + { + "epoch": 1.075226626978758, + "grad_norm": 0.5825856328010559, + "learning_rate": 8.10160045458218e-05, + "loss": 1.1773, + "step": 3974 + }, + { + "epoch": 1.0757678257340009, + "grad_norm": 0.40720388293266296, + "learning_rate": 8.099130550108566e-05, + "loss": 1.1581, + "step": 3976 + }, + { + "epoch": 1.0763090244892437, + "grad_norm": 0.3680490553379059, + "learning_rate": 8.096659416992414e-05, + "loss": 1.1747, + "step": 3978 + }, + { + "epoch": 1.0768502232444865, + "grad_norm": 0.2958071529865265, + "learning_rate": 8.094187056213393e-05, + "loss": 1.1517, + "step": 3980 + }, + { + "epoch": 1.0773914219997294, + "grad_norm": 0.28484034538269043, + "learning_rate": 8.09171346875167e-05, + "loss": 1.145, + "step": 3982 + }, + { + "epoch": 1.0779326207549722, + "grad_norm": 0.3037899434566498, + "learning_rate": 8.089238655587887e-05, + "loss": 1.1532, + "step": 3984 + }, + { + "epoch": 1.078473819510215, + "grad_norm": 0.27155500650405884, + "learning_rate": 8.086762617703181e-05, + "loss": 1.1713, + "step": 3986 + }, + { + "epoch": 1.079015018265458, + "grad_norm": 0.2606882154941559, + "learning_rate": 8.08428535607917e-05, + "loss": 1.1773, + "step": 3988 + }, + { + "epoch": 1.079556217020701, + "grad_norm": 0.22694651782512665, + "learning_rate": 8.081806871697959e-05, + "loss": 1.1613, + "step": 3990 + }, + { + "epoch": 1.0800974157759438, + "grad_norm": 0.23277917504310608, + "learning_rate": 8.079327165542135e-05, + "loss": 1.1573, + "step": 3992 + }, + { + "epoch": 1.0806386145311866, + "grad_norm": 0.24309536814689636, + "learning_rate": 8.076846238594774e-05, + "loss": 1.1787, + "step": 3994 + }, + { + "epoch": 1.0811798132864294, + "grad_norm": 0.23015187680721283, + "learning_rate": 8.074364091839432e-05, + "loss": 1.161, + "step": 3996 + }, + { + "epoch": 1.0817210120416723, + "grad_norm": 0.22507344186306, + "learning_rate": 8.071880726260149e-05, + "loss": 1.1693, + "step": 3998 + }, + { + "epoch": 1.082262210796915, + "grad_norm": 0.23293937742710114, + "learning_rate": 8.06939614284145e-05, + "loss": 1.1496, + "step": 4000 + }, + { + "epoch": 1.0828034095521581, + "grad_norm": 0.25221702456474304, + "learning_rate": 8.06691034256834e-05, + "loss": 1.1638, + "step": 4002 + }, + { + "epoch": 1.083344608307401, + "grad_norm": 0.4965110123157501, + "learning_rate": 8.064423326426313e-05, + "loss": 1.1511, + "step": 4004 + }, + { + "epoch": 1.0838858070626438, + "grad_norm": 0.23343190550804138, + "learning_rate": 8.061935095401336e-05, + "loss": 1.1619, + "step": 4006 + }, + { + "epoch": 1.0844270058178866, + "grad_norm": 0.24027478694915771, + "learning_rate": 8.059445650479862e-05, + "loss": 1.1642, + "step": 4008 + }, + { + "epoch": 1.0849682045731295, + "grad_norm": 0.2377437800168991, + "learning_rate": 8.056954992648828e-05, + "loss": 1.1653, + "step": 4010 + }, + { + "epoch": 1.0855094033283723, + "grad_norm": 0.21449270844459534, + "learning_rate": 8.054463122895645e-05, + "loss": 1.1623, + "step": 4012 + }, + { + "epoch": 1.0860506020836151, + "grad_norm": 0.22771266102790833, + "learning_rate": 8.051970042208214e-05, + "loss": 1.1848, + "step": 4014 + }, + { + "epoch": 1.086591800838858, + "grad_norm": 0.22908978164196014, + "learning_rate": 8.049475751574907e-05, + "loss": 1.1716, + "step": 4016 + }, + { + "epoch": 1.087132999594101, + "grad_norm": 0.21853183209896088, + "learning_rate": 8.046980251984579e-05, + "loss": 1.1488, + "step": 4018 + }, + { + "epoch": 1.0876741983493439, + "grad_norm": 0.2258382886648178, + "learning_rate": 8.044483544426565e-05, + "loss": 1.1516, + "step": 4020 + }, + { + "epoch": 1.0882153971045867, + "grad_norm": 0.22110265493392944, + "learning_rate": 8.04198562989068e-05, + "loss": 1.1651, + "step": 4022 + }, + { + "epoch": 1.0887565958598295, + "grad_norm": 0.2222428172826767, + "learning_rate": 8.039486509367213e-05, + "loss": 1.1689, + "step": 4024 + }, + { + "epoch": 1.0892977946150724, + "grad_norm": 0.26905685663223267, + "learning_rate": 8.036986183846937e-05, + "loss": 1.1664, + "step": 4026 + }, + { + "epoch": 1.0898389933703152, + "grad_norm": 0.2191791832447052, + "learning_rate": 8.0344846543211e-05, + "loss": 1.1662, + "step": 4028 + }, + { + "epoch": 1.090380192125558, + "grad_norm": 0.22855934500694275, + "learning_rate": 8.031981921781425e-05, + "loss": 1.1686, + "step": 4030 + }, + { + "epoch": 1.0909213908808009, + "grad_norm": 0.22431407868862152, + "learning_rate": 8.029477987220113e-05, + "loss": 1.1699, + "step": 4032 + }, + { + "epoch": 1.091462589636044, + "grad_norm": 0.21042400598526, + "learning_rate": 8.026972851629846e-05, + "loss": 1.1574, + "step": 4034 + }, + { + "epoch": 1.0920037883912868, + "grad_norm": 0.22836509346961975, + "learning_rate": 8.024466516003777e-05, + "loss": 1.1717, + "step": 4036 + }, + { + "epoch": 1.0925449871465296, + "grad_norm": 0.21852664649486542, + "learning_rate": 8.021958981335535e-05, + "loss": 1.148, + "step": 4038 + }, + { + "epoch": 1.0930861859017724, + "grad_norm": 0.23865893483161926, + "learning_rate": 8.01945024861923e-05, + "loss": 1.1716, + "step": 4040 + }, + { + "epoch": 1.0936273846570153, + "grad_norm": 0.21468840539455414, + "learning_rate": 8.01694031884944e-05, + "loss": 1.1609, + "step": 4042 + }, + { + "epoch": 1.094168583412258, + "grad_norm": 0.219735786318779, + "learning_rate": 8.014429193021219e-05, + "loss": 1.183, + "step": 4044 + }, + { + "epoch": 1.094709782167501, + "grad_norm": 0.23305675387382507, + "learning_rate": 8.011916872130099e-05, + "loss": 1.1545, + "step": 4046 + }, + { + "epoch": 1.095250980922744, + "grad_norm": 0.2143971025943756, + "learning_rate": 8.009403357172083e-05, + "loss": 1.155, + "step": 4048 + }, + { + "epoch": 1.0957921796779868, + "grad_norm": 0.20684604346752167, + "learning_rate": 8.006888649143646e-05, + "loss": 1.1501, + "step": 4050 + }, + { + "epoch": 1.0963333784332296, + "grad_norm": 0.22377672791481018, + "learning_rate": 8.00437274904174e-05, + "loss": 1.1559, + "step": 4052 + }, + { + "epoch": 1.0968745771884725, + "grad_norm": 0.20589472353458405, + "learning_rate": 8.001855657863787e-05, + "loss": 1.1682, + "step": 4054 + }, + { + "epoch": 1.0974157759437153, + "grad_norm": 0.20585069060325623, + "learning_rate": 7.999337376607677e-05, + "loss": 1.1539, + "step": 4056 + }, + { + "epoch": 1.0979569746989581, + "grad_norm": 0.2118963897228241, + "learning_rate": 7.996817906271782e-05, + "loss": 1.1888, + "step": 4058 + }, + { + "epoch": 1.098498173454201, + "grad_norm": 0.2089332491159439, + "learning_rate": 7.994297247854939e-05, + "loss": 1.1705, + "step": 4060 + }, + { + "epoch": 1.099039372209444, + "grad_norm": 0.2157619744539261, + "learning_rate": 7.991775402356453e-05, + "loss": 1.1468, + "step": 4062 + }, + { + "epoch": 1.0995805709646869, + "grad_norm": 0.21696984767913818, + "learning_rate": 7.989252370776108e-05, + "loss": 1.1585, + "step": 4064 + }, + { + "epoch": 1.1001217697199297, + "grad_norm": 0.2212984263896942, + "learning_rate": 7.98672815411415e-05, + "loss": 1.1624, + "step": 4066 + }, + { + "epoch": 1.1006629684751725, + "grad_norm": 0.2152889221906662, + "learning_rate": 7.984202753371299e-05, + "loss": 1.1794, + "step": 4068 + }, + { + "epoch": 1.1012041672304154, + "grad_norm": 0.22469989955425262, + "learning_rate": 7.981676169548747e-05, + "loss": 1.1651, + "step": 4070 + }, + { + "epoch": 1.1017453659856582, + "grad_norm": 0.2330954372882843, + "learning_rate": 7.979148403648146e-05, + "loss": 1.1536, + "step": 4072 + }, + { + "epoch": 1.102286564740901, + "grad_norm": 0.22769203782081604, + "learning_rate": 7.976619456671628e-05, + "loss": 1.1484, + "step": 4074 + }, + { + "epoch": 1.1028277634961439, + "grad_norm": 0.21332180500030518, + "learning_rate": 7.974089329621786e-05, + "loss": 1.1462, + "step": 4076 + }, + { + "epoch": 1.103368962251387, + "grad_norm": 0.20643813908100128, + "learning_rate": 7.971558023501679e-05, + "loss": 1.1538, + "step": 4078 + }, + { + "epoch": 1.1039101610066298, + "grad_norm": 0.2165161520242691, + "learning_rate": 7.96902553931484e-05, + "loss": 1.1447, + "step": 4080 + }, + { + "epoch": 1.1044513597618726, + "grad_norm": 0.2074541449546814, + "learning_rate": 7.96649187806527e-05, + "loss": 1.1592, + "step": 4082 + }, + { + "epoch": 1.1049925585171154, + "grad_norm": 0.2030552178621292, + "learning_rate": 7.963957040757424e-05, + "loss": 1.1648, + "step": 4084 + }, + { + "epoch": 1.1055337572723583, + "grad_norm": 0.8349897861480713, + "learning_rate": 7.961421028396239e-05, + "loss": 1.1549, + "step": 4086 + }, + { + "epoch": 1.106074956027601, + "grad_norm": 0.22756335139274597, + "learning_rate": 7.958883841987108e-05, + "loss": 1.1577, + "step": 4088 + }, + { + "epoch": 1.106616154782844, + "grad_norm": 0.21515905857086182, + "learning_rate": 7.956345482535892e-05, + "loss": 1.1522, + "step": 4090 + }, + { + "epoch": 1.1071573535380868, + "grad_norm": 0.211012601852417, + "learning_rate": 7.953805951048916e-05, + "loss": 1.1654, + "step": 4092 + }, + { + "epoch": 1.1076985522933298, + "grad_norm": 0.2122959941625595, + "learning_rate": 7.951265248532976e-05, + "loss": 1.1376, + "step": 4094 + }, + { + "epoch": 1.1082397510485726, + "grad_norm": 0.21735821664333344, + "learning_rate": 7.94872337599532e-05, + "loss": 1.1609, + "step": 4096 + }, + { + "epoch": 1.1087809498038155, + "grad_norm": 0.2157265990972519, + "learning_rate": 7.946180334443673e-05, + "loss": 1.1413, + "step": 4098 + }, + { + "epoch": 1.1093221485590583, + "grad_norm": 0.23656488955020905, + "learning_rate": 7.943636124886214e-05, + "loss": 1.1706, + "step": 4100 + }, + { + "epoch": 1.1098633473143011, + "grad_norm": 0.44128191471099854, + "learning_rate": 7.94109074833159e-05, + "loss": 1.1422, + "step": 4102 + }, + { + "epoch": 1.110404546069544, + "grad_norm": 0.30563896894454956, + "learning_rate": 7.938544205788909e-05, + "loss": 1.1624, + "step": 4104 + }, + { + "epoch": 1.1109457448247868, + "grad_norm": 0.3123810291290283, + "learning_rate": 7.935996498267742e-05, + "loss": 1.1615, + "step": 4106 + }, + { + "epoch": 1.1114869435800299, + "grad_norm": 0.2737880051136017, + "learning_rate": 7.933447626778119e-05, + "loss": 1.1816, + "step": 4108 + }, + { + "epoch": 1.1120281423352727, + "grad_norm": 0.22107571363449097, + "learning_rate": 7.930897592330535e-05, + "loss": 1.1561, + "step": 4110 + }, + { + "epoch": 1.1125693410905155, + "grad_norm": 0.2256803661584854, + "learning_rate": 7.928346395935945e-05, + "loss": 1.1749, + "step": 4112 + }, + { + "epoch": 1.1131105398457584, + "grad_norm": 0.2229342758655548, + "learning_rate": 7.925794038605766e-05, + "loss": 1.1587, + "step": 4114 + }, + { + "epoch": 1.1136517386010012, + "grad_norm": 0.211014524102211, + "learning_rate": 7.923240521351871e-05, + "loss": 1.1754, + "step": 4116 + }, + { + "epoch": 1.114192937356244, + "grad_norm": 0.2330767959356308, + "learning_rate": 7.920685845186595e-05, + "loss": 1.1499, + "step": 4118 + }, + { + "epoch": 1.1147341361114869, + "grad_norm": 0.21222126483917236, + "learning_rate": 7.918130011122737e-05, + "loss": 1.1561, + "step": 4120 + }, + { + "epoch": 1.11527533486673, + "grad_norm": 0.6723094582557678, + "learning_rate": 7.915573020173547e-05, + "loss": 1.1633, + "step": 4122 + }, + { + "epoch": 1.1158165336219728, + "grad_norm": 0.2816324532032013, + "learning_rate": 7.91301487335274e-05, + "loss": 1.1713, + "step": 4124 + }, + { + "epoch": 1.1163577323772156, + "grad_norm": 0.25782155990600586, + "learning_rate": 7.910455571674486e-05, + "loss": 1.1626, + "step": 4126 + }, + { + "epoch": 1.1168989311324584, + "grad_norm": 0.22253093123435974, + "learning_rate": 7.907895116153413e-05, + "loss": 1.1676, + "step": 4128 + }, + { + "epoch": 1.1174401298877013, + "grad_norm": 0.24996554851531982, + "learning_rate": 7.905333507804608e-05, + "loss": 1.1613, + "step": 4130 + }, + { + "epoch": 1.117981328642944, + "grad_norm": 0.20716790854930878, + "learning_rate": 7.902770747643615e-05, + "loss": 1.1557, + "step": 4132 + }, + { + "epoch": 1.118522527398187, + "grad_norm": 0.21587280929088593, + "learning_rate": 7.900206836686432e-05, + "loss": 1.1508, + "step": 4134 + }, + { + "epoch": 1.1190637261534297, + "grad_norm": 0.22769996523857117, + "learning_rate": 7.897641775949518e-05, + "loss": 1.1477, + "step": 4136 + }, + { + "epoch": 1.1196049249086728, + "grad_norm": 0.2523115277290344, + "learning_rate": 7.895075566449781e-05, + "loss": 1.1744, + "step": 4138 + }, + { + "epoch": 1.1201461236639156, + "grad_norm": 0.2227836698293686, + "learning_rate": 7.892508209204592e-05, + "loss": 1.1511, + "step": 4140 + }, + { + "epoch": 1.1206873224191585, + "grad_norm": 0.23955698311328888, + "learning_rate": 7.88993970523177e-05, + "loss": 1.1604, + "step": 4142 + }, + { + "epoch": 1.1212285211744013, + "grad_norm": 0.217301145195961, + "learning_rate": 7.887370055549594e-05, + "loss": 1.1441, + "step": 4144 + }, + { + "epoch": 1.1217697199296441, + "grad_norm": 0.22721989452838898, + "learning_rate": 7.884799261176795e-05, + "loss": 1.131, + "step": 4146 + }, + { + "epoch": 1.122310918684887, + "grad_norm": 0.23621070384979248, + "learning_rate": 7.882227323132558e-05, + "loss": 1.1413, + "step": 4148 + }, + { + "epoch": 1.1228521174401298, + "grad_norm": 0.20073488354682922, + "learning_rate": 7.879654242436523e-05, + "loss": 1.1474, + "step": 4150 + }, + { + "epoch": 1.1233933161953726, + "grad_norm": 0.20614007115364075, + "learning_rate": 7.877080020108776e-05, + "loss": 1.171, + "step": 4152 + }, + { + "epoch": 1.1239345149506157, + "grad_norm": 0.21867454051971436, + "learning_rate": 7.874504657169868e-05, + "loss": 1.1486, + "step": 4154 + }, + { + "epoch": 1.1244757137058585, + "grad_norm": 0.22386595606803894, + "learning_rate": 7.871928154640788e-05, + "loss": 1.1462, + "step": 4156 + }, + { + "epoch": 1.1250169124611014, + "grad_norm": 0.20961107313632965, + "learning_rate": 7.86935051354299e-05, + "loss": 1.1833, + "step": 4158 + }, + { + "epoch": 1.1255581112163442, + "grad_norm": 0.205885648727417, + "learning_rate": 7.866771734898373e-05, + "loss": 1.1513, + "step": 4160 + }, + { + "epoch": 1.126099309971587, + "grad_norm": 0.20518140494823456, + "learning_rate": 7.864191819729282e-05, + "loss": 1.1696, + "step": 4162 + }, + { + "epoch": 1.1266405087268299, + "grad_norm": 0.205685555934906, + "learning_rate": 7.861610769058523e-05, + "loss": 1.1437, + "step": 4164 + }, + { + "epoch": 1.1271817074820727, + "grad_norm": 0.2135431170463562, + "learning_rate": 7.859028583909345e-05, + "loss": 1.1604, + "step": 4166 + }, + { + "epoch": 1.1277229062373157, + "grad_norm": 0.21558403968811035, + "learning_rate": 7.85644526530545e-05, + "loss": 1.1771, + "step": 4168 + }, + { + "epoch": 1.1282641049925586, + "grad_norm": 0.220762237906456, + "learning_rate": 7.853860814270985e-05, + "loss": 1.1391, + "step": 4170 + }, + { + "epoch": 1.1288053037478014, + "grad_norm": 0.21308547258377075, + "learning_rate": 7.851275231830555e-05, + "loss": 1.1463, + "step": 4172 + }, + { + "epoch": 1.1293465025030442, + "grad_norm": 0.2130470871925354, + "learning_rate": 7.848688519009202e-05, + "loss": 1.1331, + "step": 4174 + }, + { + "epoch": 1.129887701258287, + "grad_norm": 0.20271500945091248, + "learning_rate": 7.846100676832424e-05, + "loss": 1.1561, + "step": 4176 + }, + { + "epoch": 1.13042890001353, + "grad_norm": 0.21627353131771088, + "learning_rate": 7.843511706326165e-05, + "loss": 1.1573, + "step": 4178 + }, + { + "epoch": 1.1309700987687727, + "grad_norm": 0.22262731194496155, + "learning_rate": 7.840921608516815e-05, + "loss": 1.1461, + "step": 4180 + }, + { + "epoch": 1.1315112975240158, + "grad_norm": 0.1991957724094391, + "learning_rate": 7.838330384431214e-05, + "loss": 1.1438, + "step": 4182 + }, + { + "epoch": 1.1320524962792586, + "grad_norm": 0.20617301762104034, + "learning_rate": 7.835738035096643e-05, + "loss": 1.1505, + "step": 4184 + }, + { + "epoch": 1.1325936950345015, + "grad_norm": 0.22232504189014435, + "learning_rate": 7.833144561540835e-05, + "loss": 1.1501, + "step": 4186 + }, + { + "epoch": 1.1331348937897443, + "grad_norm": 0.1919006109237671, + "learning_rate": 7.830549964791964e-05, + "loss": 1.1465, + "step": 4188 + }, + { + "epoch": 1.1336760925449871, + "grad_norm": 0.20378994941711426, + "learning_rate": 7.827954245878654e-05, + "loss": 1.1598, + "step": 4190 + }, + { + "epoch": 1.13421729130023, + "grad_norm": 0.21754945814609528, + "learning_rate": 7.825357405829967e-05, + "loss": 1.1612, + "step": 4192 + }, + { + "epoch": 1.1347584900554728, + "grad_norm": 0.20612432062625885, + "learning_rate": 7.822759445675419e-05, + "loss": 1.1518, + "step": 4194 + }, + { + "epoch": 1.1352996888107159, + "grad_norm": 0.21937181055545807, + "learning_rate": 7.820160366444961e-05, + "loss": 1.1567, + "step": 4196 + }, + { + "epoch": 1.1358408875659587, + "grad_norm": 0.2110893726348877, + "learning_rate": 7.817560169168991e-05, + "loss": 1.1582, + "step": 4198 + }, + { + "epoch": 1.1363820863212015, + "grad_norm": 0.2051175981760025, + "learning_rate": 7.814958854878356e-05, + "loss": 1.1777, + "step": 4200 + }, + { + "epoch": 1.1369232850764444, + "grad_norm": 0.21144556999206543, + "learning_rate": 7.812356424604335e-05, + "loss": 1.1568, + "step": 4202 + }, + { + "epoch": 1.1374644838316872, + "grad_norm": 0.22127197682857513, + "learning_rate": 7.809752879378656e-05, + "loss": 1.145, + "step": 4204 + }, + { + "epoch": 1.13800568258693, + "grad_norm": 0.20766502618789673, + "learning_rate": 7.807148220233488e-05, + "loss": 1.1506, + "step": 4206 + }, + { + "epoch": 1.1385468813421729, + "grad_norm": 0.20828627049922943, + "learning_rate": 7.804542448201447e-05, + "loss": 1.177, + "step": 4208 + }, + { + "epoch": 1.1390880800974157, + "grad_norm": 0.19857019186019897, + "learning_rate": 7.801935564315574e-05, + "loss": 1.1514, + "step": 4210 + }, + { + "epoch": 1.1396292788526585, + "grad_norm": 0.1956413984298706, + "learning_rate": 7.799327569609373e-05, + "loss": 1.1578, + "step": 4212 + }, + { + "epoch": 1.1401704776079016, + "grad_norm": 0.20685957372188568, + "learning_rate": 7.79671846511677e-05, + "loss": 1.1484, + "step": 4214 + }, + { + "epoch": 1.1407116763631444, + "grad_norm": 0.21370911598205566, + "learning_rate": 7.794108251872138e-05, + "loss": 1.1445, + "step": 4216 + }, + { + "epoch": 1.1412528751183872, + "grad_norm": 0.21143577992916107, + "learning_rate": 7.791496930910293e-05, + "loss": 1.1488, + "step": 4218 + }, + { + "epoch": 1.14179407387363, + "grad_norm": 0.20833255350589752, + "learning_rate": 7.788884503266485e-05, + "loss": 1.1553, + "step": 4220 + }, + { + "epoch": 1.142335272628873, + "grad_norm": 0.21482530236244202, + "learning_rate": 7.786270969976403e-05, + "loss": 1.1521, + "step": 4222 + }, + { + "epoch": 1.1428764713841157, + "grad_norm": 0.21887533366680145, + "learning_rate": 7.783656332076177e-05, + "loss": 1.1381, + "step": 4224 + }, + { + "epoch": 1.1434176701393586, + "grad_norm": 0.22636641561985016, + "learning_rate": 7.781040590602373e-05, + "loss": 1.1572, + "step": 4226 + }, + { + "epoch": 1.1439588688946016, + "grad_norm": 0.2340206354856491, + "learning_rate": 7.778423746591999e-05, + "loss": 1.1493, + "step": 4228 + }, + { + "epoch": 1.1445000676498445, + "grad_norm": 0.23177602887153625, + "learning_rate": 7.77580580108249e-05, + "loss": 1.1586, + "step": 4230 + }, + { + "epoch": 1.1450412664050873, + "grad_norm": 0.19879846274852753, + "learning_rate": 7.773186755111728e-05, + "loss": 1.171, + "step": 4232 + }, + { + "epoch": 1.1455824651603301, + "grad_norm": 0.2035079002380371, + "learning_rate": 7.770566609718026e-05, + "loss": 1.1593, + "step": 4234 + }, + { + "epoch": 1.146123663915573, + "grad_norm": 0.2164110541343689, + "learning_rate": 7.767945365940134e-05, + "loss": 1.1546, + "step": 4236 + }, + { + "epoch": 1.1466648626708158, + "grad_norm": 0.22445808351039886, + "learning_rate": 7.765323024817237e-05, + "loss": 1.1536, + "step": 4238 + }, + { + "epoch": 1.1472060614260586, + "grad_norm": 0.2171921283006668, + "learning_rate": 7.762699587388957e-05, + "loss": 1.166, + "step": 4240 + }, + { + "epoch": 1.1477472601813017, + "grad_norm": 0.20324265956878662, + "learning_rate": 7.76007505469535e-05, + "loss": 1.1514, + "step": 4242 + }, + { + "epoch": 1.1482884589365445, + "grad_norm": 0.20602954924106598, + "learning_rate": 7.757449427776902e-05, + "loss": 1.1526, + "step": 4244 + }, + { + "epoch": 1.1488296576917874, + "grad_norm": 0.19193702936172485, + "learning_rate": 7.754822707674538e-05, + "loss": 1.1461, + "step": 4246 + }, + { + "epoch": 1.1493708564470302, + "grad_norm": 0.2005046159029007, + "learning_rate": 7.752194895429617e-05, + "loss": 1.1422, + "step": 4248 + }, + { + "epoch": 1.149912055202273, + "grad_norm": 0.20979368686676025, + "learning_rate": 7.749565992083925e-05, + "loss": 1.1508, + "step": 4250 + }, + { + "epoch": 1.1504532539575159, + "grad_norm": 0.20569835603237152, + "learning_rate": 7.746935998679685e-05, + "loss": 1.1587, + "step": 4252 + }, + { + "epoch": 1.1509944527127587, + "grad_norm": 0.21431037783622742, + "learning_rate": 7.744304916259553e-05, + "loss": 1.1743, + "step": 4254 + }, + { + "epoch": 1.1515356514680017, + "grad_norm": 0.23459142446517944, + "learning_rate": 7.741672745866612e-05, + "loss": 1.1511, + "step": 4256 + }, + { + "epoch": 1.1520768502232446, + "grad_norm": 0.20567552745342255, + "learning_rate": 7.739039488544382e-05, + "loss": 1.1432, + "step": 4258 + }, + { + "epoch": 1.1526180489784874, + "grad_norm": 0.21821950376033783, + "learning_rate": 7.73640514533681e-05, + "loss": 1.1565, + "step": 4260 + }, + { + "epoch": 1.1531592477337302, + "grad_norm": 0.21483366191387177, + "learning_rate": 7.733769717288275e-05, + "loss": 1.1753, + "step": 4262 + }, + { + "epoch": 1.153700446488973, + "grad_norm": 0.2083878517150879, + "learning_rate": 7.731133205443587e-05, + "loss": 1.1564, + "step": 4264 + }, + { + "epoch": 1.154241645244216, + "grad_norm": 0.22914838790893555, + "learning_rate": 7.728495610847984e-05, + "loss": 1.1463, + "step": 4266 + }, + { + "epoch": 1.1547828439994587, + "grad_norm": 0.21248088777065277, + "learning_rate": 7.72585693454713e-05, + "loss": 1.167, + "step": 4268 + }, + { + "epoch": 1.1553240427547016, + "grad_norm": 0.21737425029277802, + "learning_rate": 7.723217177587129e-05, + "loss": 1.1391, + "step": 4270 + }, + { + "epoch": 1.1558652415099444, + "grad_norm": 0.20481689274311066, + "learning_rate": 7.7205763410145e-05, + "loss": 1.1659, + "step": 4272 + }, + { + "epoch": 1.1564064402651875, + "grad_norm": 0.2236364781856537, + "learning_rate": 7.717934425876199e-05, + "loss": 1.156, + "step": 4274 + }, + { + "epoch": 1.1569476390204303, + "grad_norm": 0.20564323663711548, + "learning_rate": 7.715291433219605e-05, + "loss": 1.1251, + "step": 4276 + }, + { + "epoch": 1.1574888377756731, + "grad_norm": 0.21151994168758392, + "learning_rate": 7.712647364092525e-05, + "loss": 1.1438, + "step": 4278 + }, + { + "epoch": 1.158030036530916, + "grad_norm": 0.2575263977050781, + "learning_rate": 7.710002219543198e-05, + "loss": 1.1655, + "step": 4280 + }, + { + "epoch": 1.1585712352861588, + "grad_norm": 0.2303873598575592, + "learning_rate": 7.707356000620279e-05, + "loss": 1.1703, + "step": 4282 + }, + { + "epoch": 1.1591124340414016, + "grad_norm": 0.2279600352048874, + "learning_rate": 7.704708708372858e-05, + "loss": 1.1732, + "step": 4284 + }, + { + "epoch": 1.1596536327966445, + "grad_norm": 0.21688218414783478, + "learning_rate": 7.702060343850449e-05, + "loss": 1.1598, + "step": 4286 + }, + { + "epoch": 1.1601948315518875, + "grad_norm": 0.20281021296977997, + "learning_rate": 7.699410908102987e-05, + "loss": 1.1689, + "step": 4288 + }, + { + "epoch": 1.1607360303071304, + "grad_norm": 0.2117750495672226, + "learning_rate": 7.696760402180834e-05, + "loss": 1.1557, + "step": 4290 + }, + { + "epoch": 1.1612772290623732, + "grad_norm": 0.21930202841758728, + "learning_rate": 7.694108827134779e-05, + "loss": 1.1408, + "step": 4292 + }, + { + "epoch": 1.161818427817616, + "grad_norm": 0.22055776417255402, + "learning_rate": 7.691456184016031e-05, + "loss": 1.1635, + "step": 4294 + }, + { + "epoch": 1.1623596265728589, + "grad_norm": 0.24794168770313263, + "learning_rate": 7.68880247387622e-05, + "loss": 1.1491, + "step": 4296 + }, + { + "epoch": 1.1629008253281017, + "grad_norm": 0.23536056280136108, + "learning_rate": 7.686147697767407e-05, + "loss": 1.1591, + "step": 4298 + }, + { + "epoch": 1.1634420240833445, + "grad_norm": 0.2640872001647949, + "learning_rate": 7.683491856742071e-05, + "loss": 1.1616, + "step": 4300 + }, + { + "epoch": 1.1639832228385876, + "grad_norm": 0.22453339397907257, + "learning_rate": 7.680834951853113e-05, + "loss": 1.135, + "step": 4302 + }, + { + "epoch": 1.1645244215938304, + "grad_norm": 0.23823754489421844, + "learning_rate": 7.678176984153855e-05, + "loss": 1.1487, + "step": 4304 + }, + { + "epoch": 1.1650656203490732, + "grad_norm": 0.20748983323574066, + "learning_rate": 7.675517954698044e-05, + "loss": 1.1523, + "step": 4306 + }, + { + "epoch": 1.165606819104316, + "grad_norm": 0.21772535145282745, + "learning_rate": 7.672857864539844e-05, + "loss": 1.1461, + "step": 4308 + }, + { + "epoch": 1.166148017859559, + "grad_norm": 0.22087602317333221, + "learning_rate": 7.670196714733842e-05, + "loss": 1.1619, + "step": 4310 + }, + { + "epoch": 1.1666892166148017, + "grad_norm": 0.22839230298995972, + "learning_rate": 7.667534506335043e-05, + "loss": 1.1533, + "step": 4312 + }, + { + "epoch": 1.1672304153700446, + "grad_norm": 0.21815411746501923, + "learning_rate": 7.664871240398875e-05, + "loss": 1.1403, + "step": 4314 + }, + { + "epoch": 1.1677716141252876, + "grad_norm": 0.19821114838123322, + "learning_rate": 7.66220691798118e-05, + "loss": 1.1472, + "step": 4316 + }, + { + "epoch": 1.1683128128805305, + "grad_norm": 0.19013382494449615, + "learning_rate": 7.659541540138222e-05, + "loss": 1.1535, + "step": 4318 + }, + { + "epoch": 1.1688540116357733, + "grad_norm": 0.1978456676006317, + "learning_rate": 7.656875107926687e-05, + "loss": 1.146, + "step": 4320 + }, + { + "epoch": 1.1693952103910161, + "grad_norm": 0.20645636320114136, + "learning_rate": 7.654207622403673e-05, + "loss": 1.1576, + "step": 4322 + }, + { + "epoch": 1.169936409146259, + "grad_norm": 0.2145734280347824, + "learning_rate": 7.651539084626698e-05, + "loss": 1.1563, + "step": 4324 + }, + { + "epoch": 1.1704776079015018, + "grad_norm": 0.20490793883800507, + "learning_rate": 7.648869495653697e-05, + "loss": 1.1827, + "step": 4326 + }, + { + "epoch": 1.1710188066567446, + "grad_norm": 0.20274941623210907, + "learning_rate": 7.646198856543021e-05, + "loss": 1.1498, + "step": 4328 + }, + { + "epoch": 1.1715600054119875, + "grad_norm": 0.2002313733100891, + "learning_rate": 7.643527168353439e-05, + "loss": 1.1296, + "step": 4330 + }, + { + "epoch": 1.1721012041672303, + "grad_norm": 0.21061885356903076, + "learning_rate": 7.640854432144137e-05, + "loss": 1.186, + "step": 4332 + }, + { + "epoch": 1.1726424029224733, + "grad_norm": 0.20808294415473938, + "learning_rate": 7.638180648974715e-05, + "loss": 1.1428, + "step": 4334 + }, + { + "epoch": 1.1731836016777162, + "grad_norm": 0.2033407986164093, + "learning_rate": 7.635505819905182e-05, + "loss": 1.1533, + "step": 4336 + }, + { + "epoch": 1.173724800432959, + "grad_norm": 0.21454395353794098, + "learning_rate": 7.632829945995974e-05, + "loss": 1.1726, + "step": 4338 + }, + { + "epoch": 1.1742659991882018, + "grad_norm": 0.20768827199935913, + "learning_rate": 7.630153028307929e-05, + "loss": 1.1801, + "step": 4340 + }, + { + "epoch": 1.1748071979434447, + "grad_norm": 0.21167118847370148, + "learning_rate": 7.627475067902307e-05, + "loss": 1.1561, + "step": 4342 + }, + { + "epoch": 1.1753483966986875, + "grad_norm": 0.2219628542661667, + "learning_rate": 7.62479606584078e-05, + "loss": 1.1455, + "step": 4344 + }, + { + "epoch": 1.1758895954539303, + "grad_norm": 0.22176925837993622, + "learning_rate": 7.622116023185429e-05, + "loss": 1.1583, + "step": 4346 + }, + { + "epoch": 1.1764307942091734, + "grad_norm": 0.21282261610031128, + "learning_rate": 7.619434940998751e-05, + "loss": 1.1415, + "step": 4348 + }, + { + "epoch": 1.1769719929644162, + "grad_norm": 0.20743747055530548, + "learning_rate": 7.616752820343655e-05, + "loss": 1.1552, + "step": 4350 + }, + { + "epoch": 1.177513191719659, + "grad_norm": 0.21067087352275848, + "learning_rate": 7.61406966228346e-05, + "loss": 1.1677, + "step": 4352 + }, + { + "epoch": 1.178054390474902, + "grad_norm": 0.22039294242858887, + "learning_rate": 7.611385467881898e-05, + "loss": 1.1495, + "step": 4354 + }, + { + "epoch": 1.1785955892301447, + "grad_norm": 0.1977587342262268, + "learning_rate": 7.60870023820311e-05, + "loss": 1.1562, + "step": 4356 + }, + { + "epoch": 1.1791367879853876, + "grad_norm": 0.2049712985754013, + "learning_rate": 7.60601397431165e-05, + "loss": 1.1526, + "step": 4358 + }, + { + "epoch": 1.1796779867406304, + "grad_norm": 0.22354736924171448, + "learning_rate": 7.603326677272482e-05, + "loss": 1.1611, + "step": 4360 + }, + { + "epoch": 1.1802191854958735, + "grad_norm": 0.3315167725086212, + "learning_rate": 7.600638348150978e-05, + "loss": 1.1621, + "step": 4362 + }, + { + "epoch": 1.1807603842511163, + "grad_norm": 0.26512739062309265, + "learning_rate": 7.597948988012912e-05, + "loss": 1.1573, + "step": 4364 + }, + { + "epoch": 1.1813015830063591, + "grad_norm": 0.2502736449241638, + "learning_rate": 7.595258597924484e-05, + "loss": 1.1786, + "step": 4366 + }, + { + "epoch": 1.181842781761602, + "grad_norm": 0.2125827521085739, + "learning_rate": 7.592567178952288e-05, + "loss": 1.1357, + "step": 4368 + }, + { + "epoch": 1.1823839805168448, + "grad_norm": 0.21820925176143646, + "learning_rate": 7.589874732163328e-05, + "loss": 1.1459, + "step": 4370 + }, + { + "epoch": 1.1829251792720876, + "grad_norm": 0.23182585835456848, + "learning_rate": 7.587181258625022e-05, + "loss": 1.1397, + "step": 4372 + }, + { + "epoch": 1.1834663780273305, + "grad_norm": 0.2198820859193802, + "learning_rate": 7.58448675940519e-05, + "loss": 1.1438, + "step": 4374 + }, + { + "epoch": 1.1840075767825735, + "grad_norm": 0.21208171546459198, + "learning_rate": 7.581791235572058e-05, + "loss": 1.1307, + "step": 4376 + }, + { + "epoch": 1.1845487755378163, + "grad_norm": 0.19944527745246887, + "learning_rate": 7.57909468819426e-05, + "loss": 1.1548, + "step": 4378 + }, + { + "epoch": 1.1850899742930592, + "grad_norm": 0.20075197517871857, + "learning_rate": 7.576397118340834e-05, + "loss": 1.1445, + "step": 4380 + }, + { + "epoch": 1.185631173048302, + "grad_norm": 0.2044142484664917, + "learning_rate": 7.573698527081228e-05, + "loss": 1.1474, + "step": 4382 + }, + { + "epoch": 1.1861723718035448, + "grad_norm": 0.2024833858013153, + "learning_rate": 7.57099891548529e-05, + "loss": 1.1648, + "step": 4384 + }, + { + "epoch": 1.1867135705587877, + "grad_norm": 0.20577891170978546, + "learning_rate": 7.568298284623274e-05, + "loss": 1.1693, + "step": 4386 + }, + { + "epoch": 1.1872547693140305, + "grad_norm": 0.21271593868732452, + "learning_rate": 7.565596635565841e-05, + "loss": 1.1331, + "step": 4388 + }, + { + "epoch": 1.1877959680692733, + "grad_norm": 0.20553138852119446, + "learning_rate": 7.562893969384051e-05, + "loss": 1.1618, + "step": 4390 + }, + { + "epoch": 1.1883371668245164, + "grad_norm": 0.21582964062690735, + "learning_rate": 7.560190287149367e-05, + "loss": 1.1616, + "step": 4392 + }, + { + "epoch": 1.1888783655797592, + "grad_norm": 0.2435728907585144, + "learning_rate": 7.55748558993366e-05, + "loss": 1.1741, + "step": 4394 + }, + { + "epoch": 1.189419564335002, + "grad_norm": 0.2033778578042984, + "learning_rate": 7.5547798788092e-05, + "loss": 1.1519, + "step": 4396 + }, + { + "epoch": 1.189960763090245, + "grad_norm": 0.20006676018238068, + "learning_rate": 7.552073154848656e-05, + "loss": 1.1649, + "step": 4398 + }, + { + "epoch": 1.1905019618454877, + "grad_norm": 0.19640152156352997, + "learning_rate": 7.549365419125109e-05, + "loss": 1.1381, + "step": 4400 + }, + { + "epoch": 1.1910431606007306, + "grad_norm": 0.21240241825580597, + "learning_rate": 7.546656672712027e-05, + "loss": 1.1589, + "step": 4402 + }, + { + "epoch": 1.1915843593559734, + "grad_norm": 0.22936300933361053, + "learning_rate": 7.54394691668329e-05, + "loss": 1.1346, + "step": 4404 + }, + { + "epoch": 1.1921255581112162, + "grad_norm": 0.20601502060890198, + "learning_rate": 7.541236152113172e-05, + "loss": 1.1672, + "step": 4406 + }, + { + "epoch": 1.1926667568664593, + "grad_norm": 0.22351358830928802, + "learning_rate": 7.538524380076351e-05, + "loss": 1.1545, + "step": 4408 + }, + { + "epoch": 1.1932079556217021, + "grad_norm": 0.23404501378536224, + "learning_rate": 7.535811601647897e-05, + "loss": 1.1484, + "step": 4410 + }, + { + "epoch": 1.193749154376945, + "grad_norm": 0.20127235352993011, + "learning_rate": 7.533097817903292e-05, + "loss": 1.1559, + "step": 4412 + }, + { + "epoch": 1.1942903531321878, + "grad_norm": 0.20970293879508972, + "learning_rate": 7.530383029918404e-05, + "loss": 1.1478, + "step": 4414 + }, + { + "epoch": 1.1948315518874306, + "grad_norm": 0.2203601449728012, + "learning_rate": 7.527667238769503e-05, + "loss": 1.1399, + "step": 4416 + }, + { + "epoch": 1.1953727506426735, + "grad_norm": 0.21253615617752075, + "learning_rate": 7.524950445533259e-05, + "loss": 1.1687, + "step": 4418 + }, + { + "epoch": 1.1959139493979163, + "grad_norm": 0.20153376460075378, + "learning_rate": 7.522232651286741e-05, + "loss": 1.1491, + "step": 4420 + }, + { + "epoch": 1.1964551481531593, + "grad_norm": 0.20877231657505035, + "learning_rate": 7.519513857107405e-05, + "loss": 1.1559, + "step": 4422 + }, + { + "epoch": 1.1969963469084022, + "grad_norm": 0.2033756971359253, + "learning_rate": 7.516794064073117e-05, + "loss": 1.1526, + "step": 4424 + }, + { + "epoch": 1.197537545663645, + "grad_norm": 0.19848887622356415, + "learning_rate": 7.514073273262126e-05, + "loss": 1.147, + "step": 4426 + }, + { + "epoch": 1.1980787444188878, + "grad_norm": 0.25644296407699585, + "learning_rate": 7.511351485753089e-05, + "loss": 1.1269, + "step": 4428 + }, + { + "epoch": 1.1986199431741307, + "grad_norm": 0.23441651463508606, + "learning_rate": 7.508628702625044e-05, + "loss": 1.1587, + "step": 4430 + }, + { + "epoch": 1.1991611419293735, + "grad_norm": 0.2278803586959839, + "learning_rate": 7.50590492495744e-05, + "loss": 1.1536, + "step": 4432 + }, + { + "epoch": 1.1997023406846163, + "grad_norm": 0.21613961458206177, + "learning_rate": 7.503180153830107e-05, + "loss": 1.1339, + "step": 4434 + }, + { + "epoch": 1.2002435394398594, + "grad_norm": 0.2126021385192871, + "learning_rate": 7.500454390323274e-05, + "loss": 1.1446, + "step": 4436 + }, + { + "epoch": 1.2007847381951022, + "grad_norm": 0.21502047777175903, + "learning_rate": 7.497727635517564e-05, + "loss": 1.1458, + "step": 4438 + }, + { + "epoch": 1.201325936950345, + "grad_norm": 0.20378261804580688, + "learning_rate": 7.494999890493993e-05, + "loss": 1.1531, + "step": 4440 + }, + { + "epoch": 1.201867135705588, + "grad_norm": 0.20229879021644592, + "learning_rate": 7.492271156333968e-05, + "loss": 1.1401, + "step": 4442 + }, + { + "epoch": 1.2024083344608307, + "grad_norm": 0.21491043269634247, + "learning_rate": 7.489541434119286e-05, + "loss": 1.1537, + "step": 4444 + }, + { + "epoch": 1.2029495332160736, + "grad_norm": 0.2054162323474884, + "learning_rate": 7.486810724932142e-05, + "loss": 1.1618, + "step": 4446 + }, + { + "epoch": 1.2034907319713164, + "grad_norm": 0.21886977553367615, + "learning_rate": 7.484079029855118e-05, + "loss": 1.1356, + "step": 4448 + }, + { + "epoch": 1.2040319307265595, + "grad_norm": 0.2097829133272171, + "learning_rate": 7.481346349971187e-05, + "loss": 1.1374, + "step": 4450 + }, + { + "epoch": 1.2045731294818023, + "grad_norm": 0.200583815574646, + "learning_rate": 7.478612686363713e-05, + "loss": 1.1758, + "step": 4452 + }, + { + "epoch": 1.2051143282370451, + "grad_norm": 0.21000835299491882, + "learning_rate": 7.475878040116451e-05, + "loss": 1.1608, + "step": 4454 + }, + { + "epoch": 1.205655526992288, + "grad_norm": 0.21297107636928558, + "learning_rate": 7.473142412313543e-05, + "loss": 1.1434, + "step": 4456 + }, + { + "epoch": 1.2061967257475308, + "grad_norm": 0.2080170065164566, + "learning_rate": 7.470405804039524e-05, + "loss": 1.1663, + "step": 4458 + }, + { + "epoch": 1.2067379245027736, + "grad_norm": 0.19678303599357605, + "learning_rate": 7.467668216379316e-05, + "loss": 1.1476, + "step": 4460 + }, + { + "epoch": 1.2072791232580165, + "grad_norm": 0.2111286073923111, + "learning_rate": 7.464929650418225e-05, + "loss": 1.1465, + "step": 4462 + }, + { + "epoch": 1.2078203220132593, + "grad_norm": 0.22287851572036743, + "learning_rate": 7.462190107241952e-05, + "loss": 1.1675, + "step": 4464 + }, + { + "epoch": 1.2083615207685021, + "grad_norm": 0.21192297339439392, + "learning_rate": 7.45944958793658e-05, + "loss": 1.1533, + "step": 4466 + }, + { + "epoch": 1.2089027195237452, + "grad_norm": 0.21232372522354126, + "learning_rate": 7.456708093588582e-05, + "loss": 1.1379, + "step": 4468 + }, + { + "epoch": 1.209443918278988, + "grad_norm": 0.2120969444513321, + "learning_rate": 7.453965625284818e-05, + "loss": 1.1505, + "step": 4470 + }, + { + "epoch": 1.2099851170342308, + "grad_norm": 0.21147684752941132, + "learning_rate": 7.45122218411253e-05, + "loss": 1.142, + "step": 4472 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.208118736743927, + "learning_rate": 7.44847777115935e-05, + "loss": 1.145, + "step": 4474 + }, + { + "epoch": 1.2110675145447165, + "grad_norm": 0.21549181640148163, + "learning_rate": 7.445732387513293e-05, + "loss": 1.1442, + "step": 4476 + }, + { + "epoch": 1.2116087132999593, + "grad_norm": 0.22884537279605865, + "learning_rate": 7.442986034262757e-05, + "loss": 1.1266, + "step": 4478 + }, + { + "epoch": 1.2121499120552022, + "grad_norm": 0.22194430232048035, + "learning_rate": 7.440238712496533e-05, + "loss": 1.1374, + "step": 4480 + }, + { + "epoch": 1.2126911108104452, + "grad_norm": 0.2395986169576645, + "learning_rate": 7.437490423303786e-05, + "loss": 1.149, + "step": 4482 + }, + { + "epoch": 1.213232309565688, + "grad_norm": 0.2180141806602478, + "learning_rate": 7.434741167774067e-05, + "loss": 1.1577, + "step": 4484 + }, + { + "epoch": 1.213773508320931, + "grad_norm": 0.21561622619628906, + "learning_rate": 7.431990946997313e-05, + "loss": 1.1446, + "step": 4486 + }, + { + "epoch": 1.2143147070761737, + "grad_norm": 0.21005584299564362, + "learning_rate": 7.429239762063844e-05, + "loss": 1.1385, + "step": 4488 + }, + { + "epoch": 1.2148559058314166, + "grad_norm": 0.21435660123825073, + "learning_rate": 7.426487614064358e-05, + "loss": 1.1719, + "step": 4490 + }, + { + "epoch": 1.2153971045866594, + "grad_norm": 0.21810956299304962, + "learning_rate": 7.423734504089939e-05, + "loss": 1.1459, + "step": 4492 + }, + { + "epoch": 1.2159383033419022, + "grad_norm": 0.20335710048675537, + "learning_rate": 7.420980433232048e-05, + "loss": 1.1223, + "step": 4494 + }, + { + "epoch": 1.2164795020971453, + "grad_norm": 0.20494681596755981, + "learning_rate": 7.41822540258253e-05, + "loss": 1.1547, + "step": 4496 + }, + { + "epoch": 1.2170207008523881, + "grad_norm": 0.19216883182525635, + "learning_rate": 7.415469413233612e-05, + "loss": 1.1338, + "step": 4498 + }, + { + "epoch": 1.217561899607631, + "grad_norm": 0.20295827090740204, + "learning_rate": 7.412712466277898e-05, + "loss": 1.159, + "step": 4500 + }, + { + "epoch": 1.2181030983628738, + "grad_norm": 0.20249909162521362, + "learning_rate": 7.409954562808373e-05, + "loss": 1.142, + "step": 4502 + }, + { + "epoch": 1.2186442971181166, + "grad_norm": 0.21104544401168823, + "learning_rate": 7.407195703918399e-05, + "loss": 1.1451, + "step": 4504 + }, + { + "epoch": 1.2191854958733594, + "grad_norm": 0.22997575998306274, + "learning_rate": 7.404435890701722e-05, + "loss": 1.1469, + "step": 4506 + }, + { + "epoch": 1.2197266946286023, + "grad_norm": 0.21492913365364075, + "learning_rate": 7.40167512425246e-05, + "loss": 1.1562, + "step": 4508 + }, + { + "epoch": 1.2202678933838453, + "grad_norm": 0.2012024223804474, + "learning_rate": 7.398913405665114e-05, + "loss": 1.1507, + "step": 4510 + }, + { + "epoch": 1.2208090921390882, + "grad_norm": 0.20585717260837555, + "learning_rate": 7.39615073603456e-05, + "loss": 1.1417, + "step": 4512 + }, + { + "epoch": 1.221350290894331, + "grad_norm": 0.20011591911315918, + "learning_rate": 7.393387116456049e-05, + "loss": 1.1236, + "step": 4514 + }, + { + "epoch": 1.2218914896495738, + "grad_norm": 0.19506965577602386, + "learning_rate": 7.390622548025217e-05, + "loss": 1.1482, + "step": 4516 + }, + { + "epoch": 1.2224326884048167, + "grad_norm": 0.20323827862739563, + "learning_rate": 7.387857031838063e-05, + "loss": 1.1576, + "step": 4518 + }, + { + "epoch": 1.2229738871600595, + "grad_norm": 0.2006043940782547, + "learning_rate": 7.385090568990974e-05, + "loss": 1.1468, + "step": 4520 + }, + { + "epoch": 1.2235150859153023, + "grad_norm": 0.19386060535907745, + "learning_rate": 7.382323160580706e-05, + "loss": 1.1407, + "step": 4522 + }, + { + "epoch": 1.2240562846705452, + "grad_norm": 0.19958168268203735, + "learning_rate": 7.37955480770439e-05, + "loss": 1.1386, + "step": 4524 + }, + { + "epoch": 1.224597483425788, + "grad_norm": 0.20519964396953583, + "learning_rate": 7.376785511459533e-05, + "loss": 1.1366, + "step": 4526 + }, + { + "epoch": 1.225138682181031, + "grad_norm": 0.19929784536361694, + "learning_rate": 7.374015272944015e-05, + "loss": 1.1518, + "step": 4528 + }, + { + "epoch": 1.225679880936274, + "grad_norm": 0.20659440755844116, + "learning_rate": 7.371244093256093e-05, + "loss": 1.1427, + "step": 4530 + }, + { + "epoch": 1.2262210796915167, + "grad_norm": 0.2114669233560562, + "learning_rate": 7.36847197349439e-05, + "loss": 1.1547, + "step": 4532 + }, + { + "epoch": 1.2267622784467596, + "grad_norm": 0.20865440368652344, + "learning_rate": 7.36569891475791e-05, + "loss": 1.1316, + "step": 4534 + }, + { + "epoch": 1.2273034772020024, + "grad_norm": 0.21056681871414185, + "learning_rate": 7.362924918146022e-05, + "loss": 1.1551, + "step": 4536 + }, + { + "epoch": 1.2278446759572452, + "grad_norm": 0.2034716010093689, + "learning_rate": 7.360149984758473e-05, + "loss": 1.134, + "step": 4538 + }, + { + "epoch": 1.228385874712488, + "grad_norm": 0.2167576402425766, + "learning_rate": 7.357374115695377e-05, + "loss": 1.1559, + "step": 4540 + }, + { + "epoch": 1.2289270734677311, + "grad_norm": 0.20762617886066437, + "learning_rate": 7.354597312057219e-05, + "loss": 1.1443, + "step": 4542 + }, + { + "epoch": 1.229468272222974, + "grad_norm": 0.2080763280391693, + "learning_rate": 7.351819574944856e-05, + "loss": 1.1495, + "step": 4544 + }, + { + "epoch": 1.2300094709782168, + "grad_norm": 0.2057662308216095, + "learning_rate": 7.349040905459517e-05, + "loss": 1.1378, + "step": 4546 + }, + { + "epoch": 1.2305506697334596, + "grad_norm": 0.22165626287460327, + "learning_rate": 7.346261304702797e-05, + "loss": 1.1525, + "step": 4548 + }, + { + "epoch": 1.2310918684887024, + "grad_norm": 0.21772271394729614, + "learning_rate": 7.343480773776664e-05, + "loss": 1.1523, + "step": 4550 + }, + { + "epoch": 1.2316330672439453, + "grad_norm": 0.20765674114227295, + "learning_rate": 7.340699313783448e-05, + "loss": 1.1308, + "step": 4552 + }, + { + "epoch": 1.2321742659991881, + "grad_norm": 0.2179335653781891, + "learning_rate": 7.337916925825855e-05, + "loss": 1.162, + "step": 4554 + }, + { + "epoch": 1.2327154647544312, + "grad_norm": 0.2079366147518158, + "learning_rate": 7.335133611006956e-05, + "loss": 1.1509, + "step": 4556 + }, + { + "epoch": 1.233256663509674, + "grad_norm": 0.21322664618492126, + "learning_rate": 7.332349370430188e-05, + "loss": 1.1411, + "step": 4558 + }, + { + "epoch": 1.2337978622649168, + "grad_norm": 0.19475920498371124, + "learning_rate": 7.329564205199356e-05, + "loss": 1.154, + "step": 4560 + }, + { + "epoch": 1.2343390610201597, + "grad_norm": 0.20483115315437317, + "learning_rate": 7.326778116418633e-05, + "loss": 1.1481, + "step": 4562 + }, + { + "epoch": 1.2348802597754025, + "grad_norm": 0.19957055151462555, + "learning_rate": 7.323991105192557e-05, + "loss": 1.1433, + "step": 4564 + }, + { + "epoch": 1.2354214585306453, + "grad_norm": 0.2019873857498169, + "learning_rate": 7.32120317262603e-05, + "loss": 1.1444, + "step": 4566 + }, + { + "epoch": 1.2359626572858882, + "grad_norm": 0.20566652715206146, + "learning_rate": 7.318414319824323e-05, + "loss": 1.1394, + "step": 4568 + }, + { + "epoch": 1.2365038560411312, + "grad_norm": 0.19783833622932434, + "learning_rate": 7.315624547893067e-05, + "loss": 1.1411, + "step": 4570 + }, + { + "epoch": 1.237045054796374, + "grad_norm": 0.20047402381896973, + "learning_rate": 7.312833857938264e-05, + "loss": 1.1288, + "step": 4572 + }, + { + "epoch": 1.2375862535516169, + "grad_norm": 0.20567147433757782, + "learning_rate": 7.310042251066272e-05, + "loss": 1.1474, + "step": 4574 + }, + { + "epoch": 1.2381274523068597, + "grad_norm": 0.2040497362613678, + "learning_rate": 7.307249728383817e-05, + "loss": 1.1216, + "step": 4576 + }, + { + "epoch": 1.2386686510621026, + "grad_norm": 0.19251488149166107, + "learning_rate": 7.304456290997991e-05, + "loss": 1.1425, + "step": 4578 + }, + { + "epoch": 1.2392098498173454, + "grad_norm": 0.2024182677268982, + "learning_rate": 7.30166194001624e-05, + "loss": 1.1337, + "step": 4580 + }, + { + "epoch": 1.2397510485725882, + "grad_norm": 0.21187058091163635, + "learning_rate": 7.298866676546383e-05, + "loss": 1.1373, + "step": 4582 + }, + { + "epoch": 1.240292247327831, + "grad_norm": 0.20792464911937714, + "learning_rate": 7.296070501696593e-05, + "loss": 1.1464, + "step": 4584 + }, + { + "epoch": 1.2408334460830739, + "grad_norm": 0.214219868183136, + "learning_rate": 7.293273416575405e-05, + "loss": 1.1431, + "step": 4586 + }, + { + "epoch": 1.241374644838317, + "grad_norm": 0.1944214552640915, + "learning_rate": 7.290475422291719e-05, + "loss": 1.1416, + "step": 4588 + }, + { + "epoch": 1.2419158435935598, + "grad_norm": 0.20861075818538666, + "learning_rate": 7.287676519954792e-05, + "loss": 1.1192, + "step": 4590 + }, + { + "epoch": 1.2424570423488026, + "grad_norm": 0.19681338965892792, + "learning_rate": 7.284876710674238e-05, + "loss": 1.125, + "step": 4592 + }, + { + "epoch": 1.2429982411040454, + "grad_norm": 0.1965487152338028, + "learning_rate": 7.28207599556004e-05, + "loss": 1.159, + "step": 4594 + }, + { + "epoch": 1.2435394398592883, + "grad_norm": 0.2032519429922104, + "learning_rate": 7.279274375722533e-05, + "loss": 1.144, + "step": 4596 + }, + { + "epoch": 1.244080638614531, + "grad_norm": 0.19022652506828308, + "learning_rate": 7.276471852272409e-05, + "loss": 1.1467, + "step": 4598 + }, + { + "epoch": 1.244621837369774, + "grad_norm": 0.21404559910297394, + "learning_rate": 7.273668426320724e-05, + "loss": 1.1426, + "step": 4600 + }, + { + "epoch": 1.245163036125017, + "grad_norm": 0.19883404672145844, + "learning_rate": 7.27086409897889e-05, + "loss": 1.1312, + "step": 4602 + }, + { + "epoch": 1.2457042348802598, + "grad_norm": 0.2046336680650711, + "learning_rate": 7.268058871358674e-05, + "loss": 1.1467, + "step": 4604 + }, + { + "epoch": 1.2462454336355027, + "grad_norm": 0.20716378092765808, + "learning_rate": 7.265252744572201e-05, + "loss": 1.1281, + "step": 4606 + }, + { + "epoch": 1.2467866323907455, + "grad_norm": 0.20886875689029694, + "learning_rate": 7.262445719731956e-05, + "loss": 1.1357, + "step": 4608 + }, + { + "epoch": 1.2473278311459883, + "grad_norm": 0.22756427526474, + "learning_rate": 7.259637797950771e-05, + "loss": 1.1405, + "step": 4610 + }, + { + "epoch": 1.2478690299012312, + "grad_norm": 0.229325532913208, + "learning_rate": 7.256828980341846e-05, + "loss": 1.1456, + "step": 4612 + }, + { + "epoch": 1.248410228656474, + "grad_norm": 0.20845824480056763, + "learning_rate": 7.254019268018728e-05, + "loss": 1.1507, + "step": 4614 + }, + { + "epoch": 1.248951427411717, + "grad_norm": 0.20090307295322418, + "learning_rate": 7.251208662095318e-05, + "loss": 1.1433, + "step": 4616 + }, + { + "epoch": 1.2494926261669599, + "grad_norm": 0.19882068037986755, + "learning_rate": 7.248397163685874e-05, + "loss": 1.1416, + "step": 4618 + }, + { + "epoch": 1.2500338249222027, + "grad_norm": 0.21998406946659088, + "learning_rate": 7.245584773905012e-05, + "loss": 1.1489, + "step": 4620 + }, + { + "epoch": 1.2505750236774456, + "grad_norm": 0.21411266922950745, + "learning_rate": 7.242771493867691e-05, + "loss": 1.148, + "step": 4622 + }, + { + "epoch": 1.2511162224326884, + "grad_norm": 0.20658662915229797, + "learning_rate": 7.239957324689232e-05, + "loss": 1.1259, + "step": 4624 + }, + { + "epoch": 1.2516574211879312, + "grad_norm": 0.20834538340568542, + "learning_rate": 7.237142267485305e-05, + "loss": 1.148, + "step": 4626 + }, + { + "epoch": 1.252198619943174, + "grad_norm": 0.1978132128715515, + "learning_rate": 7.234326323371931e-05, + "loss": 1.1529, + "step": 4628 + }, + { + "epoch": 1.252739818698417, + "grad_norm": 0.2106226533651352, + "learning_rate": 7.231509493465484e-05, + "loss": 1.1225, + "step": 4630 + }, + { + "epoch": 1.2532810174536597, + "grad_norm": 0.20942124724388123, + "learning_rate": 7.228691778882693e-05, + "loss": 1.145, + "step": 4632 + }, + { + "epoch": 1.2538222162089028, + "grad_norm": 0.2049248218536377, + "learning_rate": 7.225873180740627e-05, + "loss": 1.1555, + "step": 4634 + }, + { + "epoch": 1.2543634149641456, + "grad_norm": 0.20401233434677124, + "learning_rate": 7.22305370015672e-05, + "loss": 1.113, + "step": 4636 + }, + { + "epoch": 1.2549046137193884, + "grad_norm": 0.20931097865104675, + "learning_rate": 7.220233338248743e-05, + "loss": 1.1501, + "step": 4638 + }, + { + "epoch": 1.2554458124746313, + "grad_norm": 0.20634891092777252, + "learning_rate": 7.217412096134823e-05, + "loss": 1.1439, + "step": 4640 + }, + { + "epoch": 1.255987011229874, + "grad_norm": 0.21860553324222565, + "learning_rate": 7.214589974933434e-05, + "loss": 1.1515, + "step": 4642 + }, + { + "epoch": 1.2565282099851172, + "grad_norm": 0.19998759031295776, + "learning_rate": 7.211766975763397e-05, + "loss": 1.1339, + "step": 4644 + }, + { + "epoch": 1.2570694087403598, + "grad_norm": 0.21334104239940643, + "learning_rate": 7.208943099743888e-05, + "loss": 1.1504, + "step": 4646 + }, + { + "epoch": 1.2576106074956028, + "grad_norm": 0.2096724510192871, + "learning_rate": 7.206118347994421e-05, + "loss": 1.1493, + "step": 4648 + }, + { + "epoch": 1.2581518062508457, + "grad_norm": 0.20401859283447266, + "learning_rate": 7.203292721634863e-05, + "loss": 1.1447, + "step": 4650 + }, + { + "epoch": 1.2586930050060885, + "grad_norm": 0.20878173410892487, + "learning_rate": 7.200466221785427e-05, + "loss": 1.1426, + "step": 4652 + }, + { + "epoch": 1.2592342037613313, + "grad_norm": 0.2019282877445221, + "learning_rate": 7.19763884956667e-05, + "loss": 1.135, + "step": 4654 + }, + { + "epoch": 1.2597754025165742, + "grad_norm": 0.21409070491790771, + "learning_rate": 7.194810606099498e-05, + "loss": 1.1504, + "step": 4656 + }, + { + "epoch": 1.260316601271817, + "grad_norm": 0.1969073861837387, + "learning_rate": 7.191981492505163e-05, + "loss": 1.1646, + "step": 4658 + }, + { + "epoch": 1.2608578000270598, + "grad_norm": 0.21602770686149597, + "learning_rate": 7.189151509905257e-05, + "loss": 1.1428, + "step": 4660 + }, + { + "epoch": 1.2613989987823029, + "grad_norm": 0.20433968305587769, + "learning_rate": 7.186320659421721e-05, + "loss": 1.1438, + "step": 4662 + }, + { + "epoch": 1.2619401975375457, + "grad_norm": 0.18745940923690796, + "learning_rate": 7.183488942176838e-05, + "loss": 1.1233, + "step": 4664 + }, + { + "epoch": 1.2624813962927885, + "grad_norm": 0.2125159353017807, + "learning_rate": 7.180656359293236e-05, + "loss": 1.1365, + "step": 4666 + }, + { + "epoch": 1.2630225950480314, + "grad_norm": 0.2109309583902359, + "learning_rate": 7.177822911893883e-05, + "loss": 1.1189, + "step": 4668 + }, + { + "epoch": 1.2635637938032742, + "grad_norm": 0.20971255004405975, + "learning_rate": 7.174988601102096e-05, + "loss": 1.1547, + "step": 4670 + }, + { + "epoch": 1.264104992558517, + "grad_norm": 0.20887215435504913, + "learning_rate": 7.172153428041527e-05, + "loss": 1.1158, + "step": 4672 + }, + { + "epoch": 1.2646461913137599, + "grad_norm": 0.208766907453537, + "learning_rate": 7.169317393836175e-05, + "loss": 1.16, + "step": 4674 + }, + { + "epoch": 1.265187390069003, + "grad_norm": 0.2097134292125702, + "learning_rate": 7.166480499610379e-05, + "loss": 1.1458, + "step": 4676 + }, + { + "epoch": 1.2657285888242458, + "grad_norm": 0.20617030560970306, + "learning_rate": 7.163642746488817e-05, + "loss": 1.1327, + "step": 4678 + }, + { + "epoch": 1.2662697875794886, + "grad_norm": 0.19682592153549194, + "learning_rate": 7.160804135596509e-05, + "loss": 1.1516, + "step": 4680 + }, + { + "epoch": 1.2668109863347314, + "grad_norm": 0.21410858631134033, + "learning_rate": 7.157964668058818e-05, + "loss": 1.1456, + "step": 4682 + }, + { + "epoch": 1.2673521850899743, + "grad_norm": 0.20145538449287415, + "learning_rate": 7.15512434500144e-05, + "loss": 1.141, + "step": 4684 + }, + { + "epoch": 1.267893383845217, + "grad_norm": 0.22975938022136688, + "learning_rate": 7.152283167550416e-05, + "loss": 1.1372, + "step": 4686 + }, + { + "epoch": 1.26843458260046, + "grad_norm": 0.21135157346725464, + "learning_rate": 7.149441136832126e-05, + "loss": 1.133, + "step": 4688 + }, + { + "epoch": 1.268975781355703, + "grad_norm": 0.20664750039577484, + "learning_rate": 7.14659825397328e-05, + "loss": 1.1604, + "step": 4690 + }, + { + "epoch": 1.2695169801109458, + "grad_norm": 0.20334216952323914, + "learning_rate": 7.143754520100938e-05, + "loss": 1.1567, + "step": 4692 + }, + { + "epoch": 1.2700581788661887, + "grad_norm": 0.20391784608364105, + "learning_rate": 7.140909936342488e-05, + "loss": 1.1401, + "step": 4694 + }, + { + "epoch": 1.2705993776214315, + "grad_norm": 0.21056121587753296, + "learning_rate": 7.138064503825658e-05, + "loss": 1.141, + "step": 4696 + }, + { + "epoch": 1.2711405763766743, + "grad_norm": 0.2019626796245575, + "learning_rate": 7.135218223678514e-05, + "loss": 1.149, + "step": 4698 + }, + { + "epoch": 1.2716817751319172, + "grad_norm": 0.19824273884296417, + "learning_rate": 7.132371097029454e-05, + "loss": 1.143, + "step": 4700 + }, + { + "epoch": 1.27222297388716, + "grad_norm": 0.19521044194698334, + "learning_rate": 7.129523125007217e-05, + "loss": 1.1512, + "step": 4702 + }, + { + "epoch": 1.272764172642403, + "grad_norm": 0.18712207674980164, + "learning_rate": 7.126674308740874e-05, + "loss": 1.141, + "step": 4704 + }, + { + "epoch": 1.2733053713976457, + "grad_norm": 0.20281393826007843, + "learning_rate": 7.123824649359829e-05, + "loss": 1.135, + "step": 4706 + }, + { + "epoch": 1.2738465701528887, + "grad_norm": 0.18838536739349365, + "learning_rate": 7.120974147993826e-05, + "loss": 1.1516, + "step": 4708 + }, + { + "epoch": 1.2743877689081315, + "grad_norm": 0.1939174383878708, + "learning_rate": 7.118122805772934e-05, + "loss": 1.1513, + "step": 4710 + }, + { + "epoch": 1.2749289676633744, + "grad_norm": 0.19968074560165405, + "learning_rate": 7.115270623827565e-05, + "loss": 1.1444, + "step": 4712 + }, + { + "epoch": 1.2754701664186172, + "grad_norm": 0.19661501049995422, + "learning_rate": 7.112417603288458e-05, + "loss": 1.1416, + "step": 4714 + }, + { + "epoch": 1.27601136517386, + "grad_norm": 0.20567114651203156, + "learning_rate": 7.109563745286684e-05, + "loss": 1.1413, + "step": 4716 + }, + { + "epoch": 1.276552563929103, + "grad_norm": 0.19401569664478302, + "learning_rate": 7.10670905095365e-05, + "loss": 1.153, + "step": 4718 + }, + { + "epoch": 1.2770937626843457, + "grad_norm": 0.21700969338417053, + "learning_rate": 7.103853521421094e-05, + "loss": 1.1523, + "step": 4720 + }, + { + "epoch": 1.2776349614395888, + "grad_norm": 0.6526715159416199, + "learning_rate": 7.10099715782108e-05, + "loss": 1.1385, + "step": 4722 + }, + { + "epoch": 1.2781761601948316, + "grad_norm": 0.23611807823181152, + "learning_rate": 7.098139961286007e-05, + "loss": 1.1578, + "step": 4724 + }, + { + "epoch": 1.2787173589500744, + "grad_norm": 0.22932860255241394, + "learning_rate": 7.095281932948605e-05, + "loss": 1.1433, + "step": 4726 + }, + { + "epoch": 1.2792585577053173, + "grad_norm": 0.2409789115190506, + "learning_rate": 7.092423073941931e-05, + "loss": 1.1409, + "step": 4728 + }, + { + "epoch": 1.27979975646056, + "grad_norm": 0.2185688316822052, + "learning_rate": 7.089563385399371e-05, + "loss": 1.1486, + "step": 4730 + }, + { + "epoch": 1.280340955215803, + "grad_norm": 0.22509334981441498, + "learning_rate": 7.086702868454645e-05, + "loss": 1.1492, + "step": 4732 + }, + { + "epoch": 1.2808821539710458, + "grad_norm": 0.2434450089931488, + "learning_rate": 7.083841524241794e-05, + "loss": 1.1533, + "step": 4734 + }, + { + "epoch": 1.2814233527262888, + "grad_norm": 0.21914516389369965, + "learning_rate": 7.080979353895193e-05, + "loss": 1.1658, + "step": 4736 + }, + { + "epoch": 1.2819645514815317, + "grad_norm": 0.23442131280899048, + "learning_rate": 7.078116358549544e-05, + "loss": 1.1355, + "step": 4738 + }, + { + "epoch": 1.2825057502367745, + "grad_norm": 0.23039007186889648, + "learning_rate": 7.075252539339871e-05, + "loss": 1.1671, + "step": 4740 + }, + { + "epoch": 1.2830469489920173, + "grad_norm": 0.2244633287191391, + "learning_rate": 7.072387897401528e-05, + "loss": 1.1298, + "step": 4742 + }, + { + "epoch": 1.2835881477472602, + "grad_norm": 0.20873397588729858, + "learning_rate": 7.069522433870197e-05, + "loss": 1.1485, + "step": 4744 + }, + { + "epoch": 1.284129346502503, + "grad_norm": 0.20467349886894226, + "learning_rate": 7.066656149881881e-05, + "loss": 1.1349, + "step": 4746 + }, + { + "epoch": 1.2846705452577458, + "grad_norm": 0.21368630230426788, + "learning_rate": 7.063789046572916e-05, + "loss": 1.1396, + "step": 4748 + }, + { + "epoch": 1.2852117440129889, + "grad_norm": 0.21021267771720886, + "learning_rate": 7.060921125079954e-05, + "loss": 1.1241, + "step": 4750 + }, + { + "epoch": 1.2857529427682317, + "grad_norm": 0.20324338972568512, + "learning_rate": 7.058052386539975e-05, + "loss": 1.1451, + "step": 4752 + }, + { + "epoch": 1.2862941415234745, + "grad_norm": 0.20919355750083923, + "learning_rate": 7.055182832090287e-05, + "loss": 1.1474, + "step": 4754 + }, + { + "epoch": 1.2868353402787174, + "grad_norm": 0.1998199224472046, + "learning_rate": 7.052312462868514e-05, + "loss": 1.1181, + "step": 4756 + }, + { + "epoch": 1.2873765390339602, + "grad_norm": 0.209974467754364, + "learning_rate": 7.049441280012608e-05, + "loss": 1.1292, + "step": 4758 + }, + { + "epoch": 1.287917737789203, + "grad_norm": 0.19232650101184845, + "learning_rate": 7.046569284660841e-05, + "loss": 1.1363, + "step": 4760 + }, + { + "epoch": 1.2884589365444459, + "grad_norm": 0.19009630382061005, + "learning_rate": 7.043696477951812e-05, + "loss": 1.1195, + "step": 4762 + }, + { + "epoch": 1.289000135299689, + "grad_norm": 0.19685257971286774, + "learning_rate": 7.040822861024434e-05, + "loss": 1.1556, + "step": 4764 + }, + { + "epoch": 1.2895413340549315, + "grad_norm": 0.19877856969833374, + "learning_rate": 7.037948435017946e-05, + "loss": 1.1463, + "step": 4766 + }, + { + "epoch": 1.2900825328101746, + "grad_norm": 0.20576737821102142, + "learning_rate": 7.035073201071909e-05, + "loss": 1.1489, + "step": 4768 + }, + { + "epoch": 1.2906237315654174, + "grad_norm": 0.22239983081817627, + "learning_rate": 7.0321971603262e-05, + "loss": 1.1443, + "step": 4770 + }, + { + "epoch": 1.2911649303206603, + "grad_norm": 0.2060551643371582, + "learning_rate": 7.02932031392102e-05, + "loss": 1.1411, + "step": 4772 + }, + { + "epoch": 1.291706129075903, + "grad_norm": 0.20176757872104645, + "learning_rate": 7.026442662996888e-05, + "loss": 1.1542, + "step": 4774 + }, + { + "epoch": 1.292247327831146, + "grad_norm": 0.19776682555675507, + "learning_rate": 7.023564208694638e-05, + "loss": 1.1313, + "step": 4776 + }, + { + "epoch": 1.292788526586389, + "grad_norm": 0.19966423511505127, + "learning_rate": 7.020684952155428e-05, + "loss": 1.1169, + "step": 4778 + }, + { + "epoch": 1.2933297253416316, + "grad_norm": 0.18727391958236694, + "learning_rate": 7.017804894520735e-05, + "loss": 1.1263, + "step": 4780 + }, + { + "epoch": 1.2938709240968747, + "grad_norm": 0.19659611582756042, + "learning_rate": 7.014924036932345e-05, + "loss": 1.1227, + "step": 4782 + }, + { + "epoch": 1.2944121228521175, + "grad_norm": 0.20023833215236664, + "learning_rate": 7.01204238053237e-05, + "loss": 1.1196, + "step": 4784 + }, + { + "epoch": 1.2949533216073603, + "grad_norm": 0.20402821898460388, + "learning_rate": 7.009159926463237e-05, + "loss": 1.1413, + "step": 4786 + }, + { + "epoch": 1.2954945203626032, + "grad_norm": 0.21623113751411438, + "learning_rate": 7.006276675867685e-05, + "loss": 1.1611, + "step": 4788 + }, + { + "epoch": 1.296035719117846, + "grad_norm": 0.2232583463191986, + "learning_rate": 7.003392629888772e-05, + "loss": 1.1486, + "step": 4790 + }, + { + "epoch": 1.2965769178730888, + "grad_norm": 0.20442326366901398, + "learning_rate": 7.00050778966987e-05, + "loss": 1.141, + "step": 4792 + }, + { + "epoch": 1.2971181166283317, + "grad_norm": 0.20763815939426422, + "learning_rate": 6.99762215635467e-05, + "loss": 1.1333, + "step": 4794 + }, + { + "epoch": 1.2976593153835747, + "grad_norm": 0.1989380568265915, + "learning_rate": 6.994735731087171e-05, + "loss": 1.1338, + "step": 4796 + }, + { + "epoch": 1.2982005141388175, + "grad_norm": 0.20569868385791779, + "learning_rate": 6.991848515011689e-05, + "loss": 1.1211, + "step": 4798 + }, + { + "epoch": 1.2987417128940604, + "grad_norm": 0.3479338586330414, + "learning_rate": 6.988960509272855e-05, + "loss": 1.1212, + "step": 4800 + }, + { + "epoch": 1.2992829116493032, + "grad_norm": 0.2268143892288208, + "learning_rate": 6.986071715015611e-05, + "loss": 1.1217, + "step": 4802 + }, + { + "epoch": 1.299824110404546, + "grad_norm": 0.22618204355239868, + "learning_rate": 6.983182133385213e-05, + "loss": 1.1542, + "step": 4804 + }, + { + "epoch": 1.3003653091597889, + "grad_norm": 0.20148879289627075, + "learning_rate": 6.980291765527228e-05, + "loss": 1.129, + "step": 4806 + }, + { + "epoch": 1.3009065079150317, + "grad_norm": 0.2690919041633606, + "learning_rate": 6.977400612587535e-05, + "loss": 1.1441, + "step": 4808 + }, + { + "epoch": 1.3014477066702748, + "grad_norm": 0.19888824224472046, + "learning_rate": 6.974508675712323e-05, + "loss": 1.1387, + "step": 4810 + }, + { + "epoch": 1.3019889054255176, + "grad_norm": 0.21588613092899323, + "learning_rate": 6.971615956048094e-05, + "loss": 1.1558, + "step": 4812 + }, + { + "epoch": 1.3025301041807604, + "grad_norm": 0.2101002186536789, + "learning_rate": 6.968722454741662e-05, + "loss": 1.1429, + "step": 4814 + }, + { + "epoch": 1.3030713029360033, + "grad_norm": 0.20919887721538544, + "learning_rate": 6.965828172940145e-05, + "loss": 1.1504, + "step": 4816 + }, + { + "epoch": 1.303612501691246, + "grad_norm": 0.21550770103931427, + "learning_rate": 6.962933111790975e-05, + "loss": 1.1487, + "step": 4818 + }, + { + "epoch": 1.304153700446489, + "grad_norm": 0.20764297246932983, + "learning_rate": 6.96003727244189e-05, + "loss": 1.1305, + "step": 4820 + }, + { + "epoch": 1.3046948992017318, + "grad_norm": 0.2233119159936905, + "learning_rate": 6.957140656040942e-05, + "loss": 1.1314, + "step": 4822 + }, + { + "epoch": 1.3052360979569748, + "grad_norm": 0.22899368405342102, + "learning_rate": 6.954243263736486e-05, + "loss": 1.1515, + "step": 4824 + }, + { + "epoch": 1.3057772967122174, + "grad_norm": 0.26306775212287903, + "learning_rate": 6.951345096677183e-05, + "loss": 1.1531, + "step": 4826 + }, + { + "epoch": 1.3063184954674605, + "grad_norm": 0.23401707410812378, + "learning_rate": 6.948446156012007e-05, + "loss": 1.1498, + "step": 4828 + }, + { + "epoch": 1.3068596942227033, + "grad_norm": 0.24562807381153107, + "learning_rate": 6.945546442890236e-05, + "loss": 1.1173, + "step": 4830 + }, + { + "epoch": 1.3074008929779461, + "grad_norm": 0.2258317917585373, + "learning_rate": 6.942645958461451e-05, + "loss": 1.1403, + "step": 4832 + }, + { + "epoch": 1.307942091733189, + "grad_norm": 0.2683268189430237, + "learning_rate": 6.939744703875546e-05, + "loss": 1.1346, + "step": 4834 + }, + { + "epoch": 1.3084832904884318, + "grad_norm": 0.23453928530216217, + "learning_rate": 6.93684268028271e-05, + "loss": 1.1543, + "step": 4836 + }, + { + "epoch": 1.3090244892436749, + "grad_norm": 0.22976765036582947, + "learning_rate": 6.933939888833451e-05, + "loss": 1.1534, + "step": 4838 + }, + { + "epoch": 1.3095656879989175, + "grad_norm": 0.19898580014705658, + "learning_rate": 6.931036330678568e-05, + "loss": 1.1559, + "step": 4840 + }, + { + "epoch": 1.3101068867541605, + "grad_norm": 0.20745742321014404, + "learning_rate": 6.92813200696917e-05, + "loss": 1.1476, + "step": 4842 + }, + { + "epoch": 1.3106480855094034, + "grad_norm": 0.19812755286693573, + "learning_rate": 6.92522691885667e-05, + "loss": 1.1116, + "step": 4844 + }, + { + "epoch": 1.3111892842646462, + "grad_norm": 0.2005460262298584, + "learning_rate": 6.922321067492783e-05, + "loss": 1.1444, + "step": 4846 + }, + { + "epoch": 1.311730483019889, + "grad_norm": 0.22607260942459106, + "learning_rate": 6.919414454029525e-05, + "loss": 1.1302, + "step": 4848 + }, + { + "epoch": 1.3122716817751319, + "grad_norm": 0.21330516040325165, + "learning_rate": 6.916507079619217e-05, + "loss": 1.1336, + "step": 4850 + }, + { + "epoch": 1.3128128805303747, + "grad_norm": 0.2110857516527176, + "learning_rate": 6.913598945414479e-05, + "loss": 1.1563, + "step": 4852 + }, + { + "epoch": 1.3133540792856175, + "grad_norm": 0.21332454681396484, + "learning_rate": 6.910690052568236e-05, + "loss": 1.1303, + "step": 4854 + }, + { + "epoch": 1.3138952780408606, + "grad_norm": 0.21654580533504486, + "learning_rate": 6.907780402233706e-05, + "loss": 1.1323, + "step": 4856 + }, + { + "epoch": 1.3144364767961034, + "grad_norm": 0.22266875207424164, + "learning_rate": 6.904869995564419e-05, + "loss": 1.1477, + "step": 4858 + }, + { + "epoch": 1.3149776755513463, + "grad_norm": 0.21026547253131866, + "learning_rate": 6.901958833714196e-05, + "loss": 1.1545, + "step": 4860 + }, + { + "epoch": 1.315518874306589, + "grad_norm": 0.24251089990139008, + "learning_rate": 6.899046917837157e-05, + "loss": 1.1345, + "step": 4862 + }, + { + "epoch": 1.316060073061832, + "grad_norm": 0.2157732993364334, + "learning_rate": 6.896134249087727e-05, + "loss": 1.1591, + "step": 4864 + }, + { + "epoch": 1.3166012718170748, + "grad_norm": 0.21439550817012787, + "learning_rate": 6.893220828620626e-05, + "loss": 1.1239, + "step": 4866 + }, + { + "epoch": 1.3171424705723176, + "grad_norm": 0.20335568487644196, + "learning_rate": 6.890306657590871e-05, + "loss": 1.128, + "step": 4868 + }, + { + "epoch": 1.3176836693275606, + "grad_norm": 0.20837850868701935, + "learning_rate": 6.88739173715378e-05, + "loss": 1.1493, + "step": 4870 + }, + { + "epoch": 1.3182248680828035, + "grad_norm": 0.20061874389648438, + "learning_rate": 6.884476068464962e-05, + "loss": 1.1321, + "step": 4872 + }, + { + "epoch": 1.3187660668380463, + "grad_norm": 0.20998738706111908, + "learning_rate": 6.881559652680332e-05, + "loss": 1.1449, + "step": 4874 + }, + { + "epoch": 1.3193072655932891, + "grad_norm": 0.215812548995018, + "learning_rate": 6.878642490956091e-05, + "loss": 1.1387, + "step": 4876 + }, + { + "epoch": 1.319848464348532, + "grad_norm": 0.21340392529964447, + "learning_rate": 6.87572458444874e-05, + "loss": 1.1323, + "step": 4878 + }, + { + "epoch": 1.3203896631037748, + "grad_norm": 0.19966621696949005, + "learning_rate": 6.87280593431508e-05, + "loss": 1.1465, + "step": 4880 + }, + { + "epoch": 1.3209308618590176, + "grad_norm": 0.19357992708683014, + "learning_rate": 6.869886541712201e-05, + "loss": 1.1317, + "step": 4882 + }, + { + "epoch": 1.3214720606142607, + "grad_norm": 0.20211121439933777, + "learning_rate": 6.866966407797488e-05, + "loss": 1.136, + "step": 4884 + }, + { + "epoch": 1.3220132593695033, + "grad_norm": 0.20985981822013855, + "learning_rate": 6.864045533728618e-05, + "loss": 1.143, + "step": 4886 + }, + { + "epoch": 1.3225544581247464, + "grad_norm": 0.1939292699098587, + "learning_rate": 6.86112392066357e-05, + "loss": 1.1362, + "step": 4888 + }, + { + "epoch": 1.3230956568799892, + "grad_norm": 0.19840145111083984, + "learning_rate": 6.858201569760606e-05, + "loss": 1.135, + "step": 4890 + }, + { + "epoch": 1.323636855635232, + "grad_norm": 0.22339017689228058, + "learning_rate": 6.855278482178288e-05, + "loss": 1.1429, + "step": 4892 + }, + { + "epoch": 1.3241780543904749, + "grad_norm": 0.2093936651945114, + "learning_rate": 6.852354659075464e-05, + "loss": 1.1434, + "step": 4894 + }, + { + "epoch": 1.3247192531457177, + "grad_norm": 0.19721046090126038, + "learning_rate": 6.849430101611276e-05, + "loss": 1.135, + "step": 4896 + }, + { + "epoch": 1.3252604519009608, + "grad_norm": 0.20016610622406006, + "learning_rate": 6.84650481094516e-05, + "loss": 1.1211, + "step": 4898 + }, + { + "epoch": 1.3258016506562034, + "grad_norm": 0.20054888725280762, + "learning_rate": 6.843578788236837e-05, + "loss": 1.1325, + "step": 4900 + }, + { + "epoch": 1.3263428494114464, + "grad_norm": 0.20290598273277283, + "learning_rate": 6.840652034646325e-05, + "loss": 1.1403, + "step": 4902 + }, + { + "epoch": 1.3268840481666893, + "grad_norm": 0.19229461252689362, + "learning_rate": 6.837724551333926e-05, + "loss": 1.1283, + "step": 4904 + }, + { + "epoch": 1.327425246921932, + "grad_norm": 0.2016628235578537, + "learning_rate": 6.834796339460232e-05, + "loss": 1.1405, + "step": 4906 + }, + { + "epoch": 1.327966445677175, + "grad_norm": 0.19562238454818726, + "learning_rate": 6.83186740018613e-05, + "loss": 1.1397, + "step": 4908 + }, + { + "epoch": 1.3285076444324178, + "grad_norm": 0.19663389027118683, + "learning_rate": 6.828937734672785e-05, + "loss": 1.1182, + "step": 4910 + }, + { + "epoch": 1.3290488431876606, + "grad_norm": 0.18820057809352875, + "learning_rate": 6.826007344081658e-05, + "loss": 1.1551, + "step": 4912 + }, + { + "epoch": 1.3295900419429034, + "grad_norm": 0.19613684713840485, + "learning_rate": 6.823076229574496e-05, + "loss": 1.1525, + "step": 4914 + }, + { + "epoch": 1.3301312406981465, + "grad_norm": 0.1969241350889206, + "learning_rate": 6.820144392313333e-05, + "loss": 1.1255, + "step": 4916 + }, + { + "epoch": 1.3306724394533893, + "grad_norm": 0.20861990749835968, + "learning_rate": 6.817211833460483e-05, + "loss": 1.1375, + "step": 4918 + }, + { + "epoch": 1.3312136382086321, + "grad_norm": 0.2094329297542572, + "learning_rate": 6.814278554178558e-05, + "loss": 1.1308, + "step": 4920 + }, + { + "epoch": 1.331754836963875, + "grad_norm": 0.19369575381278992, + "learning_rate": 6.811344555630446e-05, + "loss": 1.1471, + "step": 4922 + }, + { + "epoch": 1.3322960357191178, + "grad_norm": 0.21166963875293732, + "learning_rate": 6.808409838979324e-05, + "loss": 1.1284, + "step": 4924 + }, + { + "epoch": 1.3328372344743606, + "grad_norm": 0.205407053232193, + "learning_rate": 6.805474405388652e-05, + "loss": 1.1391, + "step": 4926 + }, + { + "epoch": 1.3333784332296035, + "grad_norm": 0.19018761813640594, + "learning_rate": 6.802538256022177e-05, + "loss": 1.111, + "step": 4928 + }, + { + "epoch": 1.3339196319848465, + "grad_norm": 0.19279932975769043, + "learning_rate": 6.799601392043927e-05, + "loss": 1.1369, + "step": 4930 + }, + { + "epoch": 1.3344608307400894, + "grad_norm": 0.20698602497577667, + "learning_rate": 6.796663814618216e-05, + "loss": 1.1507, + "step": 4932 + }, + { + "epoch": 1.3350020294953322, + "grad_norm": 0.21521639823913574, + "learning_rate": 6.793725524909635e-05, + "loss": 1.1271, + "step": 4934 + }, + { + "epoch": 1.335543228250575, + "grad_norm": 0.200445294380188, + "learning_rate": 6.790786524083067e-05, + "loss": 1.1364, + "step": 4936 + }, + { + "epoch": 1.3360844270058179, + "grad_norm": 0.22544774413108826, + "learning_rate": 6.787846813303668e-05, + "loss": 1.1368, + "step": 4938 + }, + { + "epoch": 1.3366256257610607, + "grad_norm": 0.19603730738162994, + "learning_rate": 6.78490639373688e-05, + "loss": 1.123, + "step": 4940 + }, + { + "epoch": 1.3371668245163035, + "grad_norm": 0.21301992237567902, + "learning_rate": 6.781965266548425e-05, + "loss": 1.1448, + "step": 4942 + }, + { + "epoch": 1.3377080232715466, + "grad_norm": 0.2113674283027649, + "learning_rate": 6.779023432904305e-05, + "loss": 1.1511, + "step": 4944 + }, + { + "epoch": 1.3382492220267894, + "grad_norm": 0.21615102887153625, + "learning_rate": 6.776080893970803e-05, + "loss": 1.125, + "step": 4946 + }, + { + "epoch": 1.3387904207820323, + "grad_norm": 0.2178725004196167, + "learning_rate": 6.773137650914483e-05, + "loss": 1.1153, + "step": 4948 + }, + { + "epoch": 1.339331619537275, + "grad_norm": 0.19788886606693268, + "learning_rate": 6.770193704902184e-05, + "loss": 1.128, + "step": 4950 + }, + { + "epoch": 1.339872818292518, + "grad_norm": 0.20174042880535126, + "learning_rate": 6.767249057101025e-05, + "loss": 1.1316, + "step": 4952 + }, + { + "epoch": 1.3404140170477608, + "grad_norm": 0.20333023369312286, + "learning_rate": 6.764303708678406e-05, + "loss": 1.1357, + "step": 4954 + }, + { + "epoch": 1.3409552158030036, + "grad_norm": 0.20150478184223175, + "learning_rate": 6.761357660802003e-05, + "loss": 1.1434, + "step": 4956 + }, + { + "epoch": 1.3414964145582466, + "grad_norm": 0.20875629782676697, + "learning_rate": 6.75841091463977e-05, + "loss": 1.1429, + "step": 4958 + }, + { + "epoch": 1.3420376133134893, + "grad_norm": 0.23204512894153595, + "learning_rate": 6.755463471359936e-05, + "loss": 1.1299, + "step": 4960 + }, + { + "epoch": 1.3425788120687323, + "grad_norm": 0.20591384172439575, + "learning_rate": 6.752515332131006e-05, + "loss": 1.1358, + "step": 4962 + }, + { + "epoch": 1.3431200108239751, + "grad_norm": 0.22469563782215118, + "learning_rate": 6.749566498121765e-05, + "loss": 1.1347, + "step": 4964 + }, + { + "epoch": 1.343661209579218, + "grad_norm": 0.2194630652666092, + "learning_rate": 6.746616970501272e-05, + "loss": 1.1391, + "step": 4966 + }, + { + "epoch": 1.3442024083344608, + "grad_norm": 0.2115604132413864, + "learning_rate": 6.743666750438856e-05, + "loss": 1.1504, + "step": 4968 + }, + { + "epoch": 1.3447436070897036, + "grad_norm": 0.22515109181404114, + "learning_rate": 6.740715839104126e-05, + "loss": 1.1221, + "step": 4970 + }, + { + "epoch": 1.3452848058449465, + "grad_norm": 0.21035178005695343, + "learning_rate": 6.737764237666964e-05, + "loss": 1.1414, + "step": 4972 + }, + { + "epoch": 1.3458260046001893, + "grad_norm": 0.24386465549468994, + "learning_rate": 6.734811947297526e-05, + "loss": 1.1212, + "step": 4974 + }, + { + "epoch": 1.3463672033554324, + "grad_norm": 0.21872912347316742, + "learning_rate": 6.731858969166236e-05, + "loss": 1.1272, + "step": 4976 + }, + { + "epoch": 1.3469084021106752, + "grad_norm": 0.22619320452213287, + "learning_rate": 6.7289053044438e-05, + "loss": 1.1312, + "step": 4978 + }, + { + "epoch": 1.347449600865918, + "grad_norm": 0.23938359320163727, + "learning_rate": 6.725950954301186e-05, + "loss": 1.1321, + "step": 4980 + }, + { + "epoch": 1.3479907996211609, + "grad_norm": 0.22768397629261017, + "learning_rate": 6.722995919909643e-05, + "loss": 1.1291, + "step": 4982 + }, + { + "epoch": 1.3485319983764037, + "grad_norm": 0.2165715992450714, + "learning_rate": 6.720040202440684e-05, + "loss": 1.1657, + "step": 4984 + }, + { + "epoch": 1.3490731971316465, + "grad_norm": 0.2011752873659134, + "learning_rate": 6.717083803066096e-05, + "loss": 1.1198, + "step": 4986 + }, + { + "epoch": 1.3496143958868894, + "grad_norm": 0.2029523253440857, + "learning_rate": 6.714126722957938e-05, + "loss": 1.1299, + "step": 4988 + }, + { + "epoch": 1.3501555946421324, + "grad_norm": 0.21974807977676392, + "learning_rate": 6.711168963288537e-05, + "loss": 1.125, + "step": 4990 + }, + { + "epoch": 1.3506967933973753, + "grad_norm": 0.21973131597042084, + "learning_rate": 6.708210525230487e-05, + "loss": 1.1352, + "step": 4992 + }, + { + "epoch": 1.351237992152618, + "grad_norm": 0.22160713374614716, + "learning_rate": 6.705251409956657e-05, + "loss": 1.1398, + "step": 4994 + }, + { + "epoch": 1.351779190907861, + "grad_norm": 0.5308296084403992, + "learning_rate": 6.702291618640178e-05, + "loss": 1.1859, + "step": 4996 + }, + { + "epoch": 1.3523203896631038, + "grad_norm": 0.3140636086463928, + "learning_rate": 6.699331152454451e-05, + "loss": 1.1285, + "step": 4998 + }, + { + "epoch": 1.3528615884183466, + "grad_norm": 0.318144828081131, + "learning_rate": 6.696370012573148e-05, + "loss": 1.1123, + "step": 5000 + }, + { + "epoch": 1.3534027871735894, + "grad_norm": 0.2733546495437622, + "learning_rate": 6.693408200170205e-05, + "loss": 1.1354, + "step": 5002 + }, + { + "epoch": 1.3539439859288325, + "grad_norm": 0.22881552577018738, + "learning_rate": 6.690445716419822e-05, + "loss": 1.1446, + "step": 5004 + }, + { + "epoch": 1.3544851846840753, + "grad_norm": 0.22428303956985474, + "learning_rate": 6.687482562496473e-05, + "loss": 1.1469, + "step": 5006 + }, + { + "epoch": 1.3550263834393181, + "grad_norm": 0.23629316687583923, + "learning_rate": 6.68451873957489e-05, + "loss": 1.1412, + "step": 5008 + }, + { + "epoch": 1.355567582194561, + "grad_norm": 0.2209664285182953, + "learning_rate": 6.681554248830074e-05, + "loss": 1.1508, + "step": 5010 + }, + { + "epoch": 1.3561087809498038, + "grad_norm": 0.24471548199653625, + "learning_rate": 6.678589091437288e-05, + "loss": 1.142, + "step": 5012 + }, + { + "epoch": 1.3566499797050466, + "grad_norm": 0.23942361772060394, + "learning_rate": 6.675623268572066e-05, + "loss": 1.1451, + "step": 5014 + }, + { + "epoch": 1.3571911784602895, + "grad_norm": 0.25475817918777466, + "learning_rate": 6.672656781410196e-05, + "loss": 1.1508, + "step": 5016 + }, + { + "epoch": 1.3577323772155325, + "grad_norm": 0.23169760406017303, + "learning_rate": 6.669689631127738e-05, + "loss": 1.1231, + "step": 5018 + }, + { + "epoch": 1.3582735759707751, + "grad_norm": 0.221586674451828, + "learning_rate": 6.666721818901009e-05, + "loss": 1.12, + "step": 5020 + }, + { + "epoch": 1.3588147747260182, + "grad_norm": 0.21877366304397583, + "learning_rate": 6.663753345906591e-05, + "loss": 1.1266, + "step": 5022 + }, + { + "epoch": 1.359355973481261, + "grad_norm": 0.20271623134613037, + "learning_rate": 6.660784213321328e-05, + "loss": 1.1278, + "step": 5024 + }, + { + "epoch": 1.3598971722365039, + "grad_norm": 0.24258723855018616, + "learning_rate": 6.657814422322326e-05, + "loss": 1.1396, + "step": 5026 + }, + { + "epoch": 1.3604383709917467, + "grad_norm": 0.204366996884346, + "learning_rate": 6.65484397408695e-05, + "loss": 1.1221, + "step": 5028 + }, + { + "epoch": 1.3609795697469895, + "grad_norm": 0.2240559309720993, + "learning_rate": 6.65187286979283e-05, + "loss": 1.1552, + "step": 5030 + }, + { + "epoch": 1.3615207685022326, + "grad_norm": 0.21712256968021393, + "learning_rate": 6.648901110617846e-05, + "loss": 1.1402, + "step": 5032 + }, + { + "epoch": 1.3620619672574752, + "grad_norm": 0.23759770393371582, + "learning_rate": 6.64592869774015e-05, + "loss": 1.1289, + "step": 5034 + }, + { + "epoch": 1.3626031660127182, + "grad_norm": 0.20966365933418274, + "learning_rate": 6.642955632338148e-05, + "loss": 1.1275, + "step": 5036 + }, + { + "epoch": 1.363144364767961, + "grad_norm": 0.20241115987300873, + "learning_rate": 6.639981915590501e-05, + "loss": 1.1366, + "step": 5038 + }, + { + "epoch": 1.363685563523204, + "grad_norm": 0.1985287219285965, + "learning_rate": 6.637007548676132e-05, + "loss": 1.1201, + "step": 5040 + }, + { + "epoch": 1.3642267622784467, + "grad_norm": 0.23188084363937378, + "learning_rate": 6.634032532774224e-05, + "loss": 1.1256, + "step": 5042 + }, + { + "epoch": 1.3647679610336896, + "grad_norm": 1.0067830085754395, + "learning_rate": 6.631056869064211e-05, + "loss": 1.1488, + "step": 5044 + }, + { + "epoch": 1.3653091597889324, + "grad_norm": 0.26151588559150696, + "learning_rate": 6.628080558725794e-05, + "loss": 1.1368, + "step": 5046 + }, + { + "epoch": 1.3658503585441752, + "grad_norm": 0.2546170949935913, + "learning_rate": 6.625103602938916e-05, + "loss": 1.1213, + "step": 5048 + }, + { + "epoch": 1.3663915572994183, + "grad_norm": 0.24770961701869965, + "learning_rate": 6.622126002883786e-05, + "loss": 1.1517, + "step": 5050 + }, + { + "epoch": 1.3669327560546611, + "grad_norm": 0.20164626836776733, + "learning_rate": 6.619147759740869e-05, + "loss": 1.1274, + "step": 5052 + }, + { + "epoch": 1.367473954809904, + "grad_norm": 0.21095354855060577, + "learning_rate": 6.61616887469088e-05, + "loss": 1.1387, + "step": 5054 + }, + { + "epoch": 1.3680151535651468, + "grad_norm": 0.20875804126262665, + "learning_rate": 6.613189348914788e-05, + "loss": 1.1321, + "step": 5056 + }, + { + "epoch": 1.3685563523203896, + "grad_norm": 0.20944173634052277, + "learning_rate": 6.610209183593824e-05, + "loss": 1.1093, + "step": 5058 + }, + { + "epoch": 1.3690975510756325, + "grad_norm": 0.195838063955307, + "learning_rate": 6.607228379909463e-05, + "loss": 1.128, + "step": 5060 + }, + { + "epoch": 1.3696387498308753, + "grad_norm": 0.20506072044372559, + "learning_rate": 6.604246939043437e-05, + "loss": 1.1258, + "step": 5062 + }, + { + "epoch": 1.3701799485861184, + "grad_norm": 0.21078158915042877, + "learning_rate": 6.601264862177735e-05, + "loss": 1.1111, + "step": 5064 + }, + { + "epoch": 1.3707211473413612, + "grad_norm": 0.20859210193157196, + "learning_rate": 6.598282150494588e-05, + "loss": 1.1477, + "step": 5066 + }, + { + "epoch": 1.371262346096604, + "grad_norm": 0.26835790276527405, + "learning_rate": 6.595298805176488e-05, + "loss": 1.1459, + "step": 5068 + }, + { + "epoch": 1.3718035448518469, + "grad_norm": 0.28039342164993286, + "learning_rate": 6.592314827406177e-05, + "loss": 1.137, + "step": 5070 + }, + { + "epoch": 1.3723447436070897, + "grad_norm": 0.25044727325439453, + "learning_rate": 6.58933021836664e-05, + "loss": 1.1468, + "step": 5072 + }, + { + "epoch": 1.3728859423623325, + "grad_norm": 0.2226564884185791, + "learning_rate": 6.586344979241122e-05, + "loss": 1.1252, + "step": 5074 + }, + { + "epoch": 1.3734271411175754, + "grad_norm": 0.2516416907310486, + "learning_rate": 6.58335911121311e-05, + "loss": 1.1248, + "step": 5076 + }, + { + "epoch": 1.3739683398728184, + "grad_norm": 0.2427980750799179, + "learning_rate": 6.580372615466348e-05, + "loss": 1.1575, + "step": 5078 + }, + { + "epoch": 1.374509538628061, + "grad_norm": 0.22615928947925568, + "learning_rate": 6.577385493184822e-05, + "loss": 1.1181, + "step": 5080 + }, + { + "epoch": 1.375050737383304, + "grad_norm": 0.24130937457084656, + "learning_rate": 6.574397745552772e-05, + "loss": 1.1158, + "step": 5082 + }, + { + "epoch": 1.375591936138547, + "grad_norm": 0.22093050181865692, + "learning_rate": 6.571409373754678e-05, + "loss": 1.1598, + "step": 5084 + }, + { + "epoch": 1.3761331348937897, + "grad_norm": 20.267744064331055, + "learning_rate": 6.568420378975278e-05, + "loss": 1.1315, + "step": 5086 + }, + { + "epoch": 1.3766743336490326, + "grad_norm": 0.8621043562889099, + "learning_rate": 6.565430762399546e-05, + "loss": 1.1657, + "step": 5088 + }, + { + "epoch": 1.3772155324042754, + "grad_norm": 0.5339746475219727, + "learning_rate": 6.562440525212712e-05, + "loss": 1.1516, + "step": 5090 + }, + { + "epoch": 1.3777567311595185, + "grad_norm": 7.3805365562438965, + "learning_rate": 6.559449668600248e-05, + "loss": 1.2317, + "step": 5092 + }, + { + "epoch": 1.378297929914761, + "grad_norm": 0.4800806939601898, + "learning_rate": 6.556458193747871e-05, + "loss": 1.248, + "step": 5094 + }, + { + "epoch": 1.3788391286700041, + "grad_norm": 0.3002035617828369, + "learning_rate": 6.553466101841542e-05, + "loss": 1.2319, + "step": 5096 + }, + { + "epoch": 1.379380327425247, + "grad_norm": 0.3063008785247803, + "learning_rate": 6.550473394067472e-05, + "loss": 1.2244, + "step": 5098 + }, + { + "epoch": 1.3799215261804898, + "grad_norm": 0.2721727192401886, + "learning_rate": 6.547480071612107e-05, + "loss": 1.144, + "step": 5100 + }, + { + "epoch": 1.3804627249357326, + "grad_norm": 0.2548552453517914, + "learning_rate": 6.544486135662146e-05, + "loss": 1.1369, + "step": 5102 + }, + { + "epoch": 1.3810039236909755, + "grad_norm": 0.2472192496061325, + "learning_rate": 6.541491587404529e-05, + "loss": 1.1254, + "step": 5104 + }, + { + "epoch": 1.3815451224462183, + "grad_norm": 0.2365529090166092, + "learning_rate": 6.538496428026434e-05, + "loss": 1.1292, + "step": 5106 + }, + { + "epoch": 1.3820863212014611, + "grad_norm": 0.2324395775794983, + "learning_rate": 6.535500658715286e-05, + "loss": 1.1315, + "step": 5108 + }, + { + "epoch": 1.3826275199567042, + "grad_norm": 0.22274981439113617, + "learning_rate": 6.532504280658747e-05, + "loss": 1.1334, + "step": 5110 + }, + { + "epoch": 1.383168718711947, + "grad_norm": 0.22897140681743622, + "learning_rate": 6.529507295044728e-05, + "loss": 1.1357, + "step": 5112 + }, + { + "epoch": 1.3837099174671899, + "grad_norm": 0.21158845722675323, + "learning_rate": 6.526509703061375e-05, + "loss": 1.1315, + "step": 5114 + }, + { + "epoch": 1.3842511162224327, + "grad_norm": 0.25797906517982483, + "learning_rate": 6.523511505897074e-05, + "loss": 1.1264, + "step": 5116 + }, + { + "epoch": 1.3847923149776755, + "grad_norm": 0.20588941872119904, + "learning_rate": 6.520512704740455e-05, + "loss": 1.1293, + "step": 5118 + }, + { + "epoch": 1.3853335137329184, + "grad_norm": 0.20949113368988037, + "learning_rate": 6.517513300780385e-05, + "loss": 1.1399, + "step": 5120 + }, + { + "epoch": 1.3858747124881612, + "grad_norm": 0.5824947953224182, + "learning_rate": 6.51451329520597e-05, + "loss": 1.1393, + "step": 5122 + }, + { + "epoch": 1.3864159112434042, + "grad_norm": 0.2793792486190796, + "learning_rate": 6.511512689206552e-05, + "loss": 1.1285, + "step": 5124 + }, + { + "epoch": 1.386957109998647, + "grad_norm": 0.24267977476119995, + "learning_rate": 6.508511483971718e-05, + "loss": 1.1481, + "step": 5126 + }, + { + "epoch": 1.38749830875389, + "grad_norm": 0.23414160311222076, + "learning_rate": 6.505509680691285e-05, + "loss": 1.1364, + "step": 5128 + }, + { + "epoch": 1.3880395075091327, + "grad_norm": 0.22055640816688538, + "learning_rate": 6.502507280555313e-05, + "loss": 1.1362, + "step": 5130 + }, + { + "epoch": 1.3885807062643756, + "grad_norm": 0.22319395840168, + "learning_rate": 6.499504284754093e-05, + "loss": 1.1341, + "step": 5132 + }, + { + "epoch": 1.3891219050196184, + "grad_norm": 0.2219814509153366, + "learning_rate": 6.496500694478158e-05, + "loss": 1.1291, + "step": 5134 + }, + { + "epoch": 1.3896631037748612, + "grad_norm": 0.22638678550720215, + "learning_rate": 6.493496510918273e-05, + "loss": 1.1493, + "step": 5136 + }, + { + "epoch": 1.3902043025301043, + "grad_norm": 0.21132950484752655, + "learning_rate": 6.490491735265438e-05, + "loss": 1.1283, + "step": 5138 + }, + { + "epoch": 1.390745501285347, + "grad_norm": 0.23047256469726562, + "learning_rate": 6.48748636871089e-05, + "loss": 1.1402, + "step": 5140 + }, + { + "epoch": 1.39128670004059, + "grad_norm": 0.21426177024841309, + "learning_rate": 6.484480412446097e-05, + "loss": 1.1259, + "step": 5142 + }, + { + "epoch": 1.3918278987958328, + "grad_norm": 0.22578999400138855, + "learning_rate": 6.481473867662766e-05, + "loss": 1.1304, + "step": 5144 + }, + { + "epoch": 1.3923690975510756, + "grad_norm": 0.20824173092842102, + "learning_rate": 6.478466735552832e-05, + "loss": 1.1291, + "step": 5146 + }, + { + "epoch": 1.3929102963063185, + "grad_norm": 0.20857419073581696, + "learning_rate": 6.475459017308466e-05, + "loss": 1.1132, + "step": 5148 + }, + { + "epoch": 1.3934514950615613, + "grad_norm": 0.20939777791500092, + "learning_rate": 6.47245071412207e-05, + "loss": 1.1358, + "step": 5150 + }, + { + "epoch": 1.3939926938168044, + "grad_norm": 0.2175268530845642, + "learning_rate": 6.469441827186278e-05, + "loss": 1.1368, + "step": 5152 + }, + { + "epoch": 1.394533892572047, + "grad_norm": 0.9276557564735413, + "learning_rate": 6.466432357693955e-05, + "loss": 1.1468, + "step": 5154 + }, + { + "epoch": 1.39507509132729, + "grad_norm": 0.2582313120365143, + "learning_rate": 6.4634223068382e-05, + "loss": 1.143, + "step": 5156 + }, + { + "epoch": 1.3956162900825329, + "grad_norm": 0.2258533239364624, + "learning_rate": 6.460411675812337e-05, + "loss": 1.1324, + "step": 5158 + }, + { + "epoch": 1.3961574888377757, + "grad_norm": 0.23796269297599792, + "learning_rate": 6.457400465809925e-05, + "loss": 1.1181, + "step": 5160 + }, + { + "epoch": 1.3966986875930185, + "grad_norm": 0.2054450958967209, + "learning_rate": 6.454388678024752e-05, + "loss": 1.1342, + "step": 5162 + }, + { + "epoch": 1.3972398863482614, + "grad_norm": 0.30061593651771545, + "learning_rate": 6.45137631365083e-05, + "loss": 1.1404, + "step": 5164 + }, + { + "epoch": 1.3977810851035042, + "grad_norm": 0.2129613757133484, + "learning_rate": 6.448363373882405e-05, + "loss": 1.1331, + "step": 5166 + }, + { + "epoch": 1.398322283858747, + "grad_norm": 0.21098573505878448, + "learning_rate": 6.445349859913952e-05, + "loss": 1.1163, + "step": 5168 + }, + { + "epoch": 1.39886348261399, + "grad_norm": 0.7791925668716431, + "learning_rate": 6.442335772940167e-05, + "loss": 1.122, + "step": 5170 + }, + { + "epoch": 1.399404681369233, + "grad_norm": 0.27684348821640015, + "learning_rate": 6.439321114155981e-05, + "loss": 1.1117, + "step": 5172 + }, + { + "epoch": 1.3999458801244757, + "grad_norm": 0.27352577447891235, + "learning_rate": 6.436305884756543e-05, + "loss": 1.1255, + "step": 5174 + }, + { + "epoch": 1.4004870788797186, + "grad_norm": 0.22613908350467682, + "learning_rate": 6.433290085937239e-05, + "loss": 1.1343, + "step": 5176 + }, + { + "epoch": 1.4010282776349614, + "grad_norm": 0.2271454930305481, + "learning_rate": 6.430273718893671e-05, + "loss": 1.1367, + "step": 5178 + }, + { + "epoch": 1.4015694763902042, + "grad_norm": 0.21145206689834595, + "learning_rate": 6.427256784821671e-05, + "loss": 1.1392, + "step": 5180 + }, + { + "epoch": 1.402110675145447, + "grad_norm": 0.21040070056915283, + "learning_rate": 6.424239284917296e-05, + "loss": 1.1516, + "step": 5182 + }, + { + "epoch": 1.4026518739006901, + "grad_norm": 0.22618639469146729, + "learning_rate": 6.421221220376826e-05, + "loss": 1.1405, + "step": 5184 + }, + { + "epoch": 1.403193072655933, + "grad_norm": 0.20837995409965515, + "learning_rate": 6.418202592396762e-05, + "loss": 1.1209, + "step": 5186 + }, + { + "epoch": 1.4037342714111758, + "grad_norm": 0.1959199756383896, + "learning_rate": 6.415183402173837e-05, + "loss": 1.1421, + "step": 5188 + }, + { + "epoch": 1.4042754701664186, + "grad_norm": 0.21497224271297455, + "learning_rate": 6.412163650904997e-05, + "loss": 1.1275, + "step": 5190 + }, + { + "epoch": 1.4048166689216615, + "grad_norm": 0.20324070751667023, + "learning_rate": 6.409143339787416e-05, + "loss": 1.147, + "step": 5192 + }, + { + "epoch": 1.4053578676769043, + "grad_norm": 0.19893191754817963, + "learning_rate": 6.406122470018489e-05, + "loss": 1.1451, + "step": 5194 + }, + { + "epoch": 1.4058990664321471, + "grad_norm": 0.2447110414505005, + "learning_rate": 6.403101042795833e-05, + "loss": 1.1311, + "step": 5196 + }, + { + "epoch": 1.4064402651873902, + "grad_norm": 0.20944955945014954, + "learning_rate": 6.400079059317283e-05, + "loss": 1.1245, + "step": 5198 + }, + { + "epoch": 1.406981463942633, + "grad_norm": 0.19964562356472015, + "learning_rate": 6.397056520780901e-05, + "loss": 1.133, + "step": 5200 + }, + { + "epoch": 1.4075226626978758, + "grad_norm": 0.20749281346797943, + "learning_rate": 6.394033428384961e-05, + "loss": 1.1166, + "step": 5202 + }, + { + "epoch": 1.4080638614531187, + "grad_norm": 0.2048967033624649, + "learning_rate": 6.391009783327961e-05, + "loss": 1.1212, + "step": 5204 + }, + { + "epoch": 1.4086050602083615, + "grad_norm": 0.20158132910728455, + "learning_rate": 6.387985586808618e-05, + "loss": 1.1327, + "step": 5206 + }, + { + "epoch": 1.4091462589636043, + "grad_norm": 0.2008758783340454, + "learning_rate": 6.384960840025868e-05, + "loss": 1.1342, + "step": 5208 + }, + { + "epoch": 1.4096874577188472, + "grad_norm": 0.20249807834625244, + "learning_rate": 6.381935544178863e-05, + "loss": 1.1515, + "step": 5210 + }, + { + "epoch": 1.4102286564740902, + "grad_norm": 0.20463450253009796, + "learning_rate": 6.378909700466975e-05, + "loss": 1.1372, + "step": 5212 + }, + { + "epoch": 1.4107698552293328, + "grad_norm": 0.2040434032678604, + "learning_rate": 6.37588331008979e-05, + "loss": 1.1194, + "step": 5214 + }, + { + "epoch": 1.411311053984576, + "grad_norm": 0.2022273689508438, + "learning_rate": 6.372856374247116e-05, + "loss": 1.1308, + "step": 5216 + }, + { + "epoch": 1.4118522527398187, + "grad_norm": 0.19660891592502594, + "learning_rate": 6.369828894138972e-05, + "loss": 1.153, + "step": 5218 + }, + { + "epoch": 1.4123934514950616, + "grad_norm": 0.20182554423809052, + "learning_rate": 6.366800870965595e-05, + "loss": 1.1085, + "step": 5220 + }, + { + "epoch": 1.4129346502503044, + "grad_norm": 0.2155122309923172, + "learning_rate": 6.363772305927439e-05, + "loss": 1.1408, + "step": 5222 + }, + { + "epoch": 1.4134758490055472, + "grad_norm": 0.1944376677274704, + "learning_rate": 6.36074320022517e-05, + "loss": 1.134, + "step": 5224 + }, + { + "epoch": 1.41401704776079, + "grad_norm": 0.19909417629241943, + "learning_rate": 6.357713555059667e-05, + "loss": 1.1211, + "step": 5226 + }, + { + "epoch": 1.414558246516033, + "grad_norm": 0.2039753794670105, + "learning_rate": 6.354683371632028e-05, + "loss": 1.135, + "step": 5228 + }, + { + "epoch": 1.415099445271276, + "grad_norm": 0.2083454132080078, + "learning_rate": 6.351652651143563e-05, + "loss": 1.1153, + "step": 5230 + }, + { + "epoch": 1.4156406440265188, + "grad_norm": 0.19243714213371277, + "learning_rate": 6.34862139479579e-05, + "loss": 1.1396, + "step": 5232 + }, + { + "epoch": 1.4161818427817616, + "grad_norm": 0.19831132888793945, + "learning_rate": 6.345589603790445e-05, + "loss": 1.1144, + "step": 5234 + }, + { + "epoch": 1.4167230415370045, + "grad_norm": 0.20038144290447235, + "learning_rate": 6.342557279329473e-05, + "loss": 1.1217, + "step": 5236 + }, + { + "epoch": 1.4172642402922473, + "grad_norm": 0.1926330327987671, + "learning_rate": 6.33952442261503e-05, + "loss": 1.1121, + "step": 5238 + }, + { + "epoch": 1.4178054390474901, + "grad_norm": 0.1921953707933426, + "learning_rate": 6.33649103484949e-05, + "loss": 1.1044, + "step": 5240 + }, + { + "epoch": 1.418346637802733, + "grad_norm": 0.19673408567905426, + "learning_rate": 6.333457117235426e-05, + "loss": 1.1277, + "step": 5242 + }, + { + "epoch": 1.418887836557976, + "grad_norm": 0.21036703884601593, + "learning_rate": 6.330422670975629e-05, + "loss": 1.1299, + "step": 5244 + }, + { + "epoch": 1.4194290353132188, + "grad_norm": 0.20971907675266266, + "learning_rate": 6.3273876972731e-05, + "loss": 1.1374, + "step": 5246 + }, + { + "epoch": 1.4199702340684617, + "grad_norm": 0.19726496934890747, + "learning_rate": 6.324352197331043e-05, + "loss": 1.158, + "step": 5248 + }, + { + "epoch": 1.4205114328237045, + "grad_norm": 0.19105949997901917, + "learning_rate": 6.321316172352875e-05, + "loss": 1.1345, + "step": 5250 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.19923214614391327, + "learning_rate": 6.318279623542223e-05, + "loss": 1.12, + "step": 5252 + }, + { + "epoch": 1.4215938303341902, + "grad_norm": 0.2036782056093216, + "learning_rate": 6.315242552102919e-05, + "loss": 1.1147, + "step": 5254 + }, + { + "epoch": 1.422135029089433, + "grad_norm": 0.20262980461120605, + "learning_rate": 6.312204959238999e-05, + "loss": 1.1234, + "step": 5256 + }, + { + "epoch": 1.422676227844676, + "grad_norm": 0.2113923728466034, + "learning_rate": 6.309166846154712e-05, + "loss": 1.1301, + "step": 5258 + }, + { + "epoch": 1.423217426599919, + "grad_norm": 0.20867778360843658, + "learning_rate": 6.306128214054508e-05, + "loss": 1.1239, + "step": 5260 + }, + { + "epoch": 1.4237586253551617, + "grad_norm": 0.20628774166107178, + "learning_rate": 6.303089064143049e-05, + "loss": 1.1468, + "step": 5262 + }, + { + "epoch": 1.4242998241104046, + "grad_norm": 0.21228572726249695, + "learning_rate": 6.300049397625194e-05, + "loss": 1.1144, + "step": 5264 + }, + { + "epoch": 1.4248410228656474, + "grad_norm": 0.2117290496826172, + "learning_rate": 6.297009215706013e-05, + "loss": 1.1314, + "step": 5266 + }, + { + "epoch": 1.4253822216208902, + "grad_norm": 0.2099006623029709, + "learning_rate": 6.293968519590779e-05, + "loss": 1.1248, + "step": 5268 + }, + { + "epoch": 1.425923420376133, + "grad_norm": 0.20300981402397156, + "learning_rate": 6.290927310484969e-05, + "loss": 1.1097, + "step": 5270 + }, + { + "epoch": 1.4264646191313761, + "grad_norm": 0.22275353968143463, + "learning_rate": 6.287885589594258e-05, + "loss": 1.118, + "step": 5272 + }, + { + "epoch": 1.4270058178866187, + "grad_norm": 0.20226998627185822, + "learning_rate": 6.284843358124538e-05, + "loss": 1.1182, + "step": 5274 + }, + { + "epoch": 1.4275470166418618, + "grad_norm": 0.21334192156791687, + "learning_rate": 6.281800617281884e-05, + "loss": 1.1023, + "step": 5276 + }, + { + "epoch": 1.4280882153971046, + "grad_norm": 0.2120470404624939, + "learning_rate": 6.278757368272587e-05, + "loss": 1.1554, + "step": 5278 + }, + { + "epoch": 1.4286294141523475, + "grad_norm": 0.20143356919288635, + "learning_rate": 6.27571361230314e-05, + "loss": 1.1467, + "step": 5280 + }, + { + "epoch": 1.4291706129075903, + "grad_norm": 0.20335279405117035, + "learning_rate": 6.272669350580225e-05, + "loss": 1.1178, + "step": 5282 + }, + { + "epoch": 1.4297118116628331, + "grad_norm": 0.2324477583169937, + "learning_rate": 6.269624584310734e-05, + "loss": 1.1288, + "step": 5284 + }, + { + "epoch": 1.4302530104180762, + "grad_norm": 0.22252020239830017, + "learning_rate": 6.26657931470176e-05, + "loss": 1.1465, + "step": 5286 + }, + { + "epoch": 1.4307942091733188, + "grad_norm": 0.19176894426345825, + "learning_rate": 6.263533542960591e-05, + "loss": 1.131, + "step": 5288 + }, + { + "epoch": 1.4313354079285618, + "grad_norm": 0.20940960943698883, + "learning_rate": 6.260487270294714e-05, + "loss": 1.1133, + "step": 5290 + }, + { + "epoch": 1.4318766066838047, + "grad_norm": 0.2106446623802185, + "learning_rate": 6.257440497911817e-05, + "loss": 1.1438, + "step": 5292 + }, + { + "epoch": 1.4324178054390475, + "grad_norm": 0.20876814424991608, + "learning_rate": 6.254393227019786e-05, + "loss": 1.1312, + "step": 5294 + }, + { + "epoch": 1.4329590041942903, + "grad_norm": 0.2158835232257843, + "learning_rate": 6.251345458826703e-05, + "loss": 1.1284, + "step": 5296 + }, + { + "epoch": 1.4335002029495332, + "grad_norm": 0.19507522881031036, + "learning_rate": 6.248297194540849e-05, + "loss": 1.1194, + "step": 5298 + }, + { + "epoch": 1.434041401704776, + "grad_norm": 0.20181389153003693, + "learning_rate": 6.2452484353707e-05, + "loss": 1.1119, + "step": 5300 + }, + { + "epoch": 1.4345826004600188, + "grad_norm": 0.22877374291419983, + "learning_rate": 6.242199182524931e-05, + "loss": 1.1195, + "step": 5302 + }, + { + "epoch": 1.435123799215262, + "grad_norm": 0.2084018886089325, + "learning_rate": 6.239149437212407e-05, + "loss": 1.1255, + "step": 5304 + }, + { + "epoch": 1.4356649979705047, + "grad_norm": 0.20372161269187927, + "learning_rate": 6.236099200642193e-05, + "loss": 1.1286, + "step": 5306 + }, + { + "epoch": 1.4362061967257476, + "grad_norm": 0.19950535893440247, + "learning_rate": 6.233048474023551e-05, + "loss": 1.1145, + "step": 5308 + }, + { + "epoch": 1.4367473954809904, + "grad_norm": 0.20108552277088165, + "learning_rate": 6.229997258565929e-05, + "loss": 1.1341, + "step": 5310 + }, + { + "epoch": 1.4372885942362332, + "grad_norm": 0.2273787260055542, + "learning_rate": 6.226945555478977e-05, + "loss": 1.1273, + "step": 5312 + }, + { + "epoch": 1.437829792991476, + "grad_norm": 0.22451777756214142, + "learning_rate": 6.223893365972535e-05, + "loss": 1.1054, + "step": 5314 + }, + { + "epoch": 1.438370991746719, + "grad_norm": 0.21472510695457458, + "learning_rate": 6.220840691256633e-05, + "loss": 1.1257, + "step": 5316 + }, + { + "epoch": 1.438912190501962, + "grad_norm": 0.22151541709899902, + "learning_rate": 6.217787532541499e-05, + "loss": 1.137, + "step": 5318 + }, + { + "epoch": 1.4394533892572048, + "grad_norm": 0.2031947374343872, + "learning_rate": 6.21473389103755e-05, + "loss": 1.1253, + "step": 5320 + }, + { + "epoch": 1.4399945880124476, + "grad_norm": 0.20195578038692474, + "learning_rate": 6.211679767955393e-05, + "loss": 1.128, + "step": 5322 + }, + { + "epoch": 1.4405357867676905, + "grad_norm": 0.19635425508022308, + "learning_rate": 6.208625164505828e-05, + "loss": 1.1214, + "step": 5324 + }, + { + "epoch": 1.4410769855229333, + "grad_norm": 0.20612046122550964, + "learning_rate": 6.205570081899846e-05, + "loss": 1.128, + "step": 5326 + }, + { + "epoch": 1.4416181842781761, + "grad_norm": 0.21617263555526733, + "learning_rate": 6.202514521348627e-05, + "loss": 1.1528, + "step": 5328 + }, + { + "epoch": 1.442159383033419, + "grad_norm": 0.2163170725107193, + "learning_rate": 6.199458484063537e-05, + "loss": 1.1125, + "step": 5330 + }, + { + "epoch": 1.442700581788662, + "grad_norm": 0.23022876679897308, + "learning_rate": 6.196401971256138e-05, + "loss": 1.1316, + "step": 5332 + }, + { + "epoch": 1.4432417805439046, + "grad_norm": 0.2044323831796646, + "learning_rate": 6.193344984138176e-05, + "loss": 1.1188, + "step": 5334 + }, + { + "epoch": 1.4437829792991477, + "grad_norm": 0.2090006023645401, + "learning_rate": 6.190287523921585e-05, + "loss": 1.1118, + "step": 5336 + }, + { + "epoch": 1.4443241780543905, + "grad_norm": 0.21892942488193512, + "learning_rate": 6.187229591818487e-05, + "loss": 1.1139, + "step": 5338 + }, + { + "epoch": 1.4448653768096333, + "grad_norm": 0.22355982661247253, + "learning_rate": 6.184171189041194e-05, + "loss": 1.1013, + "step": 5340 + }, + { + "epoch": 1.4454065755648762, + "grad_norm": 0.20586957037448883, + "learning_rate": 6.181112316802199e-05, + "loss": 1.1317, + "step": 5342 + }, + { + "epoch": 1.445947774320119, + "grad_norm": 0.2016911506652832, + "learning_rate": 6.178052976314186e-05, + "loss": 1.1356, + "step": 5344 + }, + { + "epoch": 1.446488973075362, + "grad_norm": 0.20564743876457214, + "learning_rate": 6.174993168790022e-05, + "loss": 1.1019, + "step": 5346 + }, + { + "epoch": 1.4470301718306047, + "grad_norm": 0.20427408814430237, + "learning_rate": 6.171932895442762e-05, + "loss": 1.1171, + "step": 5348 + }, + { + "epoch": 1.4475713705858477, + "grad_norm": 0.21171078085899353, + "learning_rate": 6.168872157485641e-05, + "loss": 1.1292, + "step": 5350 + }, + { + "epoch": 1.4481125693410906, + "grad_norm": 0.2225184589624405, + "learning_rate": 6.165810956132082e-05, + "loss": 1.1199, + "step": 5352 + }, + { + "epoch": 1.4486537680963334, + "grad_norm": 0.2076125293970108, + "learning_rate": 6.162749292595693e-05, + "loss": 1.1369, + "step": 5354 + }, + { + "epoch": 1.4491949668515762, + "grad_norm": 0.19879041612148285, + "learning_rate": 6.159687168090259e-05, + "loss": 1.108, + "step": 5356 + }, + { + "epoch": 1.449736165606819, + "grad_norm": 0.24063818156719208, + "learning_rate": 6.156624583829753e-05, + "loss": 1.1098, + "step": 5358 + }, + { + "epoch": 1.450277364362062, + "grad_norm": 0.23444120585918427, + "learning_rate": 6.15356154102833e-05, + "loss": 1.1111, + "step": 5360 + }, + { + "epoch": 1.4508185631173047, + "grad_norm": 0.22161896526813507, + "learning_rate": 6.150498040900325e-05, + "loss": 1.112, + "step": 5362 + }, + { + "epoch": 1.4513597618725478, + "grad_norm": 0.2024424970149994, + "learning_rate": 6.147434084660253e-05, + "loss": 1.1162, + "step": 5364 + }, + { + "epoch": 1.4519009606277906, + "grad_norm": 0.20698332786560059, + "learning_rate": 6.144369673522813e-05, + "loss": 1.1197, + "step": 5366 + }, + { + "epoch": 1.4524421593830334, + "grad_norm": 0.1972983181476593, + "learning_rate": 6.141304808702886e-05, + "loss": 1.1378, + "step": 5368 + }, + { + "epoch": 1.4529833581382763, + "grad_norm": 0.20257173478603363, + "learning_rate": 6.138239491415525e-05, + "loss": 1.1276, + "step": 5370 + }, + { + "epoch": 1.4535245568935191, + "grad_norm": 0.2144201546907425, + "learning_rate": 6.135173722875972e-05, + "loss": 1.1237, + "step": 5372 + }, + { + "epoch": 1.454065755648762, + "grad_norm": 0.2175070196390152, + "learning_rate": 6.132107504299641e-05, + "loss": 1.1047, + "step": 5374 + }, + { + "epoch": 1.4546069544040048, + "grad_norm": 0.22028861939907074, + "learning_rate": 6.129040836902126e-05, + "loss": 1.1263, + "step": 5376 + }, + { + "epoch": 1.4551481531592478, + "grad_norm": 0.23484350740909576, + "learning_rate": 6.125973721899201e-05, + "loss": 1.1198, + "step": 5378 + }, + { + "epoch": 1.4556893519144907, + "grad_norm": 0.24591375887393951, + "learning_rate": 6.122906160506814e-05, + "loss": 1.127, + "step": 5380 + }, + { + "epoch": 1.4562305506697335, + "grad_norm": 0.22519347071647644, + "learning_rate": 6.119838153941095e-05, + "loss": 1.1405, + "step": 5382 + }, + { + "epoch": 1.4567717494249763, + "grad_norm": 0.2102644294500351, + "learning_rate": 6.116769703418347e-05, + "loss": 1.1273, + "step": 5384 + }, + { + "epoch": 1.4573129481802192, + "grad_norm": 9.886588096618652, + "learning_rate": 6.113700810155046e-05, + "loss": 1.1474, + "step": 5386 + }, + { + "epoch": 1.457854146935462, + "grad_norm": 0.24959422647953033, + "learning_rate": 6.110631475367852e-05, + "loss": 1.1304, + "step": 5388 + }, + { + "epoch": 1.4583953456907048, + "grad_norm": 0.2364051342010498, + "learning_rate": 6.107561700273592e-05, + "loss": 1.1316, + "step": 5390 + }, + { + "epoch": 1.458936544445948, + "grad_norm": 0.2226945012807846, + "learning_rate": 6.10449148608927e-05, + "loss": 1.1081, + "step": 5392 + }, + { + "epoch": 1.4594777432011905, + "grad_norm": 0.21343544125556946, + "learning_rate": 6.1014208340320665e-05, + "loss": 1.1128, + "step": 5394 + }, + { + "epoch": 1.4600189419564336, + "grad_norm": 0.21775399148464203, + "learning_rate": 6.098349745319334e-05, + "loss": 1.1365, + "step": 5396 + }, + { + "epoch": 1.4605601407116764, + "grad_norm": 0.21264183521270752, + "learning_rate": 6.0952782211685955e-05, + "loss": 1.1251, + "step": 5398 + }, + { + "epoch": 1.4611013394669192, + "grad_norm": 0.21287740767002106, + "learning_rate": 6.092206262797553e-05, + "loss": 1.1301, + "step": 5400 + }, + { + "epoch": 1.461642538222162, + "grad_norm": 0.212521493434906, + "learning_rate": 6.089133871424074e-05, + "loss": 1.1082, + "step": 5402 + }, + { + "epoch": 1.462183736977405, + "grad_norm": 0.21181194484233856, + "learning_rate": 6.0860610482662005e-05, + "loss": 1.0978, + "step": 5404 + }, + { + "epoch": 1.462724935732648, + "grad_norm": 0.21306376159191132, + "learning_rate": 6.0829877945421464e-05, + "loss": 1.1167, + "step": 5406 + }, + { + "epoch": 1.4632661344878906, + "grad_norm": 0.19244280457496643, + "learning_rate": 6.079914111470295e-05, + "loss": 1.1316, + "step": 5408 + }, + { + "epoch": 1.4638073332431336, + "grad_norm": 0.2017110288143158, + "learning_rate": 6.076840000269199e-05, + "loss": 1.1128, + "step": 5410 + }, + { + "epoch": 1.4643485319983764, + "grad_norm": 0.2144317775964737, + "learning_rate": 6.073765462157586e-05, + "loss": 1.1187, + "step": 5412 + }, + { + "epoch": 1.4648897307536193, + "grad_norm": 0.20555539429187775, + "learning_rate": 6.0706904983543444e-05, + "loss": 1.1234, + "step": 5414 + }, + { + "epoch": 1.4654309295088621, + "grad_norm": 0.2013130784034729, + "learning_rate": 6.0676151100785373e-05, + "loss": 1.1329, + "step": 5416 + }, + { + "epoch": 1.465972128264105, + "grad_norm": 0.19903089106082916, + "learning_rate": 6.0645392985493966e-05, + "loss": 1.1204, + "step": 5418 + }, + { + "epoch": 1.4665133270193478, + "grad_norm": 0.20048683881759644, + "learning_rate": 6.061463064986317e-05, + "loss": 1.1237, + "step": 5420 + }, + { + "epoch": 1.4670545257745906, + "grad_norm": 0.2066463679075241, + "learning_rate": 6.058386410608865e-05, + "loss": 1.1252, + "step": 5422 + }, + { + "epoch": 1.4675957245298337, + "grad_norm": 0.19743682444095612, + "learning_rate": 6.055309336636773e-05, + "loss": 1.1135, + "step": 5424 + }, + { + "epoch": 1.4681369232850765, + "grad_norm": 0.20476719737052917, + "learning_rate": 6.05223184428994e-05, + "loss": 1.1326, + "step": 5426 + }, + { + "epoch": 1.4686781220403193, + "grad_norm": 0.2180105745792389, + "learning_rate": 6.049153934788429e-05, + "loss": 1.1321, + "step": 5428 + }, + { + "epoch": 1.4692193207955622, + "grad_norm": 0.19696936011314392, + "learning_rate": 6.0460756093524684e-05, + "loss": 1.1273, + "step": 5430 + }, + { + "epoch": 1.469760519550805, + "grad_norm": 0.20372888445854187, + "learning_rate": 6.0429968692024544e-05, + "loss": 1.1293, + "step": 5432 + }, + { + "epoch": 1.4703017183060478, + "grad_norm": 0.20151478052139282, + "learning_rate": 6.039917715558945e-05, + "loss": 1.1156, + "step": 5434 + }, + { + "epoch": 1.4708429170612907, + "grad_norm": 0.20314039289951324, + "learning_rate": 6.036838149642664e-05, + "loss": 1.1231, + "step": 5436 + }, + { + "epoch": 1.4713841158165337, + "grad_norm": 0.24197211861610413, + "learning_rate": 6.033758172674495e-05, + "loss": 1.104, + "step": 5438 + }, + { + "epoch": 1.4719253145717766, + "grad_norm": 0.20811395347118378, + "learning_rate": 6.0306777858754915e-05, + "loss": 1.1323, + "step": 5440 + }, + { + "epoch": 1.4724665133270194, + "grad_norm": 0.1977129578590393, + "learning_rate": 6.0275969904668605e-05, + "loss": 1.1314, + "step": 5442 + }, + { + "epoch": 1.4730077120822622, + "grad_norm": 0.2016446590423584, + "learning_rate": 6.0245157876699774e-05, + "loss": 1.1146, + "step": 5444 + }, + { + "epoch": 1.473548910837505, + "grad_norm": 0.18964055180549622, + "learning_rate": 6.0214341787063776e-05, + "loss": 1.1309, + "step": 5446 + }, + { + "epoch": 1.4740901095927479, + "grad_norm": 0.2042587697505951, + "learning_rate": 6.018352164797759e-05, + "loss": 1.1211, + "step": 5448 + }, + { + "epoch": 1.4746313083479907, + "grad_norm": 0.19844447076320648, + "learning_rate": 6.015269747165975e-05, + "loss": 1.121, + "step": 5450 + }, + { + "epoch": 1.4751725071032338, + "grad_norm": 0.19678519666194916, + "learning_rate": 6.012186927033044e-05, + "loss": 1.1441, + "step": 5452 + }, + { + "epoch": 1.4757137058584764, + "grad_norm": 0.20729778707027435, + "learning_rate": 6.009103705621144e-05, + "loss": 1.1155, + "step": 5454 + }, + { + "epoch": 1.4762549046137194, + "grad_norm": 0.21526449918746948, + "learning_rate": 6.006020084152606e-05, + "loss": 1.1449, + "step": 5456 + }, + { + "epoch": 1.4767961033689623, + "grad_norm": 0.21945127844810486, + "learning_rate": 6.0029360638499286e-05, + "loss": 1.1031, + "step": 5458 + }, + { + "epoch": 1.477337302124205, + "grad_norm": 0.1969752162694931, + "learning_rate": 5.9998516459357604e-05, + "loss": 1.1191, + "step": 5460 + }, + { + "epoch": 1.477878500879448, + "grad_norm": 0.22696499526500702, + "learning_rate": 5.996766831632913e-05, + "loss": 1.1128, + "step": 5462 + }, + { + "epoch": 1.4784196996346908, + "grad_norm": 0.26032930612564087, + "learning_rate": 5.993681622164354e-05, + "loss": 1.1213, + "step": 5464 + }, + { + "epoch": 1.4789608983899338, + "grad_norm": 0.21460728347301483, + "learning_rate": 5.990596018753204e-05, + "loss": 1.1081, + "step": 5466 + }, + { + "epoch": 1.4795020971451764, + "grad_norm": 0.20702674984931946, + "learning_rate": 5.987510022622746e-05, + "loss": 1.1147, + "step": 5468 + }, + { + "epoch": 1.4800432959004195, + "grad_norm": 0.19713331758975983, + "learning_rate": 5.9844236349964134e-05, + "loss": 1.1385, + "step": 5470 + }, + { + "epoch": 1.4805844946556623, + "grad_norm": 0.20525701344013214, + "learning_rate": 5.981336857097799e-05, + "loss": 1.1286, + "step": 5472 + }, + { + "epoch": 1.4811256934109052, + "grad_norm": 0.19685855507850647, + "learning_rate": 5.9782496901506444e-05, + "loss": 1.1317, + "step": 5474 + }, + { + "epoch": 1.481666892166148, + "grad_norm": 0.20022651553153992, + "learning_rate": 5.9751621353788535e-05, + "loss": 1.1248, + "step": 5476 + }, + { + "epoch": 1.4822080909213908, + "grad_norm": 0.2085038274526596, + "learning_rate": 5.972074194006476e-05, + "loss": 1.1551, + "step": 5478 + }, + { + "epoch": 1.4827492896766337, + "grad_norm": 0.20097662508487701, + "learning_rate": 5.968985867257721e-05, + "loss": 1.1197, + "step": 5480 + }, + { + "epoch": 1.4832904884318765, + "grad_norm": 0.19229756295681, + "learning_rate": 5.965897156356949e-05, + "loss": 1.1258, + "step": 5482 + }, + { + "epoch": 1.4838316871871196, + "grad_norm": 0.2001403421163559, + "learning_rate": 5.9628080625286665e-05, + "loss": 1.1131, + "step": 5484 + }, + { + "epoch": 1.4843728859423624, + "grad_norm": 0.19318203628063202, + "learning_rate": 5.959718586997542e-05, + "loss": 1.118, + "step": 5486 + }, + { + "epoch": 1.4849140846976052, + "grad_norm": 0.19797928631305695, + "learning_rate": 5.95662873098839e-05, + "loss": 1.103, + "step": 5488 + }, + { + "epoch": 1.485455283452848, + "grad_norm": 0.20442748069763184, + "learning_rate": 5.953538495726172e-05, + "loss": 1.1066, + "step": 5490 + }, + { + "epoch": 1.4859964822080909, + "grad_norm": 0.19988074898719788, + "learning_rate": 5.9504478824360077e-05, + "loss": 1.1022, + "step": 5492 + }, + { + "epoch": 1.4865376809633337, + "grad_norm": 0.20216168463230133, + "learning_rate": 5.947356892343161e-05, + "loss": 1.1222, + "step": 5494 + }, + { + "epoch": 1.4870788797185766, + "grad_norm": 0.2014523148536682, + "learning_rate": 5.944265526673051e-05, + "loss": 1.1327, + "step": 5496 + }, + { + "epoch": 1.4876200784738196, + "grad_norm": 0.1946011334657669, + "learning_rate": 5.941173786651236e-05, + "loss": 1.0985, + "step": 5498 + }, + { + "epoch": 1.4881612772290624, + "grad_norm": 0.19211317598819733, + "learning_rate": 5.938081673503433e-05, + "loss": 1.1405, + "step": 5500 + }, + { + "epoch": 1.4887024759843053, + "grad_norm": 0.19912263751029968, + "learning_rate": 5.934989188455502e-05, + "loss": 1.1234, + "step": 5502 + }, + { + "epoch": 1.489243674739548, + "grad_norm": 0.2064303755760193, + "learning_rate": 5.931896332733451e-05, + "loss": 1.1317, + "step": 5504 + }, + { + "epoch": 1.489784873494791, + "grad_norm": 0.19061782956123352, + "learning_rate": 5.928803107563432e-05, + "loss": 1.1099, + "step": 5506 + }, + { + "epoch": 1.4903260722500338, + "grad_norm": 0.1989295333623886, + "learning_rate": 5.92570951417175e-05, + "loss": 1.1273, + "step": 5508 + }, + { + "epoch": 1.4908672710052766, + "grad_norm": 0.19522297382354736, + "learning_rate": 5.92261555378485e-05, + "loss": 1.1123, + "step": 5510 + }, + { + "epoch": 1.4914084697605197, + "grad_norm": 0.1995413452386856, + "learning_rate": 5.9195212276293255e-05, + "loss": 1.1218, + "step": 5512 + }, + { + "epoch": 1.4919496685157625, + "grad_norm": 0.1953144520521164, + "learning_rate": 5.916426536931915e-05, + "loss": 1.1164, + "step": 5514 + }, + { + "epoch": 1.4924908672710053, + "grad_norm": 0.20359961688518524, + "learning_rate": 5.9133314829195006e-05, + "loss": 1.1134, + "step": 5516 + }, + { + "epoch": 1.4930320660262482, + "grad_norm": 0.20576973259449005, + "learning_rate": 5.9102360668191084e-05, + "loss": 1.1195, + "step": 5518 + }, + { + "epoch": 1.493573264781491, + "grad_norm": 0.2180139273405075, + "learning_rate": 5.907140289857907e-05, + "loss": 1.1179, + "step": 5520 + }, + { + "epoch": 1.4941144635367338, + "grad_norm": 0.2019367218017578, + "learning_rate": 5.9040441532632115e-05, + "loss": 1.1057, + "step": 5522 + }, + { + "epoch": 1.4946556622919767, + "grad_norm": 0.19558557868003845, + "learning_rate": 5.900947658262477e-05, + "loss": 1.1164, + "step": 5524 + }, + { + "epoch": 1.4951968610472197, + "grad_norm": 0.19859646260738373, + "learning_rate": 5.897850806083302e-05, + "loss": 1.1267, + "step": 5526 + }, + { + "epoch": 1.4957380598024623, + "grad_norm": 0.19553080201148987, + "learning_rate": 5.8947535979534244e-05, + "loss": 1.1277, + "step": 5528 + }, + { + "epoch": 1.4962792585577054, + "grad_norm": 0.20781400799751282, + "learning_rate": 5.891656035100724e-05, + "loss": 1.1179, + "step": 5530 + }, + { + "epoch": 1.4968204573129482, + "grad_norm": 0.20255321264266968, + "learning_rate": 5.8885581187532246e-05, + "loss": 1.1243, + "step": 5532 + }, + { + "epoch": 1.497361656068191, + "grad_norm": 0.2056676149368286, + "learning_rate": 5.8854598501390845e-05, + "loss": 1.1329, + "step": 5534 + }, + { + "epoch": 1.4979028548234339, + "grad_norm": 0.20784781873226166, + "learning_rate": 5.8823612304866046e-05, + "loss": 1.1042, + "step": 5536 + }, + { + "epoch": 1.4984440535786767, + "grad_norm": 0.20506465435028076, + "learning_rate": 5.8792622610242275e-05, + "loss": 1.1157, + "step": 5538 + }, + { + "epoch": 1.4989852523339195, + "grad_norm": 0.21897608041763306, + "learning_rate": 5.8761629429805296e-05, + "loss": 1.1311, + "step": 5540 + }, + { + "epoch": 1.4995264510891624, + "grad_norm": 0.21256893873214722, + "learning_rate": 5.87306327758423e-05, + "loss": 1.1106, + "step": 5542 + }, + { + "epoch": 1.5000676498444054, + "grad_norm": 0.2042350023984909, + "learning_rate": 5.86996326606418e-05, + "loss": 1.1029, + "step": 5544 + }, + { + "epoch": 1.5006088485996483, + "grad_norm": 0.20099198818206787, + "learning_rate": 5.866862909649373e-05, + "loss": 1.1068, + "step": 5546 + }, + { + "epoch": 1.501150047354891, + "grad_norm": 0.20550420880317688, + "learning_rate": 5.863762209568938e-05, + "loss": 1.1256, + "step": 5548 + }, + { + "epoch": 1.501691246110134, + "grad_norm": 0.2070087343454361, + "learning_rate": 5.8606611670521404e-05, + "loss": 1.1324, + "step": 5550 + }, + { + "epoch": 1.5022324448653768, + "grad_norm": 0.2023104876279831, + "learning_rate": 5.8575597833283794e-05, + "loss": 1.1188, + "step": 5552 + }, + { + "epoch": 1.5027736436206198, + "grad_norm": 0.1989484578371048, + "learning_rate": 5.854458059627191e-05, + "loss": 1.1313, + "step": 5554 + }, + { + "epoch": 1.5033148423758624, + "grad_norm": 0.19165557622909546, + "learning_rate": 5.851355997178247e-05, + "loss": 1.1385, + "step": 5556 + }, + { + "epoch": 1.5038560411311055, + "grad_norm": 0.20374418795108795, + "learning_rate": 5.848253597211349e-05, + "loss": 1.1304, + "step": 5558 + }, + { + "epoch": 1.504397239886348, + "grad_norm": 0.19242756068706512, + "learning_rate": 5.845150860956441e-05, + "loss": 1.1146, + "step": 5560 + }, + { + "epoch": 1.5049384386415912, + "grad_norm": 0.19632889330387115, + "learning_rate": 5.84204778964359e-05, + "loss": 1.1212, + "step": 5562 + }, + { + "epoch": 1.505479637396834, + "grad_norm": 0.19135157763957977, + "learning_rate": 5.838944384503003e-05, + "loss": 1.1194, + "step": 5564 + }, + { + "epoch": 1.5060208361520768, + "grad_norm": 0.20354853570461273, + "learning_rate": 5.835840646765019e-05, + "loss": 1.1318, + "step": 5566 + }, + { + "epoch": 1.5065620349073197, + "grad_norm": 0.19074110686779022, + "learning_rate": 5.832736577660103e-05, + "loss": 1.1257, + "step": 5568 + }, + { + "epoch": 1.5071032336625625, + "grad_norm": 0.19348789751529694, + "learning_rate": 5.829632178418857e-05, + "loss": 1.0964, + "step": 5570 + }, + { + "epoch": 1.5076444324178055, + "grad_norm": 0.19624833762645721, + "learning_rate": 5.8265274502720134e-05, + "loss": 1.1124, + "step": 5572 + }, + { + "epoch": 1.5081856311730482, + "grad_norm": 0.1924959421157837, + "learning_rate": 5.823422394450434e-05, + "loss": 1.1188, + "step": 5574 + }, + { + "epoch": 1.5087268299282912, + "grad_norm": 0.2001020610332489, + "learning_rate": 5.820317012185108e-05, + "loss": 1.105, + "step": 5576 + }, + { + "epoch": 1.509268028683534, + "grad_norm": 0.1978660523891449, + "learning_rate": 5.817211304707161e-05, + "loss": 1.1073, + "step": 5578 + }, + { + "epoch": 1.5098092274387769, + "grad_norm": 0.1959565430879593, + "learning_rate": 5.8141052732478375e-05, + "loss": 1.12, + "step": 5580 + }, + { + "epoch": 1.5103504261940197, + "grad_norm": 0.1948666125535965, + "learning_rate": 5.81099891903852e-05, + "loss": 1.118, + "step": 5582 + }, + { + "epoch": 1.5108916249492625, + "grad_norm": 0.19219863414764404, + "learning_rate": 5.807892243310713e-05, + "loss": 1.0994, + "step": 5584 + }, + { + "epoch": 1.5114328237045056, + "grad_norm": 0.20111101865768433, + "learning_rate": 5.8047852472960496e-05, + "loss": 1.099, + "step": 5586 + }, + { + "epoch": 1.5119740224597482, + "grad_norm": 0.21085359156131744, + "learning_rate": 5.801677932226293e-05, + "loss": 1.1227, + "step": 5588 + }, + { + "epoch": 1.5125152212149913, + "grad_norm": 0.20131339132785797, + "learning_rate": 5.798570299333329e-05, + "loss": 1.1148, + "step": 5590 + }, + { + "epoch": 1.513056419970234, + "grad_norm": 0.20731684565544128, + "learning_rate": 5.79546234984917e-05, + "loss": 1.1049, + "step": 5592 + }, + { + "epoch": 1.513597618725477, + "grad_norm": 0.19507162272930145, + "learning_rate": 5.792354085005956e-05, + "loss": 1.1032, + "step": 5594 + }, + { + "epoch": 1.5141388174807198, + "grad_norm": 0.19368614256381989, + "learning_rate": 5.78924550603595e-05, + "loss": 1.1197, + "step": 5596 + }, + { + "epoch": 1.5146800162359626, + "grad_norm": 0.20111224055290222, + "learning_rate": 5.7861366141715424e-05, + "loss": 1.1129, + "step": 5598 + }, + { + "epoch": 1.5152212149912057, + "grad_norm": 0.1937616467475891, + "learning_rate": 5.783027410645242e-05, + "loss": 1.1078, + "step": 5600 + }, + { + "epoch": 1.5157624137464483, + "grad_norm": 0.19857732951641083, + "learning_rate": 5.7799178966896885e-05, + "loss": 1.1269, + "step": 5602 + }, + { + "epoch": 1.5163036125016913, + "grad_norm": 0.19797003269195557, + "learning_rate": 5.776808073537637e-05, + "loss": 1.115, + "step": 5604 + }, + { + "epoch": 1.5168448112569342, + "grad_norm": 0.1918354630470276, + "learning_rate": 5.773697942421974e-05, + "loss": 1.1062, + "step": 5606 + }, + { + "epoch": 1.517386010012177, + "grad_norm": 0.20058605074882507, + "learning_rate": 5.7705875045756995e-05, + "loss": 1.1242, + "step": 5608 + }, + { + "epoch": 1.5179272087674198, + "grad_norm": 0.20740623772144318, + "learning_rate": 5.76747676123194e-05, + "loss": 1.1234, + "step": 5610 + }, + { + "epoch": 1.5184684075226627, + "grad_norm": 0.20457324385643005, + "learning_rate": 5.7643657136239416e-05, + "loss": 1.1229, + "step": 5612 + }, + { + "epoch": 1.5190096062779057, + "grad_norm": 0.2054700404405594, + "learning_rate": 5.76125436298507e-05, + "loss": 1.1146, + "step": 5614 + }, + { + "epoch": 1.5195508050331483, + "grad_norm": 0.1983955353498459, + "learning_rate": 5.758142710548816e-05, + "loss": 1.1406, + "step": 5616 + }, + { + "epoch": 1.5200920037883914, + "grad_norm": 0.20349404215812683, + "learning_rate": 5.755030757548784e-05, + "loss": 1.122, + "step": 5618 + }, + { + "epoch": 1.520633202543634, + "grad_norm": 0.2126217931509018, + "learning_rate": 5.751918505218698e-05, + "loss": 1.1192, + "step": 5620 + }, + { + "epoch": 1.521174401298877, + "grad_norm": 0.20240618288516998, + "learning_rate": 5.748805954792407e-05, + "loss": 1.128, + "step": 5622 + }, + { + "epoch": 1.5217156000541199, + "grad_norm": 0.19409391283988953, + "learning_rate": 5.74569310750387e-05, + "loss": 1.116, + "step": 5624 + }, + { + "epoch": 1.5222567988093627, + "grad_norm": 0.19456911087036133, + "learning_rate": 5.74257996458717e-05, + "loss": 1.1168, + "step": 5626 + }, + { + "epoch": 1.5227979975646058, + "grad_norm": 0.20936742424964905, + "learning_rate": 5.7394665272765045e-05, + "loss": 1.1283, + "step": 5628 + }, + { + "epoch": 1.5233391963198484, + "grad_norm": 0.19374299049377441, + "learning_rate": 5.736352796806187e-05, + "loss": 1.1181, + "step": 5630 + }, + { + "epoch": 1.5238803950750914, + "grad_norm": 0.2012673169374466, + "learning_rate": 5.7332387744106475e-05, + "loss": 1.1186, + "step": 5632 + }, + { + "epoch": 1.524421593830334, + "grad_norm": 0.19683314859867096, + "learning_rate": 5.730124461324433e-05, + "loss": 1.1133, + "step": 5634 + }, + { + "epoch": 1.524962792585577, + "grad_norm": 0.20046451687812805, + "learning_rate": 5.7270098587822075e-05, + "loss": 1.0925, + "step": 5636 + }, + { + "epoch": 1.52550399134082, + "grad_norm": 0.20211833715438843, + "learning_rate": 5.723894968018744e-05, + "loss": 1.1286, + "step": 5638 + }, + { + "epoch": 1.5260451900960628, + "grad_norm": 0.19466207921504974, + "learning_rate": 5.7207797902689344e-05, + "loss": 1.1178, + "step": 5640 + }, + { + "epoch": 1.5265863888513056, + "grad_norm": 0.19484566152095795, + "learning_rate": 5.717664326767783e-05, + "loss": 1.1182, + "step": 5642 + }, + { + "epoch": 1.5271275876065484, + "grad_norm": 0.20028981566429138, + "learning_rate": 5.714548578750407e-05, + "loss": 1.1245, + "step": 5644 + }, + { + "epoch": 1.5276687863617915, + "grad_norm": 0.21267130970954895, + "learning_rate": 5.711432547452038e-05, + "loss": 1.1072, + "step": 5646 + }, + { + "epoch": 1.528209985117034, + "grad_norm": 0.20393440127372742, + "learning_rate": 5.708316234108019e-05, + "loss": 1.1179, + "step": 5648 + }, + { + "epoch": 1.5287511838722772, + "grad_norm": 0.19241678714752197, + "learning_rate": 5.705199639953802e-05, + "loss": 1.1438, + "step": 5650 + }, + { + "epoch": 1.52929238262752, + "grad_norm": 0.1952701359987259, + "learning_rate": 5.702082766224957e-05, + "loss": 1.0999, + "step": 5652 + }, + { + "epoch": 1.5298335813827628, + "grad_norm": 0.1929616928100586, + "learning_rate": 5.698965614157157e-05, + "loss": 1.1093, + "step": 5654 + }, + { + "epoch": 1.5303747801380057, + "grad_norm": 0.19738009572029114, + "learning_rate": 5.6958481849861924e-05, + "loss": 1.1225, + "step": 5656 + }, + { + "epoch": 1.5309159788932485, + "grad_norm": 0.21183903515338898, + "learning_rate": 5.6927304799479586e-05, + "loss": 1.1238, + "step": 5658 + }, + { + "epoch": 1.5314571776484915, + "grad_norm": 0.19697804749011993, + "learning_rate": 5.6896125002784605e-05, + "loss": 1.1164, + "step": 5660 + }, + { + "epoch": 1.5319983764037342, + "grad_norm": 0.2036931812763214, + "learning_rate": 5.6864942472138164e-05, + "loss": 1.1144, + "step": 5662 + }, + { + "epoch": 1.5325395751589772, + "grad_norm": 0.1969900131225586, + "learning_rate": 5.683375721990247e-05, + "loss": 1.1071, + "step": 5664 + }, + { + "epoch": 1.53308077391422, + "grad_norm": 0.21038979291915894, + "learning_rate": 5.680256925844085e-05, + "loss": 1.1431, + "step": 5666 + }, + { + "epoch": 1.5336219726694629, + "grad_norm": 0.20183879137039185, + "learning_rate": 5.6771378600117696e-05, + "loss": 1.1036, + "step": 5668 + }, + { + "epoch": 1.5341631714247057, + "grad_norm": 0.1987054944038391, + "learning_rate": 5.674018525729847e-05, + "loss": 1.1207, + "step": 5670 + }, + { + "epoch": 1.5347043701799485, + "grad_norm": 0.19822414219379425, + "learning_rate": 5.670898924234968e-05, + "loss": 1.1396, + "step": 5672 + }, + { + "epoch": 1.5352455689351916, + "grad_norm": 0.21729126572608948, + "learning_rate": 5.6677790567638913e-05, + "loss": 1.1022, + "step": 5674 + }, + { + "epoch": 1.5357867676904342, + "grad_norm": 0.209171324968338, + "learning_rate": 5.664658924553482e-05, + "loss": 1.1315, + "step": 5676 + }, + { + "epoch": 1.5363279664456773, + "grad_norm": 0.19557133316993713, + "learning_rate": 5.661538528840706e-05, + "loss": 1.1161, + "step": 5678 + }, + { + "epoch": 1.5368691652009199, + "grad_norm": 0.19774852693080902, + "learning_rate": 5.65841787086264e-05, + "loss": 1.111, + "step": 5680 + }, + { + "epoch": 1.537410363956163, + "grad_norm": 0.19714070856571198, + "learning_rate": 5.655296951856459e-05, + "loss": 1.1402, + "step": 5682 + }, + { + "epoch": 1.5379515627114058, + "grad_norm": 0.20607705414295197, + "learning_rate": 5.6521757730594425e-05, + "loss": 1.1228, + "step": 5684 + }, + { + "epoch": 1.5384927614666486, + "grad_norm": 0.19225092232227325, + "learning_rate": 5.649054335708975e-05, + "loss": 1.1217, + "step": 5686 + }, + { + "epoch": 1.5390339602218917, + "grad_norm": 0.19334369897842407, + "learning_rate": 5.645932641042544e-05, + "loss": 1.107, + "step": 5688 + }, + { + "epoch": 1.5395751589771343, + "grad_norm": 0.1904376894235611, + "learning_rate": 5.642810690297734e-05, + "loss": 1.1095, + "step": 5690 + }, + { + "epoch": 1.5401163577323773, + "grad_norm": 0.19064228236675262, + "learning_rate": 5.639688484712238e-05, + "loss": 1.109, + "step": 5692 + }, + { + "epoch": 1.54065755648762, + "grad_norm": 0.19435817003250122, + "learning_rate": 5.636566025523844e-05, + "loss": 1.1089, + "step": 5694 + }, + { + "epoch": 1.541198755242863, + "grad_norm": 0.2055196613073349, + "learning_rate": 5.6334433139704455e-05, + "loss": 1.1357, + "step": 5696 + }, + { + "epoch": 1.5417399539981058, + "grad_norm": 0.20478758215904236, + "learning_rate": 5.630320351290032e-05, + "loss": 1.1057, + "step": 5698 + }, + { + "epoch": 1.5422811527533486, + "grad_norm": 0.1930474191904068, + "learning_rate": 5.627197138720694e-05, + "loss": 1.1148, + "step": 5700 + }, + { + "epoch": 1.5428223515085915, + "grad_norm": 0.19690391421318054, + "learning_rate": 5.624073677500622e-05, + "loss": 1.1171, + "step": 5702 + }, + { + "epoch": 1.5433635502638343, + "grad_norm": 0.20076878368854523, + "learning_rate": 5.6209499688681054e-05, + "loss": 1.1039, + "step": 5704 + }, + { + "epoch": 1.5439047490190774, + "grad_norm": 0.21312648057937622, + "learning_rate": 5.617826014061529e-05, + "loss": 1.1015, + "step": 5706 + }, + { + "epoch": 1.54444594777432, + "grad_norm": 0.21611233055591583, + "learning_rate": 5.6147018143193785e-05, + "loss": 1.1231, + "step": 5708 + }, + { + "epoch": 1.544987146529563, + "grad_norm": 0.19640076160430908, + "learning_rate": 5.611577370880234e-05, + "loss": 1.1065, + "step": 5710 + }, + { + "epoch": 1.5455283452848059, + "grad_norm": 0.20446118712425232, + "learning_rate": 5.608452684982772e-05, + "loss": 1.1099, + "step": 5712 + }, + { + "epoch": 1.5460695440400487, + "grad_norm": 0.20652319490909576, + "learning_rate": 5.605327757865769e-05, + "loss": 1.1044, + "step": 5714 + }, + { + "epoch": 1.5466107427952915, + "grad_norm": 0.2292853742837906, + "learning_rate": 5.602202590768094e-05, + "loss": 1.1074, + "step": 5716 + }, + { + "epoch": 1.5471519415505344, + "grad_norm": 0.19580331444740295, + "learning_rate": 5.599077184928709e-05, + "loss": 1.1028, + "step": 5718 + }, + { + "epoch": 1.5476931403057774, + "grad_norm": 0.19520023465156555, + "learning_rate": 5.5959515415866756e-05, + "loss": 1.1035, + "step": 5720 + }, + { + "epoch": 1.54823433906102, + "grad_norm": 0.21405938267707825, + "learning_rate": 5.592825661981148e-05, + "loss": 1.1118, + "step": 5722 + }, + { + "epoch": 1.548775537816263, + "grad_norm": 0.19495871663093567, + "learning_rate": 5.5896995473513715e-05, + "loss": 1.11, + "step": 5724 + }, + { + "epoch": 1.549316736571506, + "grad_norm": 0.19302654266357422, + "learning_rate": 5.586573198936687e-05, + "loss": 1.1142, + "step": 5726 + }, + { + "epoch": 1.5498579353267488, + "grad_norm": 0.19378650188446045, + "learning_rate": 5.5834466179765276e-05, + "loss": 1.1092, + "step": 5728 + }, + { + "epoch": 1.5503991340819916, + "grad_norm": 0.1954188495874405, + "learning_rate": 5.580319805710418e-05, + "loss": 1.1196, + "step": 5730 + }, + { + "epoch": 1.5509403328372344, + "grad_norm": 0.1967216432094574, + "learning_rate": 5.577192763377977e-05, + "loss": 1.1144, + "step": 5732 + }, + { + "epoch": 1.5514815315924775, + "grad_norm": 0.19551461935043335, + "learning_rate": 5.5740654922189104e-05, + "loss": 1.0973, + "step": 5734 + }, + { + "epoch": 1.55202273034772, + "grad_norm": 0.19180738925933838, + "learning_rate": 5.5709379934730186e-05, + "loss": 1.1226, + "step": 5736 + }, + { + "epoch": 1.5525639291029631, + "grad_norm": 0.20024901628494263, + "learning_rate": 5.56781026838019e-05, + "loss": 1.1271, + "step": 5738 + }, + { + "epoch": 1.553105127858206, + "grad_norm": 0.20005659759044647, + "learning_rate": 5.564682318180402e-05, + "loss": 1.119, + "step": 5740 + }, + { + "epoch": 1.5536463266134488, + "grad_norm": 0.197408989071846, + "learning_rate": 5.5615541441137254e-05, + "loss": 1.1046, + "step": 5742 + }, + { + "epoch": 1.5541875253686916, + "grad_norm": 0.19918306171894073, + "learning_rate": 5.558425747420316e-05, + "loss": 1.141, + "step": 5744 + }, + { + "epoch": 1.5547287241239345, + "grad_norm": 0.19122545421123505, + "learning_rate": 5.5552971293404165e-05, + "loss": 1.1088, + "step": 5746 + }, + { + "epoch": 1.5552699228791775, + "grad_norm": 0.19816403090953827, + "learning_rate": 5.5521682911143636e-05, + "loss": 1.0977, + "step": 5748 + }, + { + "epoch": 1.5558111216344201, + "grad_norm": 0.2009589970111847, + "learning_rate": 5.5490392339825734e-05, + "loss": 1.0943, + "step": 5750 + }, + { + "epoch": 1.5563523203896632, + "grad_norm": 0.19680792093276978, + "learning_rate": 5.545909959185555e-05, + "loss": 1.1332, + "step": 5752 + }, + { + "epoch": 1.5568935191449058, + "grad_norm": 0.20506925880908966, + "learning_rate": 5.542780467963902e-05, + "loss": 1.123, + "step": 5754 + }, + { + "epoch": 1.5574347179001489, + "grad_norm": 0.1940237581729889, + "learning_rate": 5.5396507615582926e-05, + "loss": 1.0973, + "step": 5756 + }, + { + "epoch": 1.5579759166553917, + "grad_norm": 0.21322675049304962, + "learning_rate": 5.5365208412094903e-05, + "loss": 1.1264, + "step": 5758 + }, + { + "epoch": 1.5585171154106345, + "grad_norm": 0.2190679907798767, + "learning_rate": 5.533390708158346e-05, + "loss": 1.1048, + "step": 5760 + }, + { + "epoch": 1.5590583141658774, + "grad_norm": 0.2050085961818695, + "learning_rate": 5.530260363645792e-05, + "loss": 1.116, + "step": 5762 + }, + { + "epoch": 1.5595995129211202, + "grad_norm": 0.20305952429771423, + "learning_rate": 5.5271298089128454e-05, + "loss": 1.1054, + "step": 5764 + }, + { + "epoch": 1.5601407116763633, + "grad_norm": 0.19230793416500092, + "learning_rate": 5.523999045200606e-05, + "loss": 1.1285, + "step": 5766 + }, + { + "epoch": 1.5606819104316059, + "grad_norm": 0.20029379427433014, + "learning_rate": 5.5208680737502606e-05, + "loss": 1.088, + "step": 5768 + }, + { + "epoch": 1.561223109186849, + "grad_norm": 0.20273414254188538, + "learning_rate": 5.5177368958030696e-05, + "loss": 1.0858, + "step": 5770 + }, + { + "epoch": 1.5617643079420918, + "grad_norm": 0.2130161076784134, + "learning_rate": 5.5146055126003856e-05, + "loss": 1.1016, + "step": 5772 + }, + { + "epoch": 1.5623055066973346, + "grad_norm": 0.19795767962932587, + "learning_rate": 5.511473925383634e-05, + "loss": 1.1269, + "step": 5774 + }, + { + "epoch": 1.5628467054525774, + "grad_norm": 0.19658271968364716, + "learning_rate": 5.508342135394329e-05, + "loss": 1.1044, + "step": 5776 + }, + { + "epoch": 1.5633879042078203, + "grad_norm": 0.20559827983379364, + "learning_rate": 5.505210143874055e-05, + "loss": 1.1189, + "step": 5778 + }, + { + "epoch": 1.5639291029630633, + "grad_norm": 0.2187706083059311, + "learning_rate": 5.502077952064487e-05, + "loss": 1.1166, + "step": 5780 + }, + { + "epoch": 1.564470301718306, + "grad_norm": 0.26281484961509705, + "learning_rate": 5.4989455612073736e-05, + "loss": 1.0923, + "step": 5782 + }, + { + "epoch": 1.565011500473549, + "grad_norm": 0.21491557359695435, + "learning_rate": 5.495812972544543e-05, + "loss": 1.0851, + "step": 5784 + }, + { + "epoch": 1.5655526992287918, + "grad_norm": 0.240944042801857, + "learning_rate": 5.4926801873179005e-05, + "loss": 1.1075, + "step": 5786 + }, + { + "epoch": 1.5660938979840346, + "grad_norm": 0.21036581695079803, + "learning_rate": 5.489547206769433e-05, + "loss": 1.0999, + "step": 5788 + }, + { + "epoch": 1.5666350967392775, + "grad_norm": 0.252795934677124, + "learning_rate": 5.486414032141203e-05, + "loss": 1.1083, + "step": 5790 + }, + { + "epoch": 1.5671762954945203, + "grad_norm": 0.26431161165237427, + "learning_rate": 5.483280664675347e-05, + "loss": 1.1181, + "step": 5792 + }, + { + "epoch": 1.5677174942497634, + "grad_norm": 0.3089235723018646, + "learning_rate": 5.480147105614084e-05, + "loss": 1.1054, + "step": 5794 + }, + { + "epoch": 1.568258693005006, + "grad_norm": 0.23798713088035583, + "learning_rate": 5.477013356199704e-05, + "loss": 1.1282, + "step": 5796 + }, + { + "epoch": 1.568799891760249, + "grad_norm": 0.25433656573295593, + "learning_rate": 5.473879417674573e-05, + "loss": 1.0991, + "step": 5798 + }, + { + "epoch": 1.5693410905154919, + "grad_norm": 0.21360009908676147, + "learning_rate": 5.470745291281135e-05, + "loss": 1.1239, + "step": 5800 + }, + { + "epoch": 1.5698822892707347, + "grad_norm": 0.19481489062309265, + "learning_rate": 5.4676109782619054e-05, + "loss": 1.1135, + "step": 5802 + }, + { + "epoch": 1.5704234880259775, + "grad_norm": 0.21150141954421997, + "learning_rate": 5.464476479859476e-05, + "loss": 1.0996, + "step": 5804 + }, + { + "epoch": 1.5709646867812204, + "grad_norm": 0.20039090514183044, + "learning_rate": 5.4613417973165106e-05, + "loss": 1.1222, + "step": 5806 + }, + { + "epoch": 1.5715058855364634, + "grad_norm": 0.2008035033941269, + "learning_rate": 5.4582069318757454e-05, + "loss": 1.0975, + "step": 5808 + }, + { + "epoch": 1.572047084291706, + "grad_norm": 0.20237408578395844, + "learning_rate": 5.45507188477999e-05, + "loss": 1.0975, + "step": 5810 + }, + { + "epoch": 1.572588283046949, + "grad_norm": 0.197785422205925, + "learning_rate": 5.451936657272125e-05, + "loss": 1.1312, + "step": 5812 + }, + { + "epoch": 1.5731294818021917, + "grad_norm": 0.1958705335855484, + "learning_rate": 5.448801250595106e-05, + "loss": 1.1196, + "step": 5814 + }, + { + "epoch": 1.5736706805574348, + "grad_norm": 0.20273469388484955, + "learning_rate": 5.445665665991957e-05, + "loss": 1.114, + "step": 5816 + }, + { + "epoch": 1.5742118793126776, + "grad_norm": 0.19480599462985992, + "learning_rate": 5.44252990470577e-05, + "loss": 1.1044, + "step": 5818 + }, + { + "epoch": 1.5747530780679204, + "grad_norm": 0.1951165795326233, + "learning_rate": 5.4393939679797115e-05, + "loss": 1.1124, + "step": 5820 + }, + { + "epoch": 1.5752942768231633, + "grad_norm": 0.20022697746753693, + "learning_rate": 5.4362578570570155e-05, + "loss": 1.1039, + "step": 5822 + }, + { + "epoch": 1.575835475578406, + "grad_norm": 0.1950424760580063, + "learning_rate": 5.433121573180985e-05, + "loss": 1.1334, + "step": 5824 + }, + { + "epoch": 1.5763766743336491, + "grad_norm": 0.20176029205322266, + "learning_rate": 5.429985117594992e-05, + "loss": 1.1147, + "step": 5826 + }, + { + "epoch": 1.5769178730888918, + "grad_norm": 0.20517152547836304, + "learning_rate": 5.4268484915424766e-05, + "loss": 1.1169, + "step": 5828 + }, + { + "epoch": 1.5774590718441348, + "grad_norm": 0.19474323093891144, + "learning_rate": 5.4237116962669465e-05, + "loss": 1.1063, + "step": 5830 + }, + { + "epoch": 1.5780002705993776, + "grad_norm": 0.20473884046077728, + "learning_rate": 5.420574733011976e-05, + "loss": 1.1203, + "step": 5832 + }, + { + "epoch": 1.5785414693546205, + "grad_norm": 0.1987096667289734, + "learning_rate": 5.417437603021207e-05, + "loss": 1.0905, + "step": 5834 + }, + { + "epoch": 1.5790826681098633, + "grad_norm": 0.20204855501651764, + "learning_rate": 5.414300307538347e-05, + "loss": 1.111, + "step": 5836 + }, + { + "epoch": 1.5796238668651061, + "grad_norm": 0.202619269490242, + "learning_rate": 5.4111628478071666e-05, + "loss": 1.1168, + "step": 5838 + }, + { + "epoch": 1.5801650656203492, + "grad_norm": 0.1962268203496933, + "learning_rate": 5.408025225071507e-05, + "loss": 1.0939, + "step": 5840 + }, + { + "epoch": 1.5807062643755918, + "grad_norm": 0.1889873594045639, + "learning_rate": 5.4048874405752694e-05, + "loss": 1.1008, + "step": 5842 + }, + { + "epoch": 1.5812474631308349, + "grad_norm": 0.19847267866134644, + "learning_rate": 5.401749495562419e-05, + "loss": 1.0867, + "step": 5844 + }, + { + "epoch": 1.5817886618860777, + "grad_norm": 0.20052090287208557, + "learning_rate": 5.3986113912769884e-05, + "loss": 1.1224, + "step": 5846 + }, + { + "epoch": 1.5823298606413205, + "grad_norm": 0.20266535878181458, + "learning_rate": 5.3954731289630724e-05, + "loss": 1.1078, + "step": 5848 + }, + { + "epoch": 1.5828710593965634, + "grad_norm": 0.19795338809490204, + "learning_rate": 5.392334709864822e-05, + "loss": 1.1194, + "step": 5850 + }, + { + "epoch": 1.5834122581518062, + "grad_norm": 0.19428643584251404, + "learning_rate": 5.389196135226461e-05, + "loss": 1.1155, + "step": 5852 + }, + { + "epoch": 1.5839534569070493, + "grad_norm": 0.1965980976819992, + "learning_rate": 5.3860574062922675e-05, + "loss": 1.1066, + "step": 5854 + }, + { + "epoch": 1.5844946556622919, + "grad_norm": 0.19217641651630402, + "learning_rate": 5.3829185243065816e-05, + "loss": 1.1186, + "step": 5856 + }, + { + "epoch": 1.585035854417535, + "grad_norm": 0.20498093962669373, + "learning_rate": 5.379779490513808e-05, + "loss": 1.1073, + "step": 5858 + }, + { + "epoch": 1.5855770531727778, + "grad_norm": 0.21169058978557587, + "learning_rate": 5.376640306158406e-05, + "loss": 1.1193, + "step": 5860 + }, + { + "epoch": 1.5861182519280206, + "grad_norm": 0.2154383659362793, + "learning_rate": 5.3735009724849e-05, + "loss": 1.1317, + "step": 5862 + }, + { + "epoch": 1.5866594506832634, + "grad_norm": 0.2106177806854248, + "learning_rate": 5.3703614907378675e-05, + "loss": 1.0965, + "step": 5864 + }, + { + "epoch": 1.5872006494385063, + "grad_norm": 0.20630276203155518, + "learning_rate": 5.3672218621619496e-05, + "loss": 1.0948, + "step": 5866 + }, + { + "epoch": 1.5877418481937493, + "grad_norm": 0.21020294725894928, + "learning_rate": 5.3640820880018475e-05, + "loss": 1.0984, + "step": 5868 + }, + { + "epoch": 1.588283046948992, + "grad_norm": 0.2151305079460144, + "learning_rate": 5.360942169502312e-05, + "loss": 1.1296, + "step": 5870 + }, + { + "epoch": 1.588824245704235, + "grad_norm": 0.21035653352737427, + "learning_rate": 5.3578021079081576e-05, + "loss": 1.1061, + "step": 5872 + }, + { + "epoch": 1.5893654444594776, + "grad_norm": 0.20419679582118988, + "learning_rate": 5.3546619044642545e-05, + "loss": 1.122, + "step": 5874 + }, + { + "epoch": 1.5899066432147206, + "grad_norm": 0.22793719172477722, + "learning_rate": 5.3515215604155275e-05, + "loss": 1.1051, + "step": 5876 + }, + { + "epoch": 1.5904478419699635, + "grad_norm": 0.2197612076997757, + "learning_rate": 5.34838107700696e-05, + "loss": 1.1173, + "step": 5878 + }, + { + "epoch": 1.5909890407252063, + "grad_norm": 0.20234552025794983, + "learning_rate": 5.345240455483587e-05, + "loss": 1.1125, + "step": 5880 + }, + { + "epoch": 1.5915302394804494, + "grad_norm": 0.201350137591362, + "learning_rate": 5.3420996970905e-05, + "loss": 1.1121, + "step": 5882 + }, + { + "epoch": 1.592071438235692, + "grad_norm": 0.1958758533000946, + "learning_rate": 5.338958803072845e-05, + "loss": 1.0994, + "step": 5884 + }, + { + "epoch": 1.592612636990935, + "grad_norm": 0.2031422108411789, + "learning_rate": 5.335817774675824e-05, + "loss": 1.1088, + "step": 5886 + }, + { + "epoch": 1.5931538357461776, + "grad_norm": 0.24101684987545013, + "learning_rate": 5.332676613144687e-05, + "loss": 1.1216, + "step": 5888 + }, + { + "epoch": 1.5936950345014207, + "grad_norm": 0.21759966015815735, + "learning_rate": 5.3295353197247386e-05, + "loss": 1.0963, + "step": 5890 + }, + { + "epoch": 1.5942362332566635, + "grad_norm": 0.22368492186069489, + "learning_rate": 5.3263938956613404e-05, + "loss": 1.1262, + "step": 5892 + }, + { + "epoch": 1.5947774320119064, + "grad_norm": 0.22949428856372833, + "learning_rate": 5.323252342199898e-05, + "loss": 1.106, + "step": 5894 + }, + { + "epoch": 1.5953186307671492, + "grad_norm": 0.20724032819271088, + "learning_rate": 5.320110660585877e-05, + "loss": 1.1051, + "step": 5896 + }, + { + "epoch": 1.595859829522392, + "grad_norm": 0.20730967819690704, + "learning_rate": 5.316968852064786e-05, + "loss": 1.1136, + "step": 5898 + }, + { + "epoch": 1.596401028277635, + "grad_norm": 0.20740945637226105, + "learning_rate": 5.313826917882188e-05, + "loss": 1.0943, + "step": 5900 + }, + { + "epoch": 1.5969422270328777, + "grad_norm": 0.2070166915655136, + "learning_rate": 5.3106848592836946e-05, + "loss": 1.1056, + "step": 5902 + }, + { + "epoch": 1.5974834257881207, + "grad_norm": 0.21423617005348206, + "learning_rate": 5.307542677514966e-05, + "loss": 1.0848, + "step": 5904 + }, + { + "epoch": 1.5980246245433636, + "grad_norm": 0.24238221347332, + "learning_rate": 5.304400373821714e-05, + "loss": 1.1147, + "step": 5906 + }, + { + "epoch": 1.5985658232986064, + "grad_norm": 0.20399025082588196, + "learning_rate": 5.3012579494496985e-05, + "loss": 1.0912, + "step": 5908 + }, + { + "epoch": 1.5991070220538492, + "grad_norm": 0.19967523217201233, + "learning_rate": 5.2981154056447225e-05, + "loss": 1.1107, + "step": 5910 + }, + { + "epoch": 1.599648220809092, + "grad_norm": 0.21315614879131317, + "learning_rate": 5.294972743652641e-05, + "loss": 1.1142, + "step": 5912 + }, + { + "epoch": 1.6001894195643351, + "grad_norm": 0.19814157485961914, + "learning_rate": 5.291829964719355e-05, + "loss": 1.1295, + "step": 5914 + }, + { + "epoch": 1.6007306183195777, + "grad_norm": 0.20252948999404907, + "learning_rate": 5.2886870700908097e-05, + "loss": 1.1028, + "step": 5916 + }, + { + "epoch": 1.6012718170748208, + "grad_norm": 0.20225170254707336, + "learning_rate": 5.2855440610129994e-05, + "loss": 1.1217, + "step": 5918 + }, + { + "epoch": 1.6018130158300636, + "grad_norm": 0.199290931224823, + "learning_rate": 5.282400938731962e-05, + "loss": 1.1344, + "step": 5920 + }, + { + "epoch": 1.6023542145853065, + "grad_norm": 0.19729246199131012, + "learning_rate": 5.2792577044937805e-05, + "loss": 1.1157, + "step": 5922 + }, + { + "epoch": 1.6028954133405493, + "grad_norm": 0.19747509062290192, + "learning_rate": 5.276114359544581e-05, + "loss": 1.1131, + "step": 5924 + }, + { + "epoch": 1.6034366120957921, + "grad_norm": 0.2086389809846878, + "learning_rate": 5.272970905130536e-05, + "loss": 1.1209, + "step": 5926 + }, + { + "epoch": 1.6039778108510352, + "grad_norm": 0.19091291725635529, + "learning_rate": 5.2698273424978584e-05, + "loss": 1.1144, + "step": 5928 + }, + { + "epoch": 1.6045190096062778, + "grad_norm": 0.20486097037792206, + "learning_rate": 5.266683672892807e-05, + "loss": 1.0971, + "step": 5930 + }, + { + "epoch": 1.6050602083615209, + "grad_norm": 0.1944921612739563, + "learning_rate": 5.263539897561683e-05, + "loss": 1.1222, + "step": 5932 + }, + { + "epoch": 1.6056014071167635, + "grad_norm": 0.20008689165115356, + "learning_rate": 5.260396017750826e-05, + "loss": 1.1147, + "step": 5934 + }, + { + "epoch": 1.6061426058720065, + "grad_norm": 0.19609950482845306, + "learning_rate": 5.257252034706622e-05, + "loss": 1.0801, + "step": 5936 + }, + { + "epoch": 1.6066838046272494, + "grad_norm": 0.20734445750713348, + "learning_rate": 5.2541079496754933e-05, + "loss": 1.1051, + "step": 5938 + }, + { + "epoch": 1.6072250033824922, + "grad_norm": 0.20087632536888123, + "learning_rate": 5.2509637639039024e-05, + "loss": 1.0988, + "step": 5940 + }, + { + "epoch": 1.6077662021377352, + "grad_norm": 0.2000436931848526, + "learning_rate": 5.2478194786383584e-05, + "loss": 1.1011, + "step": 5942 + }, + { + "epoch": 1.6083074008929779, + "grad_norm": 0.20579595863819122, + "learning_rate": 5.244675095125403e-05, + "loss": 1.1024, + "step": 5944 + }, + { + "epoch": 1.608848599648221, + "grad_norm": 0.21479883790016174, + "learning_rate": 5.241530614611618e-05, + "loss": 1.1171, + "step": 5946 + }, + { + "epoch": 1.6093897984034635, + "grad_norm": 0.2454638034105301, + "learning_rate": 5.2383860383436266e-05, + "loss": 1.0975, + "step": 5948 + }, + { + "epoch": 1.6099309971587066, + "grad_norm": 0.2089093029499054, + "learning_rate": 5.235241367568089e-05, + "loss": 1.1091, + "step": 5950 + }, + { + "epoch": 1.6104721959139494, + "grad_norm": 0.23135189712047577, + "learning_rate": 5.232096603531699e-05, + "loss": 1.1249, + "step": 5952 + }, + { + "epoch": 1.6110133946691922, + "grad_norm": 0.24319973587989807, + "learning_rate": 5.228951747481191e-05, + "loss": 1.123, + "step": 5954 + }, + { + "epoch": 1.611554593424435, + "grad_norm": 0.22288009524345398, + "learning_rate": 5.2258068006633385e-05, + "loss": 1.1062, + "step": 5956 + }, + { + "epoch": 1.612095792179678, + "grad_norm": 0.2177572399377823, + "learning_rate": 5.222661764324946e-05, + "loss": 1.0975, + "step": 5958 + }, + { + "epoch": 1.612636990934921, + "grad_norm": 0.20513436198234558, + "learning_rate": 5.2195166397128536e-05, + "loss": 1.0995, + "step": 5960 + }, + { + "epoch": 1.6131781896901636, + "grad_norm": 0.2116641104221344, + "learning_rate": 5.216371428073941e-05, + "loss": 1.0973, + "step": 5962 + }, + { + "epoch": 1.6137193884454066, + "grad_norm": 0.206109419465065, + "learning_rate": 5.213226130655117e-05, + "loss": 1.117, + "step": 5964 + }, + { + "epoch": 1.6142605872006495, + "grad_norm": 0.2107187956571579, + "learning_rate": 5.210080748703328e-05, + "loss": 1.1254, + "step": 5966 + }, + { + "epoch": 1.6148017859558923, + "grad_norm": 0.22307105362415314, + "learning_rate": 5.206935283465553e-05, + "loss": 1.1065, + "step": 5968 + }, + { + "epoch": 1.6153429847111351, + "grad_norm": 0.2044670134782791, + "learning_rate": 5.203789736188802e-05, + "loss": 1.1119, + "step": 5970 + }, + { + "epoch": 1.615884183466378, + "grad_norm": 0.18756787478923798, + "learning_rate": 5.2006441081201216e-05, + "loss": 1.1188, + "step": 5972 + }, + { + "epoch": 1.616425382221621, + "grad_norm": 0.19725120067596436, + "learning_rate": 5.197498400506587e-05, + "loss": 1.1027, + "step": 5974 + }, + { + "epoch": 1.6169665809768636, + "grad_norm": 0.196888267993927, + "learning_rate": 5.1943526145953066e-05, + "loss": 1.0968, + "step": 5976 + }, + { + "epoch": 1.6175077797321067, + "grad_norm": 0.19184596836566925, + "learning_rate": 5.191206751633419e-05, + "loss": 1.1074, + "step": 5978 + }, + { + "epoch": 1.6180489784873495, + "grad_norm": 0.19667644798755646, + "learning_rate": 5.1880608128680955e-05, + "loss": 1.1036, + "step": 5980 + }, + { + "epoch": 1.6185901772425924, + "grad_norm": 0.21635574102401733, + "learning_rate": 5.184914799546532e-05, + "loss": 1.0962, + "step": 5982 + }, + { + "epoch": 1.6191313759978352, + "grad_norm": 0.20050860941410065, + "learning_rate": 5.181768712915961e-05, + "loss": 1.1164, + "step": 5984 + }, + { + "epoch": 1.619672574753078, + "grad_norm": 0.20226170122623444, + "learning_rate": 5.178622554223638e-05, + "loss": 1.123, + "step": 5986 + }, + { + "epoch": 1.620213773508321, + "grad_norm": 0.20348498225212097, + "learning_rate": 5.175476324716852e-05, + "loss": 1.1103, + "step": 5988 + }, + { + "epoch": 1.6207549722635637, + "grad_norm": 0.21365690231323242, + "learning_rate": 5.172330025642916e-05, + "loss": 1.0926, + "step": 5990 + }, + { + "epoch": 1.6212961710188067, + "grad_norm": 0.21233424544334412, + "learning_rate": 5.169183658249173e-05, + "loss": 1.124, + "step": 5992 + }, + { + "epoch": 1.6218373697740494, + "grad_norm": 0.20547015964984894, + "learning_rate": 5.166037223782993e-05, + "loss": 1.1016, + "step": 5994 + }, + { + "epoch": 1.6223785685292924, + "grad_norm": 0.1983044296503067, + "learning_rate": 5.162890723491773e-05, + "loss": 1.1006, + "step": 5996 + }, + { + "epoch": 1.6229197672845352, + "grad_norm": 0.19740712642669678, + "learning_rate": 5.159744158622932e-05, + "loss": 1.1077, + "step": 5998 + }, + { + "epoch": 1.623460966039778, + "grad_norm": 0.20450197160243988, + "learning_rate": 5.156597530423921e-05, + "loss": 1.0955, + "step": 6000 + }, + { + "epoch": 1.6240021647950211, + "grad_norm": 0.20513780415058136, + "learning_rate": 5.15345084014221e-05, + "loss": 1.1081, + "step": 6002 + }, + { + "epoch": 1.6245433635502637, + "grad_norm": 0.19694030284881592, + "learning_rate": 5.1503040890252994e-05, + "loss": 1.1173, + "step": 6004 + }, + { + "epoch": 1.6250845623055068, + "grad_norm": 0.19904263317584991, + "learning_rate": 5.147157278320708e-05, + "loss": 1.1275, + "step": 6006 + }, + { + "epoch": 1.6256257610607494, + "grad_norm": 0.19937120378017426, + "learning_rate": 5.144010409275983e-05, + "loss": 1.102, + "step": 6008 + }, + { + "epoch": 1.6261669598159925, + "grad_norm": 0.18996426463127136, + "learning_rate": 5.1408634831386915e-05, + "loss": 1.0981, + "step": 6010 + }, + { + "epoch": 1.6267081585712353, + "grad_norm": 0.20067033171653748, + "learning_rate": 5.137716501156427e-05, + "loss": 1.1108, + "step": 6012 + }, + { + "epoch": 1.6272493573264781, + "grad_norm": 0.18880565464496613, + "learning_rate": 5.134569464576798e-05, + "loss": 1.1079, + "step": 6014 + }, + { + "epoch": 1.627790556081721, + "grad_norm": 0.19722242653369904, + "learning_rate": 5.131422374647444e-05, + "loss": 1.101, + "step": 6016 + }, + { + "epoch": 1.6283317548369638, + "grad_norm": 0.20435649156570435, + "learning_rate": 5.1282752326160186e-05, + "loss": 1.1181, + "step": 6018 + }, + { + "epoch": 1.6288729535922069, + "grad_norm": 0.19975650310516357, + "learning_rate": 5.125128039730198e-05, + "loss": 1.1084, + "step": 6020 + }, + { + "epoch": 1.6294141523474495, + "grad_norm": 0.2203846126794815, + "learning_rate": 5.121980797237681e-05, + "loss": 1.1009, + "step": 6022 + }, + { + "epoch": 1.6299553511026925, + "grad_norm": 0.20882205665111542, + "learning_rate": 5.1188335063861813e-05, + "loss": 1.1261, + "step": 6024 + }, + { + "epoch": 1.6304965498579354, + "grad_norm": 0.21228885650634766, + "learning_rate": 5.115686168423436e-05, + "loss": 1.1055, + "step": 6026 + }, + { + "epoch": 1.6310377486131782, + "grad_norm": 0.20111221075057983, + "learning_rate": 5.1125387845971996e-05, + "loss": 1.0966, + "step": 6028 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.19490067660808563, + "learning_rate": 5.109391356155243e-05, + "loss": 1.0951, + "step": 6030 + }, + { + "epoch": 1.6321201461236639, + "grad_norm": 0.19791452586650848, + "learning_rate": 5.1062438843453574e-05, + "loss": 1.1156, + "step": 6032 + }, + { + "epoch": 1.632661344878907, + "grad_norm": 0.2057853639125824, + "learning_rate": 5.103096370415351e-05, + "loss": 1.099, + "step": 6034 + }, + { + "epoch": 1.6332025436341495, + "grad_norm": 0.2035697102546692, + "learning_rate": 5.099948815613047e-05, + "loss": 1.112, + "step": 6036 + }, + { + "epoch": 1.6337437423893926, + "grad_norm": 0.2258489727973938, + "learning_rate": 5.0968012211862845e-05, + "loss": 1.0983, + "step": 6038 + }, + { + "epoch": 1.6342849411446354, + "grad_norm": 0.20500481128692627, + "learning_rate": 5.0936535883829216e-05, + "loss": 1.1126, + "step": 6040 + }, + { + "epoch": 1.6348261398998782, + "grad_norm": 0.19694970548152924, + "learning_rate": 5.0905059184508275e-05, + "loss": 1.1089, + "step": 6042 + }, + { + "epoch": 1.635367338655121, + "grad_norm": 0.21168774366378784, + "learning_rate": 5.0873582126378885e-05, + "loss": 1.1041, + "step": 6044 + }, + { + "epoch": 1.635908537410364, + "grad_norm": 0.1914626508951187, + "learning_rate": 5.084210472192006e-05, + "loss": 1.0989, + "step": 6046 + }, + { + "epoch": 1.636449736165607, + "grad_norm": 0.19588778913021088, + "learning_rate": 5.0810626983610934e-05, + "loss": 1.0963, + "step": 6048 + }, + { + "epoch": 1.6369909349208496, + "grad_norm": 0.1954125612974167, + "learning_rate": 5.077914892393075e-05, + "loss": 1.0995, + "step": 6050 + }, + { + "epoch": 1.6375321336760926, + "grad_norm": 0.19976812601089478, + "learning_rate": 5.074767055535894e-05, + "loss": 1.1035, + "step": 6052 + }, + { + "epoch": 1.6380733324313355, + "grad_norm": 0.2050340324640274, + "learning_rate": 5.071619189037501e-05, + "loss": 1.0782, + "step": 6054 + }, + { + "epoch": 1.6386145311865783, + "grad_norm": 0.19455774128437042, + "learning_rate": 5.068471294145859e-05, + "loss": 1.1051, + "step": 6056 + }, + { + "epoch": 1.6391557299418211, + "grad_norm": 0.1886800080537796, + "learning_rate": 5.0653233721089456e-05, + "loss": 1.0953, + "step": 6058 + }, + { + "epoch": 1.639696928697064, + "grad_norm": 0.2028556764125824, + "learning_rate": 5.062175424174744e-05, + "loss": 1.1252, + "step": 6060 + }, + { + "epoch": 1.640238127452307, + "grad_norm": 0.2425209879875183, + "learning_rate": 5.059027451591252e-05, + "loss": 1.1148, + "step": 6062 + }, + { + "epoch": 1.6407793262075496, + "grad_norm": 0.21492329239845276, + "learning_rate": 5.055879455606475e-05, + "loss": 1.0911, + "step": 6064 + }, + { + "epoch": 1.6413205249627927, + "grad_norm": 0.20221753418445587, + "learning_rate": 5.0527314374684263e-05, + "loss": 1.1082, + "step": 6066 + }, + { + "epoch": 1.6418617237180353, + "grad_norm": 0.19085150957107544, + "learning_rate": 5.049583398425133e-05, + "loss": 1.108, + "step": 6068 + }, + { + "epoch": 1.6424029224732783, + "grad_norm": 0.19889535009860992, + "learning_rate": 5.046435339724626e-05, + "loss": 1.1019, + "step": 6070 + }, + { + "epoch": 1.6429441212285212, + "grad_norm": 0.1931409239768982, + "learning_rate": 5.043287262614943e-05, + "loss": 1.1152, + "step": 6072 + }, + { + "epoch": 1.643485319983764, + "grad_norm": 0.2050054371356964, + "learning_rate": 5.0401391683441344e-05, + "loss": 1.1276, + "step": 6074 + }, + { + "epoch": 1.6440265187390068, + "grad_norm": 0.20617274940013885, + "learning_rate": 5.036991058160253e-05, + "loss": 1.0877, + "step": 6076 + }, + { + "epoch": 1.6445677174942497, + "grad_norm": 0.19923336803913116, + "learning_rate": 5.033842933311359e-05, + "loss": 1.1253, + "step": 6078 + }, + { + "epoch": 1.6451089162494927, + "grad_norm": 0.19882087409496307, + "learning_rate": 5.030694795045519e-05, + "loss": 1.0867, + "step": 6080 + }, + { + "epoch": 1.6456501150047353, + "grad_norm": 0.2002253383398056, + "learning_rate": 5.0275466446108045e-05, + "loss": 1.1202, + "step": 6082 + }, + { + "epoch": 1.6461913137599784, + "grad_norm": 0.20727257430553436, + "learning_rate": 5.024398483255292e-05, + "loss": 1.1106, + "step": 6084 + }, + { + "epoch": 1.6467325125152212, + "grad_norm": 0.2989455461502075, + "learning_rate": 5.021250312227062e-05, + "loss": 1.1213, + "step": 6086 + }, + { + "epoch": 1.647273711270464, + "grad_norm": 0.2180749773979187, + "learning_rate": 5.018102132774198e-05, + "loss": 1.0855, + "step": 6088 + }, + { + "epoch": 1.647814910025707, + "grad_norm": 0.2092970758676529, + "learning_rate": 5.0149539461447895e-05, + "loss": 1.1104, + "step": 6090 + }, + { + "epoch": 1.6483561087809497, + "grad_norm": 0.20346704125404358, + "learning_rate": 5.011805753586927e-05, + "loss": 1.1026, + "step": 6092 + }, + { + "epoch": 1.6488973075361928, + "grad_norm": 0.20743313431739807, + "learning_rate": 5.0086575563487016e-05, + "loss": 1.1244, + "step": 6094 + }, + { + "epoch": 1.6494385062914354, + "grad_norm": 0.19220343232154846, + "learning_rate": 5.005509355678211e-05, + "loss": 1.1151, + "step": 6096 + }, + { + "epoch": 1.6499797050466785, + "grad_norm": 0.19584113359451294, + "learning_rate": 5.0023611528235506e-05, + "loss": 1.1035, + "step": 6098 + }, + { + "epoch": 1.6505209038019213, + "grad_norm": 0.19244736433029175, + "learning_rate": 4.9992129490328154e-05, + "loss": 1.1117, + "step": 6100 + }, + { + "epoch": 1.6510621025571641, + "grad_norm": 0.2030666321516037, + "learning_rate": 4.996064745554106e-05, + "loss": 1.1117, + "step": 6102 + }, + { + "epoch": 1.651603301312407, + "grad_norm": 0.1975410431623459, + "learning_rate": 4.992916543635518e-05, + "loss": 1.0907, + "step": 6104 + }, + { + "epoch": 1.6521445000676498, + "grad_norm": 0.19762983918190002, + "learning_rate": 4.9897683445251476e-05, + "loss": 1.0791, + "step": 6106 + }, + { + "epoch": 1.6526856988228928, + "grad_norm": 0.20722368359565735, + "learning_rate": 4.986620149471093e-05, + "loss": 1.0901, + "step": 6108 + }, + { + "epoch": 1.6532268975781355, + "grad_norm": 0.20464728772640228, + "learning_rate": 4.9834719597214435e-05, + "loss": 1.0952, + "step": 6110 + }, + { + "epoch": 1.6537680963333785, + "grad_norm": 0.20351345837116241, + "learning_rate": 4.980323776524297e-05, + "loss": 1.1168, + "step": 6112 + }, + { + "epoch": 1.6543092950886213, + "grad_norm": 0.20605972409248352, + "learning_rate": 4.97717560112774e-05, + "loss": 1.098, + "step": 6114 + }, + { + "epoch": 1.6548504938438642, + "grad_norm": 0.21736709773540497, + "learning_rate": 4.974027434779859e-05, + "loss": 1.1111, + "step": 6116 + }, + { + "epoch": 1.655391692599107, + "grad_norm": 0.21413607895374298, + "learning_rate": 4.9708792787287355e-05, + "loss": 1.0787, + "step": 6118 + }, + { + "epoch": 1.6559328913543498, + "grad_norm": 0.20902234315872192, + "learning_rate": 4.9677311342224475e-05, + "loss": 1.0973, + "step": 6120 + }, + { + "epoch": 1.656474090109593, + "grad_norm": 0.20447993278503418, + "learning_rate": 4.964583002509072e-05, + "loss": 1.094, + "step": 6122 + }, + { + "epoch": 1.6570152888648355, + "grad_norm": 0.20252478122711182, + "learning_rate": 4.961434884836677e-05, + "loss": 1.0898, + "step": 6124 + }, + { + "epoch": 1.6575564876200786, + "grad_norm": 0.1981995403766632, + "learning_rate": 4.958286782453325e-05, + "loss": 1.0723, + "step": 6126 + }, + { + "epoch": 1.6580976863753212, + "grad_norm": 0.19710524380207062, + "learning_rate": 4.955138696607074e-05, + "loss": 1.1059, + "step": 6128 + }, + { + "epoch": 1.6586388851305642, + "grad_norm": 0.20728375017642975, + "learning_rate": 4.951990628545973e-05, + "loss": 1.1087, + "step": 6130 + }, + { + "epoch": 1.659180083885807, + "grad_norm": 0.21482379734516144, + "learning_rate": 4.948842579518067e-05, + "loss": 1.1075, + "step": 6132 + }, + { + "epoch": 1.65972128264105, + "grad_norm": 0.19691164791584015, + "learning_rate": 4.9456945507713914e-05, + "loss": 1.0988, + "step": 6134 + }, + { + "epoch": 1.6602624813962927, + "grad_norm": 0.218665212392807, + "learning_rate": 4.942546543553977e-05, + "loss": 1.1055, + "step": 6136 + }, + { + "epoch": 1.6608036801515356, + "grad_norm": 0.21065405011177063, + "learning_rate": 4.939398559113841e-05, + "loss": 1.1173, + "step": 6138 + }, + { + "epoch": 1.6613448789067786, + "grad_norm": 0.20526757836341858, + "learning_rate": 4.936250598698995e-05, + "loss": 1.0959, + "step": 6140 + }, + { + "epoch": 1.6618860776620212, + "grad_norm": 0.21178075671195984, + "learning_rate": 4.933102663557439e-05, + "loss": 1.079, + "step": 6142 + }, + { + "epoch": 1.6624272764172643, + "grad_norm": 0.21773749589920044, + "learning_rate": 4.9299547549371624e-05, + "loss": 1.0977, + "step": 6144 + }, + { + "epoch": 1.6629684751725071, + "grad_norm": 0.21717792749404907, + "learning_rate": 4.92680687408615e-05, + "loss": 1.1109, + "step": 6146 + }, + { + "epoch": 1.66350967392775, + "grad_norm": 0.22109778225421906, + "learning_rate": 4.923659022252368e-05, + "loss": 1.0851, + "step": 6148 + }, + { + "epoch": 1.6640508726829928, + "grad_norm": 0.21108923852443695, + "learning_rate": 4.920511200683776e-05, + "loss": 1.0802, + "step": 6150 + }, + { + "epoch": 1.6645920714382356, + "grad_norm": 0.23644781112670898, + "learning_rate": 4.917363410628319e-05, + "loss": 1.1202, + "step": 6152 + }, + { + "epoch": 1.6651332701934787, + "grad_norm": 0.2017291635274887, + "learning_rate": 4.9142156533339294e-05, + "loss": 1.1047, + "step": 6154 + }, + { + "epoch": 1.6656744689487213, + "grad_norm": 0.2093794196844101, + "learning_rate": 4.911067930048531e-05, + "loss": 1.0952, + "step": 6156 + }, + { + "epoch": 1.6662156677039643, + "grad_norm": 0.22405418753623962, + "learning_rate": 4.907920242020028e-05, + "loss": 1.1025, + "step": 6158 + }, + { + "epoch": 1.6667568664592072, + "grad_norm": 0.2150014191865921, + "learning_rate": 4.9047725904963146e-05, + "loss": 1.1002, + "step": 6160 + }, + { + "epoch": 1.66729806521445, + "grad_norm": 0.21797533333301544, + "learning_rate": 4.901624976725269e-05, + "loss": 1.1001, + "step": 6162 + }, + { + "epoch": 1.6678392639696928, + "grad_norm": 0.20575197041034698, + "learning_rate": 4.898477401954755e-05, + "loss": 1.0773, + "step": 6164 + }, + { + "epoch": 1.6683804627249357, + "grad_norm": 0.2048603594303131, + "learning_rate": 4.895329867432619e-05, + "loss": 1.113, + "step": 6166 + }, + { + "epoch": 1.6689216614801787, + "grad_norm": 0.22338087856769562, + "learning_rate": 4.8921823744066964e-05, + "loss": 1.1002, + "step": 6168 + }, + { + "epoch": 1.6694628602354213, + "grad_norm": 0.19425320625305176, + "learning_rate": 4.889034924124802e-05, + "loss": 1.0919, + "step": 6170 + }, + { + "epoch": 1.6700040589906644, + "grad_norm": 0.21217811107635498, + "learning_rate": 4.885887517834733e-05, + "loss": 1.0953, + "step": 6172 + }, + { + "epoch": 1.6705452577459072, + "grad_norm": 0.21486474573612213, + "learning_rate": 4.8827401567842725e-05, + "loss": 1.0968, + "step": 6174 + }, + { + "epoch": 1.67108645650115, + "grad_norm": 0.2081034928560257, + "learning_rate": 4.879592842221182e-05, + "loss": 1.1169, + "step": 6176 + }, + { + "epoch": 1.671627655256393, + "grad_norm": 0.20809540152549744, + "learning_rate": 4.876445575393206e-05, + "loss": 1.1165, + "step": 6178 + }, + { + "epoch": 1.6721688540116357, + "grad_norm": 0.2050357460975647, + "learning_rate": 4.873298357548074e-05, + "loss": 1.1106, + "step": 6180 + }, + { + "epoch": 1.6727100527668788, + "grad_norm": 0.19713976979255676, + "learning_rate": 4.8701511899334905e-05, + "loss": 1.1049, + "step": 6182 + }, + { + "epoch": 1.6732512515221214, + "grad_norm": 0.20866985619068146, + "learning_rate": 4.867004073797142e-05, + "loss": 1.0902, + "step": 6184 + }, + { + "epoch": 1.6737924502773645, + "grad_norm": 0.20040997862815857, + "learning_rate": 4.863857010386695e-05, + "loss": 1.103, + "step": 6186 + }, + { + "epoch": 1.674333649032607, + "grad_norm": 0.1946992427110672, + "learning_rate": 4.860710000949796e-05, + "loss": 1.1068, + "step": 6188 + }, + { + "epoch": 1.6748748477878501, + "grad_norm": 0.21326212584972382, + "learning_rate": 4.8575630467340656e-05, + "loss": 1.1104, + "step": 6190 + }, + { + "epoch": 1.675416046543093, + "grad_norm": 0.199645534157753, + "learning_rate": 4.854416148987111e-05, + "loss": 1.0824, + "step": 6192 + }, + { + "epoch": 1.6759572452983358, + "grad_norm": 0.20009875297546387, + "learning_rate": 4.851269308956509e-05, + "loss": 1.1174, + "step": 6194 + }, + { + "epoch": 1.6764984440535788, + "grad_norm": 0.21479113399982452, + "learning_rate": 4.848122527889817e-05, + "loss": 1.0889, + "step": 6196 + }, + { + "epoch": 1.6770396428088215, + "grad_norm": 0.19752614200115204, + "learning_rate": 4.8449758070345676e-05, + "loss": 1.0884, + "step": 6198 + }, + { + "epoch": 1.6775808415640645, + "grad_norm": 0.19362103939056396, + "learning_rate": 4.841829147638269e-05, + "loss": 1.1154, + "step": 6200 + }, + { + "epoch": 1.6781220403193071, + "grad_norm": 0.19553016126155853, + "learning_rate": 4.8386825509484115e-05, + "loss": 1.0965, + "step": 6202 + }, + { + "epoch": 1.6786632390745502, + "grad_norm": 0.19789254665374756, + "learning_rate": 4.8355360182124517e-05, + "loss": 1.1, + "step": 6204 + }, + { + "epoch": 1.679204437829793, + "grad_norm": 0.20107004046440125, + "learning_rate": 4.8323895506778247e-05, + "loss": 1.104, + "step": 6206 + }, + { + "epoch": 1.6797456365850358, + "grad_norm": 0.2045447677373886, + "learning_rate": 4.82924314959194e-05, + "loss": 1.1114, + "step": 6208 + }, + { + "epoch": 1.6802868353402787, + "grad_norm": 0.22341173887252808, + "learning_rate": 4.826096816202183e-05, + "loss": 1.1003, + "step": 6210 + }, + { + "epoch": 1.6808280340955215, + "grad_norm": 0.20418062806129456, + "learning_rate": 4.822950551755903e-05, + "loss": 1.1035, + "step": 6212 + }, + { + "epoch": 1.6813692328507646, + "grad_norm": 0.19770026206970215, + "learning_rate": 4.819804357500437e-05, + "loss": 1.1029, + "step": 6214 + }, + { + "epoch": 1.6819104316060072, + "grad_norm": 0.19675873219966888, + "learning_rate": 4.816658234683081e-05, + "loss": 1.1004, + "step": 6216 + }, + { + "epoch": 1.6824516303612502, + "grad_norm": 0.20119518041610718, + "learning_rate": 4.813512184551109e-05, + "loss": 1.0937, + "step": 6218 + }, + { + "epoch": 1.682992829116493, + "grad_norm": 0.19882871210575104, + "learning_rate": 4.810366208351764e-05, + "loss": 1.0961, + "step": 6220 + }, + { + "epoch": 1.683534027871736, + "grad_norm": 0.215384840965271, + "learning_rate": 4.80722030733226e-05, + "loss": 1.0991, + "step": 6222 + }, + { + "epoch": 1.6840752266269787, + "grad_norm": 0.21123012900352478, + "learning_rate": 4.8040744827397805e-05, + "loss": 1.0994, + "step": 6224 + }, + { + "epoch": 1.6846164253822216, + "grad_norm": 0.19668787717819214, + "learning_rate": 4.8009287358214833e-05, + "loss": 1.093, + "step": 6226 + }, + { + "epoch": 1.6851576241374646, + "grad_norm": 0.20249195396900177, + "learning_rate": 4.79778306782449e-05, + "loss": 1.1019, + "step": 6228 + }, + { + "epoch": 1.6856988228927072, + "grad_norm": 0.1980815827846527, + "learning_rate": 4.794637479995893e-05, + "loss": 1.0892, + "step": 6230 + }, + { + "epoch": 1.6862400216479503, + "grad_norm": 0.20097172260284424, + "learning_rate": 4.7914919735827506e-05, + "loss": 1.1173, + "step": 6232 + }, + { + "epoch": 1.6867812204031931, + "grad_norm": 0.1997818648815155, + "learning_rate": 4.7883465498320936e-05, + "loss": 1.0952, + "step": 6234 + }, + { + "epoch": 1.687322419158436, + "grad_norm": 0.2143600881099701, + "learning_rate": 4.785201209990917e-05, + "loss": 1.0842, + "step": 6236 + }, + { + "epoch": 1.6878636179136788, + "grad_norm": 0.20444436371326447, + "learning_rate": 4.782055955306182e-05, + "loss": 1.1043, + "step": 6238 + }, + { + "epoch": 1.6884048166689216, + "grad_norm": 0.19443412125110626, + "learning_rate": 4.7789107870248174e-05, + "loss": 1.1011, + "step": 6240 + }, + { + "epoch": 1.6889460154241647, + "grad_norm": 0.19847697019577026, + "learning_rate": 4.7757657063937185e-05, + "loss": 1.0947, + "step": 6242 + }, + { + "epoch": 1.6894872141794073, + "grad_norm": 0.20564959943294525, + "learning_rate": 4.7726207146597425e-05, + "loss": 1.0963, + "step": 6244 + }, + { + "epoch": 1.6900284129346503, + "grad_norm": 0.2089342474937439, + "learning_rate": 4.7694758130697124e-05, + "loss": 1.0888, + "step": 6246 + }, + { + "epoch": 1.690569611689893, + "grad_norm": 0.19679591059684753, + "learning_rate": 4.7663310028704214e-05, + "loss": 1.1011, + "step": 6248 + }, + { + "epoch": 1.691110810445136, + "grad_norm": 0.21167321503162384, + "learning_rate": 4.763186285308618e-05, + "loss": 1.0839, + "step": 6250 + }, + { + "epoch": 1.6916520092003788, + "grad_norm": 0.20337669551372528, + "learning_rate": 4.760041661631018e-05, + "loss": 1.1143, + "step": 6252 + }, + { + "epoch": 1.6921932079556217, + "grad_norm": 0.21780863404273987, + "learning_rate": 4.756897133084301e-05, + "loss": 1.0929, + "step": 6254 + }, + { + "epoch": 1.6927344067108647, + "grad_norm": 0.21046027541160583, + "learning_rate": 4.753752700915105e-05, + "loss": 1.1005, + "step": 6256 + }, + { + "epoch": 1.6932756054661073, + "grad_norm": 0.22077994048595428, + "learning_rate": 4.750608366370033e-05, + "loss": 1.1022, + "step": 6258 + }, + { + "epoch": 1.6938168042213504, + "grad_norm": 0.22833840548992157, + "learning_rate": 4.7474641306956514e-05, + "loss": 1.1079, + "step": 6260 + }, + { + "epoch": 1.694358002976593, + "grad_norm": 0.21717151999473572, + "learning_rate": 4.744319995138481e-05, + "loss": 1.1035, + "step": 6262 + }, + { + "epoch": 1.694899201731836, + "grad_norm": 0.2176828384399414, + "learning_rate": 4.7411759609450085e-05, + "loss": 1.1009, + "step": 6264 + }, + { + "epoch": 1.695440400487079, + "grad_norm": 0.20053714513778687, + "learning_rate": 4.7380320293616774e-05, + "loss": 1.0962, + "step": 6266 + }, + { + "epoch": 1.6959815992423217, + "grad_norm": 0.21354977786540985, + "learning_rate": 4.734888201634893e-05, + "loss": 1.0957, + "step": 6268 + }, + { + "epoch": 1.6965227979975646, + "grad_norm": 0.22005771100521088, + "learning_rate": 4.731744479011013e-05, + "loss": 1.11, + "step": 6270 + }, + { + "epoch": 1.6970639967528074, + "grad_norm": 0.25545334815979004, + "learning_rate": 4.7286008627363656e-05, + "loss": 1.0929, + "step": 6272 + }, + { + "epoch": 1.6976051955080504, + "grad_norm": 0.22265726327896118, + "learning_rate": 4.725457354057226e-05, + "loss": 1.1003, + "step": 6274 + }, + { + "epoch": 1.698146394263293, + "grad_norm": 0.2123861014842987, + "learning_rate": 4.72231395421983e-05, + "loss": 1.1065, + "step": 6276 + }, + { + "epoch": 1.6986875930185361, + "grad_norm": 0.2084009051322937, + "learning_rate": 4.7191706644703706e-05, + "loss": 1.1044, + "step": 6278 + }, + { + "epoch": 1.699228791773779, + "grad_norm": 0.20307877659797668, + "learning_rate": 4.716027486054996e-05, + "loss": 1.0841, + "step": 6280 + }, + { + "epoch": 1.6997699905290218, + "grad_norm": 0.19163917005062103, + "learning_rate": 4.7128844202198144e-05, + "loss": 1.0904, + "step": 6282 + }, + { + "epoch": 1.7003111892842646, + "grad_norm": 0.20670676231384277, + "learning_rate": 4.7097414682108855e-05, + "loss": 1.1193, + "step": 6284 + }, + { + "epoch": 1.7008523880395074, + "grad_norm": 0.197808176279068, + "learning_rate": 4.706598631274223e-05, + "loss": 1.1111, + "step": 6286 + }, + { + "epoch": 1.7013935867947505, + "grad_norm": 0.19535589218139648, + "learning_rate": 4.703455910655799e-05, + "loss": 1.101, + "step": 6288 + }, + { + "epoch": 1.7019347855499931, + "grad_norm": 0.19631515443325043, + "learning_rate": 4.700313307601537e-05, + "loss": 1.0996, + "step": 6290 + }, + { + "epoch": 1.7024759843052362, + "grad_norm": 0.20223629474639893, + "learning_rate": 4.6971708233573094e-05, + "loss": 1.0919, + "step": 6292 + }, + { + "epoch": 1.703017183060479, + "grad_norm": 0.1986115723848343, + "learning_rate": 4.694028459168953e-05, + "loss": 1.1044, + "step": 6294 + }, + { + "epoch": 1.7035583818157218, + "grad_norm": 0.1945781409740448, + "learning_rate": 4.690886216282248e-05, + "loss": 1.08, + "step": 6296 + }, + { + "epoch": 1.7040995805709647, + "grad_norm": 0.23026058077812195, + "learning_rate": 4.687744095942926e-05, + "loss": 1.1126, + "step": 6298 + }, + { + "epoch": 1.7046407793262075, + "grad_norm": 0.19715817272663116, + "learning_rate": 4.684602099396676e-05, + "loss": 1.0997, + "step": 6300 + }, + { + "epoch": 1.7051819780814506, + "grad_norm": 0.20487914979457855, + "learning_rate": 4.681460227889134e-05, + "loss": 1.099, + "step": 6302 + }, + { + "epoch": 1.7057231768366932, + "grad_norm": 0.2061152309179306, + "learning_rate": 4.678318482665883e-05, + "loss": 1.0836, + "step": 6304 + }, + { + "epoch": 1.7062643755919362, + "grad_norm": 0.19450321793556213, + "learning_rate": 4.675176864972467e-05, + "loss": 1.1035, + "step": 6306 + }, + { + "epoch": 1.706805574347179, + "grad_norm": 0.20388276875019073, + "learning_rate": 4.672035376054368e-05, + "loss": 1.0919, + "step": 6308 + }, + { + "epoch": 1.7073467731024219, + "grad_norm": 0.1996920257806778, + "learning_rate": 4.668894017157021e-05, + "loss": 1.0991, + "step": 6310 + }, + { + "epoch": 1.7078879718576647, + "grad_norm": 0.3231523036956787, + "learning_rate": 4.665752789525812e-05, + "loss": 1.0994, + "step": 6312 + }, + { + "epoch": 1.7084291706129076, + "grad_norm": 0.19108951091766357, + "learning_rate": 4.662611694406072e-05, + "loss": 1.0961, + "step": 6314 + }, + { + "epoch": 1.7089703693681506, + "grad_norm": 0.20860451459884644, + "learning_rate": 4.65947073304308e-05, + "loss": 1.1023, + "step": 6316 + }, + { + "epoch": 1.7095115681233932, + "grad_norm": 0.20047715306282043, + "learning_rate": 4.656329906682063e-05, + "loss": 1.1009, + "step": 6318 + }, + { + "epoch": 1.7100527668786363, + "grad_norm": 0.1992918699979782, + "learning_rate": 4.653189216568194e-05, + "loss": 1.0875, + "step": 6320 + }, + { + "epoch": 1.7105939656338789, + "grad_norm": 0.20760871469974518, + "learning_rate": 4.650048663946591e-05, + "loss": 1.1005, + "step": 6322 + }, + { + "epoch": 1.711135164389122, + "grad_norm": 0.20540857315063477, + "learning_rate": 4.646908250062318e-05, + "loss": 1.106, + "step": 6324 + }, + { + "epoch": 1.7116763631443648, + "grad_norm": 0.2165304571390152, + "learning_rate": 4.643767976160383e-05, + "loss": 1.1136, + "step": 6326 + }, + { + "epoch": 1.7122175618996076, + "grad_norm": 0.21409067511558533, + "learning_rate": 4.640627843485742e-05, + "loss": 1.0926, + "step": 6328 + }, + { + "epoch": 1.7127587606548504, + "grad_norm": 0.21867913007736206, + "learning_rate": 4.637487853283293e-05, + "loss": 1.1066, + "step": 6330 + }, + { + "epoch": 1.7132999594100933, + "grad_norm": 0.21393905580043793, + "learning_rate": 4.634348006797876e-05, + "loss": 1.0871, + "step": 6332 + }, + { + "epoch": 1.7138411581653363, + "grad_norm": 0.21339541673660278, + "learning_rate": 4.6312083052742736e-05, + "loss": 1.0835, + "step": 6334 + }, + { + "epoch": 1.714382356920579, + "grad_norm": 0.21182167530059814, + "learning_rate": 4.628068749957216e-05, + "loss": 1.0905, + "step": 6336 + }, + { + "epoch": 1.714923555675822, + "grad_norm": 0.23677603900432587, + "learning_rate": 4.6249293420913696e-05, + "loss": 1.0825, + "step": 6338 + }, + { + "epoch": 1.7154647544310648, + "grad_norm": 0.2179262340068817, + "learning_rate": 4.621790082921346e-05, + "loss": 1.1057, + "step": 6340 + }, + { + "epoch": 1.7160059531863077, + "grad_norm": 0.18723274767398834, + "learning_rate": 4.6186509736916964e-05, + "loss": 1.1255, + "step": 6342 + }, + { + "epoch": 1.7165471519415505, + "grad_norm": 0.2081994116306305, + "learning_rate": 4.615512015646913e-05, + "loss": 1.0916, + "step": 6344 + }, + { + "epoch": 1.7170883506967933, + "grad_norm": 0.2164573073387146, + "learning_rate": 4.612373210031428e-05, + "loss": 1.1004, + "step": 6346 + }, + { + "epoch": 1.7176295494520364, + "grad_norm": 0.20629379153251648, + "learning_rate": 4.60923455808961e-05, + "loss": 1.0943, + "step": 6348 + }, + { + "epoch": 1.718170748207279, + "grad_norm": 0.20933091640472412, + "learning_rate": 4.6060960610657747e-05, + "loss": 1.1148, + "step": 6350 + }, + { + "epoch": 1.718711946962522, + "grad_norm": 0.18730591237545013, + "learning_rate": 4.6029577202041686e-05, + "loss": 1.0982, + "step": 6352 + }, + { + "epoch": 1.7192531457177649, + "grad_norm": 0.20031727850437164, + "learning_rate": 4.5998195367489794e-05, + "loss": 1.0872, + "step": 6354 + }, + { + "epoch": 1.7197943444730077, + "grad_norm": 0.19143308699131012, + "learning_rate": 4.5966815119443324e-05, + "loss": 1.079, + "step": 6356 + }, + { + "epoch": 1.7203355432282506, + "grad_norm": 0.19414161145687103, + "learning_rate": 4.593543647034288e-05, + "loss": 1.0778, + "step": 6358 + }, + { + "epoch": 1.7208767419834934, + "grad_norm": 0.2009509950876236, + "learning_rate": 4.590405943262846e-05, + "loss": 1.1031, + "step": 6360 + }, + { + "epoch": 1.7214179407387364, + "grad_norm": 0.20481033623218536, + "learning_rate": 4.587268401873943e-05, + "loss": 1.0899, + "step": 6362 + }, + { + "epoch": 1.721959139493979, + "grad_norm": 0.2116551250219345, + "learning_rate": 4.5841310241114466e-05, + "loss": 1.1196, + "step": 6364 + }, + { + "epoch": 1.722500338249222, + "grad_norm": 0.19716574251651764, + "learning_rate": 4.580993811219164e-05, + "loss": 1.0933, + "step": 6366 + }, + { + "epoch": 1.723041537004465, + "grad_norm": 0.19840463995933533, + "learning_rate": 4.5778567644408354e-05, + "loss": 1.0772, + "step": 6368 + }, + { + "epoch": 1.7235827357597078, + "grad_norm": 0.19723396003246307, + "learning_rate": 4.574719885020135e-05, + "loss": 1.0695, + "step": 6370 + }, + { + "epoch": 1.7241239345149506, + "grad_norm": 0.19954833388328552, + "learning_rate": 4.571583174200668e-05, + "loss": 1.1042, + "step": 6372 + }, + { + "epoch": 1.7246651332701934, + "grad_norm": 0.20160029828548431, + "learning_rate": 4.568446633225981e-05, + "loss": 1.1117, + "step": 6374 + }, + { + "epoch": 1.7252063320254365, + "grad_norm": 0.20232245326042175, + "learning_rate": 4.565310263339544e-05, + "loss": 1.0953, + "step": 6376 + }, + { + "epoch": 1.725747530780679, + "grad_norm": 0.2009570151567459, + "learning_rate": 4.562174065784764e-05, + "loss": 1.1032, + "step": 6378 + }, + { + "epoch": 1.7262887295359222, + "grad_norm": 0.1986301690340042, + "learning_rate": 4.5590380418049786e-05, + "loss": 1.1, + "step": 6380 + }, + { + "epoch": 1.7268299282911648, + "grad_norm": 0.1958642154932022, + "learning_rate": 4.5559021926434554e-05, + "loss": 1.1014, + "step": 6382 + }, + { + "epoch": 1.7273711270464078, + "grad_norm": 0.19796167314052582, + "learning_rate": 4.5527665195433946e-05, + "loss": 1.0982, + "step": 6384 + }, + { + "epoch": 1.7279123258016507, + "grad_norm": 0.1887817084789276, + "learning_rate": 4.549631023747928e-05, + "loss": 1.1114, + "step": 6386 + }, + { + "epoch": 1.7284535245568935, + "grad_norm": 0.19255441427230835, + "learning_rate": 4.546495706500112e-05, + "loss": 1.1003, + "step": 6388 + }, + { + "epoch": 1.7289947233121363, + "grad_norm": 0.20663224160671234, + "learning_rate": 4.543360569042939e-05, + "loss": 1.0865, + "step": 6390 + }, + { + "epoch": 1.7295359220673792, + "grad_norm": 0.1899230033159256, + "learning_rate": 4.540225612619323e-05, + "loss": 1.103, + "step": 6392 + }, + { + "epoch": 1.7300771208226222, + "grad_norm": 0.20343951880931854, + "learning_rate": 4.537090838472109e-05, + "loss": 1.0944, + "step": 6394 + }, + { + "epoch": 1.7306183195778648, + "grad_norm": 0.4894717335700989, + "learning_rate": 4.533956247844074e-05, + "loss": 1.1254, + "step": 6396 + }, + { + "epoch": 1.7311595183331079, + "grad_norm": 0.20982342958450317, + "learning_rate": 4.530821841977917e-05, + "loss": 1.0878, + "step": 6398 + }, + { + "epoch": 1.7317007170883507, + "grad_norm": 0.19965919852256775, + "learning_rate": 4.527687622116265e-05, + "loss": 1.0979, + "step": 6400 + }, + { + "epoch": 1.7322419158435935, + "grad_norm": 2.418462038040161, + "learning_rate": 4.524553589501671e-05, + "loss": 1.0996, + "step": 6402 + }, + { + "epoch": 1.7327831145988364, + "grad_norm": 0.227882981300354, + "learning_rate": 4.521419745376616e-05, + "loss": 1.1097, + "step": 6404 + }, + { + "epoch": 1.7333243133540792, + "grad_norm": 0.275223970413208, + "learning_rate": 4.518286090983501e-05, + "loss": 1.1213, + "step": 6406 + }, + { + "epoch": 1.7338655121093223, + "grad_norm": 29.966249465942383, + "learning_rate": 4.5151526275646606e-05, + "loss": 1.0799, + "step": 6408 + }, + { + "epoch": 1.7344067108645649, + "grad_norm": 0.22100159525871277, + "learning_rate": 4.512019356362345e-05, + "loss": 1.0993, + "step": 6410 + }, + { + "epoch": 1.734947909619808, + "grad_norm": 0.3507586717605591, + "learning_rate": 4.508886278618733e-05, + "loss": 1.1173, + "step": 6412 + }, + { + "epoch": 1.7354891083750508, + "grad_norm": 0.3754311800003052, + "learning_rate": 4.505753395575926e-05, + "loss": 1.094, + "step": 6414 + }, + { + "epoch": 1.7360303071302936, + "grad_norm": 0.2857351005077362, + "learning_rate": 4.502620708475946e-05, + "loss": 1.1005, + "step": 6416 + }, + { + "epoch": 1.7365715058855364, + "grad_norm": 0.2881494164466858, + "learning_rate": 4.4994882185607385e-05, + "loss": 1.099, + "step": 6418 + }, + { + "epoch": 1.7371127046407793, + "grad_norm": 0.24020306766033173, + "learning_rate": 4.4963559270721744e-05, + "loss": 1.1237, + "step": 6420 + }, + { + "epoch": 1.7376539033960223, + "grad_norm": 0.24890685081481934, + "learning_rate": 4.493223835252041e-05, + "loss": 1.1084, + "step": 6422 + }, + { + "epoch": 1.738195102151265, + "grad_norm": 0.24650399386882782, + "learning_rate": 4.490091944342049e-05, + "loss": 1.0938, + "step": 6424 + }, + { + "epoch": 1.738736300906508, + "grad_norm": 0.24110651016235352, + "learning_rate": 4.486960255583827e-05, + "loss": 1.1078, + "step": 6426 + }, + { + "epoch": 1.7392774996617508, + "grad_norm": 0.21163125336170197, + "learning_rate": 4.4838287702189234e-05, + "loss": 1.1004, + "step": 6428 + }, + { + "epoch": 1.7398186984169937, + "grad_norm": 0.21215718984603882, + "learning_rate": 4.480697489488813e-05, + "loss": 1.0921, + "step": 6430 + }, + { + "epoch": 1.7403598971722365, + "grad_norm": 0.20836880803108215, + "learning_rate": 4.4775664146348815e-05, + "loss": 1.1004, + "step": 6432 + }, + { + "epoch": 1.7409010959274793, + "grad_norm": 0.2075803130865097, + "learning_rate": 4.4744355468984364e-05, + "loss": 1.0712, + "step": 6434 + }, + { + "epoch": 1.7414422946827224, + "grad_norm": 0.22689945995807648, + "learning_rate": 4.4713048875207006e-05, + "loss": 1.0956, + "step": 6436 + }, + { + "epoch": 1.741983493437965, + "grad_norm": 4.691305160522461, + "learning_rate": 4.468174437742818e-05, + "loss": 1.0959, + "step": 6438 + }, + { + "epoch": 1.742524692193208, + "grad_norm": 0.3774552345275879, + "learning_rate": 4.465044198805846e-05, + "loss": 1.1038, + "step": 6440 + }, + { + "epoch": 1.7430658909484507, + "grad_norm": 0.23061533272266388, + "learning_rate": 4.4619141719507604e-05, + "loss": 1.0786, + "step": 6442 + }, + { + "epoch": 1.7436070897036937, + "grad_norm": 0.2104322612285614, + "learning_rate": 4.458784358418454e-05, + "loss": 1.0988, + "step": 6444 + }, + { + "epoch": 1.7441482884589365, + "grad_norm": 0.21337291598320007, + "learning_rate": 4.455654759449733e-05, + "loss": 1.0804, + "step": 6446 + }, + { + "epoch": 1.7446894872141794, + "grad_norm": 0.25351959466934204, + "learning_rate": 4.4525253762853183e-05, + "loss": 1.088, + "step": 6448 + }, + { + "epoch": 1.7452306859694224, + "grad_norm": 0.22051605582237244, + "learning_rate": 4.449396210165847e-05, + "loss": 1.0786, + "step": 6450 + }, + { + "epoch": 1.745771884724665, + "grad_norm": 0.20295509696006775, + "learning_rate": 4.446267262331866e-05, + "loss": 1.093, + "step": 6452 + }, + { + "epoch": 1.746313083479908, + "grad_norm": 0.2147730588912964, + "learning_rate": 4.4431385340238425e-05, + "loss": 1.1052, + "step": 6454 + }, + { + "epoch": 1.7468542822351507, + "grad_norm": 0.20842982828617096, + "learning_rate": 4.4400100264821526e-05, + "loss": 1.1097, + "step": 6456 + }, + { + "epoch": 1.7473954809903938, + "grad_norm": 0.19626913964748383, + "learning_rate": 4.436881740947084e-05, + "loss": 1.1013, + "step": 6458 + }, + { + "epoch": 1.7479366797456366, + "grad_norm": 0.202324777841568, + "learning_rate": 4.433753678658838e-05, + "loss": 1.0974, + "step": 6460 + }, + { + "epoch": 1.7484778785008794, + "grad_norm": 0.20163972675800323, + "learning_rate": 4.430625840857527e-05, + "loss": 1.0912, + "step": 6462 + }, + { + "epoch": 1.7490190772561223, + "grad_norm": 0.1973981410264969, + "learning_rate": 4.427498228783174e-05, + "loss": 1.0986, + "step": 6464 + }, + { + "epoch": 1.749560276011365, + "grad_norm": 0.1972910761833191, + "learning_rate": 4.424370843675714e-05, + "loss": 1.1089, + "step": 6466 + }, + { + "epoch": 1.7501014747666082, + "grad_norm": 0.19948570430278778, + "learning_rate": 4.4212436867749905e-05, + "loss": 1.0904, + "step": 6468 + }, + { + "epoch": 1.7506426735218508, + "grad_norm": 0.19768944382667542, + "learning_rate": 4.4181167593207584e-05, + "loss": 1.0879, + "step": 6470 + }, + { + "epoch": 1.7511838722770938, + "grad_norm": 0.1974615752696991, + "learning_rate": 4.4149900625526787e-05, + "loss": 1.0912, + "step": 6472 + }, + { + "epoch": 1.7517250710323367, + "grad_norm": 0.1955399364233017, + "learning_rate": 4.41186359771032e-05, + "loss": 1.0811, + "step": 6474 + }, + { + "epoch": 1.7522662697875795, + "grad_norm": 0.20192265510559082, + "learning_rate": 4.4087373660331666e-05, + "loss": 1.1031, + "step": 6476 + }, + { + "epoch": 1.7528074685428223, + "grad_norm": 0.19073598086833954, + "learning_rate": 4.4056113687606034e-05, + "loss": 1.1014, + "step": 6478 + }, + { + "epoch": 1.7533486672980652, + "grad_norm": 0.19759954512119293, + "learning_rate": 4.402485607131923e-05, + "loss": 1.0738, + "step": 6480 + }, + { + "epoch": 1.7538898660533082, + "grad_norm": 0.20035560429096222, + "learning_rate": 4.399360082386326e-05, + "loss": 1.097, + "step": 6482 + }, + { + "epoch": 1.7544310648085508, + "grad_norm": 0.19551776349544525, + "learning_rate": 4.396234795762919e-05, + "loss": 1.0851, + "step": 6484 + }, + { + "epoch": 1.7549722635637939, + "grad_norm": 0.1970243901014328, + "learning_rate": 4.393109748500714e-05, + "loss": 1.0942, + "step": 6486 + }, + { + "epoch": 1.7555134623190367, + "grad_norm": 0.20147021114826202, + "learning_rate": 4.38998494183863e-05, + "loss": 1.0852, + "step": 6488 + }, + { + "epoch": 1.7560546610742795, + "grad_norm": 0.20297984778881073, + "learning_rate": 4.3868603770154845e-05, + "loss": 1.0848, + "step": 6490 + }, + { + "epoch": 1.7565958598295224, + "grad_norm": 0.19104255735874176, + "learning_rate": 4.383736055270008e-05, + "loss": 1.085, + "step": 6492 + }, + { + "epoch": 1.7571370585847652, + "grad_norm": 0.19445335865020752, + "learning_rate": 4.380611977840829e-05, + "loss": 1.0634, + "step": 6494 + }, + { + "epoch": 1.7576782573400083, + "grad_norm": 0.19129939377307892, + "learning_rate": 4.3774881459664785e-05, + "loss": 1.091, + "step": 6496 + }, + { + "epoch": 1.7582194560952509, + "grad_norm": 0.19413970410823822, + "learning_rate": 4.3743645608853905e-05, + "loss": 1.0781, + "step": 6498 + }, + { + "epoch": 1.758760654850494, + "grad_norm": 0.19401364028453827, + "learning_rate": 4.371241223835907e-05, + "loss": 1.0988, + "step": 6500 + }, + { + "epoch": 1.7593018536057365, + "grad_norm": 0.18937474489212036, + "learning_rate": 4.3681181360562646e-05, + "loss": 1.0939, + "step": 6502 + }, + { + "epoch": 1.7598430523609796, + "grad_norm": 0.195633202791214, + "learning_rate": 4.3649952987846035e-05, + "loss": 1.0799, + "step": 6504 + }, + { + "epoch": 1.7603842511162224, + "grad_norm": 0.18419165909290314, + "learning_rate": 4.3618727132589655e-05, + "loss": 1.1024, + "step": 6506 + }, + { + "epoch": 1.7609254498714653, + "grad_norm": 0.1886375993490219, + "learning_rate": 4.358750380717288e-05, + "loss": 1.0999, + "step": 6508 + }, + { + "epoch": 1.7614666486267083, + "grad_norm": 0.19577690958976746, + "learning_rate": 4.3556283023974185e-05, + "loss": 1.0933, + "step": 6510 + }, + { + "epoch": 1.762007847381951, + "grad_norm": 0.20096082985401154, + "learning_rate": 4.352506479537093e-05, + "loss": 1.1118, + "step": 6512 + }, + { + "epoch": 1.762549046137194, + "grad_norm": 0.19489717483520508, + "learning_rate": 4.3493849133739503e-05, + "loss": 1.0708, + "step": 6514 + }, + { + "epoch": 1.7630902448924366, + "grad_norm": 0.19973506033420563, + "learning_rate": 4.3462636051455305e-05, + "loss": 1.1236, + "step": 6516 + }, + { + "epoch": 1.7636314436476797, + "grad_norm": 0.20452648401260376, + "learning_rate": 4.3431425560892655e-05, + "loss": 1.097, + "step": 6518 + }, + { + "epoch": 1.7641726424029225, + "grad_norm": 0.19926029443740845, + "learning_rate": 4.3400217674424856e-05, + "loss": 1.1087, + "step": 6520 + }, + { + "epoch": 1.7647138411581653, + "grad_norm": 0.2000960111618042, + "learning_rate": 4.3369012404424246e-05, + "loss": 1.0851, + "step": 6522 + }, + { + "epoch": 1.7652550399134082, + "grad_norm": 0.19704541563987732, + "learning_rate": 4.333780976326205e-05, + "loss": 1.0934, + "step": 6524 + }, + { + "epoch": 1.765796238668651, + "grad_norm": 0.20251207053661346, + "learning_rate": 4.330660976330848e-05, + "loss": 1.0879, + "step": 6526 + }, + { + "epoch": 1.766337437423894, + "grad_norm": 0.1936361938714981, + "learning_rate": 4.3275412416932705e-05, + "loss": 1.0703, + "step": 6528 + }, + { + "epoch": 1.7668786361791367, + "grad_norm": 0.19449754059314728, + "learning_rate": 4.324421773650281e-05, + "loss": 1.0956, + "step": 6530 + }, + { + "epoch": 1.7674198349343797, + "grad_norm": 0.19861850142478943, + "learning_rate": 4.321302573438586e-05, + "loss": 1.1031, + "step": 6532 + }, + { + "epoch": 1.7679610336896225, + "grad_norm": 0.19280153512954712, + "learning_rate": 4.318183642294786e-05, + "loss": 1.0701, + "step": 6534 + }, + { + "epoch": 1.7685022324448654, + "grad_norm": 0.20537976920604706, + "learning_rate": 4.315064981455373e-05, + "loss": 1.0755, + "step": 6536 + }, + { + "epoch": 1.7690434312001082, + "grad_norm": 0.18868936598300934, + "learning_rate": 4.3119465921567303e-05, + "loss": 1.0894, + "step": 6538 + }, + { + "epoch": 1.769584629955351, + "grad_norm": 0.19705283641815186, + "learning_rate": 4.3088284756351384e-05, + "loss": 1.107, + "step": 6540 + }, + { + "epoch": 1.770125828710594, + "grad_norm": 0.19849726557731628, + "learning_rate": 4.3057106331267655e-05, + "loss": 1.1024, + "step": 6542 + }, + { + "epoch": 1.7706670274658367, + "grad_norm": 0.21062889695167542, + "learning_rate": 4.30259306586767e-05, + "loss": 1.0831, + "step": 6544 + }, + { + "epoch": 1.7712082262210798, + "grad_norm": 0.21323548257350922, + "learning_rate": 4.2994757750938075e-05, + "loss": 1.0833, + "step": 6546 + }, + { + "epoch": 1.7717494249763226, + "grad_norm": 0.19871409237384796, + "learning_rate": 4.29635876204102e-05, + "loss": 1.1002, + "step": 6548 + }, + { + "epoch": 1.7722906237315654, + "grad_norm": 0.19860664010047913, + "learning_rate": 4.293242027945036e-05, + "loss": 1.1054, + "step": 6550 + }, + { + "epoch": 1.7728318224868083, + "grad_norm": 0.19428445398807526, + "learning_rate": 4.29012557404148e-05, + "loss": 1.1035, + "step": 6552 + }, + { + "epoch": 1.773373021242051, + "grad_norm": 0.20563170313835144, + "learning_rate": 4.2870094015658595e-05, + "loss": 1.0977, + "step": 6554 + }, + { + "epoch": 1.7739142199972942, + "grad_norm": 0.2111564427614212, + "learning_rate": 4.283893511753577e-05, + "loss": 1.0908, + "step": 6556 + }, + { + "epoch": 1.7744554187525368, + "grad_norm": 0.2181028574705124, + "learning_rate": 4.280777905839918e-05, + "loss": 1.0916, + "step": 6558 + }, + { + "epoch": 1.7749966175077798, + "grad_norm": 0.22550417482852936, + "learning_rate": 4.277662585060055e-05, + "loss": 1.0863, + "step": 6560 + }, + { + "epoch": 1.7755378162630227, + "grad_norm": 0.23247095942497253, + "learning_rate": 4.27454755064905e-05, + "loss": 1.1046, + "step": 6562 + }, + { + "epoch": 1.7760790150182655, + "grad_norm": 0.22413510084152222, + "learning_rate": 4.27143280384185e-05, + "loss": 1.0833, + "step": 6564 + }, + { + "epoch": 1.7766202137735083, + "grad_norm": 0.21419657766819, + "learning_rate": 4.26831834587329e-05, + "loss": 1.0879, + "step": 6566 + }, + { + "epoch": 1.7771614125287511, + "grad_norm": 0.20207151770591736, + "learning_rate": 4.265204177978087e-05, + "loss": 1.0889, + "step": 6568 + }, + { + "epoch": 1.7777026112839942, + "grad_norm": 0.19237180054187775, + "learning_rate": 4.262090301390848e-05, + "loss": 1.0808, + "step": 6570 + }, + { + "epoch": 1.7782438100392368, + "grad_norm": 0.19792573153972626, + "learning_rate": 4.25897671734606e-05, + "loss": 1.098, + "step": 6572 + }, + { + "epoch": 1.7787850087944799, + "grad_norm": 0.1927460879087448, + "learning_rate": 4.255863427078095e-05, + "loss": 1.1095, + "step": 6574 + }, + { + "epoch": 1.7793262075497225, + "grad_norm": 0.20088867843151093, + "learning_rate": 4.2527504318212104e-05, + "loss": 1.0657, + "step": 6576 + }, + { + "epoch": 1.7798674063049655, + "grad_norm": 0.19744569063186646, + "learning_rate": 4.249637732809541e-05, + "loss": 1.0986, + "step": 6578 + }, + { + "epoch": 1.7804086050602084, + "grad_norm": 0.34829336404800415, + "learning_rate": 4.246525331277116e-05, + "loss": 1.0857, + "step": 6580 + }, + { + "epoch": 1.7809498038154512, + "grad_norm": 0.20506344735622406, + "learning_rate": 4.2434132284578345e-05, + "loss": 1.0996, + "step": 6582 + }, + { + "epoch": 1.781491002570694, + "grad_norm": 0.2044580727815628, + "learning_rate": 4.2403014255854825e-05, + "loss": 1.0861, + "step": 6584 + }, + { + "epoch": 1.7820322013259369, + "grad_norm": 0.20686107873916626, + "learning_rate": 4.2371899238937265e-05, + "loss": 1.0872, + "step": 6586 + }, + { + "epoch": 1.78257340008118, + "grad_norm": 0.20514465868473053, + "learning_rate": 4.234078724616113e-05, + "loss": 1.0829, + "step": 6588 + }, + { + "epoch": 1.7831145988364225, + "grad_norm": 0.21564225852489471, + "learning_rate": 4.2309678289860704e-05, + "loss": 1.0984, + "step": 6590 + }, + { + "epoch": 1.7836557975916656, + "grad_norm": 0.21428346633911133, + "learning_rate": 4.2278572382369045e-05, + "loss": 1.0856, + "step": 6592 + }, + { + "epoch": 1.7841969963469084, + "grad_norm": 0.2111758589744568, + "learning_rate": 4.224746953601803e-05, + "loss": 1.0783, + "step": 6594 + }, + { + "epoch": 1.7847381951021513, + "grad_norm": 0.21367578208446503, + "learning_rate": 4.2216369763138284e-05, + "loss": 1.093, + "step": 6596 + }, + { + "epoch": 1.785279393857394, + "grad_norm": 0.20083215832710266, + "learning_rate": 4.2185273076059246e-05, + "loss": 1.0666, + "step": 6598 + }, + { + "epoch": 1.785820592612637, + "grad_norm": 0.19775110483169556, + "learning_rate": 4.2154179487109084e-05, + "loss": 1.1041, + "step": 6600 + }, + { + "epoch": 1.78636179136788, + "grad_norm": 0.20243553817272186, + "learning_rate": 4.212308900861483e-05, + "loss": 1.0881, + "step": 6602 + }, + { + "epoch": 1.7869029901231226, + "grad_norm": 0.20611883699893951, + "learning_rate": 4.209200165290221e-05, + "loss": 1.1025, + "step": 6604 + }, + { + "epoch": 1.7874441888783656, + "grad_norm": 0.1935873180627823, + "learning_rate": 4.20609174322957e-05, + "loss": 1.1095, + "step": 6606 + }, + { + "epoch": 1.7879853876336085, + "grad_norm": 0.20548474788665771, + "learning_rate": 4.20298363591186e-05, + "loss": 1.0853, + "step": 6608 + }, + { + "epoch": 1.7885265863888513, + "grad_norm": 0.20183724164962769, + "learning_rate": 4.1998758445692874e-05, + "loss": 1.0675, + "step": 6610 + }, + { + "epoch": 1.7890677851440941, + "grad_norm": 0.19128645956516266, + "learning_rate": 4.1967683704339326e-05, + "loss": 1.1009, + "step": 6612 + }, + { + "epoch": 1.789608983899337, + "grad_norm": 0.20994137227535248, + "learning_rate": 4.193661214737745e-05, + "loss": 1.1067, + "step": 6614 + }, + { + "epoch": 1.79015018265458, + "grad_norm": 0.19996902346611023, + "learning_rate": 4.1905543787125476e-05, + "loss": 1.0934, + "step": 6616 + }, + { + "epoch": 1.7906913814098226, + "grad_norm": 0.23378756642341614, + "learning_rate": 4.187447863590039e-05, + "loss": 1.0834, + "step": 6618 + }, + { + "epoch": 1.7912325801650657, + "grad_norm": 0.21649529039859772, + "learning_rate": 4.184341670601788e-05, + "loss": 1.1146, + "step": 6620 + }, + { + "epoch": 1.7917737789203085, + "grad_norm": 0.2031160295009613, + "learning_rate": 4.18123580097924e-05, + "loss": 1.1049, + "step": 6622 + }, + { + "epoch": 1.7923149776755514, + "grad_norm": 0.19968315958976746, + "learning_rate": 4.178130255953703e-05, + "loss": 1.0898, + "step": 6624 + }, + { + "epoch": 1.7928561764307942, + "grad_norm": 0.19984765350818634, + "learning_rate": 4.1750250367563695e-05, + "loss": 1.0969, + "step": 6626 + }, + { + "epoch": 1.793397375186037, + "grad_norm": 0.20172137022018433, + "learning_rate": 4.171920144618292e-05, + "loss": 1.0932, + "step": 6628 + }, + { + "epoch": 1.79393857394128, + "grad_norm": 0.19911104440689087, + "learning_rate": 4.168815580770399e-05, + "loss": 1.0803, + "step": 6630 + }, + { + "epoch": 1.7944797726965227, + "grad_norm": 0.19864173233509064, + "learning_rate": 4.165711346443485e-05, + "loss": 1.0818, + "step": 6632 + }, + { + "epoch": 1.7950209714517658, + "grad_norm": 0.22264103591442108, + "learning_rate": 4.162607442868217e-05, + "loss": 1.0991, + "step": 6634 + }, + { + "epoch": 1.7955621702070084, + "grad_norm": 0.19450636208057404, + "learning_rate": 4.15950387127513e-05, + "loss": 1.0711, + "step": 6636 + }, + { + "epoch": 1.7961033689622514, + "grad_norm": 0.20430341362953186, + "learning_rate": 4.156400632894628e-05, + "loss": 1.093, + "step": 6638 + }, + { + "epoch": 1.7966445677174943, + "grad_norm": 0.19968783855438232, + "learning_rate": 4.15329772895698e-05, + "loss": 1.0836, + "step": 6640 + }, + { + "epoch": 1.797185766472737, + "grad_norm": 0.19790159165859222, + "learning_rate": 4.1501951606923274e-05, + "loss": 1.0781, + "step": 6642 + }, + { + "epoch": 1.79772696522798, + "grad_norm": 0.19310256838798523, + "learning_rate": 4.147092929330673e-05, + "loss": 1.0802, + "step": 6644 + }, + { + "epoch": 1.7982681639832228, + "grad_norm": 0.20688797533512115, + "learning_rate": 4.1439910361018876e-05, + "loss": 1.1096, + "step": 6646 + }, + { + "epoch": 1.7988093627384658, + "grad_norm": 0.20901572704315186, + "learning_rate": 4.140889482235712e-05, + "loss": 1.0849, + "step": 6648 + }, + { + "epoch": 1.7993505614937084, + "grad_norm": 0.2252446860074997, + "learning_rate": 4.13778826896175e-05, + "loss": 1.0879, + "step": 6650 + }, + { + "epoch": 1.7998917602489515, + "grad_norm": 0.20364707708358765, + "learning_rate": 4.1346873975094674e-05, + "loss": 1.0936, + "step": 6652 + }, + { + "epoch": 1.8004329590041943, + "grad_norm": 0.19577133655548096, + "learning_rate": 4.1315868691081985e-05, + "loss": 1.1156, + "step": 6654 + }, + { + "epoch": 1.8009741577594371, + "grad_norm": 0.21345022320747375, + "learning_rate": 4.128486684987138e-05, + "loss": 1.0778, + "step": 6656 + }, + { + "epoch": 1.80151535651468, + "grad_norm": 0.21753264963626862, + "learning_rate": 4.125386846375346e-05, + "loss": 1.0729, + "step": 6658 + }, + { + "epoch": 1.8020565552699228, + "grad_norm": 0.19566558301448822, + "learning_rate": 4.122287354501749e-05, + "loss": 1.0969, + "step": 6660 + }, + { + "epoch": 1.8025977540251659, + "grad_norm": 0.18708518147468567, + "learning_rate": 4.1191882105951304e-05, + "loss": 1.0817, + "step": 6662 + }, + { + "epoch": 1.8031389527804085, + "grad_norm": 0.2051536738872528, + "learning_rate": 4.116089415884138e-05, + "loss": 1.0959, + "step": 6664 + }, + { + "epoch": 1.8036801515356515, + "grad_norm": 0.19509519636631012, + "learning_rate": 4.112990971597283e-05, + "loss": 1.1013, + "step": 6666 + }, + { + "epoch": 1.8042213502908944, + "grad_norm": 0.19669197499752045, + "learning_rate": 4.109892878962933e-05, + "loss": 1.0798, + "step": 6668 + }, + { + "epoch": 1.8047625490461372, + "grad_norm": 0.18896794319152832, + "learning_rate": 4.106795139209321e-05, + "loss": 1.0775, + "step": 6670 + }, + { + "epoch": 1.80530374780138, + "grad_norm": 0.19769933819770813, + "learning_rate": 4.1036977535645385e-05, + "loss": 1.0935, + "step": 6672 + }, + { + "epoch": 1.8058449465566229, + "grad_norm": 0.20387202501296997, + "learning_rate": 4.100600723256536e-05, + "loss": 1.0804, + "step": 6674 + }, + { + "epoch": 1.806386145311866, + "grad_norm": 0.21471519768238068, + "learning_rate": 4.0975040495131235e-05, + "loss": 1.0624, + "step": 6676 + }, + { + "epoch": 1.8069273440671085, + "grad_norm": 0.19445520639419556, + "learning_rate": 4.094407733561968e-05, + "loss": 1.08, + "step": 6678 + }, + { + "epoch": 1.8074685428223516, + "grad_norm": 0.19775503873825073, + "learning_rate": 4.091311776630596e-05, + "loss": 1.1051, + "step": 6680 + }, + { + "epoch": 1.8080097415775944, + "grad_norm": 0.1946769505739212, + "learning_rate": 4.088216179946395e-05, + "loss": 1.0724, + "step": 6682 + }, + { + "epoch": 1.8085509403328373, + "grad_norm": 0.19768846035003662, + "learning_rate": 4.085120944736604e-05, + "loss": 1.0862, + "step": 6684 + }, + { + "epoch": 1.80909213908808, + "grad_norm": 0.1936732679605484, + "learning_rate": 4.082026072228322e-05, + "loss": 1.0697, + "step": 6686 + }, + { + "epoch": 1.809633337843323, + "grad_norm": 0.19827993214130402, + "learning_rate": 4.0789315636485026e-05, + "loss": 1.0802, + "step": 6688 + }, + { + "epoch": 1.810174536598566, + "grad_norm": 0.1980011910200119, + "learning_rate": 4.075837420223958e-05, + "loss": 1.0776, + "step": 6690 + }, + { + "epoch": 1.8107157353538086, + "grad_norm": 0.19554691016674042, + "learning_rate": 4.072743643181352e-05, + "loss": 1.0856, + "step": 6692 + }, + { + "epoch": 1.8112569341090516, + "grad_norm": 0.21835459768772125, + "learning_rate": 4.069650233747203e-05, + "loss": 1.0798, + "step": 6694 + }, + { + "epoch": 1.8117981328642943, + "grad_norm": 0.2064054161310196, + "learning_rate": 4.06655719314789e-05, + "loss": 1.1061, + "step": 6696 + }, + { + "epoch": 1.8123393316195373, + "grad_norm": 0.2178279608488083, + "learning_rate": 4.0634645226096386e-05, + "loss": 1.0927, + "step": 6698 + }, + { + "epoch": 1.8128805303747801, + "grad_norm": 0.20535053312778473, + "learning_rate": 4.0603722233585306e-05, + "loss": 1.0818, + "step": 6700 + }, + { + "epoch": 1.813421729130023, + "grad_norm": 0.18977800011634827, + "learning_rate": 4.057280296620499e-05, + "loss": 1.077, + "step": 6702 + }, + { + "epoch": 1.813962927885266, + "grad_norm": 0.19911536574363708, + "learning_rate": 4.0541887436213304e-05, + "loss": 1.0959, + "step": 6704 + }, + { + "epoch": 1.8145041266405086, + "grad_norm": 0.2092689573764801, + "learning_rate": 4.051097565586666e-05, + "loss": 1.0899, + "step": 6706 + }, + { + "epoch": 1.8150453253957517, + "grad_norm": 0.1950940638780594, + "learning_rate": 4.048006763741994e-05, + "loss": 1.0772, + "step": 6708 + }, + { + "epoch": 1.8155865241509943, + "grad_norm": 0.19229792058467865, + "learning_rate": 4.0449163393126555e-05, + "loss": 1.0726, + "step": 6710 + }, + { + "epoch": 1.8161277229062374, + "grad_norm": 0.2023068070411682, + "learning_rate": 4.0418262935238406e-05, + "loss": 1.0984, + "step": 6712 + }, + { + "epoch": 1.8166689216614802, + "grad_norm": 0.2057054489850998, + "learning_rate": 4.0387366276005875e-05, + "loss": 1.1125, + "step": 6714 + }, + { + "epoch": 1.817210120416723, + "grad_norm": 0.20108623802661896, + "learning_rate": 4.035647342767793e-05, + "loss": 1.0698, + "step": 6716 + }, + { + "epoch": 1.8177513191719659, + "grad_norm": 0.2006451040506363, + "learning_rate": 4.032558440250191e-05, + "loss": 1.1069, + "step": 6718 + }, + { + "epoch": 1.8182925179272087, + "grad_norm": 0.19579680263996124, + "learning_rate": 4.029469921272373e-05, + "loss": 1.0842, + "step": 6720 + }, + { + "epoch": 1.8188337166824518, + "grad_norm": 0.1893213838338852, + "learning_rate": 4.026381787058772e-05, + "loss": 1.0998, + "step": 6722 + }, + { + "epoch": 1.8193749154376944, + "grad_norm": 0.19480760395526886, + "learning_rate": 4.0232940388336724e-05, + "loss": 1.0822, + "step": 6724 + }, + { + "epoch": 1.8199161141929374, + "grad_norm": 0.1998399943113327, + "learning_rate": 4.020206677821201e-05, + "loss": 1.0743, + "step": 6726 + }, + { + "epoch": 1.8204573129481803, + "grad_norm": 0.2122696340084076, + "learning_rate": 4.0171197052453394e-05, + "loss": 1.0884, + "step": 6728 + }, + { + "epoch": 1.820998511703423, + "grad_norm": 0.21512769162654877, + "learning_rate": 4.014033122329908e-05, + "loss": 1.0959, + "step": 6730 + }, + { + "epoch": 1.821539710458666, + "grad_norm": 0.21467286348342896, + "learning_rate": 4.010946930298573e-05, + "loss": 1.0827, + "step": 6732 + }, + { + "epoch": 1.8220809092139088, + "grad_norm": 0.2198825478553772, + "learning_rate": 4.007861130374851e-05, + "loss": 1.0973, + "step": 6734 + }, + { + "epoch": 1.8226221079691518, + "grad_norm": 0.22405289113521576, + "learning_rate": 4.004775723782097e-05, + "loss": 1.0723, + "step": 6736 + }, + { + "epoch": 1.8231633067243944, + "grad_norm": 0.2315000593662262, + "learning_rate": 4.0016907117435106e-05, + "loss": 1.0889, + "step": 6738 + }, + { + "epoch": 1.8237045054796375, + "grad_norm": 0.20799678564071655, + "learning_rate": 3.9986060954821424e-05, + "loss": 1.0626, + "step": 6740 + }, + { + "epoch": 1.8242457042348803, + "grad_norm": 0.19263596832752228, + "learning_rate": 3.995521876220878e-05, + "loss": 1.0778, + "step": 6742 + }, + { + "epoch": 1.8247869029901231, + "grad_norm": 0.2035963535308838, + "learning_rate": 3.9924380551824495e-05, + "loss": 1.0873, + "step": 6744 + }, + { + "epoch": 1.825328101745366, + "grad_norm": 0.20593370497226715, + "learning_rate": 3.989354633589428e-05, + "loss": 1.0886, + "step": 6746 + }, + { + "epoch": 1.8258693005006088, + "grad_norm": 0.1969778835773468, + "learning_rate": 3.9862716126642285e-05, + "loss": 1.0811, + "step": 6748 + }, + { + "epoch": 1.8264104992558519, + "grad_norm": 0.20104999840259552, + "learning_rate": 3.98318899362911e-05, + "loss": 1.0796, + "step": 6750 + }, + { + "epoch": 1.8269516980110945, + "grad_norm": 0.19378796219825745, + "learning_rate": 3.980106777706166e-05, + "loss": 1.0806, + "step": 6752 + }, + { + "epoch": 1.8274928967663375, + "grad_norm": 0.18916435539722443, + "learning_rate": 3.9770249661173354e-05, + "loss": 1.0702, + "step": 6754 + }, + { + "epoch": 1.8280340955215801, + "grad_norm": 0.1983213573694229, + "learning_rate": 3.9739435600843936e-05, + "loss": 1.0983, + "step": 6756 + }, + { + "epoch": 1.8285752942768232, + "grad_norm": 0.1911832094192505, + "learning_rate": 3.970862560828956e-05, + "loss": 1.0863, + "step": 6758 + }, + { + "epoch": 1.829116493032066, + "grad_norm": 0.2137800008058548, + "learning_rate": 3.967781969572474e-05, + "loss": 1.0846, + "step": 6760 + }, + { + "epoch": 1.8296576917873089, + "grad_norm": 0.22685086727142334, + "learning_rate": 3.9647017875362474e-05, + "loss": 1.074, + "step": 6762 + }, + { + "epoch": 1.830198890542552, + "grad_norm": 0.20144490897655487, + "learning_rate": 3.9616220159414016e-05, + "loss": 1.0905, + "step": 6764 + }, + { + "epoch": 1.8307400892977945, + "grad_norm": 0.22347332537174225, + "learning_rate": 3.958542656008906e-05, + "loss": 1.0786, + "step": 6766 + }, + { + "epoch": 1.8312812880530376, + "grad_norm": 0.21963992714881897, + "learning_rate": 3.955463708959564e-05, + "loss": 1.0899, + "step": 6768 + }, + { + "epoch": 1.8318224868082802, + "grad_norm": 0.20576758682727814, + "learning_rate": 3.9523851760140176e-05, + "loss": 1.086, + "step": 6770 + }, + { + "epoch": 1.8323636855635232, + "grad_norm": 0.1927286684513092, + "learning_rate": 3.949307058392742e-05, + "loss": 1.071, + "step": 6772 + }, + { + "epoch": 1.832904884318766, + "grad_norm": 0.21448402106761932, + "learning_rate": 3.9462293573160515e-05, + "loss": 1.1007, + "step": 6774 + }, + { + "epoch": 1.833446083074009, + "grad_norm": 0.22515618801116943, + "learning_rate": 3.9431520740040914e-05, + "loss": 1.1049, + "step": 6776 + }, + { + "epoch": 1.8339872818292517, + "grad_norm": 0.21115495264530182, + "learning_rate": 3.940075209676843e-05, + "loss": 1.0868, + "step": 6778 + }, + { + "epoch": 1.8345284805844946, + "grad_norm": 0.21134918928146362, + "learning_rate": 3.936998765554123e-05, + "loss": 1.0909, + "step": 6780 + }, + { + "epoch": 1.8350696793397376, + "grad_norm": 0.21711567044258118, + "learning_rate": 3.9339227428555776e-05, + "loss": 1.0832, + "step": 6782 + }, + { + "epoch": 1.8356108780949802, + "grad_norm": 0.2019776701927185, + "learning_rate": 3.930847142800688e-05, + "loss": 1.0804, + "step": 6784 + }, + { + "epoch": 1.8361520768502233, + "grad_norm": 0.19144585728645325, + "learning_rate": 3.9277719666087706e-05, + "loss": 1.0855, + "step": 6786 + }, + { + "epoch": 1.8366932756054661, + "grad_norm": 0.19285689294338226, + "learning_rate": 3.924697215498971e-05, + "loss": 1.0717, + "step": 6788 + }, + { + "epoch": 1.837234474360709, + "grad_norm": 0.2029266208410263, + "learning_rate": 3.9216228906902655e-05, + "loss": 1.092, + "step": 6790 + }, + { + "epoch": 1.8377756731159518, + "grad_norm": 0.20512457191944122, + "learning_rate": 3.9185489934014614e-05, + "loss": 1.0768, + "step": 6792 + }, + { + "epoch": 1.8383168718711946, + "grad_norm": 0.19526542723178864, + "learning_rate": 3.9154755248511996e-05, + "loss": 1.0808, + "step": 6794 + }, + { + "epoch": 1.8388580706264377, + "grad_norm": 0.19210286438465118, + "learning_rate": 3.912402486257947e-05, + "loss": 1.0992, + "step": 6796 + }, + { + "epoch": 1.8393992693816803, + "grad_norm": 0.21016091108322144, + "learning_rate": 3.9093298788400055e-05, + "loss": 1.1029, + "step": 6798 + }, + { + "epoch": 1.8399404681369234, + "grad_norm": 0.207411989569664, + "learning_rate": 3.9062577038155004e-05, + "loss": 1.0769, + "step": 6800 + }, + { + "epoch": 1.8404816668921662, + "grad_norm": 0.20119555294513702, + "learning_rate": 3.903185962402388e-05, + "loss": 1.0971, + "step": 6802 + }, + { + "epoch": 1.841022865647409, + "grad_norm": 0.19205377995967865, + "learning_rate": 3.900114655818452e-05, + "loss": 1.0748, + "step": 6804 + }, + { + "epoch": 1.8415640644026519, + "grad_norm": 0.18573497235774994, + "learning_rate": 3.897043785281302e-05, + "loss": 1.0875, + "step": 6806 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.19591739773750305, + "learning_rate": 3.8939733520083826e-05, + "loss": 1.0803, + "step": 6808 + }, + { + "epoch": 1.8426464619131377, + "grad_norm": 0.19275245070457458, + "learning_rate": 3.8909033572169556e-05, + "loss": 1.0773, + "step": 6810 + }, + { + "epoch": 1.8431876606683804, + "grad_norm": 0.19375787675380707, + "learning_rate": 3.8878338021241134e-05, + "loss": 1.0583, + "step": 6812 + }, + { + "epoch": 1.8437288594236234, + "grad_norm": 0.19528155028820038, + "learning_rate": 3.884764687946774e-05, + "loss": 1.1114, + "step": 6814 + }, + { + "epoch": 1.844270058178866, + "grad_norm": 0.20921604335308075, + "learning_rate": 3.881696015901678e-05, + "loss": 1.1055, + "step": 6816 + }, + { + "epoch": 1.844811256934109, + "grad_norm": 0.19633661210536957, + "learning_rate": 3.878627787205394e-05, + "loss": 1.0962, + "step": 6818 + }, + { + "epoch": 1.845352455689352, + "grad_norm": 0.2025715708732605, + "learning_rate": 3.875560003074314e-05, + "loss": 1.0714, + "step": 6820 + }, + { + "epoch": 1.8458936544445947, + "grad_norm": 0.21309030055999756, + "learning_rate": 3.872492664724654e-05, + "loss": 1.1004, + "step": 6822 + }, + { + "epoch": 1.8464348531998378, + "grad_norm": 0.1951216161251068, + "learning_rate": 3.869425773372451e-05, + "loss": 1.1007, + "step": 6824 + }, + { + "epoch": 1.8469760519550804, + "grad_norm": 0.19131900370121002, + "learning_rate": 3.866359330233568e-05, + "loss": 1.0928, + "step": 6826 + }, + { + "epoch": 1.8475172507103235, + "grad_norm": 0.20170724391937256, + "learning_rate": 3.863293336523686e-05, + "loss": 1.0938, + "step": 6828 + }, + { + "epoch": 1.848058449465566, + "grad_norm": 0.19565552473068237, + "learning_rate": 3.860227793458314e-05, + "loss": 1.0875, + "step": 6830 + }, + { + "epoch": 1.8485996482208091, + "grad_norm": 0.19799013435840607, + "learning_rate": 3.8571627022527765e-05, + "loss": 1.1031, + "step": 6832 + }, + { + "epoch": 1.849140846976052, + "grad_norm": 0.22882051765918732, + "learning_rate": 3.854098064122223e-05, + "loss": 1.1012, + "step": 6834 + }, + { + "epoch": 1.8496820457312948, + "grad_norm": 0.1981499344110489, + "learning_rate": 3.85103388028162e-05, + "loss": 1.0804, + "step": 6836 + }, + { + "epoch": 1.8502232444865376, + "grad_norm": 0.20329636335372925, + "learning_rate": 3.8479701519457564e-05, + "loss": 1.0777, + "step": 6838 + }, + { + "epoch": 1.8507644432417805, + "grad_norm": 0.19371087849140167, + "learning_rate": 3.8449068803292366e-05, + "loss": 1.1031, + "step": 6840 + }, + { + "epoch": 1.8513056419970235, + "grad_norm": 0.2147005945444107, + "learning_rate": 3.841844066646492e-05, + "loss": 1.0833, + "step": 6842 + }, + { + "epoch": 1.8518468407522661, + "grad_norm": 0.1998402327299118, + "learning_rate": 3.8387817121117646e-05, + "loss": 1.0796, + "step": 6844 + }, + { + "epoch": 1.8523880395075092, + "grad_norm": 0.19730143249034882, + "learning_rate": 3.835719817939117e-05, + "loss": 1.0995, + "step": 6846 + }, + { + "epoch": 1.852929238262752, + "grad_norm": 0.20164614915847778, + "learning_rate": 3.8326583853424306e-05, + "loss": 1.0654, + "step": 6848 + }, + { + "epoch": 1.8534704370179949, + "grad_norm": 0.20535439252853394, + "learning_rate": 3.8295974155354024e-05, + "loss": 1.0871, + "step": 6850 + }, + { + "epoch": 1.8540116357732377, + "grad_norm": 0.20539341866970062, + "learning_rate": 3.826536909731544e-05, + "loss": 1.1022, + "step": 6852 + }, + { + "epoch": 1.8545528345284805, + "grad_norm": 0.20159456133842468, + "learning_rate": 3.823476869144189e-05, + "loss": 1.078, + "step": 6854 + }, + { + "epoch": 1.8550940332837236, + "grad_norm": 0.2013360559940338, + "learning_rate": 3.820417294986481e-05, + "loss": 1.0775, + "step": 6856 + }, + { + "epoch": 1.8556352320389662, + "grad_norm": 0.20098362863063812, + "learning_rate": 3.8173581884713816e-05, + "loss": 1.0677, + "step": 6858 + }, + { + "epoch": 1.8561764307942092, + "grad_norm": 0.20573690533638, + "learning_rate": 3.8142995508116644e-05, + "loss": 1.0831, + "step": 6860 + }, + { + "epoch": 1.856717629549452, + "grad_norm": 0.20562604069709778, + "learning_rate": 3.811241383219918e-05, + "loss": 1.0739, + "step": 6862 + }, + { + "epoch": 1.857258828304695, + "grad_norm": 0.2088104784488678, + "learning_rate": 3.8081836869085454e-05, + "loss": 1.0712, + "step": 6864 + }, + { + "epoch": 1.8578000270599377, + "grad_norm": 0.18918459117412567, + "learning_rate": 3.805126463089765e-05, + "loss": 1.0866, + "step": 6866 + }, + { + "epoch": 1.8583412258151806, + "grad_norm": 0.19981247186660767, + "learning_rate": 3.802069712975605e-05, + "loss": 1.0899, + "step": 6868 + }, + { + "epoch": 1.8588824245704236, + "grad_norm": 0.19627903401851654, + "learning_rate": 3.799013437777903e-05, + "loss": 1.0664, + "step": 6870 + }, + { + "epoch": 1.8594236233256662, + "grad_norm": 0.2044699788093567, + "learning_rate": 3.795957638708316e-05, + "loss": 1.1034, + "step": 6872 + }, + { + "epoch": 1.8599648220809093, + "grad_norm": 0.23277227580547333, + "learning_rate": 3.792902316978304e-05, + "loss": 1.0776, + "step": 6874 + }, + { + "epoch": 1.8605060208361521, + "grad_norm": 0.2099398672580719, + "learning_rate": 3.7898474737991454e-05, + "loss": 1.0918, + "step": 6876 + }, + { + "epoch": 1.861047219591395, + "grad_norm": 0.20346395671367645, + "learning_rate": 3.786793110381924e-05, + "loss": 1.0966, + "step": 6878 + }, + { + "epoch": 1.8615884183466378, + "grad_norm": 0.232667475938797, + "learning_rate": 3.783739227937533e-05, + "loss": 1.0926, + "step": 6880 + }, + { + "epoch": 1.8621296171018806, + "grad_norm": 0.21606087684631348, + "learning_rate": 3.780685827676678e-05, + "loss": 1.0871, + "step": 6882 + }, + { + "epoch": 1.8626708158571237, + "grad_norm": 0.20770125091075897, + "learning_rate": 3.777632910809871e-05, + "loss": 1.088, + "step": 6884 + }, + { + "epoch": 1.8632120146123663, + "grad_norm": 0.2001635581254959, + "learning_rate": 3.7745804785474304e-05, + "loss": 1.0859, + "step": 6886 + }, + { + "epoch": 1.8637532133676094, + "grad_norm": 0.19649547338485718, + "learning_rate": 3.771528532099491e-05, + "loss": 1.0875, + "step": 6888 + }, + { + "epoch": 1.864294412122852, + "grad_norm": 0.2008652687072754, + "learning_rate": 3.768477072675988e-05, + "loss": 1.0809, + "step": 6890 + }, + { + "epoch": 1.864835610878095, + "grad_norm": 0.20074871182441711, + "learning_rate": 3.7654261014866624e-05, + "loss": 1.0813, + "step": 6892 + }, + { + "epoch": 1.8653768096333379, + "grad_norm": 0.19595515727996826, + "learning_rate": 3.7623756197410634e-05, + "loss": 1.0888, + "step": 6894 + }, + { + "epoch": 1.8659180083885807, + "grad_norm": 0.2046647071838379, + "learning_rate": 3.759325628648551e-05, + "loss": 1.0769, + "step": 6896 + }, + { + "epoch": 1.8664592071438235, + "grad_norm": 0.20076891779899597, + "learning_rate": 3.7562761294182815e-05, + "loss": 1.1192, + "step": 6898 + }, + { + "epoch": 1.8670004058990664, + "grad_norm": 0.19799132645130157, + "learning_rate": 3.753227123259226e-05, + "loss": 1.0612, + "step": 6900 + }, + { + "epoch": 1.8675416046543094, + "grad_norm": 0.19283054769039154, + "learning_rate": 3.750178611380153e-05, + "loss": 1.0883, + "step": 6902 + }, + { + "epoch": 1.868082803409552, + "grad_norm": 0.1865224689245224, + "learning_rate": 3.7471305949896386e-05, + "loss": 1.0879, + "step": 6904 + }, + { + "epoch": 1.868624002164795, + "grad_norm": 0.19918258488178253, + "learning_rate": 3.7440830752960585e-05, + "loss": 1.0941, + "step": 6906 + }, + { + "epoch": 1.869165200920038, + "grad_norm": 0.19885693490505219, + "learning_rate": 3.741036053507596e-05, + "loss": 1.0939, + "step": 6908 + }, + { + "epoch": 1.8697063996752807, + "grad_norm": 0.19469766318798065, + "learning_rate": 3.737989530832237e-05, + "loss": 1.0894, + "step": 6910 + }, + { + "epoch": 1.8702475984305236, + "grad_norm": 0.20550334453582764, + "learning_rate": 3.734943508477765e-05, + "loss": 1.091, + "step": 6912 + }, + { + "epoch": 1.8707887971857664, + "grad_norm": 0.19922946393489838, + "learning_rate": 3.731897987651769e-05, + "loss": 1.0761, + "step": 6914 + }, + { + "epoch": 1.8713299959410095, + "grad_norm": 0.2074526846408844, + "learning_rate": 3.728852969561639e-05, + "loss": 1.0754, + "step": 6916 + }, + { + "epoch": 1.871871194696252, + "grad_norm": 0.19522885978221893, + "learning_rate": 3.725808455414563e-05, + "loss": 1.0898, + "step": 6918 + }, + { + "epoch": 1.8724123934514951, + "grad_norm": 0.18808268010616302, + "learning_rate": 3.722764446417532e-05, + "loss": 1.0835, + "step": 6920 + }, + { + "epoch": 1.872953592206738, + "grad_norm": 0.19153587520122528, + "learning_rate": 3.719720943777335e-05, + "loss": 1.0795, + "step": 6922 + }, + { + "epoch": 1.8734947909619808, + "grad_norm": 0.20617860555648804, + "learning_rate": 3.716677948700562e-05, + "loss": 1.0881, + "step": 6924 + }, + { + "epoch": 1.8740359897172236, + "grad_norm": 0.196294903755188, + "learning_rate": 3.713635462393601e-05, + "loss": 1.0753, + "step": 6926 + }, + { + "epoch": 1.8745771884724665, + "grad_norm": 0.20135530829429626, + "learning_rate": 3.710593486062638e-05, + "loss": 1.0766, + "step": 6928 + }, + { + "epoch": 1.8751183872277095, + "grad_norm": 0.1895909607410431, + "learning_rate": 3.7075520209136564e-05, + "loss": 1.0969, + "step": 6930 + }, + { + "epoch": 1.8756595859829521, + "grad_norm": 0.5106291770935059, + "learning_rate": 3.7045110681524345e-05, + "loss": 1.0856, + "step": 6932 + }, + { + "epoch": 1.8762007847381952, + "grad_norm": 0.20290765166282654, + "learning_rate": 3.701470628984556e-05, + "loss": 1.0749, + "step": 6934 + }, + { + "epoch": 1.876741983493438, + "grad_norm": 0.2311553955078125, + "learning_rate": 3.6984307046153934e-05, + "loss": 1.0834, + "step": 6936 + }, + { + "epoch": 1.8772831822486808, + "grad_norm": 0.24637652933597565, + "learning_rate": 3.695391296250115e-05, + "loss": 1.0962, + "step": 6938 + }, + { + "epoch": 1.8778243810039237, + "grad_norm": 0.2694454789161682, + "learning_rate": 3.692352405093689e-05, + "loss": 1.0624, + "step": 6940 + }, + { + "epoch": 1.8783655797591665, + "grad_norm": 0.23916806280612946, + "learning_rate": 3.6893140323508734e-05, + "loss": 1.0804, + "step": 6942 + }, + { + "epoch": 1.8789067785144096, + "grad_norm": 0.23160186409950256, + "learning_rate": 3.6862761792262254e-05, + "loss": 1.0874, + "step": 6944 + }, + { + "epoch": 1.8794479772696522, + "grad_norm": 0.2252962291240692, + "learning_rate": 3.683238846924096e-05, + "loss": 1.0778, + "step": 6946 + }, + { + "epoch": 1.8799891760248952, + "grad_norm": 0.20364049077033997, + "learning_rate": 3.6802020366486246e-05, + "loss": 1.0805, + "step": 6948 + }, + { + "epoch": 1.8805303747801378, + "grad_norm": 0.22888129949569702, + "learning_rate": 3.67716574960375e-05, + "loss": 1.1125, + "step": 6950 + }, + { + "epoch": 1.881071573535381, + "grad_norm": 0.2272316813468933, + "learning_rate": 3.6741299869931986e-05, + "loss": 1.0641, + "step": 6952 + }, + { + "epoch": 1.8816127722906237, + "grad_norm": 0.23863543570041656, + "learning_rate": 3.6710947500204895e-05, + "loss": 1.0792, + "step": 6954 + }, + { + "epoch": 1.8821539710458666, + "grad_norm": 0.21758796274662018, + "learning_rate": 3.6680600398889386e-05, + "loss": 1.0842, + "step": 6956 + }, + { + "epoch": 1.8826951698011094, + "grad_norm": 0.19649061560630798, + "learning_rate": 3.665025857801647e-05, + "loss": 1.0875, + "step": 6958 + }, + { + "epoch": 1.8832363685563522, + "grad_norm": 0.20000873506069183, + "learning_rate": 3.6619922049615096e-05, + "loss": 1.0994, + "step": 6960 + }, + { + "epoch": 1.8837775673115953, + "grad_norm": 0.21018992364406586, + "learning_rate": 3.65895908257121e-05, + "loss": 1.1178, + "step": 6962 + }, + { + "epoch": 1.884318766066838, + "grad_norm": 0.22726866602897644, + "learning_rate": 3.6559264918332205e-05, + "loss": 1.0826, + "step": 6964 + }, + { + "epoch": 1.884859964822081, + "grad_norm": 0.20268107950687408, + "learning_rate": 3.652894433949804e-05, + "loss": 1.0992, + "step": 6966 + }, + { + "epoch": 1.8854011635773238, + "grad_norm": 0.1961902678012848, + "learning_rate": 3.649862910123015e-05, + "loss": 1.0832, + "step": 6968 + }, + { + "epoch": 1.8859423623325666, + "grad_norm": 0.2033190131187439, + "learning_rate": 3.646831921554692e-05, + "loss": 1.0766, + "step": 6970 + }, + { + "epoch": 1.8864835610878095, + "grad_norm": 0.2002858817577362, + "learning_rate": 3.643801469446462e-05, + "loss": 1.0722, + "step": 6972 + }, + { + "epoch": 1.8870247598430523, + "grad_norm": 0.19256839156150818, + "learning_rate": 3.640771554999742e-05, + "loss": 1.0853, + "step": 6974 + }, + { + "epoch": 1.8875659585982953, + "grad_norm": 0.19123882055282593, + "learning_rate": 3.637742179415733e-05, + "loss": 1.0693, + "step": 6976 + }, + { + "epoch": 1.888107157353538, + "grad_norm": 0.19901691377162933, + "learning_rate": 3.634713343895422e-05, + "loss": 1.0936, + "step": 6978 + }, + { + "epoch": 1.888648356108781, + "grad_norm": 0.19063329696655273, + "learning_rate": 3.631685049639586e-05, + "loss": 1.0957, + "step": 6980 + }, + { + "epoch": 1.8891895548640238, + "grad_norm": 0.19534985721111298, + "learning_rate": 3.628657297848783e-05, + "loss": 1.0912, + "step": 6982 + }, + { + "epoch": 1.8897307536192667, + "grad_norm": 0.20418201386928558, + "learning_rate": 3.625630089723359e-05, + "loss": 1.075, + "step": 6984 + }, + { + "epoch": 1.8902719523745095, + "grad_norm": 0.19935591518878937, + "learning_rate": 3.62260342646344e-05, + "loss": 1.0876, + "step": 6986 + }, + { + "epoch": 1.8908131511297523, + "grad_norm": 0.19858041405677795, + "learning_rate": 3.619577309268941e-05, + "loss": 1.0759, + "step": 6988 + }, + { + "epoch": 1.8913543498849954, + "grad_norm": 0.20118165016174316, + "learning_rate": 3.6165517393395593e-05, + "loss": 1.0821, + "step": 6990 + }, + { + "epoch": 1.891895548640238, + "grad_norm": 0.20047937333583832, + "learning_rate": 3.6135267178747746e-05, + "loss": 1.0771, + "step": 6992 + }, + { + "epoch": 1.892436747395481, + "grad_norm": 0.1953369379043579, + "learning_rate": 3.610502246073848e-05, + "loss": 1.088, + "step": 6994 + }, + { + "epoch": 1.892977946150724, + "grad_norm": 0.1911652833223343, + "learning_rate": 3.607478325135822e-05, + "loss": 1.0677, + "step": 6996 + }, + { + "epoch": 1.8935191449059667, + "grad_norm": 0.19671845436096191, + "learning_rate": 3.6044549562595276e-05, + "loss": 1.0749, + "step": 6998 + }, + { + "epoch": 1.8940603436612096, + "grad_norm": 0.20309677720069885, + "learning_rate": 3.6014321406435664e-05, + "loss": 1.0894, + "step": 7000 + }, + { + "epoch": 1.8946015424164524, + "grad_norm": 0.2059335559606552, + "learning_rate": 3.598409879486329e-05, + "loss": 1.1025, + "step": 7002 + }, + { + "epoch": 1.8951427411716955, + "grad_norm": 0.2229032665491104, + "learning_rate": 3.595388173985983e-05, + "loss": 1.063, + "step": 7004 + }, + { + "epoch": 1.895683939926938, + "grad_norm": 0.21180075407028198, + "learning_rate": 3.592367025340476e-05, + "loss": 1.0689, + "step": 7006 + }, + { + "epoch": 1.8962251386821811, + "grad_norm": 0.21449512243270874, + "learning_rate": 3.589346434747535e-05, + "loss": 1.0882, + "step": 7008 + }, + { + "epoch": 1.8967663374374237, + "grad_norm": 0.21245978772640228, + "learning_rate": 3.5863264034046646e-05, + "loss": 1.0762, + "step": 7010 + }, + { + "epoch": 1.8973075361926668, + "grad_norm": 0.21630197763442993, + "learning_rate": 3.583306932509147e-05, + "loss": 1.0867, + "step": 7012 + }, + { + "epoch": 1.8978487349479096, + "grad_norm": 0.1897646188735962, + "learning_rate": 3.580288023258048e-05, + "loss": 1.0707, + "step": 7014 + }, + { + "epoch": 1.8983899337031525, + "grad_norm": 0.22349651157855988, + "learning_rate": 3.577269676848205e-05, + "loss": 1.0651, + "step": 7016 + }, + { + "epoch": 1.8989311324583955, + "grad_norm": 0.23416654765605927, + "learning_rate": 3.574251894476233e-05, + "loss": 1.0685, + "step": 7018 + }, + { + "epoch": 1.8994723312136381, + "grad_norm": 0.208474799990654, + "learning_rate": 3.571234677338524e-05, + "loss": 1.0818, + "step": 7020 + }, + { + "epoch": 1.9000135299688812, + "grad_norm": 0.1950254738330841, + "learning_rate": 3.568218026631248e-05, + "loss": 1.0758, + "step": 7022 + }, + { + "epoch": 1.9005547287241238, + "grad_norm": 0.20036619901657104, + "learning_rate": 3.5652019435503456e-05, + "loss": 1.088, + "step": 7024 + }, + { + "epoch": 1.9010959274793668, + "grad_norm": 0.19547797739505768, + "learning_rate": 3.562186429291538e-05, + "loss": 1.0588, + "step": 7026 + }, + { + "epoch": 1.9016371262346097, + "grad_norm": 0.19309277832508087, + "learning_rate": 3.5591714850503185e-05, + "loss": 1.0695, + "step": 7028 + }, + { + "epoch": 1.9021783249898525, + "grad_norm": 0.2112337052822113, + "learning_rate": 3.556157112021953e-05, + "loss": 1.0817, + "step": 7030 + }, + { + "epoch": 1.9027195237450953, + "grad_norm": 0.1964777112007141, + "learning_rate": 3.5531433114014836e-05, + "loss": 1.0928, + "step": 7032 + }, + { + "epoch": 1.9032607225003382, + "grad_norm": 0.19426238536834717, + "learning_rate": 3.550130084383719e-05, + "loss": 1.0851, + "step": 7034 + }, + { + "epoch": 1.9038019212555812, + "grad_norm": 0.2040168046951294, + "learning_rate": 3.547117432163252e-05, + "loss": 1.0686, + "step": 7036 + }, + { + "epoch": 1.9043431200108238, + "grad_norm": 0.18765003979206085, + "learning_rate": 3.544105355934439e-05, + "loss": 1.0921, + "step": 7038 + }, + { + "epoch": 1.904884318766067, + "grad_norm": 0.1980425864458084, + "learning_rate": 3.5410938568914084e-05, + "loss": 1.071, + "step": 7040 + }, + { + "epoch": 1.9054255175213097, + "grad_norm": 0.1978006809949875, + "learning_rate": 3.5380829362280624e-05, + "loss": 1.0722, + "step": 7042 + }, + { + "epoch": 1.9059667162765526, + "grad_norm": 0.20557118952274323, + "learning_rate": 3.535072595138073e-05, + "loss": 1.0676, + "step": 7044 + }, + { + "epoch": 1.9065079150317954, + "grad_norm": 0.2010992020368576, + "learning_rate": 3.532062834814881e-05, + "loss": 1.0908, + "step": 7046 + }, + { + "epoch": 1.9070491137870382, + "grad_norm": 0.19449837505817413, + "learning_rate": 3.5290536564517016e-05, + "loss": 1.0781, + "step": 7048 + }, + { + "epoch": 1.9075903125422813, + "grad_norm": 0.20721149444580078, + "learning_rate": 3.526045061241513e-05, + "loss": 1.073, + "step": 7050 + }, + { + "epoch": 1.908131511297524, + "grad_norm": 0.20579661428928375, + "learning_rate": 3.523037050377067e-05, + "loss": 1.0826, + "step": 7052 + }, + { + "epoch": 1.908672710052767, + "grad_norm": 0.2040823996067047, + "learning_rate": 3.520029625050882e-05, + "loss": 1.078, + "step": 7054 + }, + { + "epoch": 1.9092139088080098, + "grad_norm": 0.2008407860994339, + "learning_rate": 3.517022786455243e-05, + "loss": 1.0781, + "step": 7056 + }, + { + "epoch": 1.9097551075632526, + "grad_norm": 0.1955711841583252, + "learning_rate": 3.514016535782202e-05, + "loss": 1.0815, + "step": 7058 + }, + { + "epoch": 1.9102963063184955, + "grad_norm": 0.19567127525806427, + "learning_rate": 3.5110108742235845e-05, + "loss": 1.0716, + "step": 7060 + }, + { + "epoch": 1.9108375050737383, + "grad_norm": 0.19317661225795746, + "learning_rate": 3.5080058029709736e-05, + "loss": 1.0525, + "step": 7062 + }, + { + "epoch": 1.9113787038289813, + "grad_norm": 0.19287818670272827, + "learning_rate": 3.505001323215725e-05, + "loss": 1.08, + "step": 7064 + }, + { + "epoch": 1.911919902584224, + "grad_norm": 0.2001931369304657, + "learning_rate": 3.501997436148954e-05, + "loss": 1.0767, + "step": 7066 + }, + { + "epoch": 1.912461101339467, + "grad_norm": 0.19853448867797852, + "learning_rate": 3.498994142961545e-05, + "loss": 1.0627, + "step": 7068 + }, + { + "epoch": 1.9130023000947096, + "grad_norm": 0.20263823866844177, + "learning_rate": 3.495991444844148e-05, + "loss": 1.0984, + "step": 7070 + }, + { + "epoch": 1.9135434988499527, + "grad_norm": 0.1982932984828949, + "learning_rate": 3.492989342987173e-05, + "loss": 1.0837, + "step": 7072 + }, + { + "epoch": 1.9140846976051955, + "grad_norm": 0.20725077390670776, + "learning_rate": 3.489987838580797e-05, + "loss": 1.0882, + "step": 7074 + }, + { + "epoch": 1.9146258963604383, + "grad_norm": 0.20248323678970337, + "learning_rate": 3.486986932814959e-05, + "loss": 1.0693, + "step": 7076 + }, + { + "epoch": 1.9151670951156814, + "grad_norm": 0.19514189660549164, + "learning_rate": 3.48398662687936e-05, + "loss": 1.0911, + "step": 7078 + }, + { + "epoch": 1.915708293870924, + "grad_norm": 0.21956880390644073, + "learning_rate": 3.4809869219634615e-05, + "loss": 1.0601, + "step": 7080 + }, + { + "epoch": 1.916249492626167, + "grad_norm": 0.21801330149173737, + "learning_rate": 3.477987819256494e-05, + "loss": 1.08, + "step": 7082 + }, + { + "epoch": 1.9167906913814097, + "grad_norm": 0.20598360896110535, + "learning_rate": 3.4749893199474414e-05, + "loss": 1.0877, + "step": 7084 + }, + { + "epoch": 1.9173318901366527, + "grad_norm": 0.19161012768745422, + "learning_rate": 3.4719914252250516e-05, + "loss": 1.1058, + "step": 7086 + }, + { + "epoch": 1.9178730888918956, + "grad_norm": 0.19513128697872162, + "learning_rate": 3.468994136277832e-05, + "loss": 1.0918, + "step": 7088 + }, + { + "epoch": 1.9184142876471384, + "grad_norm": 0.1921873688697815, + "learning_rate": 3.465997454294051e-05, + "loss": 1.0707, + "step": 7090 + }, + { + "epoch": 1.9189554864023812, + "grad_norm": 0.19686387479305267, + "learning_rate": 3.4630013804617326e-05, + "loss": 1.0892, + "step": 7092 + }, + { + "epoch": 1.919496685157624, + "grad_norm": 0.19348624348640442, + "learning_rate": 3.460005915968668e-05, + "loss": 1.1039, + "step": 7094 + }, + { + "epoch": 1.9200378839128671, + "grad_norm": 0.2056368589401245, + "learning_rate": 3.457011062002399e-05, + "loss": 1.0808, + "step": 7096 + }, + { + "epoch": 1.9205790826681097, + "grad_norm": 0.21135970950126648, + "learning_rate": 3.454016819750226e-05, + "loss": 1.0957, + "step": 7098 + }, + { + "epoch": 1.9211202814233528, + "grad_norm": 0.23276221752166748, + "learning_rate": 3.451023190399214e-05, + "loss": 1.0863, + "step": 7100 + }, + { + "epoch": 1.9216614801785956, + "grad_norm": 0.20852985978126526, + "learning_rate": 3.448030175136173e-05, + "loss": 1.0875, + "step": 7102 + }, + { + "epoch": 1.9222026789338384, + "grad_norm": 0.20848432183265686, + "learning_rate": 3.445037775147682e-05, + "loss": 1.0672, + "step": 7104 + }, + { + "epoch": 1.9227438776890813, + "grad_norm": 0.21013838052749634, + "learning_rate": 3.442045991620069e-05, + "loss": 1.067, + "step": 7106 + }, + { + "epoch": 1.9232850764443241, + "grad_norm": 0.20575588941574097, + "learning_rate": 3.4390548257394183e-05, + "loss": 1.0922, + "step": 7108 + }, + { + "epoch": 1.9238262751995672, + "grad_norm": 0.20153602957725525, + "learning_rate": 3.4360642786915705e-05, + "loss": 1.0825, + "step": 7110 + }, + { + "epoch": 1.9243674739548098, + "grad_norm": 0.2115265280008316, + "learning_rate": 3.433074351662121e-05, + "loss": 1.0815, + "step": 7112 + }, + { + "epoch": 1.9249086727100528, + "grad_norm": 0.21177321672439575, + "learning_rate": 3.430085045836415e-05, + "loss": 1.096, + "step": 7114 + }, + { + "epoch": 1.9254498714652957, + "grad_norm": 0.22927716374397278, + "learning_rate": 3.427096362399561e-05, + "loss": 1.1029, + "step": 7116 + }, + { + "epoch": 1.9259910702205385, + "grad_norm": 0.1915036141872406, + "learning_rate": 3.424108302536414e-05, + "loss": 1.0766, + "step": 7118 + }, + { + "epoch": 1.9265322689757813, + "grad_norm": 0.20599964261054993, + "learning_rate": 3.42112086743158e-05, + "loss": 1.0803, + "step": 7120 + }, + { + "epoch": 1.9270734677310242, + "grad_norm": 0.21761341392993927, + "learning_rate": 3.418134058269421e-05, + "loss": 1.0801, + "step": 7122 + }, + { + "epoch": 1.9276146664862672, + "grad_norm": 0.2250652313232422, + "learning_rate": 3.415147876234052e-05, + "loss": 1.0716, + "step": 7124 + }, + { + "epoch": 1.9281558652415098, + "grad_norm": 0.21000199019908905, + "learning_rate": 3.4121623225093346e-05, + "loss": 1.0695, + "step": 7126 + }, + { + "epoch": 1.928697063996753, + "grad_norm": 0.1999553143978119, + "learning_rate": 3.4091773982788866e-05, + "loss": 1.0771, + "step": 7128 + }, + { + "epoch": 1.9292382627519957, + "grad_norm": 0.19224688410758972, + "learning_rate": 3.406193104726072e-05, + "loss": 1.0636, + "step": 7130 + }, + { + "epoch": 1.9297794615072386, + "grad_norm": 0.1926104724407196, + "learning_rate": 3.403209443034009e-05, + "loss": 1.064, + "step": 7132 + }, + { + "epoch": 1.9303206602624814, + "grad_norm": 0.19845491647720337, + "learning_rate": 3.40022641438556e-05, + "loss": 1.0837, + "step": 7134 + }, + { + "epoch": 1.9308618590177242, + "grad_norm": 0.19521653652191162, + "learning_rate": 3.3972440199633404e-05, + "loss": 1.0719, + "step": 7136 + }, + { + "epoch": 1.9314030577729673, + "grad_norm": 0.19986054301261902, + "learning_rate": 3.39426226094971e-05, + "loss": 1.0771, + "step": 7138 + }, + { + "epoch": 1.93194425652821, + "grad_norm": 1.7982203960418701, + "learning_rate": 3.3912811385267856e-05, + "loss": 1.1079, + "step": 7140 + }, + { + "epoch": 1.932485455283453, + "grad_norm": 0.24166445434093475, + "learning_rate": 3.388300653876422e-05, + "loss": 1.0707, + "step": 7142 + }, + { + "epoch": 1.9330266540386956, + "grad_norm": 0.250700443983078, + "learning_rate": 3.385320808180226e-05, + "loss": 1.0891, + "step": 7144 + }, + { + "epoch": 1.9335678527939386, + "grad_norm": 0.2176661342382431, + "learning_rate": 3.382341602619548e-05, + "loss": 1.0848, + "step": 7146 + }, + { + "epoch": 1.9341090515491814, + "grad_norm": 0.22913646697998047, + "learning_rate": 3.379363038375488e-05, + "loss": 1.0779, + "step": 7148 + }, + { + "epoch": 1.9346502503044243, + "grad_norm": 0.2078842669725418, + "learning_rate": 3.3763851166288915e-05, + "loss": 1.0774, + "step": 7150 + }, + { + "epoch": 1.9351914490596671, + "grad_norm": 0.21032822132110596, + "learning_rate": 3.373407838560346e-05, + "loss": 1.0541, + "step": 7152 + }, + { + "epoch": 1.93573264781491, + "grad_norm": 0.20584969222545624, + "learning_rate": 3.370431205350188e-05, + "loss": 1.0589, + "step": 7154 + }, + { + "epoch": 1.936273846570153, + "grad_norm": 0.1954772025346756, + "learning_rate": 3.367455218178495e-05, + "loss": 1.0832, + "step": 7156 + }, + { + "epoch": 1.9368150453253956, + "grad_norm": 0.2397390902042389, + "learning_rate": 3.36447987822509e-05, + "loss": 1.0856, + "step": 7158 + }, + { + "epoch": 1.9373562440806387, + "grad_norm": 0.20190705358982086, + "learning_rate": 3.361505186669537e-05, + "loss": 1.0882, + "step": 7160 + }, + { + "epoch": 1.9378974428358815, + "grad_norm": 0.2014457732439041, + "learning_rate": 3.3585311446911485e-05, + "loss": 1.0727, + "step": 7162 + }, + { + "epoch": 1.9384386415911243, + "grad_norm": 0.19649088382720947, + "learning_rate": 3.355557753468974e-05, + "loss": 1.0771, + "step": 7164 + }, + { + "epoch": 1.9389798403463672, + "grad_norm": 0.19902083277702332, + "learning_rate": 3.352585014181808e-05, + "loss": 1.0736, + "step": 7166 + }, + { + "epoch": 1.93952103910161, + "grad_norm": 0.20133136212825775, + "learning_rate": 3.349612928008186e-05, + "loss": 1.089, + "step": 7168 + }, + { + "epoch": 1.940062237856853, + "grad_norm": 0.20350739359855652, + "learning_rate": 3.3466414961263804e-05, + "loss": 1.0851, + "step": 7170 + }, + { + "epoch": 1.9406034366120957, + "grad_norm": 0.19824256002902985, + "learning_rate": 3.343670719714411e-05, + "loss": 1.0536, + "step": 7172 + }, + { + "epoch": 1.9411446353673387, + "grad_norm": 0.20827580988407135, + "learning_rate": 3.3407005999500356e-05, + "loss": 1.0668, + "step": 7174 + }, + { + "epoch": 1.9416858341225816, + "grad_norm": 0.19485266506671906, + "learning_rate": 3.337731138010748e-05, + "loss": 1.0723, + "step": 7176 + }, + { + "epoch": 1.9422270328778244, + "grad_norm": 0.19142813980579376, + "learning_rate": 3.3347623350737867e-05, + "loss": 1.0805, + "step": 7178 + }, + { + "epoch": 1.9427682316330672, + "grad_norm": 0.19015726447105408, + "learning_rate": 3.331794192316125e-05, + "loss": 1.0648, + "step": 7180 + }, + { + "epoch": 1.94330943038831, + "grad_norm": 0.19703617691993713, + "learning_rate": 3.328826710914473e-05, + "loss": 1.0833, + "step": 7182 + }, + { + "epoch": 1.943850629143553, + "grad_norm": 0.20594839751720428, + "learning_rate": 3.325859892045286e-05, + "loss": 1.0736, + "step": 7184 + }, + { + "epoch": 1.9443918278987957, + "grad_norm": 0.24551725387573242, + "learning_rate": 3.322893736884749e-05, + "loss": 1.0748, + "step": 7186 + }, + { + "epoch": 1.9449330266540388, + "grad_norm": 0.2049313485622406, + "learning_rate": 3.319928246608788e-05, + "loss": 1.0669, + "step": 7188 + }, + { + "epoch": 1.9454742254092816, + "grad_norm": 0.21290773153305054, + "learning_rate": 3.316963422393064e-05, + "loss": 1.0729, + "step": 7190 + }, + { + "epoch": 1.9460154241645244, + "grad_norm": 0.18959276378154755, + "learning_rate": 3.313999265412973e-05, + "loss": 1.0717, + "step": 7192 + }, + { + "epoch": 1.9465566229197673, + "grad_norm": 0.19647051393985748, + "learning_rate": 3.3110357768436465e-05, + "loss": 1.0639, + "step": 7194 + }, + { + "epoch": 1.94709782167501, + "grad_norm": 0.214079350233078, + "learning_rate": 3.3080729578599556e-05, + "loss": 1.0744, + "step": 7196 + }, + { + "epoch": 1.9476390204302532, + "grad_norm": 0.1902647763490677, + "learning_rate": 3.3051108096365015e-05, + "loss": 1.0821, + "step": 7198 + }, + { + "epoch": 1.9481802191854958, + "grad_norm": 0.2037125825881958, + "learning_rate": 3.302149333347618e-05, + "loss": 1.0774, + "step": 7200 + }, + { + "epoch": 1.9487214179407388, + "grad_norm": 0.18941730260849, + "learning_rate": 3.299188530167379e-05, + "loss": 1.0857, + "step": 7202 + }, + { + "epoch": 1.9492626166959814, + "grad_norm": 0.19084425270557404, + "learning_rate": 3.296228401269584e-05, + "loss": 1.0779, + "step": 7204 + }, + { + "epoch": 1.9498038154512245, + "grad_norm": 0.21471716463565826, + "learning_rate": 3.293268947827769e-05, + "loss": 1.0683, + "step": 7206 + }, + { + "epoch": 1.9503450142064673, + "grad_norm": 0.1984710693359375, + "learning_rate": 3.290310171015204e-05, + "loss": 1.0901, + "step": 7208 + }, + { + "epoch": 1.9508862129617102, + "grad_norm": 0.2048645317554474, + "learning_rate": 3.2873520720048876e-05, + "loss": 1.075, + "step": 7210 + }, + { + "epoch": 1.951427411716953, + "grad_norm": 0.19725827872753143, + "learning_rate": 3.284394651969551e-05, + "loss": 1.0846, + "step": 7212 + }, + { + "epoch": 1.9519686104721958, + "grad_norm": 0.20842018723487854, + "learning_rate": 3.281437912081655e-05, + "loss": 1.0681, + "step": 7214 + }, + { + "epoch": 1.9525098092274389, + "grad_norm": 0.20301663875579834, + "learning_rate": 3.278481853513393e-05, + "loss": 1.0688, + "step": 7216 + }, + { + "epoch": 1.9530510079826815, + "grad_norm": 0.19450542330741882, + "learning_rate": 3.2755264774366834e-05, + "loss": 1.0515, + "step": 7218 + }, + { + "epoch": 1.9535922067379246, + "grad_norm": 0.19653567671775818, + "learning_rate": 3.272571785023183e-05, + "loss": 1.0831, + "step": 7220 + }, + { + "epoch": 1.9541334054931674, + "grad_norm": 0.2116047441959381, + "learning_rate": 3.26961777744427e-05, + "loss": 1.0809, + "step": 7222 + }, + { + "epoch": 1.9546746042484102, + "grad_norm": 0.20507092773914337, + "learning_rate": 3.266664455871052e-05, + "loss": 1.0758, + "step": 7224 + }, + { + "epoch": 1.955215803003653, + "grad_norm": 0.20406030118465424, + "learning_rate": 3.263711821474368e-05, + "loss": 1.0924, + "step": 7226 + }, + { + "epoch": 1.9557570017588959, + "grad_norm": 0.20105832815170288, + "learning_rate": 3.2607598754247797e-05, + "loss": 1.091, + "step": 7228 + }, + { + "epoch": 1.956298200514139, + "grad_norm": 0.19092315435409546, + "learning_rate": 3.257808618892581e-05, + "loss": 1.1026, + "step": 7230 + }, + { + "epoch": 1.9568393992693816, + "grad_norm": 0.19637537002563477, + "learning_rate": 3.254858053047791e-05, + "loss": 1.0636, + "step": 7232 + }, + { + "epoch": 1.9573805980246246, + "grad_norm": 0.19321343302726746, + "learning_rate": 3.2519081790601525e-05, + "loss": 1.0925, + "step": 7234 + }, + { + "epoch": 1.9579217967798674, + "grad_norm": 0.19708101451396942, + "learning_rate": 3.248958998099136e-05, + "loss": 1.0804, + "step": 7236 + }, + { + "epoch": 1.9584629955351103, + "grad_norm": 0.19389650225639343, + "learning_rate": 3.246010511333937e-05, + "loss": 1.069, + "step": 7238 + }, + { + "epoch": 1.959004194290353, + "grad_norm": 0.19223317503929138, + "learning_rate": 3.243062719933473e-05, + "loss": 1.0695, + "step": 7240 + }, + { + "epoch": 1.959545393045596, + "grad_norm": 0.20129427313804626, + "learning_rate": 3.2401156250663934e-05, + "loss": 1.0628, + "step": 7242 + }, + { + "epoch": 1.960086591800839, + "grad_norm": 0.20165562629699707, + "learning_rate": 3.237169227901064e-05, + "loss": 1.0731, + "step": 7244 + }, + { + "epoch": 1.9606277905560816, + "grad_norm": 0.19325421750545502, + "learning_rate": 3.234223529605577e-05, + "loss": 1.0659, + "step": 7246 + }, + { + "epoch": 1.9611689893113247, + "grad_norm": 0.200506791472435, + "learning_rate": 3.2312785313477453e-05, + "loss": 1.0854, + "step": 7248 + }, + { + "epoch": 1.9617101880665675, + "grad_norm": 0.20187094807624817, + "learning_rate": 3.2283342342951095e-05, + "loss": 1.0809, + "step": 7250 + }, + { + "epoch": 1.9622513868218103, + "grad_norm": 0.22668886184692383, + "learning_rate": 3.2253906396149255e-05, + "loss": 1.0804, + "step": 7252 + }, + { + "epoch": 1.9627925855770532, + "grad_norm": 0.1926940679550171, + "learning_rate": 3.222447748474176e-05, + "loss": 1.066, + "step": 7254 + }, + { + "epoch": 1.963333784332296, + "grad_norm": 0.19703729450702667, + "learning_rate": 3.219505562039563e-05, + "loss": 1.0716, + "step": 7256 + }, + { + "epoch": 1.963874983087539, + "grad_norm": 0.19300486147403717, + "learning_rate": 3.216564081477509e-05, + "loss": 1.058, + "step": 7258 + }, + { + "epoch": 1.9644161818427817, + "grad_norm": 0.22398415207862854, + "learning_rate": 3.213623307954155e-05, + "loss": 1.0689, + "step": 7260 + }, + { + "epoch": 1.9649573805980247, + "grad_norm": 0.23281800746917725, + "learning_rate": 3.2106832426353624e-05, + "loss": 1.0479, + "step": 7262 + }, + { + "epoch": 1.9654985793532673, + "grad_norm": 0.20088189840316772, + "learning_rate": 3.207743886686716e-05, + "loss": 1.0802, + "step": 7264 + }, + { + "epoch": 1.9660397781085104, + "grad_norm": 0.20714959502220154, + "learning_rate": 3.204805241273515e-05, + "loss": 1.0737, + "step": 7266 + }, + { + "epoch": 1.9665809768637532, + "grad_norm": 0.19426997005939484, + "learning_rate": 3.201867307560778e-05, + "loss": 1.0999, + "step": 7268 + }, + { + "epoch": 1.967122175618996, + "grad_norm": 0.2388717085123062, + "learning_rate": 3.198930086713241e-05, + "loss": 1.0781, + "step": 7270 + }, + { + "epoch": 1.967663374374239, + "grad_norm": 0.2533293068408966, + "learning_rate": 3.195993579895357e-05, + "loss": 1.0581, + "step": 7272 + }, + { + "epoch": 1.9682045731294817, + "grad_norm": 0.22910656034946442, + "learning_rate": 3.193057788271296e-05, + "loss": 1.0694, + "step": 7274 + }, + { + "epoch": 1.9687457718847248, + "grad_norm": 0.19577403366565704, + "learning_rate": 3.1901227130049494e-05, + "loss": 1.075, + "step": 7276 + }, + { + "epoch": 1.9692869706399674, + "grad_norm": 0.1928378939628601, + "learning_rate": 3.187188355259917e-05, + "loss": 1.086, + "step": 7278 + }, + { + "epoch": 1.9698281693952104, + "grad_norm": 0.21802791953086853, + "learning_rate": 3.1842547161995194e-05, + "loss": 1.0679, + "step": 7280 + }, + { + "epoch": 1.9703693681504533, + "grad_norm": 0.2115304321050644, + "learning_rate": 3.181321796986789e-05, + "loss": 1.0812, + "step": 7282 + }, + { + "epoch": 1.970910566905696, + "grad_norm": 0.19090840220451355, + "learning_rate": 3.1783895987844753e-05, + "loss": 1.0688, + "step": 7284 + }, + { + "epoch": 1.971451765660939, + "grad_norm": 0.18902477622032166, + "learning_rate": 3.1754581227550384e-05, + "loss": 1.079, + "step": 7286 + }, + { + "epoch": 1.9719929644161818, + "grad_norm": 0.21566107869148254, + "learning_rate": 3.1725273700606584e-05, + "loss": 1.0536, + "step": 7288 + }, + { + "epoch": 1.9725341631714248, + "grad_norm": 0.19657929241657257, + "learning_rate": 3.169597341863223e-05, + "loss": 1.0418, + "step": 7290 + }, + { + "epoch": 1.9730753619266674, + "grad_norm": 0.1881057769060135, + "learning_rate": 3.166668039324336e-05, + "loss": 1.0487, + "step": 7292 + }, + { + "epoch": 1.9736165606819105, + "grad_norm": 0.2057676613330841, + "learning_rate": 3.16373946360531e-05, + "loss": 1.0828, + "step": 7294 + }, + { + "epoch": 1.9741577594371533, + "grad_norm": 0.1998111754655838, + "learning_rate": 3.160811615867171e-05, + "loss": 1.0816, + "step": 7296 + }, + { + "epoch": 1.9746989581923962, + "grad_norm": 0.1907121241092682, + "learning_rate": 3.157884497270658e-05, + "loss": 1.0568, + "step": 7298 + }, + { + "epoch": 1.975240156947639, + "grad_norm": 0.19950172305107117, + "learning_rate": 3.154958108976222e-05, + "loss": 1.0842, + "step": 7300 + }, + { + "epoch": 1.9757813557028818, + "grad_norm": 0.2000495046377182, + "learning_rate": 3.152032452144017e-05, + "loss": 1.07, + "step": 7302 + }, + { + "epoch": 1.9763225544581249, + "grad_norm": 0.20422512292861938, + "learning_rate": 3.1491075279339186e-05, + "loss": 1.0804, + "step": 7304 + }, + { + "epoch": 1.9768637532133675, + "grad_norm": 0.19462010264396667, + "learning_rate": 3.146183337505502e-05, + "loss": 1.0698, + "step": 7306 + }, + { + "epoch": 1.9774049519686105, + "grad_norm": 0.1909000128507614, + "learning_rate": 3.143259882018054e-05, + "loss": 1.0772, + "step": 7308 + }, + { + "epoch": 1.9779461507238534, + "grad_norm": 0.20225849747657776, + "learning_rate": 3.1403371626305755e-05, + "loss": 1.0499, + "step": 7310 + }, + { + "epoch": 1.9784873494790962, + "grad_norm": 0.19416281580924988, + "learning_rate": 3.137415180501768e-05, + "loss": 1.0707, + "step": 7312 + }, + { + "epoch": 1.979028548234339, + "grad_norm": 0.18877071142196655, + "learning_rate": 3.134493936790046e-05, + "loss": 1.073, + "step": 7314 + }, + { + "epoch": 1.9795697469895819, + "grad_norm": 0.18901939690113068, + "learning_rate": 3.131573432653527e-05, + "loss": 1.0635, + "step": 7316 + }, + { + "epoch": 1.980110945744825, + "grad_norm": 0.20818552374839783, + "learning_rate": 3.128653669250039e-05, + "loss": 1.0542, + "step": 7318 + }, + { + "epoch": 1.9806521445000675, + "grad_norm": 0.201040580868721, + "learning_rate": 3.125734647737112e-05, + "loss": 1.0489, + "step": 7320 + }, + { + "epoch": 1.9811933432553106, + "grad_norm": 0.1945294588804245, + "learning_rate": 3.1228163692719894e-05, + "loss": 1.0614, + "step": 7322 + }, + { + "epoch": 1.9817345420105532, + "grad_norm": 0.1898927241563797, + "learning_rate": 3.119898835011614e-05, + "loss": 1.0623, + "step": 7324 + }, + { + "epoch": 1.9822757407657963, + "grad_norm": 0.19310621917247772, + "learning_rate": 3.1169820461126325e-05, + "loss": 1.0788, + "step": 7326 + }, + { + "epoch": 1.982816939521039, + "grad_norm": 0.19416488707065582, + "learning_rate": 3.114066003731402e-05, + "loss": 1.0737, + "step": 7328 + }, + { + "epoch": 1.983358138276282, + "grad_norm": 0.2104490101337433, + "learning_rate": 3.111150709023978e-05, + "loss": 1.0843, + "step": 7330 + }, + { + "epoch": 1.983899337031525, + "grad_norm": 0.2151290625333786, + "learning_rate": 3.10823616314612e-05, + "loss": 1.0643, + "step": 7332 + }, + { + "epoch": 1.9844405357867676, + "grad_norm": 0.24143965542316437, + "learning_rate": 3.105322367253296e-05, + "loss": 1.0692, + "step": 7334 + }, + { + "epoch": 1.9849817345420107, + "grad_norm": 0.21232156455516815, + "learning_rate": 3.102409322500673e-05, + "loss": 1.0671, + "step": 7336 + }, + { + "epoch": 1.9855229332972533, + "grad_norm": 0.22918109595775604, + "learning_rate": 3.0994970300431184e-05, + "loss": 1.0745, + "step": 7338 + }, + { + "epoch": 1.9860641320524963, + "grad_norm": 0.23817414045333862, + "learning_rate": 3.0965854910352037e-05, + "loss": 1.0506, + "step": 7340 + }, + { + "epoch": 1.9866053308077392, + "grad_norm": 0.2065417319536209, + "learning_rate": 3.0936747066312e-05, + "loss": 1.0436, + "step": 7342 + }, + { + "epoch": 1.987146529562982, + "grad_norm": 0.20867787301540375, + "learning_rate": 3.090764677985083e-05, + "loss": 1.0541, + "step": 7344 + }, + { + "epoch": 1.9876877283182248, + "grad_norm": 0.24583370983600616, + "learning_rate": 3.087855406250526e-05, + "loss": 1.0795, + "step": 7346 + }, + { + "epoch": 1.9882289270734677, + "grad_norm": 0.24740496277809143, + "learning_rate": 3.084946892580901e-05, + "loss": 1.078, + "step": 7348 + }, + { + "epoch": 1.9887701258287107, + "grad_norm": 0.2057471126317978, + "learning_rate": 3.0820391381292805e-05, + "loss": 1.0821, + "step": 7350 + }, + { + "epoch": 1.9893113245839533, + "grad_norm": 0.20828630030155182, + "learning_rate": 3.079132144048439e-05, + "loss": 1.0774, + "step": 7352 + }, + { + "epoch": 1.9898525233391964, + "grad_norm": 0.2172783613204956, + "learning_rate": 3.0762259114908445e-05, + "loss": 1.0786, + "step": 7354 + }, + { + "epoch": 1.9903937220944392, + "grad_norm": 0.21654413640499115, + "learning_rate": 3.073320441608667e-05, + "loss": 1.0698, + "step": 7356 + }, + { + "epoch": 1.990934920849682, + "grad_norm": 0.19531476497650146, + "learning_rate": 3.0704157355537735e-05, + "loss": 1.0732, + "step": 7358 + }, + { + "epoch": 1.9914761196049249, + "grad_norm": 0.20442189276218414, + "learning_rate": 3.0675117944777256e-05, + "loss": 1.07, + "step": 7360 + }, + { + "epoch": 1.9920173183601677, + "grad_norm": 0.206589475274086, + "learning_rate": 3.064608619531785e-05, + "loss": 1.0787, + "step": 7362 + }, + { + "epoch": 1.9925585171154108, + "grad_norm": 0.1964482069015503, + "learning_rate": 3.061706211866906e-05, + "loss": 1.0642, + "step": 7364 + }, + { + "epoch": 1.9930997158706534, + "grad_norm": 0.20698732137680054, + "learning_rate": 3.0588045726337416e-05, + "loss": 1.0572, + "step": 7366 + }, + { + "epoch": 1.9936409146258964, + "grad_norm": 0.19766584038734436, + "learning_rate": 3.055903702982641e-05, + "loss": 1.0564, + "step": 7368 + }, + { + "epoch": 1.9941821133811393, + "grad_norm": 0.20464429259300232, + "learning_rate": 3.0530036040636466e-05, + "loss": 1.0715, + "step": 7370 + }, + { + "epoch": 1.994723312136382, + "grad_norm": 0.19982165098190308, + "learning_rate": 3.050104277026494e-05, + "loss": 1.0578, + "step": 7372 + }, + { + "epoch": 1.995264510891625, + "grad_norm": 0.19134065508842468, + "learning_rate": 3.047205723020614e-05, + "loss": 1.0646, + "step": 7374 + }, + { + "epoch": 1.9958057096468678, + "grad_norm": 0.19620591402053833, + "learning_rate": 3.044307943195133e-05, + "loss": 1.0735, + "step": 7376 + }, + { + "epoch": 1.9963469084021108, + "grad_norm": 0.19285741448402405, + "learning_rate": 3.0414109386988654e-05, + "loss": 1.064, + "step": 7378 + }, + { + "epoch": 1.9968881071573534, + "grad_norm": 0.1897832751274109, + "learning_rate": 3.0385147106803235e-05, + "loss": 1.0802, + "step": 7380 + }, + { + "epoch": 1.9974293059125965, + "grad_norm": 0.1987927407026291, + "learning_rate": 3.035619260287711e-05, + "loss": 1.0866, + "step": 7382 + }, + { + "epoch": 1.997970504667839, + "grad_norm": 0.19874325394630432, + "learning_rate": 3.0327245886689193e-05, + "loss": 1.0706, + "step": 7384 + }, + { + "epoch": 1.9985117034230822, + "grad_norm": 0.20415663719177246, + "learning_rate": 3.029830696971535e-05, + "loss": 1.0866, + "step": 7386 + }, + { + "epoch": 1.999052902178325, + "grad_norm": 0.2038622349500656, + "learning_rate": 3.0269375863428324e-05, + "loss": 1.0709, + "step": 7388 + }, + { + "epoch": 1.9995941009335678, + "grad_norm": 0.20079196989536285, + "learning_rate": 3.024045257929781e-05, + "loss": 1.1005, + "step": 7390 + }, + { + "epoch": 2.0, + "grad_norm": 0.2497442662715912, + "learning_rate": 3.021153712879036e-05, + "loss": 1.0736, + "step": 7392 + }, + { + "epoch": 2.000541198755243, + "grad_norm": 0.4766577184200287, + "learning_rate": 3.018262952336942e-05, + "loss": 0.9819, + "step": 7394 + }, + { + "epoch": 2.0010823975104857, + "grad_norm": 0.3478733003139496, + "learning_rate": 3.0153729774495355e-05, + "loss": 0.9411, + "step": 7396 + }, + { + "epoch": 2.0016235962657287, + "grad_norm": 0.27517619729042053, + "learning_rate": 3.0124837893625378e-05, + "loss": 0.9386, + "step": 7398 + }, + { + "epoch": 2.0021647950209713, + "grad_norm": 0.24881546199321747, + "learning_rate": 3.0095953892213613e-05, + "loss": 0.9348, + "step": 7400 + }, + { + "epoch": 2.0027059937762144, + "grad_norm": 0.23649244010448456, + "learning_rate": 3.0067077781711056e-05, + "loss": 0.9639, + "step": 7402 + }, + { + "epoch": 2.003247192531457, + "grad_norm": 0.2691260576248169, + "learning_rate": 3.0038209573565556e-05, + "loss": 0.9421, + "step": 7404 + }, + { + "epoch": 2.0037883912867, + "grad_norm": 0.24442261457443237, + "learning_rate": 3.0009349279221865e-05, + "loss": 0.9353, + "step": 7406 + }, + { + "epoch": 2.004329590041943, + "grad_norm": 0.2566124498844147, + "learning_rate": 2.9980496910121548e-05, + "loss": 0.9405, + "step": 7408 + }, + { + "epoch": 2.0048707887971857, + "grad_norm": 0.2458662986755371, + "learning_rate": 2.9951652477703074e-05, + "loss": 0.9408, + "step": 7410 + }, + { + "epoch": 2.0054119875524288, + "grad_norm": 0.2390550822019577, + "learning_rate": 2.992281599340171e-05, + "loss": 0.945, + "step": 7412 + }, + { + "epoch": 2.0059531863076714, + "grad_norm": 0.22238920629024506, + "learning_rate": 2.989398746864966e-05, + "loss": 0.9499, + "step": 7414 + }, + { + "epoch": 2.0064943850629144, + "grad_norm": 0.22608213126659393, + "learning_rate": 2.9865166914875896e-05, + "loss": 0.9514, + "step": 7416 + }, + { + "epoch": 2.007035583818157, + "grad_norm": 0.21794183552265167, + "learning_rate": 2.9836354343506245e-05, + "loss": 0.9436, + "step": 7418 + }, + { + "epoch": 2.0075767825734, + "grad_norm": 0.22811618447303772, + "learning_rate": 2.9807549765963393e-05, + "loss": 0.9754, + "step": 7420 + }, + { + "epoch": 2.008117981328643, + "grad_norm": 0.8486079573631287, + "learning_rate": 2.977875319366682e-05, + "loss": 0.9618, + "step": 7422 + }, + { + "epoch": 2.0086591800838858, + "grad_norm": 0.2298651933670044, + "learning_rate": 2.974996463803289e-05, + "loss": 0.9549, + "step": 7424 + }, + { + "epoch": 2.009200378839129, + "grad_norm": 0.24675211310386658, + "learning_rate": 2.9721184110474732e-05, + "loss": 0.9513, + "step": 7426 + }, + { + "epoch": 2.0097415775943714, + "grad_norm": 0.22576522827148438, + "learning_rate": 2.9692411622402317e-05, + "loss": 0.962, + "step": 7428 + }, + { + "epoch": 2.0102827763496145, + "grad_norm": 0.22054821252822876, + "learning_rate": 2.9663647185222426e-05, + "loss": 0.9592, + "step": 7430 + }, + { + "epoch": 2.010823975104857, + "grad_norm": 0.22029328346252441, + "learning_rate": 2.963489081033865e-05, + "loss": 0.9686, + "step": 7432 + }, + { + "epoch": 2.0113651738601, + "grad_norm": 0.22338277101516724, + "learning_rate": 2.9606142509151365e-05, + "loss": 0.9565, + "step": 7434 + }, + { + "epoch": 2.0119063726153428, + "grad_norm": 0.23180079460144043, + "learning_rate": 2.9577402293057792e-05, + "loss": 0.9576, + "step": 7436 + }, + { + "epoch": 2.012447571370586, + "grad_norm": 0.23672650754451752, + "learning_rate": 2.954867017345191e-05, + "loss": 0.9494, + "step": 7438 + }, + { + "epoch": 2.012988770125829, + "grad_norm": 0.21766048669815063, + "learning_rate": 2.9519946161724486e-05, + "loss": 0.9656, + "step": 7440 + }, + { + "epoch": 2.0135299688810715, + "grad_norm": 0.22322192788124084, + "learning_rate": 2.949123026926309e-05, + "loss": 0.9361, + "step": 7442 + }, + { + "epoch": 2.0140711676363146, + "grad_norm": 0.20866787433624268, + "learning_rate": 2.9462522507452056e-05, + "loss": 0.9421, + "step": 7444 + }, + { + "epoch": 2.014612366391557, + "grad_norm": 0.22801345586776733, + "learning_rate": 2.943382288767249e-05, + "loss": 0.9281, + "step": 7446 + }, + { + "epoch": 2.0151535651468, + "grad_norm": 0.2315617799758911, + "learning_rate": 2.940513142130233e-05, + "loss": 0.9497, + "step": 7448 + }, + { + "epoch": 2.015694763902043, + "grad_norm": 0.25177010893821716, + "learning_rate": 2.9376448119716205e-05, + "loss": 0.9647, + "step": 7450 + }, + { + "epoch": 2.016235962657286, + "grad_norm": 0.22845017910003662, + "learning_rate": 2.934777299428554e-05, + "loss": 0.9558, + "step": 7452 + }, + { + "epoch": 2.016777161412529, + "grad_norm": 0.23017989099025726, + "learning_rate": 2.931910605637852e-05, + "loss": 0.9795, + "step": 7454 + }, + { + "epoch": 2.0173183601677716, + "grad_norm": 0.23005174100399017, + "learning_rate": 2.9290447317360086e-05, + "loss": 0.9345, + "step": 7456 + }, + { + "epoch": 2.0178595589230146, + "grad_norm": 0.23430398106575012, + "learning_rate": 2.92617967885919e-05, + "loss": 0.9302, + "step": 7458 + }, + { + "epoch": 2.018400757678257, + "grad_norm": 0.23327389359474182, + "learning_rate": 2.9233154481432423e-05, + "loss": 0.9478, + "step": 7460 + }, + { + "epoch": 2.0189419564335003, + "grad_norm": 0.21937346458435059, + "learning_rate": 2.9204520407236825e-05, + "loss": 0.9523, + "step": 7462 + }, + { + "epoch": 2.019483155188743, + "grad_norm": 0.23134754598140717, + "learning_rate": 2.9175894577356976e-05, + "loss": 0.9444, + "step": 7464 + }, + { + "epoch": 2.020024353943986, + "grad_norm": 0.22234463691711426, + "learning_rate": 2.9147277003141572e-05, + "loss": 0.9478, + "step": 7466 + }, + { + "epoch": 2.020565552699229, + "grad_norm": 0.2396455705165863, + "learning_rate": 2.911866769593592e-05, + "loss": 0.9392, + "step": 7468 + }, + { + "epoch": 2.0211067514544716, + "grad_norm": 0.23214465379714966, + "learning_rate": 2.9090066667082137e-05, + "loss": 0.9409, + "step": 7470 + }, + { + "epoch": 2.0216479502097147, + "grad_norm": 0.23044392466545105, + "learning_rate": 2.9061473927919026e-05, + "loss": 0.9369, + "step": 7472 + }, + { + "epoch": 2.0221891489649573, + "grad_norm": 0.22520564496517181, + "learning_rate": 2.9032889489782122e-05, + "loss": 0.9684, + "step": 7474 + }, + { + "epoch": 2.0227303477202003, + "grad_norm": 0.21495433151721954, + "learning_rate": 2.9004313364003625e-05, + "loss": 0.9539, + "step": 7476 + }, + { + "epoch": 2.023271546475443, + "grad_norm": 0.22534355521202087, + "learning_rate": 2.8975745561912503e-05, + "loss": 0.9733, + "step": 7478 + }, + { + "epoch": 2.023812745230686, + "grad_norm": 0.2338736206293106, + "learning_rate": 2.8947186094834332e-05, + "loss": 0.9503, + "step": 7480 + }, + { + "epoch": 2.024353943985929, + "grad_norm": 0.21245649456977844, + "learning_rate": 2.8918634974091512e-05, + "loss": 0.9714, + "step": 7482 + }, + { + "epoch": 2.0248951427411717, + "grad_norm": 0.20702339708805084, + "learning_rate": 2.889009221100301e-05, + "loss": 0.9447, + "step": 7484 + }, + { + "epoch": 2.0254363414964147, + "grad_norm": 0.21514731645584106, + "learning_rate": 2.8861557816884575e-05, + "loss": 0.9361, + "step": 7486 + }, + { + "epoch": 2.0259775402516573, + "grad_norm": 0.21352145075798035, + "learning_rate": 2.8833031803048554e-05, + "loss": 0.9504, + "step": 7488 + }, + { + "epoch": 2.0265187390069004, + "grad_norm": 0.2188720405101776, + "learning_rate": 2.880451418080403e-05, + "loss": 0.954, + "step": 7490 + }, + { + "epoch": 2.027059937762143, + "grad_norm": 0.21520425379276276, + "learning_rate": 2.877600496145675e-05, + "loss": 0.9495, + "step": 7492 + }, + { + "epoch": 2.027601136517386, + "grad_norm": 0.2130945473909378, + "learning_rate": 2.874750415630912e-05, + "loss": 0.9433, + "step": 7494 + }, + { + "epoch": 2.0281423352726287, + "grad_norm": 0.22284521162509918, + "learning_rate": 2.8719011776660228e-05, + "loss": 0.955, + "step": 7496 + }, + { + "epoch": 2.0286835340278717, + "grad_norm": 0.2262052744626999, + "learning_rate": 2.869052783380577e-05, + "loss": 0.952, + "step": 7498 + }, + { + "epoch": 2.0292247327831148, + "grad_norm": 0.21521814167499542, + "learning_rate": 2.866205233903817e-05, + "loss": 0.9463, + "step": 7500 + }, + { + "epoch": 2.0297659315383574, + "grad_norm": 0.21867747604846954, + "learning_rate": 2.8633585303646416e-05, + "loss": 0.9631, + "step": 7502 + }, + { + "epoch": 2.0303071302936004, + "grad_norm": 0.21973881125450134, + "learning_rate": 2.8605126738916265e-05, + "loss": 0.9515, + "step": 7504 + }, + { + "epoch": 2.030848329048843, + "grad_norm": 0.2146976739168167, + "learning_rate": 2.857667665612999e-05, + "loss": 0.9225, + "step": 7506 + }, + { + "epoch": 2.031389527804086, + "grad_norm": 0.21844498813152313, + "learning_rate": 2.8548235066566586e-05, + "loss": 0.9641, + "step": 7508 + }, + { + "epoch": 2.0319307265593287, + "grad_norm": 0.21708504855632782, + "learning_rate": 2.8519801981501613e-05, + "loss": 0.9468, + "step": 7510 + }, + { + "epoch": 2.0324719253145718, + "grad_norm": 0.22518758475780487, + "learning_rate": 2.8491377412207347e-05, + "loss": 0.9493, + "step": 7512 + }, + { + "epoch": 2.033013124069815, + "grad_norm": 0.2251872569322586, + "learning_rate": 2.846296136995256e-05, + "loss": 0.9522, + "step": 7514 + }, + { + "epoch": 2.0335543228250574, + "grad_norm": 0.22971048951148987, + "learning_rate": 2.8434553866002818e-05, + "loss": 0.9593, + "step": 7516 + }, + { + "epoch": 2.0340955215803005, + "grad_norm": 0.222760409116745, + "learning_rate": 2.840615491162013e-05, + "loss": 0.9415, + "step": 7518 + }, + { + "epoch": 2.034636720335543, + "grad_norm": 0.21448856592178345, + "learning_rate": 2.837776451806321e-05, + "loss": 0.9556, + "step": 7520 + }, + { + "epoch": 2.035177919090786, + "grad_norm": 0.21855683624744415, + "learning_rate": 2.8349382696587394e-05, + "loss": 0.9621, + "step": 7522 + }, + { + "epoch": 2.0357191178460288, + "grad_norm": 0.22361056506633759, + "learning_rate": 2.8321009458444538e-05, + "loss": 0.9196, + "step": 7524 + }, + { + "epoch": 2.036260316601272, + "grad_norm": 0.21855206787586212, + "learning_rate": 2.8292644814883145e-05, + "loss": 0.9487, + "step": 7526 + }, + { + "epoch": 2.036801515356515, + "grad_norm": 0.22322331368923187, + "learning_rate": 2.8264288777148328e-05, + "loss": 0.952, + "step": 7528 + }, + { + "epoch": 2.0373427141117575, + "grad_norm": 0.22281445562839508, + "learning_rate": 2.8235941356481775e-05, + "loss": 0.9479, + "step": 7530 + }, + { + "epoch": 2.0378839128670005, + "grad_norm": 0.2184039056301117, + "learning_rate": 2.8207602564121716e-05, + "loss": 0.9662, + "step": 7532 + }, + { + "epoch": 2.038425111622243, + "grad_norm": 0.21754087507724762, + "learning_rate": 2.817927241130303e-05, + "loss": 0.9494, + "step": 7534 + }, + { + "epoch": 2.038966310377486, + "grad_norm": 0.2201545089483261, + "learning_rate": 2.81509509092571e-05, + "loss": 0.9567, + "step": 7536 + }, + { + "epoch": 2.039507509132729, + "grad_norm": 0.2164948433637619, + "learning_rate": 2.8122638069211927e-05, + "loss": 0.9459, + "step": 7538 + }, + { + "epoch": 2.040048707887972, + "grad_norm": 0.219279482960701, + "learning_rate": 2.8094333902392073e-05, + "loss": 0.9442, + "step": 7540 + }, + { + "epoch": 2.040589906643215, + "grad_norm": 0.22091278433799744, + "learning_rate": 2.806603842001867e-05, + "loss": 0.9666, + "step": 7542 + }, + { + "epoch": 2.0411311053984575, + "grad_norm": 0.22501921653747559, + "learning_rate": 2.803775163330935e-05, + "loss": 0.9684, + "step": 7544 + }, + { + "epoch": 2.0416723041537006, + "grad_norm": 0.2181495875120163, + "learning_rate": 2.8009473553478365e-05, + "loss": 0.959, + "step": 7546 + }, + { + "epoch": 2.042213502908943, + "grad_norm": 0.22720569372177124, + "learning_rate": 2.7981204191736482e-05, + "loss": 0.95, + "step": 7548 + }, + { + "epoch": 2.0427547016641863, + "grad_norm": 0.21670077741146088, + "learning_rate": 2.7952943559291023e-05, + "loss": 0.9469, + "step": 7550 + }, + { + "epoch": 2.043295900419429, + "grad_norm": 0.22416341304779053, + "learning_rate": 2.792469166734586e-05, + "loss": 0.9576, + "step": 7552 + }, + { + "epoch": 2.043837099174672, + "grad_norm": 0.22936950623989105, + "learning_rate": 2.789644852710135e-05, + "loss": 0.9595, + "step": 7554 + }, + { + "epoch": 2.0443782979299145, + "grad_norm": 0.21276643872261047, + "learning_rate": 2.7868214149754456e-05, + "loss": 0.963, + "step": 7556 + }, + { + "epoch": 2.0449194966851576, + "grad_norm": 0.21516387164592743, + "learning_rate": 2.7839988546498575e-05, + "loss": 0.9485, + "step": 7558 + }, + { + "epoch": 2.0454606954404007, + "grad_norm": 0.22228024899959564, + "learning_rate": 2.7811771728523718e-05, + "loss": 0.9523, + "step": 7560 + }, + { + "epoch": 2.0460018941956433, + "grad_norm": 0.22026000916957855, + "learning_rate": 2.7783563707016346e-05, + "loss": 0.9524, + "step": 7562 + }, + { + "epoch": 2.0465430929508863, + "grad_norm": 0.2245919108390808, + "learning_rate": 2.7755364493159496e-05, + "loss": 0.9576, + "step": 7564 + }, + { + "epoch": 2.047084291706129, + "grad_norm": 0.2183576226234436, + "learning_rate": 2.772717409813263e-05, + "loss": 0.9526, + "step": 7566 + }, + { + "epoch": 2.047625490461372, + "grad_norm": 0.2127113789319992, + "learning_rate": 2.7698992533111778e-05, + "loss": 0.9489, + "step": 7568 + }, + { + "epoch": 2.0481666892166146, + "grad_norm": 0.22545476257801056, + "learning_rate": 2.7670819809269475e-05, + "loss": 0.9475, + "step": 7570 + }, + { + "epoch": 2.0487078879718577, + "grad_norm": 0.20920006930828094, + "learning_rate": 2.7642655937774675e-05, + "loss": 0.9574, + "step": 7572 + }, + { + "epoch": 2.0492490867271007, + "grad_norm": 0.22011259198188782, + "learning_rate": 2.761450092979293e-05, + "loss": 0.9506, + "step": 7574 + }, + { + "epoch": 2.0497902854823433, + "grad_norm": 0.2137351781129837, + "learning_rate": 2.758635479648618e-05, + "loss": 0.9551, + "step": 7576 + }, + { + "epoch": 2.0503314842375864, + "grad_norm": 0.21463316679000854, + "learning_rate": 2.7558217549012932e-05, + "loss": 0.9428, + "step": 7578 + }, + { + "epoch": 2.050872682992829, + "grad_norm": 0.2303556650876999, + "learning_rate": 2.7530089198528076e-05, + "loss": 0.9388, + "step": 7580 + }, + { + "epoch": 2.051413881748072, + "grad_norm": 0.21915993094444275, + "learning_rate": 2.750196975618305e-05, + "loss": 0.9551, + "step": 7582 + }, + { + "epoch": 2.0519550805033147, + "grad_norm": 0.22042939066886902, + "learning_rate": 2.7473859233125736e-05, + "loss": 0.9647, + "step": 7584 + }, + { + "epoch": 2.0524962792585577, + "grad_norm": 0.2282932549715042, + "learning_rate": 2.7445757640500504e-05, + "loss": 0.9717, + "step": 7586 + }, + { + "epoch": 2.0530374780138008, + "grad_norm": 0.22055798768997192, + "learning_rate": 2.7417664989448122e-05, + "loss": 0.9364, + "step": 7588 + }, + { + "epoch": 2.0535786767690434, + "grad_norm": 0.21404729783535004, + "learning_rate": 2.7389581291105882e-05, + "loss": 0.9454, + "step": 7590 + }, + { + "epoch": 2.0541198755242864, + "grad_norm": 0.21840506792068481, + "learning_rate": 2.736150655660746e-05, + "loss": 0.9579, + "step": 7592 + }, + { + "epoch": 2.054661074279529, + "grad_norm": 0.2240770161151886, + "learning_rate": 2.7333440797083032e-05, + "loss": 0.9498, + "step": 7594 + }, + { + "epoch": 2.055202273034772, + "grad_norm": 0.2156870812177658, + "learning_rate": 2.730538402365921e-05, + "loss": 0.9443, + "step": 7596 + }, + { + "epoch": 2.0557434717900147, + "grad_norm": 0.2223811000585556, + "learning_rate": 2.7277336247459018e-05, + "loss": 0.9566, + "step": 7598 + }, + { + "epoch": 2.0562846705452578, + "grad_norm": 0.22407159209251404, + "learning_rate": 2.7249297479601942e-05, + "loss": 0.9595, + "step": 7600 + }, + { + "epoch": 2.056825869300501, + "grad_norm": 0.215648353099823, + "learning_rate": 2.7221267731203858e-05, + "loss": 0.9662, + "step": 7602 + }, + { + "epoch": 2.0573670680557434, + "grad_norm": 0.22349536418914795, + "learning_rate": 2.7193247013377128e-05, + "loss": 0.9392, + "step": 7604 + }, + { + "epoch": 2.0579082668109865, + "grad_norm": 0.22904209792613983, + "learning_rate": 2.716523533723042e-05, + "loss": 0.9447, + "step": 7606 + }, + { + "epoch": 2.058449465566229, + "grad_norm": 0.2158462107181549, + "learning_rate": 2.7137232713868986e-05, + "loss": 0.9577, + "step": 7608 + }, + { + "epoch": 2.058990664321472, + "grad_norm": 0.22222615778446198, + "learning_rate": 2.7109239154394328e-05, + "loss": 0.9517, + "step": 7610 + }, + { + "epoch": 2.0595318630767148, + "grad_norm": 0.2167874127626419, + "learning_rate": 2.7081254669904472e-05, + "loss": 0.9307, + "step": 7612 + }, + { + "epoch": 2.060073061831958, + "grad_norm": 0.21957142651081085, + "learning_rate": 2.7053279271493748e-05, + "loss": 0.9341, + "step": 7614 + }, + { + "epoch": 2.060614260587201, + "grad_norm": 0.2196885198354721, + "learning_rate": 2.7025312970252958e-05, + "loss": 0.9561, + "step": 7616 + }, + { + "epoch": 2.0611554593424435, + "grad_norm": 0.22702725231647491, + "learning_rate": 2.699735577726926e-05, + "loss": 0.9341, + "step": 7618 + }, + { + "epoch": 2.0616966580976865, + "grad_norm": 0.21746306121349335, + "learning_rate": 2.6969407703626253e-05, + "loss": 0.9501, + "step": 7620 + }, + { + "epoch": 2.062237856852929, + "grad_norm": 0.2231265902519226, + "learning_rate": 2.694146876040383e-05, + "loss": 0.9333, + "step": 7622 + }, + { + "epoch": 2.062779055608172, + "grad_norm": 0.8074358105659485, + "learning_rate": 2.691353895867833e-05, + "loss": 0.9379, + "step": 7624 + }, + { + "epoch": 2.063320254363415, + "grad_norm": 0.25972089171409607, + "learning_rate": 2.6885618309522486e-05, + "loss": 0.9611, + "step": 7626 + }, + { + "epoch": 2.063861453118658, + "grad_norm": 0.22194311022758484, + "learning_rate": 2.6857706824005298e-05, + "loss": 0.9498, + "step": 7628 + }, + { + "epoch": 2.0644026518739005, + "grad_norm": 0.24714423716068268, + "learning_rate": 2.6829804513192285e-05, + "loss": 0.9516, + "step": 7630 + }, + { + "epoch": 2.0649438506291435, + "grad_norm": 0.21855002641677856, + "learning_rate": 2.6801911388145196e-05, + "loss": 0.9592, + "step": 7632 + }, + { + "epoch": 2.0654850493843866, + "grad_norm": 0.22424137592315674, + "learning_rate": 2.6774027459922223e-05, + "loss": 0.9577, + "step": 7634 + }, + { + "epoch": 2.066026248139629, + "grad_norm": 0.23414380848407745, + "learning_rate": 2.6746152739577846e-05, + "loss": 0.9441, + "step": 7636 + }, + { + "epoch": 2.0665674468948723, + "grad_norm": 0.23106548190116882, + "learning_rate": 2.6718287238162963e-05, + "loss": 0.9456, + "step": 7638 + }, + { + "epoch": 2.067108645650115, + "grad_norm": 0.21523398160934448, + "learning_rate": 2.6690430966724723e-05, + "loss": 0.9669, + "step": 7640 + }, + { + "epoch": 2.067649844405358, + "grad_norm": 0.2224549949169159, + "learning_rate": 2.6662583936306752e-05, + "loss": 0.9543, + "step": 7642 + }, + { + "epoch": 2.0681910431606005, + "grad_norm": 0.25353899598121643, + "learning_rate": 2.6634746157948887e-05, + "loss": 0.9347, + "step": 7644 + }, + { + "epoch": 2.0687322419158436, + "grad_norm": 0.237226665019989, + "learning_rate": 2.6606917642687357e-05, + "loss": 0.9553, + "step": 7646 + }, + { + "epoch": 2.0692734406710867, + "grad_norm": 0.21808220446109772, + "learning_rate": 2.657909840155472e-05, + "loss": 0.9535, + "step": 7648 + }, + { + "epoch": 2.0698146394263293, + "grad_norm": 0.2249707281589508, + "learning_rate": 2.6551288445579813e-05, + "loss": 0.9478, + "step": 7650 + }, + { + "epoch": 2.0703558381815723, + "grad_norm": 0.21939726173877716, + "learning_rate": 2.652348778578785e-05, + "loss": 0.9431, + "step": 7652 + }, + { + "epoch": 2.070897036936815, + "grad_norm": 0.21499034762382507, + "learning_rate": 2.6495696433200324e-05, + "loss": 0.9514, + "step": 7654 + }, + { + "epoch": 2.071438235692058, + "grad_norm": 0.27530205249786377, + "learning_rate": 2.646791439883507e-05, + "loss": 0.9613, + "step": 7656 + }, + { + "epoch": 2.0719794344473006, + "grad_norm": 0.2797897756099701, + "learning_rate": 2.6440141693706157e-05, + "loss": 0.9621, + "step": 7658 + }, + { + "epoch": 2.0725206332025436, + "grad_norm": 0.270762175321579, + "learning_rate": 2.641237832882406e-05, + "loss": 0.9293, + "step": 7660 + }, + { + "epoch": 2.0730618319577867, + "grad_norm": 0.25451114773750305, + "learning_rate": 2.638462431519542e-05, + "loss": 0.9461, + "step": 7662 + }, + { + "epoch": 2.0736030307130293, + "grad_norm": 0.2576145529747009, + "learning_rate": 2.635687966382334e-05, + "loss": 0.9473, + "step": 7664 + }, + { + "epoch": 2.0741442294682724, + "grad_norm": 0.2652926445007324, + "learning_rate": 2.632914438570705e-05, + "loss": 0.9509, + "step": 7666 + }, + { + "epoch": 2.074685428223515, + "grad_norm": 0.22931689023971558, + "learning_rate": 2.6301418491842167e-05, + "loss": 0.9628, + "step": 7668 + }, + { + "epoch": 2.075226626978758, + "grad_norm": 0.31552034616470337, + "learning_rate": 2.6273701993220522e-05, + "loss": 0.9419, + "step": 7670 + }, + { + "epoch": 2.0757678257340006, + "grad_norm": 0.22639602422714233, + "learning_rate": 2.6245994900830262e-05, + "loss": 0.959, + "step": 7672 + }, + { + "epoch": 2.0763090244892437, + "grad_norm": 0.2326960563659668, + "learning_rate": 2.621829722565581e-05, + "loss": 0.9318, + "step": 7674 + }, + { + "epoch": 2.0768502232444863, + "grad_norm": 0.22966636717319489, + "learning_rate": 2.6190608978677823e-05, + "loss": 0.9601, + "step": 7676 + }, + { + "epoch": 2.0773914219997294, + "grad_norm": 0.23487944900989532, + "learning_rate": 2.6162930170873263e-05, + "loss": 0.9616, + "step": 7678 + }, + { + "epoch": 2.0779326207549724, + "grad_norm": 0.23548728227615356, + "learning_rate": 2.6135260813215283e-05, + "loss": 0.9192, + "step": 7680 + }, + { + "epoch": 2.078473819510215, + "grad_norm": 0.226890429854393, + "learning_rate": 2.6107600916673374e-05, + "loss": 0.9449, + "step": 7682 + }, + { + "epoch": 2.079015018265458, + "grad_norm": 0.22991536557674408, + "learning_rate": 2.607995049221319e-05, + "loss": 0.9657, + "step": 7684 + }, + { + "epoch": 2.0795562170207007, + "grad_norm": 0.22618654370307922, + "learning_rate": 2.6052309550796695e-05, + "loss": 0.943, + "step": 7686 + }, + { + "epoch": 2.0800974157759438, + "grad_norm": 0.2274552434682846, + "learning_rate": 2.602467810338207e-05, + "loss": 0.9288, + "step": 7688 + }, + { + "epoch": 2.0806386145311864, + "grad_norm": 0.2106860727071762, + "learning_rate": 2.599705616092375e-05, + "loss": 0.9464, + "step": 7690 + }, + { + "epoch": 2.0811798132864294, + "grad_norm": 0.22152534127235413, + "learning_rate": 2.5969443734372344e-05, + "loss": 0.9474, + "step": 7692 + }, + { + "epoch": 2.0817210120416725, + "grad_norm": 0.22242993116378784, + "learning_rate": 2.594184083467476e-05, + "loss": 0.9519, + "step": 7694 + }, + { + "epoch": 2.082262210796915, + "grad_norm": 0.22864559292793274, + "learning_rate": 2.59142474727741e-05, + "loss": 0.9508, + "step": 7696 + }, + { + "epoch": 2.082803409552158, + "grad_norm": 0.21396631002426147, + "learning_rate": 2.588666365960967e-05, + "loss": 0.9565, + "step": 7698 + }, + { + "epoch": 2.0833446083074008, + "grad_norm": 0.2228645533323288, + "learning_rate": 2.585908940611699e-05, + "loss": 0.9395, + "step": 7700 + }, + { + "epoch": 2.083885807062644, + "grad_norm": 0.23101234436035156, + "learning_rate": 2.583152472322783e-05, + "loss": 0.9526, + "step": 7702 + }, + { + "epoch": 2.0844270058178864, + "grad_norm": 0.22011829912662506, + "learning_rate": 2.580396962187015e-05, + "loss": 0.9571, + "step": 7704 + }, + { + "epoch": 2.0849682045731295, + "grad_norm": 0.22112277150154114, + "learning_rate": 2.577642411296806e-05, + "loss": 0.946, + "step": 7706 + }, + { + "epoch": 2.0855094033283725, + "grad_norm": 0.23127155005931854, + "learning_rate": 2.5748888207441934e-05, + "loss": 0.941, + "step": 7708 + }, + { + "epoch": 2.086050602083615, + "grad_norm": 0.22508841753005981, + "learning_rate": 2.572136191620831e-05, + "loss": 0.9347, + "step": 7710 + }, + { + "epoch": 2.086591800838858, + "grad_norm": 0.2130221128463745, + "learning_rate": 2.5693845250179927e-05, + "loss": 0.929, + "step": 7712 + }, + { + "epoch": 2.087132999594101, + "grad_norm": 0.22937741875648499, + "learning_rate": 2.5666338220265672e-05, + "loss": 0.9427, + "step": 7714 + }, + { + "epoch": 2.087674198349344, + "grad_norm": 0.2199690043926239, + "learning_rate": 2.563884083737067e-05, + "loss": 0.9477, + "step": 7716 + }, + { + "epoch": 2.0882153971045865, + "grad_norm": 0.22044318914413452, + "learning_rate": 2.5611353112396154e-05, + "loss": 0.9284, + "step": 7718 + }, + { + "epoch": 2.0887565958598295, + "grad_norm": 0.21888893842697144, + "learning_rate": 2.558387505623958e-05, + "loss": 0.9369, + "step": 7720 + }, + { + "epoch": 2.0892977946150726, + "grad_norm": 0.21437235176563263, + "learning_rate": 2.5556406679794563e-05, + "loss": 0.9467, + "step": 7722 + }, + { + "epoch": 2.089838993370315, + "grad_norm": 0.21653996407985687, + "learning_rate": 2.5528947993950857e-05, + "loss": 0.9703, + "step": 7724 + }, + { + "epoch": 2.0903801921255583, + "grad_norm": 0.21470144391059875, + "learning_rate": 2.550149900959442e-05, + "loss": 0.9595, + "step": 7726 + }, + { + "epoch": 2.090921390880801, + "grad_norm": 0.21421150863170624, + "learning_rate": 2.5474059737607297e-05, + "loss": 0.9443, + "step": 7728 + }, + { + "epoch": 2.091462589636044, + "grad_norm": 0.22067682445049286, + "learning_rate": 2.5446630188867747e-05, + "loss": 0.9402, + "step": 7730 + }, + { + "epoch": 2.0920037883912865, + "grad_norm": 0.21614603698253632, + "learning_rate": 2.5419210374250096e-05, + "loss": 0.9662, + "step": 7732 + }, + { + "epoch": 2.0925449871465296, + "grad_norm": 0.22540171444416046, + "learning_rate": 2.5391800304624937e-05, + "loss": 0.9582, + "step": 7734 + }, + { + "epoch": 2.0930861859017726, + "grad_norm": 0.31395238637924194, + "learning_rate": 2.5364399990858855e-05, + "loss": 0.9321, + "step": 7736 + }, + { + "epoch": 2.0936273846570153, + "grad_norm": 0.22442223131656647, + "learning_rate": 2.5337009443814686e-05, + "loss": 0.9491, + "step": 7738 + }, + { + "epoch": 2.0941685834122583, + "grad_norm": 0.22935397922992706, + "learning_rate": 2.5309628674351304e-05, + "loss": 0.9684, + "step": 7740 + }, + { + "epoch": 2.094709782167501, + "grad_norm": 0.3103565275669098, + "learning_rate": 2.5282257693323765e-05, + "loss": 0.9248, + "step": 7742 + }, + { + "epoch": 2.095250980922744, + "grad_norm": 0.2358006089925766, + "learning_rate": 2.5254896511583214e-05, + "loss": 0.941, + "step": 7744 + }, + { + "epoch": 2.0957921796779866, + "grad_norm": 0.22335706651210785, + "learning_rate": 2.522754513997696e-05, + "loss": 0.9337, + "step": 7746 + }, + { + "epoch": 2.0963333784332296, + "grad_norm": 0.2343682497739792, + "learning_rate": 2.5200203589348348e-05, + "loss": 0.9289, + "step": 7748 + }, + { + "epoch": 2.0968745771884723, + "grad_norm": 0.22699034214019775, + "learning_rate": 2.5172871870536863e-05, + "loss": 0.9517, + "step": 7750 + }, + { + "epoch": 2.0974157759437153, + "grad_norm": 0.2185050994157791, + "learning_rate": 2.5145549994378136e-05, + "loss": 0.9559, + "step": 7752 + }, + { + "epoch": 2.0979569746989584, + "grad_norm": 0.2319442629814148, + "learning_rate": 2.51182379717038e-05, + "loss": 0.9546, + "step": 7754 + }, + { + "epoch": 2.098498173454201, + "grad_norm": 0.22405529022216797, + "learning_rate": 2.5090935813341703e-05, + "loss": 0.9594, + "step": 7756 + }, + { + "epoch": 2.099039372209444, + "grad_norm": 0.21981188654899597, + "learning_rate": 2.5063643530115664e-05, + "loss": 0.9466, + "step": 7758 + }, + { + "epoch": 2.0995805709646866, + "grad_norm": 0.22078728675842285, + "learning_rate": 2.5036361132845686e-05, + "loss": 0.9481, + "step": 7760 + }, + { + "epoch": 2.1001217697199297, + "grad_norm": 0.2291024625301361, + "learning_rate": 2.5009088632347756e-05, + "loss": 0.9574, + "step": 7762 + }, + { + "epoch": 2.1006629684751723, + "grad_norm": 0.21931250393390656, + "learning_rate": 2.4981826039434035e-05, + "loss": 0.9264, + "step": 7764 + }, + { + "epoch": 2.1012041672304154, + "grad_norm": 0.21614545583724976, + "learning_rate": 2.4954573364912652e-05, + "loss": 0.9608, + "step": 7766 + }, + { + "epoch": 2.1017453659856584, + "grad_norm": 0.22134146094322205, + "learning_rate": 2.4927330619587934e-05, + "loss": 0.9476, + "step": 7768 + }, + { + "epoch": 2.102286564740901, + "grad_norm": 0.21246960759162903, + "learning_rate": 2.490009781426015e-05, + "loss": 0.9432, + "step": 7770 + }, + { + "epoch": 2.102827763496144, + "grad_norm": 0.22484076023101807, + "learning_rate": 2.4872874959725683e-05, + "loss": 0.9665, + "step": 7772 + }, + { + "epoch": 2.1033689622513867, + "grad_norm": 0.2102733999490738, + "learning_rate": 2.4845662066777e-05, + "loss": 0.9431, + "step": 7774 + }, + { + "epoch": 2.1039101610066298, + "grad_norm": 0.22462059557437897, + "learning_rate": 2.4818459146202526e-05, + "loss": 0.9546, + "step": 7776 + }, + { + "epoch": 2.1044513597618724, + "grad_norm": 0.21615788340568542, + "learning_rate": 2.4791266208786833e-05, + "loss": 0.9457, + "step": 7778 + }, + { + "epoch": 2.1049925585171154, + "grad_norm": 0.21363696455955505, + "learning_rate": 2.4764083265310478e-05, + "loss": 0.951, + "step": 7780 + }, + { + "epoch": 2.1055337572723585, + "grad_norm": 0.2130512297153473, + "learning_rate": 2.4736910326550094e-05, + "loss": 0.9337, + "step": 7782 + }, + { + "epoch": 2.106074956027601, + "grad_norm": 0.23000115156173706, + "learning_rate": 2.470974740327829e-05, + "loss": 0.9447, + "step": 7784 + }, + { + "epoch": 2.106616154782844, + "grad_norm": 0.22016383707523346, + "learning_rate": 2.4682594506263773e-05, + "loss": 0.9366, + "step": 7786 + }, + { + "epoch": 2.1071573535380868, + "grad_norm": 0.22897587716579437, + "learning_rate": 2.4655451646271177e-05, + "loss": 0.9452, + "step": 7788 + }, + { + "epoch": 2.10769855229333, + "grad_norm": 0.22692860662937164, + "learning_rate": 2.462831883406131e-05, + "loss": 0.9527, + "step": 7790 + }, + { + "epoch": 2.1082397510485724, + "grad_norm": 0.22521987557411194, + "learning_rate": 2.4601196080390843e-05, + "loss": 0.9257, + "step": 7792 + }, + { + "epoch": 2.1087809498038155, + "grad_norm": 0.21891172230243683, + "learning_rate": 2.4574083396012564e-05, + "loss": 0.9336, + "step": 7794 + }, + { + "epoch": 2.109322148559058, + "grad_norm": 0.218449667096138, + "learning_rate": 2.454698079167519e-05, + "loss": 0.9641, + "step": 7796 + }, + { + "epoch": 2.109863347314301, + "grad_norm": 0.22452402114868164, + "learning_rate": 2.4519888278123498e-05, + "loss": 0.9563, + "step": 7798 + }, + { + "epoch": 2.110404546069544, + "grad_norm": 0.22462545335292816, + "learning_rate": 2.449280586609825e-05, + "loss": 0.9504, + "step": 7800 + }, + { + "epoch": 2.110945744824787, + "grad_norm": 0.22184407711029053, + "learning_rate": 2.4465733566336195e-05, + "loss": 0.9432, + "step": 7802 + }, + { + "epoch": 2.11148694358003, + "grad_norm": 0.22373095154762268, + "learning_rate": 2.4438671389570105e-05, + "loss": 0.954, + "step": 7804 + }, + { + "epoch": 2.1120281423352725, + "grad_norm": 0.21508368849754333, + "learning_rate": 2.4411619346528674e-05, + "loss": 0.9504, + "step": 7806 + }, + { + "epoch": 2.1125693410905155, + "grad_norm": 0.21892188489437103, + "learning_rate": 2.4384577447936648e-05, + "loss": 0.9412, + "step": 7808 + }, + { + "epoch": 2.113110539845758, + "grad_norm": 0.2197813093662262, + "learning_rate": 2.4357545704514685e-05, + "loss": 0.9444, + "step": 7810 + }, + { + "epoch": 2.113651738601001, + "grad_norm": 0.22589881718158722, + "learning_rate": 2.433052412697947e-05, + "loss": 0.9422, + "step": 7812 + }, + { + "epoch": 2.1141929373562443, + "grad_norm": 0.24351909756660461, + "learning_rate": 2.4303512726043643e-05, + "loss": 0.9472, + "step": 7814 + }, + { + "epoch": 2.114734136111487, + "grad_norm": 0.23213231563568115, + "learning_rate": 2.427651151241582e-05, + "loss": 0.9338, + "step": 7816 + }, + { + "epoch": 2.11527533486673, + "grad_norm": 0.22738105058670044, + "learning_rate": 2.424952049680053e-05, + "loss": 0.9524, + "step": 7818 + }, + { + "epoch": 2.1158165336219725, + "grad_norm": 0.23027917742729187, + "learning_rate": 2.4222539689898317e-05, + "loss": 0.9439, + "step": 7820 + }, + { + "epoch": 2.1163577323772156, + "grad_norm": 0.2254500389099121, + "learning_rate": 2.4195569102405647e-05, + "loss": 0.9422, + "step": 7822 + }, + { + "epoch": 2.116898931132458, + "grad_norm": 0.22430899739265442, + "learning_rate": 2.416860874501496e-05, + "loss": 0.9447, + "step": 7824 + }, + { + "epoch": 2.1174401298877013, + "grad_norm": 0.22458522021770477, + "learning_rate": 2.414165862841459e-05, + "loss": 0.9465, + "step": 7826 + }, + { + "epoch": 2.1179813286429443, + "grad_norm": 0.21511007845401764, + "learning_rate": 2.4114718763288857e-05, + "loss": 0.9664, + "step": 7828 + }, + { + "epoch": 2.118522527398187, + "grad_norm": 0.21453900635242462, + "learning_rate": 2.4087789160318024e-05, + "loss": 0.9405, + "step": 7830 + }, + { + "epoch": 2.11906372615343, + "grad_norm": 0.22213365137577057, + "learning_rate": 2.406086983017823e-05, + "loss": 0.9331, + "step": 7832 + }, + { + "epoch": 2.1196049249086726, + "grad_norm": 0.22342199087142944, + "learning_rate": 2.403396078354159e-05, + "loss": 0.941, + "step": 7834 + }, + { + "epoch": 2.1201461236639156, + "grad_norm": 0.2279137820005417, + "learning_rate": 2.4007062031076133e-05, + "loss": 0.9696, + "step": 7836 + }, + { + "epoch": 2.1206873224191582, + "grad_norm": 0.22755584120750427, + "learning_rate": 2.3980173583445815e-05, + "loss": 0.9465, + "step": 7838 + }, + { + "epoch": 2.1212285211744013, + "grad_norm": 0.2181105762720108, + "learning_rate": 2.395329545131046e-05, + "loss": 0.9434, + "step": 7840 + }, + { + "epoch": 2.1217697199296444, + "grad_norm": 0.21755985915660858, + "learning_rate": 2.3926427645325877e-05, + "loss": 0.9541, + "step": 7842 + }, + { + "epoch": 2.122310918684887, + "grad_norm": 0.23180459439754486, + "learning_rate": 2.389957017614369e-05, + "loss": 0.933, + "step": 7844 + }, + { + "epoch": 2.12285211744013, + "grad_norm": 0.22180335223674774, + "learning_rate": 2.387272305441151e-05, + "loss": 0.9481, + "step": 7846 + }, + { + "epoch": 2.1233933161953726, + "grad_norm": 0.22434116899967194, + "learning_rate": 2.38458862907728e-05, + "loss": 0.9171, + "step": 7848 + }, + { + "epoch": 2.1239345149506157, + "grad_norm": 0.22076992690563202, + "learning_rate": 2.3819059895866936e-05, + "loss": 0.9559, + "step": 7850 + }, + { + "epoch": 2.1244757137058583, + "grad_norm": 0.22003455460071564, + "learning_rate": 2.3792243880329186e-05, + "loss": 0.9632, + "step": 7852 + }, + { + "epoch": 2.1250169124611014, + "grad_norm": 0.2267758697271347, + "learning_rate": 2.3765438254790657e-05, + "loss": 0.9441, + "step": 7854 + }, + { + "epoch": 2.1255581112163444, + "grad_norm": 0.21586257219314575, + "learning_rate": 2.3738643029878394e-05, + "loss": 0.9394, + "step": 7856 + }, + { + "epoch": 2.126099309971587, + "grad_norm": 0.21697446703910828, + "learning_rate": 2.3711858216215295e-05, + "loss": 0.9389, + "step": 7858 + }, + { + "epoch": 2.12664050872683, + "grad_norm": 0.22206568717956543, + "learning_rate": 2.3685083824420147e-05, + "loss": 0.9532, + "step": 7860 + }, + { + "epoch": 2.1271817074820727, + "grad_norm": 0.2361227571964264, + "learning_rate": 2.365831986510756e-05, + "loss": 0.9671, + "step": 7862 + }, + { + "epoch": 2.1277229062373157, + "grad_norm": 0.23557588458061218, + "learning_rate": 2.3631566348888073e-05, + "loss": 0.9218, + "step": 7864 + }, + { + "epoch": 2.1282641049925584, + "grad_norm": 0.21931879222393036, + "learning_rate": 2.360482328636801e-05, + "loss": 0.9428, + "step": 7866 + }, + { + "epoch": 2.1288053037478014, + "grad_norm": 0.24976031482219696, + "learning_rate": 2.3578090688149614e-05, + "loss": 0.936, + "step": 7868 + }, + { + "epoch": 2.1293465025030445, + "grad_norm": 0.25675225257873535, + "learning_rate": 2.3551368564830957e-05, + "loss": 0.9567, + "step": 7870 + }, + { + "epoch": 2.129887701258287, + "grad_norm": 0.26582902669906616, + "learning_rate": 2.352465692700597e-05, + "loss": 0.9666, + "step": 7872 + }, + { + "epoch": 2.13042890001353, + "grad_norm": 0.23450426757335663, + "learning_rate": 2.3497955785264385e-05, + "loss": 0.9515, + "step": 7874 + }, + { + "epoch": 2.1309700987687727, + "grad_norm": 0.22431163489818573, + "learning_rate": 2.3471265150191817e-05, + "loss": 0.9225, + "step": 7876 + }, + { + "epoch": 2.131511297524016, + "grad_norm": 0.22514396905899048, + "learning_rate": 2.3444585032369725e-05, + "loss": 0.9559, + "step": 7878 + }, + { + "epoch": 2.1320524962792584, + "grad_norm": 0.21655204892158508, + "learning_rate": 2.3417915442375316e-05, + "loss": 0.9419, + "step": 7880 + }, + { + "epoch": 2.1325936950345015, + "grad_norm": 0.22209306061267853, + "learning_rate": 2.339125639078175e-05, + "loss": 0.9455, + "step": 7882 + }, + { + "epoch": 2.133134893789744, + "grad_norm": 0.22619475424289703, + "learning_rate": 2.3364607888157897e-05, + "loss": 0.9397, + "step": 7884 + }, + { + "epoch": 2.133676092544987, + "grad_norm": 0.22519052028656006, + "learning_rate": 2.333796994506852e-05, + "loss": 0.9542, + "step": 7886 + }, + { + "epoch": 2.13421729130023, + "grad_norm": 0.22170260548591614, + "learning_rate": 2.331134257207413e-05, + "loss": 0.9319, + "step": 7888 + }, + { + "epoch": 2.134758490055473, + "grad_norm": 0.23741647601127625, + "learning_rate": 2.3284725779731114e-05, + "loss": 0.9506, + "step": 7890 + }, + { + "epoch": 2.135299688810716, + "grad_norm": 0.22754010558128357, + "learning_rate": 2.325811957859158e-05, + "loss": 0.9441, + "step": 7892 + }, + { + "epoch": 2.1358408875659585, + "grad_norm": 0.2147742211818695, + "learning_rate": 2.3231523979203568e-05, + "loss": 0.9438, + "step": 7894 + }, + { + "epoch": 2.1363820863212015, + "grad_norm": 0.23172889649868011, + "learning_rate": 2.320493899211078e-05, + "loss": 0.9375, + "step": 7896 + }, + { + "epoch": 2.136923285076444, + "grad_norm": 0.24967627227306366, + "learning_rate": 2.3178364627852784e-05, + "loss": 0.9459, + "step": 7898 + }, + { + "epoch": 2.137464483831687, + "grad_norm": 0.23095029592514038, + "learning_rate": 2.315180089696493e-05, + "loss": 0.9318, + "step": 7900 + }, + { + "epoch": 2.1380056825869302, + "grad_norm": 0.2277185320854187, + "learning_rate": 2.3125247809978324e-05, + "loss": 0.9237, + "step": 7902 + }, + { + "epoch": 2.138546881342173, + "grad_norm": 0.22815929353237152, + "learning_rate": 2.3098705377419877e-05, + "loss": 0.9251, + "step": 7904 + }, + { + "epoch": 2.139088080097416, + "grad_norm": 0.2270124852657318, + "learning_rate": 2.307217360981227e-05, + "loss": 0.9436, + "step": 7906 + }, + { + "epoch": 2.1396292788526585, + "grad_norm": 0.23960675299167633, + "learning_rate": 2.3045652517673978e-05, + "loss": 0.9334, + "step": 7908 + }, + { + "epoch": 2.1401704776079016, + "grad_norm": 0.23153585195541382, + "learning_rate": 2.301914211151918e-05, + "loss": 0.9246, + "step": 7910 + }, + { + "epoch": 2.140711676363144, + "grad_norm": 0.22093063592910767, + "learning_rate": 2.2992642401857904e-05, + "loss": 0.9468, + "step": 7912 + }, + { + "epoch": 2.1412528751183872, + "grad_norm": 0.22165392339229584, + "learning_rate": 2.2966153399195832e-05, + "loss": 0.9502, + "step": 7914 + }, + { + "epoch": 2.14179407387363, + "grad_norm": 0.21936124563217163, + "learning_rate": 2.2939675114034543e-05, + "loss": 0.944, + "step": 7916 + }, + { + "epoch": 2.142335272628873, + "grad_norm": 0.229405015707016, + "learning_rate": 2.2913207556871225e-05, + "loss": 0.953, + "step": 7918 + }, + { + "epoch": 2.142876471384116, + "grad_norm": 0.22054095566272736, + "learning_rate": 2.288675073819891e-05, + "loss": 0.9307, + "step": 7920 + }, + { + "epoch": 2.1434176701393586, + "grad_norm": 0.2344048023223877, + "learning_rate": 2.286030466850631e-05, + "loss": 0.9353, + "step": 7922 + }, + { + "epoch": 2.1439588688946016, + "grad_norm": 0.22292739152908325, + "learning_rate": 2.2833869358277908e-05, + "loss": 0.9616, + "step": 7924 + }, + { + "epoch": 2.1445000676498442, + "grad_norm": 0.23201830685138702, + "learning_rate": 2.2807444817993935e-05, + "loss": 0.9473, + "step": 7926 + }, + { + "epoch": 2.1450412664050873, + "grad_norm": 0.21303871273994446, + "learning_rate": 2.2781031058130315e-05, + "loss": 0.9469, + "step": 7928 + }, + { + "epoch": 2.14558246516033, + "grad_norm": 0.22408634424209595, + "learning_rate": 2.2754628089158746e-05, + "loss": 0.9605, + "step": 7930 + }, + { + "epoch": 2.146123663915573, + "grad_norm": 0.20855221152305603, + "learning_rate": 2.272823592154658e-05, + "loss": 0.9104, + "step": 7932 + }, + { + "epoch": 2.146664862670816, + "grad_norm": 0.21597841382026672, + "learning_rate": 2.2701854565756963e-05, + "loss": 0.9506, + "step": 7934 + }, + { + "epoch": 2.1472060614260586, + "grad_norm": 0.23572826385498047, + "learning_rate": 2.267548403224866e-05, + "loss": 0.9477, + "step": 7936 + }, + { + "epoch": 2.1477472601813017, + "grad_norm": 0.25431180000305176, + "learning_rate": 2.2649124331476283e-05, + "loss": 0.9096, + "step": 7938 + }, + { + "epoch": 2.1482884589365443, + "grad_norm": 0.23999497294425964, + "learning_rate": 2.2622775473890005e-05, + "loss": 0.9577, + "step": 7940 + }, + { + "epoch": 2.1488296576917874, + "grad_norm": 0.22671611607074738, + "learning_rate": 2.2596437469935804e-05, + "loss": 0.9379, + "step": 7942 + }, + { + "epoch": 2.14937085644703, + "grad_norm": 0.22205355763435364, + "learning_rate": 2.2570110330055282e-05, + "loss": 0.9655, + "step": 7944 + }, + { + "epoch": 2.149912055202273, + "grad_norm": 0.23694594204425812, + "learning_rate": 2.2543794064685785e-05, + "loss": 0.9666, + "step": 7946 + }, + { + "epoch": 2.150453253957516, + "grad_norm": 0.25069737434387207, + "learning_rate": 2.251748868426033e-05, + "loss": 0.957, + "step": 7948 + }, + { + "epoch": 2.1509944527127587, + "grad_norm": 0.23108243942260742, + "learning_rate": 2.2491194199207634e-05, + "loss": 0.9375, + "step": 7950 + }, + { + "epoch": 2.1515356514680017, + "grad_norm": 0.20701000094413757, + "learning_rate": 2.246491061995205e-05, + "loss": 0.9321, + "step": 7952 + }, + { + "epoch": 2.1520768502232444, + "grad_norm": 0.21403051912784576, + "learning_rate": 2.243863795691365e-05, + "loss": 0.9356, + "step": 7954 + }, + { + "epoch": 2.1526180489784874, + "grad_norm": 0.2154771387577057, + "learning_rate": 2.2412376220508186e-05, + "loss": 0.9286, + "step": 7956 + }, + { + "epoch": 2.15315924773373, + "grad_norm": 0.24486608803272247, + "learning_rate": 2.2386125421147018e-05, + "loss": 0.9612, + "step": 7958 + }, + { + "epoch": 2.153700446488973, + "grad_norm": 0.22294406592845917, + "learning_rate": 2.235988556923723e-05, + "loss": 0.9353, + "step": 7960 + }, + { + "epoch": 2.154241645244216, + "grad_norm": 0.22138181328773499, + "learning_rate": 2.2333656675181545e-05, + "loss": 0.9406, + "step": 7962 + }, + { + "epoch": 2.1547828439994587, + "grad_norm": 0.22416189312934875, + "learning_rate": 2.2307438749378366e-05, + "loss": 0.9298, + "step": 7964 + }, + { + "epoch": 2.155324042754702, + "grad_norm": 0.22987906634807587, + "learning_rate": 2.2281231802221686e-05, + "loss": 0.9639, + "step": 7966 + }, + { + "epoch": 2.1558652415099444, + "grad_norm": 0.23663191497325897, + "learning_rate": 2.225503584410121e-05, + "loss": 0.9377, + "step": 7968 + }, + { + "epoch": 2.1564064402651875, + "grad_norm": 0.2200813889503479, + "learning_rate": 2.2228850885402232e-05, + "loss": 0.9484, + "step": 7970 + }, + { + "epoch": 2.15694763902043, + "grad_norm": 0.21985627710819244, + "learning_rate": 2.220267693650573e-05, + "loss": 0.9436, + "step": 7972 + }, + { + "epoch": 2.157488837775673, + "grad_norm": 0.22890211641788483, + "learning_rate": 2.21765140077883e-05, + "loss": 0.9438, + "step": 7974 + }, + { + "epoch": 2.158030036530916, + "grad_norm": 0.22592753171920776, + "learning_rate": 2.2150362109622174e-05, + "loss": 0.9339, + "step": 7976 + }, + { + "epoch": 2.158571235286159, + "grad_norm": 0.22675001621246338, + "learning_rate": 2.2124221252375216e-05, + "loss": 0.9342, + "step": 7978 + }, + { + "epoch": 2.159112434041402, + "grad_norm": 0.2262718826532364, + "learning_rate": 2.2098091446410873e-05, + "loss": 0.9477, + "step": 7980 + }, + { + "epoch": 2.1596536327966445, + "grad_norm": 0.2256879359483719, + "learning_rate": 2.207197270208826e-05, + "loss": 0.9604, + "step": 7982 + }, + { + "epoch": 2.1601948315518875, + "grad_norm": 0.2155425101518631, + "learning_rate": 2.204586502976208e-05, + "loss": 0.9568, + "step": 7984 + }, + { + "epoch": 2.16073603030713, + "grad_norm": 0.2202017903327942, + "learning_rate": 2.2019768439782686e-05, + "loss": 0.9448, + "step": 7986 + }, + { + "epoch": 2.161277229062373, + "grad_norm": 0.21422246098518372, + "learning_rate": 2.199368294249596e-05, + "loss": 0.935, + "step": 7988 + }, + { + "epoch": 2.1618184278176162, + "grad_norm": 0.2188245803117752, + "learning_rate": 2.1967608548243462e-05, + "loss": 0.9437, + "step": 7990 + }, + { + "epoch": 2.162359626572859, + "grad_norm": 0.21898794174194336, + "learning_rate": 2.19415452673623e-05, + "loss": 0.9592, + "step": 7992 + }, + { + "epoch": 2.162900825328102, + "grad_norm": 0.22789134085178375, + "learning_rate": 2.1915493110185205e-05, + "loss": 0.9378, + "step": 7994 + }, + { + "epoch": 2.1634420240833445, + "grad_norm": 0.2148413062095642, + "learning_rate": 2.1889452087040485e-05, + "loss": 0.9482, + "step": 7996 + }, + { + "epoch": 2.1639832228385876, + "grad_norm": 0.22526615858078003, + "learning_rate": 2.1863422208252064e-05, + "loss": 0.9235, + "step": 7998 + }, + { + "epoch": 2.16452442159383, + "grad_norm": 0.21775515377521515, + "learning_rate": 2.183740348413939e-05, + "loss": 0.9368, + "step": 8000 + }, + { + "epoch": 2.1650656203490732, + "grad_norm": 0.22225214540958405, + "learning_rate": 2.181139592501753e-05, + "loss": 0.9599, + "step": 8002 + }, + { + "epoch": 2.1656068191043163, + "grad_norm": 0.22834616899490356, + "learning_rate": 2.1785399541197133e-05, + "loss": 0.9396, + "step": 8004 + }, + { + "epoch": 2.166148017859559, + "grad_norm": 0.2112305760383606, + "learning_rate": 2.175941434298435e-05, + "loss": 0.9376, + "step": 8006 + }, + { + "epoch": 2.166689216614802, + "grad_norm": 0.2193281054496765, + "learning_rate": 2.1733440340681026e-05, + "loss": 0.9479, + "step": 8008 + }, + { + "epoch": 2.1672304153700446, + "grad_norm": 0.23251795768737793, + "learning_rate": 2.1707477544584426e-05, + "loss": 0.9722, + "step": 8010 + }, + { + "epoch": 2.1677716141252876, + "grad_norm": 0.22384493052959442, + "learning_rate": 2.1681525964987477e-05, + "loss": 0.9497, + "step": 8012 + }, + { + "epoch": 2.1683128128805302, + "grad_norm": 0.22541068494319916, + "learning_rate": 2.1655585612178574e-05, + "loss": 0.9536, + "step": 8014 + }, + { + "epoch": 2.1688540116357733, + "grad_norm": 0.212521493434906, + "learning_rate": 2.1629656496441735e-05, + "loss": 0.9404, + "step": 8016 + }, + { + "epoch": 2.169395210391016, + "grad_norm": 0.21923743188381195, + "learning_rate": 2.160373862805648e-05, + "loss": 0.9441, + "step": 8018 + }, + { + "epoch": 2.169936409146259, + "grad_norm": 0.22538289427757263, + "learning_rate": 2.157783201729791e-05, + "loss": 0.9414, + "step": 8020 + }, + { + "epoch": 2.170477607901502, + "grad_norm": 0.2310040146112442, + "learning_rate": 2.1551936674436596e-05, + "loss": 0.9556, + "step": 8022 + }, + { + "epoch": 2.1710188066567446, + "grad_norm": 0.22321900725364685, + "learning_rate": 2.15260526097387e-05, + "loss": 0.9559, + "step": 8024 + }, + { + "epoch": 2.1715600054119877, + "grad_norm": 0.21206355094909668, + "learning_rate": 2.1500179833465913e-05, + "loss": 0.9143, + "step": 8026 + }, + { + "epoch": 2.1721012041672303, + "grad_norm": 0.2201893925666809, + "learning_rate": 2.1474318355875395e-05, + "loss": 0.9608, + "step": 8028 + }, + { + "epoch": 2.1726424029224733, + "grad_norm": 0.216016948223114, + "learning_rate": 2.1448468187219888e-05, + "loss": 0.9636, + "step": 8030 + }, + { + "epoch": 2.173183601677716, + "grad_norm": 0.22232617437839508, + "learning_rate": 2.1422629337747618e-05, + "loss": 0.9441, + "step": 8032 + }, + { + "epoch": 2.173724800432959, + "grad_norm": 0.2273850440979004, + "learning_rate": 2.1396801817702355e-05, + "loss": 0.9274, + "step": 8034 + }, + { + "epoch": 2.174265999188202, + "grad_norm": 0.21438546478748322, + "learning_rate": 2.1370985637323322e-05, + "loss": 0.9339, + "step": 8036 + }, + { + "epoch": 2.1748071979434447, + "grad_norm": 0.2264583706855774, + "learning_rate": 2.1345180806845305e-05, + "loss": 0.9312, + "step": 8038 + }, + { + "epoch": 2.1753483966986877, + "grad_norm": 0.22321751713752747, + "learning_rate": 2.1319387336498518e-05, + "loss": 0.9219, + "step": 8040 + }, + { + "epoch": 2.1758895954539303, + "grad_norm": 0.21017922461032867, + "learning_rate": 2.129360523650879e-05, + "loss": 0.9479, + "step": 8042 + }, + { + "epoch": 2.1764307942091734, + "grad_norm": 0.24168282747268677, + "learning_rate": 2.1267834517097314e-05, + "loss": 0.9284, + "step": 8044 + }, + { + "epoch": 2.176971992964416, + "grad_norm": 0.22116367518901825, + "learning_rate": 2.124207518848086e-05, + "loss": 0.9325, + "step": 8046 + }, + { + "epoch": 2.177513191719659, + "grad_norm": 0.21654056012630463, + "learning_rate": 2.1216327260871622e-05, + "loss": 0.9282, + "step": 8048 + }, + { + "epoch": 2.1780543904749017, + "grad_norm": 0.22288241982460022, + "learning_rate": 2.1190590744477308e-05, + "loss": 0.924, + "step": 8050 + }, + { + "epoch": 2.1785955892301447, + "grad_norm": 0.23226431012153625, + "learning_rate": 2.1164865649501102e-05, + "loss": 0.9438, + "step": 8052 + }, + { + "epoch": 2.179136787985388, + "grad_norm": 0.2278352975845337, + "learning_rate": 2.1139151986141654e-05, + "loss": 0.9318, + "step": 8054 + }, + { + "epoch": 2.1796779867406304, + "grad_norm": 0.22836346924304962, + "learning_rate": 2.1113449764593095e-05, + "loss": 0.9321, + "step": 8056 + }, + { + "epoch": 2.1802191854958735, + "grad_norm": 0.2221662849187851, + "learning_rate": 2.1087758995044975e-05, + "loss": 0.9604, + "step": 8058 + }, + { + "epoch": 2.180760384251116, + "grad_norm": 0.22639283537864685, + "learning_rate": 2.1062079687682372e-05, + "loss": 0.9495, + "step": 8060 + }, + { + "epoch": 2.181301583006359, + "grad_norm": 0.23551926016807556, + "learning_rate": 2.103641185268572e-05, + "loss": 0.9394, + "step": 8062 + }, + { + "epoch": 2.1818427817616017, + "grad_norm": 0.2246224582195282, + "learning_rate": 2.1010755500231054e-05, + "loss": 0.9521, + "step": 8064 + }, + { + "epoch": 2.182383980516845, + "grad_norm": 0.2231868952512741, + "learning_rate": 2.0985110640489707e-05, + "loss": 0.9475, + "step": 8066 + }, + { + "epoch": 2.182925179272088, + "grad_norm": 0.2231374979019165, + "learning_rate": 2.0959477283628554e-05, + "loss": 0.961, + "step": 8068 + }, + { + "epoch": 2.1834663780273305, + "grad_norm": 0.22598937153816223, + "learning_rate": 2.0933855439809847e-05, + "loss": 0.942, + "step": 8070 + }, + { + "epoch": 2.1840075767825735, + "grad_norm": 0.21894758939743042, + "learning_rate": 2.0908245119191312e-05, + "loss": 0.9297, + "step": 8072 + }, + { + "epoch": 2.184548775537816, + "grad_norm": 0.2219132035970688, + "learning_rate": 2.0882646331926105e-05, + "loss": 0.9491, + "step": 8074 + }, + { + "epoch": 2.185089974293059, + "grad_norm": 0.2236965149641037, + "learning_rate": 2.0857059088162806e-05, + "loss": 0.9348, + "step": 8076 + }, + { + "epoch": 2.185631173048302, + "grad_norm": 0.21840029954910278, + "learning_rate": 2.083148339804539e-05, + "loss": 0.9545, + "step": 8078 + }, + { + "epoch": 2.186172371803545, + "grad_norm": 0.2195041924715042, + "learning_rate": 2.0805919271713288e-05, + "loss": 0.9563, + "step": 8080 + }, + { + "epoch": 2.186713570558788, + "grad_norm": 0.22610372304916382, + "learning_rate": 2.0780366719301353e-05, + "loss": 0.9369, + "step": 8082 + }, + { + "epoch": 2.1872547693140305, + "grad_norm": 0.22029927372932434, + "learning_rate": 2.075482575093979e-05, + "loss": 0.9437, + "step": 8084 + }, + { + "epoch": 2.1877959680692736, + "grad_norm": 0.2290394902229309, + "learning_rate": 2.0729296376754276e-05, + "loss": 0.923, + "step": 8086 + }, + { + "epoch": 2.188337166824516, + "grad_norm": 0.21997421979904175, + "learning_rate": 2.070377860686587e-05, + "loss": 0.9311, + "step": 8088 + }, + { + "epoch": 2.1888783655797592, + "grad_norm": 0.23235318064689636, + "learning_rate": 2.0678272451391035e-05, + "loss": 0.9528, + "step": 8090 + }, + { + "epoch": 2.189419564335002, + "grad_norm": 0.23188577592372894, + "learning_rate": 2.0652777920441597e-05, + "loss": 0.948, + "step": 8092 + }, + { + "epoch": 2.189960763090245, + "grad_norm": 0.2300158441066742, + "learning_rate": 2.0627295024124828e-05, + "loss": 0.9523, + "step": 8094 + }, + { + "epoch": 2.190501961845488, + "grad_norm": 0.24667437374591827, + "learning_rate": 2.0601823772543304e-05, + "loss": 0.9375, + "step": 8096 + }, + { + "epoch": 2.1910431606007306, + "grad_norm": 0.22055894136428833, + "learning_rate": 2.0576364175795122e-05, + "loss": 0.9467, + "step": 8098 + }, + { + "epoch": 2.1915843593559736, + "grad_norm": 0.21852728724479675, + "learning_rate": 2.0550916243973607e-05, + "loss": 0.9394, + "step": 8100 + }, + { + "epoch": 2.1921255581112162, + "grad_norm": 0.2328782081604004, + "learning_rate": 2.052547998716755e-05, + "loss": 0.9479, + "step": 8102 + }, + { + "epoch": 2.1926667568664593, + "grad_norm": 0.2291097640991211, + "learning_rate": 2.0500055415461116e-05, + "loss": 0.9337, + "step": 8104 + }, + { + "epoch": 2.193207955621702, + "grad_norm": 0.21956700086593628, + "learning_rate": 2.047464253893377e-05, + "loss": 0.928, + "step": 8106 + }, + { + "epoch": 2.193749154376945, + "grad_norm": 0.22167332470417023, + "learning_rate": 2.0449241367660403e-05, + "loss": 0.963, + "step": 8108 + }, + { + "epoch": 2.194290353132188, + "grad_norm": 0.23299533128738403, + "learning_rate": 2.0423851911711243e-05, + "loss": 0.9401, + "step": 8110 + }, + { + "epoch": 2.1948315518874306, + "grad_norm": 0.22299495339393616, + "learning_rate": 2.0398474181151904e-05, + "loss": 0.943, + "step": 8112 + }, + { + "epoch": 2.1953727506426737, + "grad_norm": 0.21900831162929535, + "learning_rate": 2.0373108186043277e-05, + "loss": 0.9321, + "step": 8114 + }, + { + "epoch": 2.1959139493979163, + "grad_norm": 0.21487180888652802, + "learning_rate": 2.0347753936441683e-05, + "loss": 0.9303, + "step": 8116 + }, + { + "epoch": 2.1964551481531593, + "grad_norm": 0.22404739260673523, + "learning_rate": 2.0322411442398726e-05, + "loss": 0.957, + "step": 8118 + }, + { + "epoch": 2.196996346908402, + "grad_norm": 0.2316906750202179, + "learning_rate": 2.0297080713961382e-05, + "loss": 0.9492, + "step": 8120 + }, + { + "epoch": 2.197537545663645, + "grad_norm": 0.21779528260231018, + "learning_rate": 2.0271761761171953e-05, + "loss": 0.9604, + "step": 8122 + }, + { + "epoch": 2.198078744418888, + "grad_norm": 0.21846908330917358, + "learning_rate": 2.0246454594068103e-05, + "loss": 0.9604, + "step": 8124 + }, + { + "epoch": 2.1986199431741307, + "grad_norm": 0.2262936234474182, + "learning_rate": 2.022115922268275e-05, + "loss": 0.9209, + "step": 8126 + }, + { + "epoch": 2.1991611419293737, + "grad_norm": 0.22368140518665314, + "learning_rate": 2.01958756570442e-05, + "loss": 0.9327, + "step": 8128 + }, + { + "epoch": 2.1997023406846163, + "grad_norm": 0.21511292457580566, + "learning_rate": 2.0170603907176082e-05, + "loss": 0.9375, + "step": 8130 + }, + { + "epoch": 2.2002435394398594, + "grad_norm": 0.22573569416999817, + "learning_rate": 2.0145343983097263e-05, + "loss": 0.9544, + "step": 8132 + }, + { + "epoch": 2.200784738195102, + "grad_norm": 0.21807391941547394, + "learning_rate": 2.0120095894822045e-05, + "loss": 0.9264, + "step": 8134 + }, + { + "epoch": 2.201325936950345, + "grad_norm": 0.21751199662685394, + "learning_rate": 2.0094859652359915e-05, + "loss": 0.9521, + "step": 8136 + }, + { + "epoch": 2.2018671357055877, + "grad_norm": 0.2182418406009674, + "learning_rate": 2.0069635265715754e-05, + "loss": 0.9344, + "step": 8138 + }, + { + "epoch": 2.2024083344608307, + "grad_norm": 0.23446698486804962, + "learning_rate": 2.0044422744889675e-05, + "loss": 0.9263, + "step": 8140 + }, + { + "epoch": 2.202949533216074, + "grad_norm": 0.22580544650554657, + "learning_rate": 2.0019222099877134e-05, + "loss": 0.9408, + "step": 8142 + }, + { + "epoch": 2.2034907319713164, + "grad_norm": 0.22245322167873383, + "learning_rate": 1.9994033340668868e-05, + "loss": 0.9393, + "step": 8144 + }, + { + "epoch": 2.2040319307265595, + "grad_norm": 0.216224804520607, + "learning_rate": 1.9968856477250903e-05, + "loss": 0.933, + "step": 8146 + }, + { + "epoch": 2.204573129481802, + "grad_norm": 0.22503700852394104, + "learning_rate": 1.9943691519604524e-05, + "loss": 0.961, + "step": 8148 + }, + { + "epoch": 2.205114328237045, + "grad_norm": 0.22365839779376984, + "learning_rate": 1.9918538477706318e-05, + "loss": 0.9428, + "step": 8150 + }, + { + "epoch": 2.2056555269922877, + "grad_norm": 0.22850005328655243, + "learning_rate": 1.9893397361528177e-05, + "loss": 0.954, + "step": 8152 + }, + { + "epoch": 2.206196725747531, + "grad_norm": 0.21970248222351074, + "learning_rate": 1.9868268181037185e-05, + "loss": 0.9369, + "step": 8154 + }, + { + "epoch": 2.206737924502774, + "grad_norm": 0.21925634145736694, + "learning_rate": 1.9843150946195772e-05, + "loss": 0.9441, + "step": 8156 + }, + { + "epoch": 2.2072791232580165, + "grad_norm": 0.2227938324213028, + "learning_rate": 1.98180456669616e-05, + "loss": 0.9413, + "step": 8158 + }, + { + "epoch": 2.2078203220132595, + "grad_norm": 0.2233031988143921, + "learning_rate": 1.9792952353287604e-05, + "loss": 0.9472, + "step": 8160 + }, + { + "epoch": 2.208361520768502, + "grad_norm": 0.22490589320659637, + "learning_rate": 1.9767871015121935e-05, + "loss": 0.9476, + "step": 8162 + }, + { + "epoch": 2.208902719523745, + "grad_norm": 0.21852964162826538, + "learning_rate": 1.9742801662408062e-05, + "loss": 0.9318, + "step": 8164 + }, + { + "epoch": 2.209443918278988, + "grad_norm": 0.22299662232398987, + "learning_rate": 1.9717744305084618e-05, + "loss": 0.9261, + "step": 8166 + }, + { + "epoch": 2.209985117034231, + "grad_norm": 0.22347654402256012, + "learning_rate": 1.96926989530856e-05, + "loss": 0.9599, + "step": 8168 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.20997770130634308, + "learning_rate": 1.9667665616340108e-05, + "loss": 0.9317, + "step": 8170 + }, + { + "epoch": 2.2110675145447165, + "grad_norm": 0.2271878868341446, + "learning_rate": 1.96426443047726e-05, + "loss": 0.9466, + "step": 8172 + }, + { + "epoch": 2.2116087132999596, + "grad_norm": 0.22561673820018768, + "learning_rate": 1.9617635028302666e-05, + "loss": 0.908, + "step": 8174 + }, + { + "epoch": 2.212149912055202, + "grad_norm": 0.2214626520872116, + "learning_rate": 1.9592637796845185e-05, + "loss": 0.9401, + "step": 8176 + }, + { + "epoch": 2.2126911108104452, + "grad_norm": 0.2159542292356491, + "learning_rate": 1.9567652620310262e-05, + "loss": 0.9524, + "step": 8178 + }, + { + "epoch": 2.213232309565688, + "grad_norm": 1.0248746871948242, + "learning_rate": 1.9542679508603194e-05, + "loss": 0.9422, + "step": 8180 + }, + { + "epoch": 2.213773508320931, + "grad_norm": 0.23673224449157715, + "learning_rate": 1.9517718471624537e-05, + "loss": 0.9507, + "step": 8182 + }, + { + "epoch": 2.2143147070761735, + "grad_norm": 0.2888081669807434, + "learning_rate": 1.9492769519269983e-05, + "loss": 0.9303, + "step": 8184 + }, + { + "epoch": 2.2148559058314166, + "grad_norm": 0.24307818710803986, + "learning_rate": 1.9467832661430523e-05, + "loss": 0.9511, + "step": 8186 + }, + { + "epoch": 2.2153971045866596, + "grad_norm": 0.22916708886623383, + "learning_rate": 1.9442907907992264e-05, + "loss": 0.9337, + "step": 8188 + }, + { + "epoch": 2.2159383033419022, + "grad_norm": 0.22617366909980774, + "learning_rate": 1.9417995268836632e-05, + "loss": 0.9284, + "step": 8190 + }, + { + "epoch": 2.2164795020971453, + "grad_norm": 0.22174227237701416, + "learning_rate": 1.939309475384012e-05, + "loss": 0.9234, + "step": 8192 + }, + { + "epoch": 2.217020700852388, + "grad_norm": 0.21963894367218018, + "learning_rate": 1.936820637287451e-05, + "loss": 0.9441, + "step": 8194 + }, + { + "epoch": 2.217561899607631, + "grad_norm": 0.21718013286590576, + "learning_rate": 1.9343330135806707e-05, + "loss": 0.9642, + "step": 8196 + }, + { + "epoch": 2.2181030983628736, + "grad_norm": 0.2226404845714569, + "learning_rate": 1.9318466052498873e-05, + "loss": 0.9302, + "step": 8198 + }, + { + "epoch": 2.2186442971181166, + "grad_norm": 0.2200288325548172, + "learning_rate": 1.9293614132808253e-05, + "loss": 0.9342, + "step": 8200 + }, + { + "epoch": 2.2191854958733597, + "grad_norm": 0.2228095829486847, + "learning_rate": 1.9268774386587402e-05, + "loss": 0.9443, + "step": 8202 + }, + { + "epoch": 2.2197266946286023, + "grad_norm": 0.22220438718795776, + "learning_rate": 1.924394682368391e-05, + "loss": 0.9468, + "step": 8204 + }, + { + "epoch": 2.2202678933838453, + "grad_norm": 0.2198549211025238, + "learning_rate": 1.9219131453940635e-05, + "loss": 0.9451, + "step": 8206 + }, + { + "epoch": 2.220809092139088, + "grad_norm": 0.2270076721906662, + "learning_rate": 1.9194328287195584e-05, + "loss": 0.9234, + "step": 8208 + }, + { + "epoch": 2.221350290894331, + "grad_norm": 0.21973510086536407, + "learning_rate": 1.916953733328187e-05, + "loss": 0.9597, + "step": 8210 + }, + { + "epoch": 2.2218914896495736, + "grad_norm": 0.23056700825691223, + "learning_rate": 1.9144758602027822e-05, + "loss": 0.9373, + "step": 8212 + }, + { + "epoch": 2.2224326884048167, + "grad_norm": 0.22011063992977142, + "learning_rate": 1.9119992103256906e-05, + "loss": 0.9563, + "step": 8214 + }, + { + "epoch": 2.2229738871600597, + "grad_norm": 0.22602880001068115, + "learning_rate": 1.9095237846787763e-05, + "loss": 0.9746, + "step": 8216 + }, + { + "epoch": 2.2235150859153023, + "grad_norm": 0.21703122556209564, + "learning_rate": 1.9070495842434107e-05, + "loss": 0.936, + "step": 8218 + }, + { + "epoch": 2.2240562846705454, + "grad_norm": 0.21804013848304749, + "learning_rate": 1.904576610000489e-05, + "loss": 0.9387, + "step": 8220 + }, + { + "epoch": 2.224597483425788, + "grad_norm": 0.21394406259059906, + "learning_rate": 1.90210486293041e-05, + "loss": 0.9397, + "step": 8222 + }, + { + "epoch": 2.225138682181031, + "grad_norm": 0.2190692126750946, + "learning_rate": 1.899634344013099e-05, + "loss": 0.9467, + "step": 8224 + }, + { + "epoch": 2.2256798809362737, + "grad_norm": 0.2185709923505783, + "learning_rate": 1.8971650542279813e-05, + "loss": 0.9446, + "step": 8226 + }, + { + "epoch": 2.2262210796915167, + "grad_norm": 0.22372521460056305, + "learning_rate": 1.8946969945540045e-05, + "loss": 0.9439, + "step": 8228 + }, + { + "epoch": 2.22676227844676, + "grad_norm": 0.22480180859565735, + "learning_rate": 1.8922301659696208e-05, + "loss": 0.959, + "step": 8230 + }, + { + "epoch": 2.2273034772020024, + "grad_norm": 0.22251246869564056, + "learning_rate": 1.8897645694528006e-05, + "loss": 0.95, + "step": 8232 + }, + { + "epoch": 2.2278446759572454, + "grad_norm": 0.21326804161071777, + "learning_rate": 1.8873002059810225e-05, + "loss": 0.9693, + "step": 8234 + }, + { + "epoch": 2.228385874712488, + "grad_norm": 0.2242264598608017, + "learning_rate": 1.884837076531278e-05, + "loss": 0.9383, + "step": 8236 + }, + { + "epoch": 2.228927073467731, + "grad_norm": 0.21798130869865417, + "learning_rate": 1.88237518208007e-05, + "loss": 0.9535, + "step": 8238 + }, + { + "epoch": 2.2294682722229737, + "grad_norm": 0.2249882072210312, + "learning_rate": 1.8799145236034073e-05, + "loss": 0.9177, + "step": 8240 + }, + { + "epoch": 2.230009470978217, + "grad_norm": 0.2334979921579361, + "learning_rate": 1.8774551020768144e-05, + "loss": 0.9453, + "step": 8242 + }, + { + "epoch": 2.23055066973346, + "grad_norm": 0.2171546220779419, + "learning_rate": 1.8749969184753204e-05, + "loss": 0.9258, + "step": 8244 + }, + { + "epoch": 2.2310918684887024, + "grad_norm": 0.216659277677536, + "learning_rate": 1.8725399737734668e-05, + "loss": 0.9439, + "step": 8246 + }, + { + "epoch": 2.2316330672439455, + "grad_norm": 0.2247028350830078, + "learning_rate": 1.8700842689453042e-05, + "loss": 0.9256, + "step": 8248 + }, + { + "epoch": 2.232174265999188, + "grad_norm": 0.21485809981822968, + "learning_rate": 1.8676298049643914e-05, + "loss": 0.9477, + "step": 8250 + }, + { + "epoch": 2.232715464754431, + "grad_norm": 0.22285142540931702, + "learning_rate": 1.8651765828037916e-05, + "loss": 0.9144, + "step": 8252 + }, + { + "epoch": 2.233256663509674, + "grad_norm": 0.22161146998405457, + "learning_rate": 1.8627246034360795e-05, + "loss": 0.9213, + "step": 8254 + }, + { + "epoch": 2.233797862264917, + "grad_norm": 0.21765734255313873, + "learning_rate": 1.8602738678333374e-05, + "loss": 0.9402, + "step": 8256 + }, + { + "epoch": 2.23433906102016, + "grad_norm": 0.21162736415863037, + "learning_rate": 1.8578243769671526e-05, + "loss": 0.939, + "step": 8258 + }, + { + "epoch": 2.2348802597754025, + "grad_norm": 0.21345964074134827, + "learning_rate": 1.8553761318086217e-05, + "loss": 0.9416, + "step": 8260 + }, + { + "epoch": 2.2354214585306456, + "grad_norm": 0.22210225462913513, + "learning_rate": 1.8529291333283417e-05, + "loss": 0.954, + "step": 8262 + }, + { + "epoch": 2.235962657285888, + "grad_norm": 0.21198150515556335, + "learning_rate": 1.850483382496422e-05, + "loss": 0.9247, + "step": 8264 + }, + { + "epoch": 2.236503856041131, + "grad_norm": 0.21502578258514404, + "learning_rate": 1.8480388802824716e-05, + "loss": 0.9328, + "step": 8266 + }, + { + "epoch": 2.237045054796374, + "grad_norm": 0.22311265766620636, + "learning_rate": 1.8455956276556085e-05, + "loss": 0.9153, + "step": 8268 + }, + { + "epoch": 2.237586253551617, + "grad_norm": 0.21108628809452057, + "learning_rate": 1.8431536255844546e-05, + "loss": 0.93, + "step": 8270 + }, + { + "epoch": 2.2381274523068595, + "grad_norm": 0.22904331982135773, + "learning_rate": 1.8407128750371365e-05, + "loss": 0.9222, + "step": 8272 + }, + { + "epoch": 2.2386686510621026, + "grad_norm": 0.21671108901500702, + "learning_rate": 1.838273376981281e-05, + "loss": 0.9262, + "step": 8274 + }, + { + "epoch": 2.2392098498173456, + "grad_norm": 0.2247387319803238, + "learning_rate": 1.8358351323840238e-05, + "loss": 0.9482, + "step": 8276 + }, + { + "epoch": 2.239751048572588, + "grad_norm": 0.228335440158844, + "learning_rate": 1.8333981422119983e-05, + "loss": 0.9269, + "step": 8278 + }, + { + "epoch": 2.2402922473278313, + "grad_norm": 0.22127707302570343, + "learning_rate": 1.8309624074313437e-05, + "loss": 0.9429, + "step": 8280 + }, + { + "epoch": 2.240833446083074, + "grad_norm": 0.23253074288368225, + "learning_rate": 1.828527929007702e-05, + "loss": 0.9504, + "step": 8282 + }, + { + "epoch": 2.241374644838317, + "grad_norm": 0.22181758284568787, + "learning_rate": 1.8260947079062152e-05, + "loss": 0.9498, + "step": 8284 + }, + { + "epoch": 2.2419158435935596, + "grad_norm": 0.2262001782655716, + "learning_rate": 1.82366274509153e-05, + "loss": 0.9202, + "step": 8286 + }, + { + "epoch": 2.2424570423488026, + "grad_norm": 0.219094917178154, + "learning_rate": 1.821232041527788e-05, + "loss": 0.9438, + "step": 8288 + }, + { + "epoch": 2.2429982411040457, + "grad_norm": 0.22481976449489594, + "learning_rate": 1.8188025981786393e-05, + "loss": 0.9334, + "step": 8290 + }, + { + "epoch": 2.2435394398592883, + "grad_norm": 0.22234976291656494, + "learning_rate": 1.816374416007226e-05, + "loss": 0.9557, + "step": 8292 + }, + { + "epoch": 2.2440806386145313, + "grad_norm": 0.2202659249305725, + "learning_rate": 1.8139474959762008e-05, + "loss": 0.947, + "step": 8294 + }, + { + "epoch": 2.244621837369774, + "grad_norm": 0.23074808716773987, + "learning_rate": 1.8115218390477057e-05, + "loss": 0.9346, + "step": 8296 + }, + { + "epoch": 2.245163036125017, + "grad_norm": 0.21704703569412231, + "learning_rate": 1.8090974461833892e-05, + "loss": 0.9249, + "step": 8298 + }, + { + "epoch": 2.2457042348802596, + "grad_norm": 0.2185417264699936, + "learning_rate": 1.8066743183443923e-05, + "loss": 0.9163, + "step": 8300 + }, + { + "epoch": 2.2462454336355027, + "grad_norm": 0.23099231719970703, + "learning_rate": 1.8042524564913606e-05, + "loss": 0.938, + "step": 8302 + }, + { + "epoch": 2.2467866323907453, + "grad_norm": 0.2459992915391922, + "learning_rate": 1.8018318615844333e-05, + "loss": 0.9309, + "step": 8304 + }, + { + "epoch": 2.2473278311459883, + "grad_norm": 0.21223576366901398, + "learning_rate": 1.799412534583253e-05, + "loss": 0.9458, + "step": 8306 + }, + { + "epoch": 2.2478690299012314, + "grad_norm": 0.2513757348060608, + "learning_rate": 1.796994476446951e-05, + "loss": 0.928, + "step": 8308 + }, + { + "epoch": 2.248410228656474, + "grad_norm": 0.2156715840101242, + "learning_rate": 1.7945776881341625e-05, + "loss": 0.963, + "step": 8310 + }, + { + "epoch": 2.248951427411717, + "grad_norm": 0.2152716964483261, + "learning_rate": 1.792162170603019e-05, + "loss": 0.9237, + "step": 8312 + }, + { + "epoch": 2.2494926261669597, + "grad_norm": 0.21866580843925476, + "learning_rate": 1.789747924811141e-05, + "loss": 0.9612, + "step": 8314 + }, + { + "epoch": 2.2500338249222027, + "grad_norm": 0.21946164965629578, + "learning_rate": 1.7873349517156574e-05, + "loss": 0.94, + "step": 8316 + }, + { + "epoch": 2.2505750236774453, + "grad_norm": 0.21115411818027496, + "learning_rate": 1.7849232522731797e-05, + "loss": 0.9332, + "step": 8318 + }, + { + "epoch": 2.2511162224326884, + "grad_norm": 0.2224206179380417, + "learning_rate": 1.782512827439824e-05, + "loss": 0.937, + "step": 8320 + }, + { + "epoch": 2.2516574211879314, + "grad_norm": 0.22236225008964539, + "learning_rate": 1.7801036781711938e-05, + "loss": 0.907, + "step": 8322 + }, + { + "epoch": 2.252198619943174, + "grad_norm": 0.21866074204444885, + "learning_rate": 1.7776958054223937e-05, + "loss": 0.9174, + "step": 8324 + }, + { + "epoch": 2.252739818698417, + "grad_norm": 0.21248647570610046, + "learning_rate": 1.775289210148013e-05, + "loss": 0.9649, + "step": 8326 + }, + { + "epoch": 2.2532810174536597, + "grad_norm": 0.2248833179473877, + "learning_rate": 1.7728838933021475e-05, + "loss": 0.9328, + "step": 8328 + }, + { + "epoch": 2.2538222162089028, + "grad_norm": 0.2298186868429184, + "learning_rate": 1.7704798558383744e-05, + "loss": 0.9345, + "step": 8330 + }, + { + "epoch": 2.2543634149641454, + "grad_norm": 0.2212422788143158, + "learning_rate": 1.76807709870977e-05, + "loss": 0.9448, + "step": 8332 + }, + { + "epoch": 2.2549046137193884, + "grad_norm": 0.2190183848142624, + "learning_rate": 1.7656756228689026e-05, + "loss": 0.9337, + "step": 8334 + }, + { + "epoch": 2.2554458124746315, + "grad_norm": 0.21614205837249756, + "learning_rate": 1.763275429267826e-05, + "loss": 0.9519, + "step": 8336 + }, + { + "epoch": 2.255987011229874, + "grad_norm": 0.22599539160728455, + "learning_rate": 1.7608765188580984e-05, + "loss": 0.9386, + "step": 8338 + }, + { + "epoch": 2.256528209985117, + "grad_norm": 0.21762165427207947, + "learning_rate": 1.7584788925907563e-05, + "loss": 0.9424, + "step": 8340 + }, + { + "epoch": 2.2570694087403598, + "grad_norm": 0.21818070113658905, + "learning_rate": 1.7560825514163366e-05, + "loss": 0.9392, + "step": 8342 + }, + { + "epoch": 2.257610607495603, + "grad_norm": 0.2244783192873001, + "learning_rate": 1.7536874962848586e-05, + "loss": 0.9595, + "step": 8344 + }, + { + "epoch": 2.2581518062508454, + "grad_norm": 0.22719615697860718, + "learning_rate": 1.7512937281458397e-05, + "loss": 0.9231, + "step": 8346 + }, + { + "epoch": 2.2586930050060885, + "grad_norm": 0.23232480883598328, + "learning_rate": 1.748901247948278e-05, + "loss": 0.9378, + "step": 8348 + }, + { + "epoch": 2.2592342037613315, + "grad_norm": 0.21811732649803162, + "learning_rate": 1.7465100566406727e-05, + "loss": 0.9525, + "step": 8350 + }, + { + "epoch": 2.259775402516574, + "grad_norm": 0.22353313863277435, + "learning_rate": 1.744120155171002e-05, + "loss": 0.9565, + "step": 8352 + }, + { + "epoch": 2.260316601271817, + "grad_norm": 0.22709165513515472, + "learning_rate": 1.7417315444867377e-05, + "loss": 0.9314, + "step": 8354 + }, + { + "epoch": 2.26085780002706, + "grad_norm": 0.21276603639125824, + "learning_rate": 1.7393442255348365e-05, + "loss": 0.9392, + "step": 8356 + }, + { + "epoch": 2.261398998782303, + "grad_norm": 0.23095040023326874, + "learning_rate": 1.7369581992617462e-05, + "loss": 0.9493, + "step": 8358 + }, + { + "epoch": 2.2619401975375455, + "grad_norm": 0.24651546776294708, + "learning_rate": 1.734573466613401e-05, + "loss": 0.9253, + "step": 8360 + }, + { + "epoch": 2.2624813962927885, + "grad_norm": 0.21786443889141083, + "learning_rate": 1.7321900285352228e-05, + "loss": 0.9345, + "step": 8362 + }, + { + "epoch": 2.2630225950480316, + "grad_norm": 0.23249082267284393, + "learning_rate": 1.7298078859721205e-05, + "loss": 0.9344, + "step": 8364 + }, + { + "epoch": 2.263563793803274, + "grad_norm": 0.22898095846176147, + "learning_rate": 1.7274270398684855e-05, + "loss": 0.9543, + "step": 8366 + }, + { + "epoch": 2.2641049925585173, + "grad_norm": 0.2174808233976364, + "learning_rate": 1.7250474911682018e-05, + "loss": 0.9257, + "step": 8368 + }, + { + "epoch": 2.26464619131376, + "grad_norm": 0.21760667860507965, + "learning_rate": 1.7226692408146316e-05, + "loss": 0.9411, + "step": 8370 + }, + { + "epoch": 2.265187390069003, + "grad_norm": 0.23155853152275085, + "learning_rate": 1.720292289750629e-05, + "loss": 0.9486, + "step": 8372 + }, + { + "epoch": 2.2657285888242455, + "grad_norm": 0.22152674198150635, + "learning_rate": 1.7179166389185293e-05, + "loss": 0.9295, + "step": 8374 + }, + { + "epoch": 2.2662697875794886, + "grad_norm": 0.24739988148212433, + "learning_rate": 1.7155422892601553e-05, + "loss": 0.9472, + "step": 8376 + }, + { + "epoch": 2.2668109863347317, + "grad_norm": 0.22437885403633118, + "learning_rate": 1.7131692417168082e-05, + "loss": 0.9461, + "step": 8378 + }, + { + "epoch": 2.2673521850899743, + "grad_norm": 0.22114497423171997, + "learning_rate": 1.7107974972292795e-05, + "loss": 0.9717, + "step": 8380 + }, + { + "epoch": 2.2678933838452173, + "grad_norm": 0.22452475130558014, + "learning_rate": 1.708427056737841e-05, + "loss": 0.9415, + "step": 8382 + }, + { + "epoch": 2.26843458260046, + "grad_norm": 0.22174549102783203, + "learning_rate": 1.7060579211822486e-05, + "loss": 0.9033, + "step": 8384 + }, + { + "epoch": 2.268975781355703, + "grad_norm": 0.21649079024791718, + "learning_rate": 1.703690091501738e-05, + "loss": 0.9269, + "step": 8386 + }, + { + "epoch": 2.2695169801109456, + "grad_norm": 0.22001945972442627, + "learning_rate": 1.7013235686350303e-05, + "loss": 0.9409, + "step": 8388 + }, + { + "epoch": 2.2700581788661887, + "grad_norm": 0.23418515920639038, + "learning_rate": 1.6989583535203295e-05, + "loss": 0.9392, + "step": 8390 + }, + { + "epoch": 2.2705993776214317, + "grad_norm": 0.21749669313430786, + "learning_rate": 1.696594447095316e-05, + "loss": 0.9147, + "step": 8392 + }, + { + "epoch": 2.2711405763766743, + "grad_norm": 0.22158776223659515, + "learning_rate": 1.694231850297156e-05, + "loss": 0.9242, + "step": 8394 + }, + { + "epoch": 2.2716817751319174, + "grad_norm": 0.22903724014759064, + "learning_rate": 1.6918705640624948e-05, + "loss": 0.9493, + "step": 8396 + }, + { + "epoch": 2.27222297388716, + "grad_norm": 0.22212767601013184, + "learning_rate": 1.689510589327461e-05, + "loss": 0.9507, + "step": 8398 + }, + { + "epoch": 2.272764172642403, + "grad_norm": 0.22192035615444183, + "learning_rate": 1.6871519270276563e-05, + "loss": 0.924, + "step": 8400 + }, + { + "epoch": 2.2733053713976457, + "grad_norm": 0.22290875017642975, + "learning_rate": 1.6847945780981713e-05, + "loss": 0.9163, + "step": 8402 + }, + { + "epoch": 2.2738465701528887, + "grad_norm": 0.24206216633319855, + "learning_rate": 1.6824385434735655e-05, + "loss": 0.9509, + "step": 8404 + }, + { + "epoch": 2.2743877689081313, + "grad_norm": 0.21022555232048035, + "learning_rate": 1.6800838240878864e-05, + "loss": 0.9548, + "step": 8406 + }, + { + "epoch": 2.2749289676633744, + "grad_norm": 0.21892598271369934, + "learning_rate": 1.6777304208746557e-05, + "loss": 0.9323, + "step": 8408 + }, + { + "epoch": 2.275470166418617, + "grad_norm": 0.2142675518989563, + "learning_rate": 1.675378334766875e-05, + "loss": 0.942, + "step": 8410 + }, + { + "epoch": 2.27601136517386, + "grad_norm": 0.20939147472381592, + "learning_rate": 1.6730275666970236e-05, + "loss": 0.9371, + "step": 8412 + }, + { + "epoch": 2.276552563929103, + "grad_norm": 0.21476370096206665, + "learning_rate": 1.6706781175970548e-05, + "loss": 0.9325, + "step": 8414 + }, + { + "epoch": 2.2770937626843457, + "grad_norm": 0.215154767036438, + "learning_rate": 1.668329988398404e-05, + "loss": 0.9324, + "step": 8416 + }, + { + "epoch": 2.2776349614395888, + "grad_norm": 0.21878786385059357, + "learning_rate": 1.6659831800319803e-05, + "loss": 0.9656, + "step": 8418 + }, + { + "epoch": 2.2781761601948314, + "grad_norm": 0.2570030391216278, + "learning_rate": 1.6636376934281718e-05, + "loss": 0.9318, + "step": 8420 + }, + { + "epoch": 2.2787173589500744, + "grad_norm": 0.23528580367565155, + "learning_rate": 1.661293529516838e-05, + "loss": 0.9247, + "step": 8422 + }, + { + "epoch": 2.279258557705317, + "grad_norm": 0.2314990907907486, + "learning_rate": 1.6589506892273195e-05, + "loss": 0.9346, + "step": 8424 + }, + { + "epoch": 2.27979975646056, + "grad_norm": 0.219787135720253, + "learning_rate": 1.6566091734884266e-05, + "loss": 0.9422, + "step": 8426 + }, + { + "epoch": 2.280340955215803, + "grad_norm": 0.21865639090538025, + "learning_rate": 1.6542689832284487e-05, + "loss": 0.9331, + "step": 8428 + }, + { + "epoch": 2.2808821539710458, + "grad_norm": 0.21845749020576477, + "learning_rate": 1.651930119375148e-05, + "loss": 0.9331, + "step": 8430 + }, + { + "epoch": 2.281423352726289, + "grad_norm": 0.2267097681760788, + "learning_rate": 1.649592582855764e-05, + "loss": 0.9372, + "step": 8432 + }, + { + "epoch": 2.2819645514815314, + "grad_norm": 0.22816410660743713, + "learning_rate": 1.6472563745970025e-05, + "loss": 0.9416, + "step": 8434 + }, + { + "epoch": 2.2825057502367745, + "grad_norm": 0.22096987068653107, + "learning_rate": 1.6449214955250507e-05, + "loss": 0.94, + "step": 8436 + }, + { + "epoch": 2.283046948992017, + "grad_norm": 0.22847194969654083, + "learning_rate": 1.642587946565566e-05, + "loss": 0.9392, + "step": 8438 + }, + { + "epoch": 2.28358814774726, + "grad_norm": 0.2178688794374466, + "learning_rate": 1.640255728643673e-05, + "loss": 0.929, + "step": 8440 + }, + { + "epoch": 2.284129346502503, + "grad_norm": 0.21666762232780457, + "learning_rate": 1.637924842683981e-05, + "loss": 0.924, + "step": 8442 + }, + { + "epoch": 2.284670545257746, + "grad_norm": 0.2161392718553543, + "learning_rate": 1.6355952896105587e-05, + "loss": 0.9315, + "step": 8444 + }, + { + "epoch": 2.285211744012989, + "grad_norm": 0.22145044803619385, + "learning_rate": 1.6332670703469548e-05, + "loss": 0.9585, + "step": 8446 + }, + { + "epoch": 2.2857529427682315, + "grad_norm": 0.2220619022846222, + "learning_rate": 1.630940185816182e-05, + "loss": 0.9334, + "step": 8448 + }, + { + "epoch": 2.2862941415234745, + "grad_norm": 0.22773829102516174, + "learning_rate": 1.6286146369407324e-05, + "loss": 0.9306, + "step": 8450 + }, + { + "epoch": 2.286835340278717, + "grad_norm": 0.22408922016620636, + "learning_rate": 1.6262904246425585e-05, + "loss": 0.9396, + "step": 8452 + }, + { + "epoch": 2.28737653903396, + "grad_norm": 0.21050365269184113, + "learning_rate": 1.6239675498430952e-05, + "loss": 0.934, + "step": 8454 + }, + { + "epoch": 2.2879177377892033, + "grad_norm": 0.2264346331357956, + "learning_rate": 1.6216460134632343e-05, + "loss": 0.935, + "step": 8456 + }, + { + "epoch": 2.288458936544446, + "grad_norm": 0.2180469036102295, + "learning_rate": 1.6193258164233466e-05, + "loss": 0.9432, + "step": 8458 + }, + { + "epoch": 2.289000135299689, + "grad_norm": 0.209541454911232, + "learning_rate": 1.617006959643269e-05, + "loss": 0.951, + "step": 8460 + }, + { + "epoch": 2.2895413340549315, + "grad_norm": 0.22174447774887085, + "learning_rate": 1.6146894440423034e-05, + "loss": 0.9301, + "step": 8462 + }, + { + "epoch": 2.2900825328101746, + "grad_norm": 0.2211778163909912, + "learning_rate": 1.612373270539224e-05, + "loss": 0.9434, + "step": 8464 + }, + { + "epoch": 2.290623731565417, + "grad_norm": 0.21868322789669037, + "learning_rate": 1.6100584400522727e-05, + "loss": 0.9282, + "step": 8466 + }, + { + "epoch": 2.2911649303206603, + "grad_norm": 0.21750010550022125, + "learning_rate": 1.60774495349916e-05, + "loss": 0.9445, + "step": 8468 + }, + { + "epoch": 2.2917061290759033, + "grad_norm": 0.23710940778255463, + "learning_rate": 1.6054328117970585e-05, + "loss": 0.9239, + "step": 8470 + }, + { + "epoch": 2.292247327831146, + "grad_norm": 0.22122523188591003, + "learning_rate": 1.603122015862614e-05, + "loss": 0.9506, + "step": 8472 + }, + { + "epoch": 2.292788526586389, + "grad_norm": 0.22228212654590607, + "learning_rate": 1.6008125666119312e-05, + "loss": 0.9355, + "step": 8474 + }, + { + "epoch": 2.2933297253416316, + "grad_norm": 0.2161496877670288, + "learning_rate": 1.5985044649605912e-05, + "loss": 0.9481, + "step": 8476 + }, + { + "epoch": 2.2938709240968747, + "grad_norm": 0.2154066562652588, + "learning_rate": 1.596197711823631e-05, + "loss": 0.9207, + "step": 8478 + }, + { + "epoch": 2.2944121228521173, + "grad_norm": 0.21459943056106567, + "learning_rate": 1.5938923081155606e-05, + "loss": 0.9293, + "step": 8480 + }, + { + "epoch": 2.2949533216073603, + "grad_norm": 0.22525747120380402, + "learning_rate": 1.5915882547503468e-05, + "loss": 0.9536, + "step": 8482 + }, + { + "epoch": 2.2954945203626034, + "grad_norm": 0.2165573537349701, + "learning_rate": 1.5892855526414292e-05, + "loss": 0.9533, + "step": 8484 + }, + { + "epoch": 2.296035719117846, + "grad_norm": 0.22310984134674072, + "learning_rate": 1.5869842027017075e-05, + "loss": 0.9319, + "step": 8486 + }, + { + "epoch": 2.296576917873089, + "grad_norm": 0.24109339714050293, + "learning_rate": 1.5846842058435458e-05, + "loss": 0.9325, + "step": 8488 + }, + { + "epoch": 2.2971181166283317, + "grad_norm": 0.2258816510438919, + "learning_rate": 1.5823855629787736e-05, + "loss": 0.9206, + "step": 8490 + }, + { + "epoch": 2.2976593153835747, + "grad_norm": 0.21822375059127808, + "learning_rate": 1.5800882750186802e-05, + "loss": 0.9228, + "step": 8492 + }, + { + "epoch": 2.2982005141388173, + "grad_norm": 0.2541802227497101, + "learning_rate": 1.5777923428740217e-05, + "loss": 0.9345, + "step": 8494 + }, + { + "epoch": 2.2987417128940604, + "grad_norm": 0.21995232999324799, + "learning_rate": 1.5754977674550098e-05, + "loss": 0.9123, + "step": 8496 + }, + { + "epoch": 2.2992829116493034, + "grad_norm": 0.2235877513885498, + "learning_rate": 1.5732045496713298e-05, + "loss": 0.9496, + "step": 8498 + }, + { + "epoch": 2.299824110404546, + "grad_norm": 0.22375322878360748, + "learning_rate": 1.570912690432118e-05, + "loss": 0.9329, + "step": 8500 + }, + { + "epoch": 2.300365309159789, + "grad_norm": 0.21761257946491241, + "learning_rate": 1.5686221906459786e-05, + "loss": 0.9402, + "step": 8502 + }, + { + "epoch": 2.3009065079150317, + "grad_norm": 0.22465704381465912, + "learning_rate": 1.5663330512209712e-05, + "loss": 0.9389, + "step": 8504 + }, + { + "epoch": 2.3014477066702748, + "grad_norm": 0.22578164935112, + "learning_rate": 1.5640452730646215e-05, + "loss": 0.9317, + "step": 8506 + }, + { + "epoch": 2.3019889054255174, + "grad_norm": 0.22684746980667114, + "learning_rate": 1.5617588570839137e-05, + "loss": 0.9176, + "step": 8508 + }, + { + "epoch": 2.3025301041807604, + "grad_norm": 0.21788343787193298, + "learning_rate": 1.5594738041852923e-05, + "loss": 0.9372, + "step": 8510 + }, + { + "epoch": 2.3030713029360035, + "grad_norm": 0.21320442855358124, + "learning_rate": 1.5571901152746588e-05, + "loss": 0.9387, + "step": 8512 + }, + { + "epoch": 2.303612501691246, + "grad_norm": 0.21150639653205872, + "learning_rate": 1.554907791257377e-05, + "loss": 0.9462, + "step": 8514 + }, + { + "epoch": 2.304153700446489, + "grad_norm": 0.21134264767169952, + "learning_rate": 1.5526268330382694e-05, + "loss": 0.9339, + "step": 8516 + }, + { + "epoch": 2.3046948992017318, + "grad_norm": 0.22074440121650696, + "learning_rate": 1.5503472415216135e-05, + "loss": 0.9525, + "step": 8518 + }, + { + "epoch": 2.305236097956975, + "grad_norm": 0.2259265035390854, + "learning_rate": 1.54806901761115e-05, + "loss": 0.9169, + "step": 8520 + }, + { + "epoch": 2.3057772967122174, + "grad_norm": 0.22465473413467407, + "learning_rate": 1.545792162210074e-05, + "loss": 0.928, + "step": 8522 + }, + { + "epoch": 2.3063184954674605, + "grad_norm": 0.21389897167682648, + "learning_rate": 1.5435166762210412e-05, + "loss": 0.9368, + "step": 8524 + }, + { + "epoch": 2.306859694222703, + "grad_norm": 0.21718525886535645, + "learning_rate": 1.5412425605461594e-05, + "loss": 0.9338, + "step": 8526 + }, + { + "epoch": 2.307400892977946, + "grad_norm": 0.2210254818201065, + "learning_rate": 1.5389698160869985e-05, + "loss": 0.9345, + "step": 8528 + }, + { + "epoch": 2.307942091733189, + "grad_norm": 0.21286837756633759, + "learning_rate": 1.536698443744578e-05, + "loss": 0.9353, + "step": 8530 + }, + { + "epoch": 2.308483290488432, + "grad_norm": 0.21803608536720276, + "learning_rate": 1.5344284444193847e-05, + "loss": 0.9565, + "step": 8532 + }, + { + "epoch": 2.309024489243675, + "grad_norm": 0.2314945012331009, + "learning_rate": 1.5321598190113486e-05, + "loss": 0.9457, + "step": 8534 + }, + { + "epoch": 2.3095656879989175, + "grad_norm": 0.22384703159332275, + "learning_rate": 1.5298925684198617e-05, + "loss": 0.9443, + "step": 8536 + }, + { + "epoch": 2.3101068867541605, + "grad_norm": 0.9141970872879028, + "learning_rate": 1.5276266935437724e-05, + "loss": 0.924, + "step": 8538 + }, + { + "epoch": 2.310648085509403, + "grad_norm": 0.2239556461572647, + "learning_rate": 1.525362195281378e-05, + "loss": 0.9474, + "step": 8540 + }, + { + "epoch": 2.311189284264646, + "grad_norm": 0.22693516314029694, + "learning_rate": 1.5230990745304336e-05, + "loss": 0.9467, + "step": 8542 + }, + { + "epoch": 2.311730483019889, + "grad_norm": 0.21579955518245697, + "learning_rate": 1.5208373321881491e-05, + "loss": 0.9292, + "step": 8544 + }, + { + "epoch": 2.312271681775132, + "grad_norm": 0.22664520144462585, + "learning_rate": 1.518576969151187e-05, + "loss": 0.9296, + "step": 8546 + }, + { + "epoch": 2.312812880530375, + "grad_norm": 0.2308238446712494, + "learning_rate": 1.51631798631566e-05, + "loss": 0.9434, + "step": 8548 + }, + { + "epoch": 2.3133540792856175, + "grad_norm": 0.22895975410938263, + "learning_rate": 1.51406038457714e-05, + "loss": 0.9519, + "step": 8550 + }, + { + "epoch": 2.3138952780408606, + "grad_norm": 0.21857978403568268, + "learning_rate": 1.511804164830643e-05, + "loss": 0.9244, + "step": 8552 + }, + { + "epoch": 2.314436476796103, + "grad_norm": 0.22424811124801636, + "learning_rate": 1.5095493279706441e-05, + "loss": 0.9272, + "step": 8554 + }, + { + "epoch": 2.3149776755513463, + "grad_norm": 0.21575234830379486, + "learning_rate": 1.5072958748910677e-05, + "loss": 0.9376, + "step": 8556 + }, + { + "epoch": 2.315518874306589, + "grad_norm": 0.21893800795078278, + "learning_rate": 1.5050438064852911e-05, + "loss": 0.9363, + "step": 8558 + }, + { + "epoch": 2.316060073061832, + "grad_norm": 0.21939443051815033, + "learning_rate": 1.5027931236461379e-05, + "loss": 0.9369, + "step": 8560 + }, + { + "epoch": 2.316601271817075, + "grad_norm": 0.21399757266044617, + "learning_rate": 1.5005438272658873e-05, + "loss": 0.9393, + "step": 8562 + }, + { + "epoch": 2.3171424705723176, + "grad_norm": 0.22195394337177277, + "learning_rate": 1.4982959182362688e-05, + "loss": 0.9343, + "step": 8564 + }, + { + "epoch": 2.3176836693275606, + "grad_norm": 0.2234540432691574, + "learning_rate": 1.4960493974484558e-05, + "loss": 0.9341, + "step": 8566 + }, + { + "epoch": 2.3182248680828033, + "grad_norm": 0.2244921773672104, + "learning_rate": 1.4938042657930817e-05, + "loss": 0.9408, + "step": 8568 + }, + { + "epoch": 2.3187660668380463, + "grad_norm": 0.22262895107269287, + "learning_rate": 1.4915605241602188e-05, + "loss": 0.9352, + "step": 8570 + }, + { + "epoch": 2.319307265593289, + "grad_norm": 0.22237823903560638, + "learning_rate": 1.4893181734393963e-05, + "loss": 0.9595, + "step": 8572 + }, + { + "epoch": 2.319848464348532, + "grad_norm": 0.21250148117542267, + "learning_rate": 1.4870772145195843e-05, + "loss": 0.9427, + "step": 8574 + }, + { + "epoch": 2.320389663103775, + "grad_norm": 0.2166733592748642, + "learning_rate": 1.4848376482892085e-05, + "loss": 0.9346, + "step": 8576 + }, + { + "epoch": 2.3209308618590176, + "grad_norm": 0.21839259564876556, + "learning_rate": 1.4825994756361382e-05, + "loss": 0.9299, + "step": 8578 + }, + { + "epoch": 2.3214720606142607, + "grad_norm": 0.22401659190654755, + "learning_rate": 1.4803626974476936e-05, + "loss": 0.9345, + "step": 8580 + }, + { + "epoch": 2.3220132593695033, + "grad_norm": 0.24845761060714722, + "learning_rate": 1.4781273146106361e-05, + "loss": 0.9177, + "step": 8582 + }, + { + "epoch": 2.3225544581247464, + "grad_norm": 0.21930010616779327, + "learning_rate": 1.4758933280111797e-05, + "loss": 0.9226, + "step": 8584 + }, + { + "epoch": 2.323095656879989, + "grad_norm": 0.2250330150127411, + "learning_rate": 1.4736607385349838e-05, + "loss": 0.9395, + "step": 8586 + }, + { + "epoch": 2.323636855635232, + "grad_norm": 0.21829088032245636, + "learning_rate": 1.4714295470671501e-05, + "loss": 0.9215, + "step": 8588 + }, + { + "epoch": 2.324178054390475, + "grad_norm": 0.2159838229417801, + "learning_rate": 1.4691997544922303e-05, + "loss": 0.9411, + "step": 8590 + }, + { + "epoch": 2.3247192531457177, + "grad_norm": 0.22546617686748505, + "learning_rate": 1.4669713616942199e-05, + "loss": 0.9265, + "step": 8592 + }, + { + "epoch": 2.3252604519009608, + "grad_norm": 0.22809801995754242, + "learning_rate": 1.4647443695565622e-05, + "loss": 0.938, + "step": 8594 + }, + { + "epoch": 2.3258016506562034, + "grad_norm": 0.2190861701965332, + "learning_rate": 1.462518778962138e-05, + "loss": 0.9386, + "step": 8596 + }, + { + "epoch": 2.3263428494114464, + "grad_norm": 0.21500615775585175, + "learning_rate": 1.4602945907932814e-05, + "loss": 0.9305, + "step": 8598 + }, + { + "epoch": 2.326884048166689, + "grad_norm": 0.23055079579353333, + "learning_rate": 1.458071805931761e-05, + "loss": 0.9271, + "step": 8600 + }, + { + "epoch": 2.327425246921932, + "grad_norm": 0.23448796570301056, + "learning_rate": 1.4558504252588007e-05, + "loss": 0.9357, + "step": 8602 + }, + { + "epoch": 2.327966445677175, + "grad_norm": 0.23531349003314972, + "learning_rate": 1.4536304496550567e-05, + "loss": 0.9239, + "step": 8604 + }, + { + "epoch": 2.3285076444324178, + "grad_norm": 0.22408518195152283, + "learning_rate": 1.451411880000636e-05, + "loss": 0.9395, + "step": 8606 + }, + { + "epoch": 2.329048843187661, + "grad_norm": 0.24350731074810028, + "learning_rate": 1.4491947171750814e-05, + "loss": 0.9595, + "step": 8608 + }, + { + "epoch": 2.3295900419429034, + "grad_norm": 0.2234962284564972, + "learning_rate": 1.4469789620573832e-05, + "loss": 0.9409, + "step": 8610 + }, + { + "epoch": 2.3301312406981465, + "grad_norm": 0.2321082204580307, + "learning_rate": 1.4447646155259726e-05, + "loss": 0.9222, + "step": 8612 + }, + { + "epoch": 2.330672439453389, + "grad_norm": 0.2343311756849289, + "learning_rate": 1.4425516784587207e-05, + "loss": 0.9303, + "step": 8614 + }, + { + "epoch": 2.331213638208632, + "grad_norm": 0.21780988574028015, + "learning_rate": 1.4403401517329434e-05, + "loss": 0.9457, + "step": 8616 + }, + { + "epoch": 2.331754836963875, + "grad_norm": 0.22351613640785217, + "learning_rate": 1.4381300362253913e-05, + "loss": 0.9341, + "step": 8618 + }, + { + "epoch": 2.332296035719118, + "grad_norm": 0.23017454147338867, + "learning_rate": 1.4359213328122622e-05, + "loss": 0.9375, + "step": 8620 + }, + { + "epoch": 2.332837234474361, + "grad_norm": 0.22003257274627686, + "learning_rate": 1.433714042369187e-05, + "loss": 0.926, + "step": 8622 + }, + { + "epoch": 2.3333784332296035, + "grad_norm": 0.22800569236278534, + "learning_rate": 1.431508165771246e-05, + "loss": 0.9665, + "step": 8624 + }, + { + "epoch": 2.3339196319848465, + "grad_norm": 0.21841825544834137, + "learning_rate": 1.4293037038929485e-05, + "loss": 0.933, + "step": 8626 + }, + { + "epoch": 2.334460830740089, + "grad_norm": 0.22250762581825256, + "learning_rate": 1.427100657608252e-05, + "loss": 0.9329, + "step": 8628 + }, + { + "epoch": 2.335002029495332, + "grad_norm": 0.21878018975257874, + "learning_rate": 1.424899027790545e-05, + "loss": 0.9186, + "step": 8630 + }, + { + "epoch": 2.3355432282505753, + "grad_norm": 0.2194346934556961, + "learning_rate": 1.4226988153126598e-05, + "loss": 0.9492, + "step": 8632 + }, + { + "epoch": 2.336084427005818, + "grad_norm": 0.2183569371700287, + "learning_rate": 1.4205000210468645e-05, + "loss": 0.9327, + "step": 8634 + }, + { + "epoch": 2.336625625761061, + "grad_norm": 0.2122499942779541, + "learning_rate": 1.4183026458648685e-05, + "loss": 0.9507, + "step": 8636 + }, + { + "epoch": 2.3371668245163035, + "grad_norm": 0.21272961795330048, + "learning_rate": 1.4161066906378113e-05, + "loss": 0.9462, + "step": 8638 + }, + { + "epoch": 2.3377080232715466, + "grad_norm": 0.21495187282562256, + "learning_rate": 1.4139121562362755e-05, + "loss": 0.9243, + "step": 8640 + }, + { + "epoch": 2.338249222026789, + "grad_norm": 0.2101580947637558, + "learning_rate": 1.4117190435302812e-05, + "loss": 0.907, + "step": 8642 + }, + { + "epoch": 2.3387904207820323, + "grad_norm": 0.2280527502298355, + "learning_rate": 1.409527353389279e-05, + "loss": 0.9212, + "step": 8644 + }, + { + "epoch": 2.3393316195372753, + "grad_norm": 0.21973061561584473, + "learning_rate": 1.40733708668216e-05, + "loss": 0.9205, + "step": 8646 + }, + { + "epoch": 2.339872818292518, + "grad_norm": 0.21221718192100525, + "learning_rate": 1.4051482442772507e-05, + "loss": 0.944, + "step": 8648 + }, + { + "epoch": 2.340414017047761, + "grad_norm": 0.2334270179271698, + "learning_rate": 1.4029608270423128e-05, + "loss": 0.9377, + "step": 8650 + }, + { + "epoch": 2.3409552158030036, + "grad_norm": 0.22014257311820984, + "learning_rate": 1.40077483584454e-05, + "loss": 0.9194, + "step": 8652 + }, + { + "epoch": 2.3414964145582466, + "grad_norm": 0.2164536565542221, + "learning_rate": 1.3985902715505661e-05, + "loss": 0.9237, + "step": 8654 + }, + { + "epoch": 2.3420376133134893, + "grad_norm": 0.21519671380519867, + "learning_rate": 1.3964071350264513e-05, + "loss": 0.9275, + "step": 8656 + }, + { + "epoch": 2.3425788120687323, + "grad_norm": 0.2254631221294403, + "learning_rate": 1.3942254271377003e-05, + "loss": 0.9427, + "step": 8658 + }, + { + "epoch": 2.343120010823975, + "grad_norm": 0.21623113751411438, + "learning_rate": 1.3920451487492419e-05, + "loss": 0.9338, + "step": 8660 + }, + { + "epoch": 2.343661209579218, + "grad_norm": 0.21347075700759888, + "learning_rate": 1.3898663007254436e-05, + "loss": 0.9364, + "step": 8662 + }, + { + "epoch": 2.3442024083344606, + "grad_norm": 0.2184230238199234, + "learning_rate": 1.3876888839301056e-05, + "loss": 0.9382, + "step": 8664 + }, + { + "epoch": 2.3447436070897036, + "grad_norm": 0.21815039217472076, + "learning_rate": 1.3855128992264554e-05, + "loss": 0.9322, + "step": 8666 + }, + { + "epoch": 2.3452848058449467, + "grad_norm": 0.21277959644794464, + "learning_rate": 1.3833383474771593e-05, + "loss": 0.935, + "step": 8668 + }, + { + "epoch": 2.3458260046001893, + "grad_norm": 0.217478409409523, + "learning_rate": 1.3811652295443122e-05, + "loss": 0.9218, + "step": 8670 + }, + { + "epoch": 2.3463672033554324, + "grad_norm": 0.219157874584198, + "learning_rate": 1.3789935462894438e-05, + "loss": 0.9131, + "step": 8672 + }, + { + "epoch": 2.346908402110675, + "grad_norm": 0.22979259490966797, + "learning_rate": 1.376823298573508e-05, + "loss": 0.9405, + "step": 8674 + }, + { + "epoch": 2.347449600865918, + "grad_norm": 0.2176111340522766, + "learning_rate": 1.3746544872568978e-05, + "loss": 0.935, + "step": 8676 + }, + { + "epoch": 2.3479907996211606, + "grad_norm": 0.22646884620189667, + "learning_rate": 1.3724871131994305e-05, + "loss": 0.9274, + "step": 8678 + }, + { + "epoch": 2.3485319983764037, + "grad_norm": 0.2228287160396576, + "learning_rate": 1.3703211772603569e-05, + "loss": 0.9456, + "step": 8680 + }, + { + "epoch": 2.3490731971316468, + "grad_norm": 0.21606303751468658, + "learning_rate": 1.368156680298357e-05, + "loss": 0.9223, + "step": 8682 + }, + { + "epoch": 2.3496143958868894, + "grad_norm": 0.21647122502326965, + "learning_rate": 1.365993623171542e-05, + "loss": 0.9146, + "step": 8684 + }, + { + "epoch": 2.3501555946421324, + "grad_norm": 0.21432216465473175, + "learning_rate": 1.3638320067374477e-05, + "loss": 0.9371, + "step": 8686 + }, + { + "epoch": 2.350696793397375, + "grad_norm": 0.21945980191230774, + "learning_rate": 1.361671831853042e-05, + "loss": 0.9518, + "step": 8688 + }, + { + "epoch": 2.351237992152618, + "grad_norm": 0.22580721974372864, + "learning_rate": 1.3595130993747217e-05, + "loss": 0.9311, + "step": 8690 + }, + { + "epoch": 2.3517791909078607, + "grad_norm": 0.21623489260673523, + "learning_rate": 1.3573558101583106e-05, + "loss": 0.9321, + "step": 8692 + }, + { + "epoch": 2.3523203896631038, + "grad_norm": 0.22183172404766083, + "learning_rate": 1.3551999650590619e-05, + "loss": 0.9422, + "step": 8694 + }, + { + "epoch": 2.352861588418347, + "grad_norm": 0.21768032014369965, + "learning_rate": 1.353045564931652e-05, + "loss": 0.9526, + "step": 8696 + }, + { + "epoch": 2.3534027871735894, + "grad_norm": 0.22030091285705566, + "learning_rate": 1.3508926106301911e-05, + "loss": 0.929, + "step": 8698 + }, + { + "epoch": 2.3539439859288325, + "grad_norm": 0.2295338213443756, + "learning_rate": 1.3487411030082093e-05, + "loss": 0.9261, + "step": 8700 + }, + { + "epoch": 2.354485184684075, + "grad_norm": 0.2111324965953827, + "learning_rate": 1.3465910429186673e-05, + "loss": 0.9143, + "step": 8702 + }, + { + "epoch": 2.355026383439318, + "grad_norm": 0.2206672579050064, + "learning_rate": 1.3444424312139514e-05, + "loss": 0.9352, + "step": 8704 + }, + { + "epoch": 2.3555675821945607, + "grad_norm": 0.2182813137769699, + "learning_rate": 1.3422952687458745e-05, + "loss": 0.9374, + "step": 8706 + }, + { + "epoch": 2.356108780949804, + "grad_norm": 0.22040089964866638, + "learning_rate": 1.3401495563656718e-05, + "loss": 0.9323, + "step": 8708 + }, + { + "epoch": 2.356649979705047, + "grad_norm": 0.22553257644176483, + "learning_rate": 1.3380052949240062e-05, + "loss": 0.9067, + "step": 8710 + }, + { + "epoch": 2.3571911784602895, + "grad_norm": 0.21787984669208527, + "learning_rate": 1.335862485270966e-05, + "loss": 0.9214, + "step": 8712 + }, + { + "epoch": 2.3577323772155325, + "grad_norm": 0.21449196338653564, + "learning_rate": 1.3337211282560608e-05, + "loss": 0.9613, + "step": 8714 + }, + { + "epoch": 2.358273575970775, + "grad_norm": 0.22243960201740265, + "learning_rate": 1.3315812247282272e-05, + "loss": 0.9527, + "step": 8716 + }, + { + "epoch": 2.358814774726018, + "grad_norm": 0.22194528579711914, + "learning_rate": 1.3294427755358241e-05, + "loss": 0.9231, + "step": 8718 + }, + { + "epoch": 2.359355973481261, + "grad_norm": 0.22155918180942535, + "learning_rate": 1.3273057815266365e-05, + "loss": 0.9148, + "step": 8720 + }, + { + "epoch": 2.359897172236504, + "grad_norm": 0.2432713508605957, + "learning_rate": 1.3251702435478669e-05, + "loss": 0.9517, + "step": 8722 + }, + { + "epoch": 2.360438370991747, + "grad_norm": 0.21524882316589355, + "learning_rate": 1.3230361624461474e-05, + "loss": 0.9312, + "step": 8724 + }, + { + "epoch": 2.3609795697469895, + "grad_norm": 0.21939432621002197, + "learning_rate": 1.320903539067524e-05, + "loss": 0.9235, + "step": 8726 + }, + { + "epoch": 2.3615207685022326, + "grad_norm": 0.23470257222652435, + "learning_rate": 1.3187723742574764e-05, + "loss": 0.9393, + "step": 8728 + }, + { + "epoch": 2.362061967257475, + "grad_norm": 0.22075091302394867, + "learning_rate": 1.3166426688608952e-05, + "loss": 0.946, + "step": 8730 + }, + { + "epoch": 2.3626031660127182, + "grad_norm": 0.23725062608718872, + "learning_rate": 1.3145144237220992e-05, + "loss": 0.927, + "step": 8732 + }, + { + "epoch": 2.363144364767961, + "grad_norm": 0.2245337814092636, + "learning_rate": 1.3123876396848228e-05, + "loss": 0.9208, + "step": 8734 + }, + { + "epoch": 2.363685563523204, + "grad_norm": 0.21437352895736694, + "learning_rate": 1.3102623175922269e-05, + "loss": 0.9128, + "step": 8736 + }, + { + "epoch": 2.364226762278447, + "grad_norm": 0.2287474125623703, + "learning_rate": 1.3081384582868883e-05, + "loss": 0.946, + "step": 8738 + }, + { + "epoch": 2.3647679610336896, + "grad_norm": 0.22067473828792572, + "learning_rate": 1.306016062610807e-05, + "loss": 0.9292, + "step": 8740 + }, + { + "epoch": 2.3653091597889326, + "grad_norm": 0.2168779820203781, + "learning_rate": 1.3038951314054026e-05, + "loss": 0.9391, + "step": 8742 + }, + { + "epoch": 2.3658503585441752, + "grad_norm": 0.22059370577335358, + "learning_rate": 1.30177566551151e-05, + "loss": 0.9239, + "step": 8744 + }, + { + "epoch": 2.3663915572994183, + "grad_norm": 0.23957104980945587, + "learning_rate": 1.2996576657693893e-05, + "loss": 0.9156, + "step": 8746 + }, + { + "epoch": 2.366932756054661, + "grad_norm": 0.24177046120166779, + "learning_rate": 1.297541133018711e-05, + "loss": 0.933, + "step": 8748 + }, + { + "epoch": 2.367473954809904, + "grad_norm": 0.22190991044044495, + "learning_rate": 1.2954260680985758e-05, + "loss": 0.9291, + "step": 8750 + }, + { + "epoch": 2.368015153565147, + "grad_norm": 0.22796812653541565, + "learning_rate": 1.2933124718474915e-05, + "loss": 0.9433, + "step": 8752 + }, + { + "epoch": 2.3685563523203896, + "grad_norm": 0.21810275316238403, + "learning_rate": 1.2912003451033905e-05, + "loss": 0.9386, + "step": 8754 + }, + { + "epoch": 2.3690975510756327, + "grad_norm": 0.22854852676391602, + "learning_rate": 1.2890896887036175e-05, + "loss": 0.9236, + "step": 8756 + }, + { + "epoch": 2.3696387498308753, + "grad_norm": 0.24949362874031067, + "learning_rate": 1.2869805034849385e-05, + "loss": 0.9242, + "step": 8758 + }, + { + "epoch": 2.3701799485861184, + "grad_norm": 0.20780764520168304, + "learning_rate": 1.284872790283535e-05, + "loss": 0.9201, + "step": 8760 + }, + { + "epoch": 2.370721147341361, + "grad_norm": 0.24522796273231506, + "learning_rate": 1.282766549935005e-05, + "loss": 0.9144, + "step": 8762 + }, + { + "epoch": 2.371262346096604, + "grad_norm": 0.22862307727336884, + "learning_rate": 1.2806617832743607e-05, + "loss": 0.9318, + "step": 8764 + }, + { + "epoch": 2.371803544851847, + "grad_norm": 0.22889967262744904, + "learning_rate": 1.2785584911360315e-05, + "loss": 0.9373, + "step": 8766 + }, + { + "epoch": 2.3723447436070897, + "grad_norm": 0.23068967461585999, + "learning_rate": 1.2764566743538647e-05, + "loss": 0.9224, + "step": 8768 + }, + { + "epoch": 2.3728859423623327, + "grad_norm": 0.21902476251125336, + "learning_rate": 1.2743563337611164e-05, + "loss": 0.9367, + "step": 8770 + }, + { + "epoch": 2.3734271411175754, + "grad_norm": 0.22904910147190094, + "learning_rate": 1.272257470190466e-05, + "loss": 0.9404, + "step": 8772 + }, + { + "epoch": 2.3739683398728184, + "grad_norm": 0.21688368916511536, + "learning_rate": 1.2701600844739992e-05, + "loss": 0.9407, + "step": 8774 + }, + { + "epoch": 2.374509538628061, + "grad_norm": 0.21657058596611023, + "learning_rate": 1.2680641774432218e-05, + "loss": 0.9346, + "step": 8776 + }, + { + "epoch": 2.375050737383304, + "grad_norm": 0.24167576432228088, + "learning_rate": 1.265969749929048e-05, + "loss": 0.9344, + "step": 8778 + }, + { + "epoch": 2.3755919361385467, + "grad_norm": 0.2237577736377716, + "learning_rate": 1.2638768027618108e-05, + "loss": 0.9469, + "step": 8780 + }, + { + "epoch": 2.3761331348937897, + "grad_norm": 0.21517324447631836, + "learning_rate": 1.2617853367712513e-05, + "loss": 0.9308, + "step": 8782 + }, + { + "epoch": 2.376674333649033, + "grad_norm": 0.2111019790172577, + "learning_rate": 1.2596953527865296e-05, + "loss": 0.9262, + "step": 8784 + }, + { + "epoch": 2.3772155324042754, + "grad_norm": 0.21131522953510284, + "learning_rate": 1.2576068516362115e-05, + "loss": 0.9528, + "step": 8786 + }, + { + "epoch": 2.3777567311595185, + "grad_norm": 0.22655946016311646, + "learning_rate": 1.2555198341482787e-05, + "loss": 0.9262, + "step": 8788 + }, + { + "epoch": 2.378297929914761, + "grad_norm": 0.219539612531662, + "learning_rate": 1.2534343011501265e-05, + "loss": 0.9403, + "step": 8790 + }, + { + "epoch": 2.378839128670004, + "grad_norm": 0.21194614470005035, + "learning_rate": 1.2513502534685557e-05, + "loss": 0.9279, + "step": 8792 + }, + { + "epoch": 2.3793803274252467, + "grad_norm": 0.2173326015472412, + "learning_rate": 1.2492676919297835e-05, + "loss": 0.936, + "step": 8794 + }, + { + "epoch": 2.37992152618049, + "grad_norm": 0.22489236295223236, + "learning_rate": 1.2471866173594355e-05, + "loss": 0.9443, + "step": 8796 + }, + { + "epoch": 2.3804627249357324, + "grad_norm": 0.21330489218235016, + "learning_rate": 1.2451070305825508e-05, + "loss": 0.9404, + "step": 8798 + }, + { + "epoch": 2.3810039236909755, + "grad_norm": 0.216909721493721, + "learning_rate": 1.2430289324235734e-05, + "loss": 0.9148, + "step": 8800 + }, + { + "epoch": 2.3815451224462185, + "grad_norm": 0.21142475306987762, + "learning_rate": 1.2409523237063626e-05, + "loss": 0.9249, + "step": 8802 + }, + { + "epoch": 2.382086321201461, + "grad_norm": 0.21710895001888275, + "learning_rate": 1.2388772052541819e-05, + "loss": 0.9204, + "step": 8804 + }, + { + "epoch": 2.382627519956704, + "grad_norm": 0.21345914900302887, + "learning_rate": 1.236803577889708e-05, + "loss": 0.9441, + "step": 8806 + }, + { + "epoch": 2.383168718711947, + "grad_norm": 0.21150511503219604, + "learning_rate": 1.2347314424350258e-05, + "loss": 0.9236, + "step": 8808 + }, + { + "epoch": 2.38370991746719, + "grad_norm": 0.2135467380285263, + "learning_rate": 1.2326607997116302e-05, + "loss": 0.9159, + "step": 8810 + }, + { + "epoch": 2.3842511162224325, + "grad_norm": 0.2189696580171585, + "learning_rate": 1.230591650540418e-05, + "loss": 0.9151, + "step": 8812 + }, + { + "epoch": 2.3847923149776755, + "grad_norm": 0.22715598344802856, + "learning_rate": 1.2285239957417006e-05, + "loss": 0.9227, + "step": 8814 + }, + { + "epoch": 2.3853335137329186, + "grad_norm": 0.21056193113327026, + "learning_rate": 1.2264578361351942e-05, + "loss": 0.9375, + "step": 8816 + }, + { + "epoch": 2.385874712488161, + "grad_norm": 0.22025969624519348, + "learning_rate": 1.2243931725400227e-05, + "loss": 0.9474, + "step": 8818 + }, + { + "epoch": 2.3864159112434042, + "grad_norm": 0.20886293053627014, + "learning_rate": 1.2223300057747178e-05, + "loss": 0.9282, + "step": 8820 + }, + { + "epoch": 2.386957109998647, + "grad_norm": 0.21318678557872772, + "learning_rate": 1.2202683366572149e-05, + "loss": 0.9176, + "step": 8822 + }, + { + "epoch": 2.38749830875389, + "grad_norm": 0.214345782995224, + "learning_rate": 1.2182081660048599e-05, + "loss": 0.9366, + "step": 8824 + }, + { + "epoch": 2.3880395075091325, + "grad_norm": 0.21186871826648712, + "learning_rate": 1.2161494946343993e-05, + "loss": 0.9522, + "step": 8826 + }, + { + "epoch": 2.3885807062643756, + "grad_norm": 0.2160114049911499, + "learning_rate": 1.21409232336199e-05, + "loss": 0.9229, + "step": 8828 + }, + { + "epoch": 2.3891219050196186, + "grad_norm": 0.21936126053333282, + "learning_rate": 1.2120366530031917e-05, + "loss": 0.9445, + "step": 8830 + }, + { + "epoch": 2.3896631037748612, + "grad_norm": 0.22110065817832947, + "learning_rate": 1.2099824843729718e-05, + "loss": 0.9096, + "step": 8832 + }, + { + "epoch": 2.3902043025301043, + "grad_norm": 0.22161610424518585, + "learning_rate": 1.2079298182856974e-05, + "loss": 0.9204, + "step": 8834 + }, + { + "epoch": 2.390745501285347, + "grad_norm": 0.2100730836391449, + "learning_rate": 1.2058786555551443e-05, + "loss": 0.9133, + "step": 8836 + }, + { + "epoch": 2.39128670004059, + "grad_norm": 0.21802273392677307, + "learning_rate": 1.2038289969944922e-05, + "loss": 0.9318, + "step": 8838 + }, + { + "epoch": 2.3918278987958326, + "grad_norm": 0.21906720101833344, + "learning_rate": 1.2017808434163197e-05, + "loss": 0.94, + "step": 8840 + }, + { + "epoch": 2.3923690975510756, + "grad_norm": 0.21740037202835083, + "learning_rate": 1.1997341956326142e-05, + "loss": 0.9103, + "step": 8842 + }, + { + "epoch": 2.3929102963063187, + "grad_norm": 0.21796771883964539, + "learning_rate": 1.1976890544547642e-05, + "loss": 0.913, + "step": 8844 + }, + { + "epoch": 2.3934514950615613, + "grad_norm": 0.21872743964195251, + "learning_rate": 1.1956454206935618e-05, + "loss": 0.935, + "step": 8846 + }, + { + "epoch": 2.3939926938168044, + "grad_norm": 0.22379183769226074, + "learning_rate": 1.1936032951591975e-05, + "loss": 0.9018, + "step": 8848 + }, + { + "epoch": 2.394533892572047, + "grad_norm": 0.254353791475296, + "learning_rate": 1.1915626786612689e-05, + "loss": 0.9443, + "step": 8850 + }, + { + "epoch": 2.39507509132729, + "grad_norm": 0.2170911729335785, + "learning_rate": 1.1895235720087728e-05, + "loss": 0.9593, + "step": 8852 + }, + { + "epoch": 2.3956162900825326, + "grad_norm": 0.2107522040605545, + "learning_rate": 1.1874859760101092e-05, + "loss": 0.9167, + "step": 8854 + }, + { + "epoch": 2.3961574888377757, + "grad_norm": 0.21663688123226166, + "learning_rate": 1.1854498914730755e-05, + "loss": 0.9261, + "step": 8856 + }, + { + "epoch": 2.3966986875930187, + "grad_norm": 0.22999665141105652, + "learning_rate": 1.1834153192048753e-05, + "loss": 0.9405, + "step": 8858 + }, + { + "epoch": 2.3972398863482614, + "grad_norm": 0.224771648645401, + "learning_rate": 1.1813822600121067e-05, + "loss": 0.915, + "step": 8860 + }, + { + "epoch": 2.3977810851035044, + "grad_norm": 0.21687954664230347, + "learning_rate": 1.1793507147007715e-05, + "loss": 0.934, + "step": 8862 + }, + { + "epoch": 2.398322283858747, + "grad_norm": 0.22275640070438385, + "learning_rate": 1.1773206840762724e-05, + "loss": 0.9288, + "step": 8864 + }, + { + "epoch": 2.39886348261399, + "grad_norm": 0.22419123351573944, + "learning_rate": 1.1752921689434081e-05, + "loss": 0.9217, + "step": 8866 + }, + { + "epoch": 2.3994046813692327, + "grad_norm": 0.22479164600372314, + "learning_rate": 1.1732651701063813e-05, + "loss": 0.9294, + "step": 8868 + }, + { + "epoch": 2.3999458801244757, + "grad_norm": 0.21974916756153107, + "learning_rate": 1.171239688368787e-05, + "loss": 0.9275, + "step": 8870 + }, + { + "epoch": 2.400487078879719, + "grad_norm": 0.23250152170658112, + "learning_rate": 1.169215724533625e-05, + "loss": 0.9382, + "step": 8872 + }, + { + "epoch": 2.4010282776349614, + "grad_norm": 0.21765121817588806, + "learning_rate": 1.167193279403287e-05, + "loss": 0.9181, + "step": 8874 + }, + { + "epoch": 2.4015694763902045, + "grad_norm": 0.2235615998506546, + "learning_rate": 1.1651723537795716e-05, + "loss": 0.9382, + "step": 8876 + }, + { + "epoch": 2.402110675145447, + "grad_norm": 0.22147420048713684, + "learning_rate": 1.1631529484636661e-05, + "loss": 0.9465, + "step": 8878 + }, + { + "epoch": 2.40265187390069, + "grad_norm": 0.2214687168598175, + "learning_rate": 1.1611350642561608e-05, + "loss": 0.9454, + "step": 8880 + }, + { + "epoch": 2.4031930726559327, + "grad_norm": 0.21026647090911865, + "learning_rate": 1.1591187019570382e-05, + "loss": 0.9387, + "step": 8882 + }, + { + "epoch": 2.403734271411176, + "grad_norm": 0.2267550379037857, + "learning_rate": 1.1571038623656827e-05, + "loss": 0.9336, + "step": 8884 + }, + { + "epoch": 2.404275470166419, + "grad_norm": 0.23210947215557098, + "learning_rate": 1.1550905462808687e-05, + "loss": 0.9295, + "step": 8886 + }, + { + "epoch": 2.4048166689216615, + "grad_norm": 0.2207941859960556, + "learning_rate": 1.1530787545007754e-05, + "loss": 0.938, + "step": 8888 + }, + { + "epoch": 2.4053578676769045, + "grad_norm": 0.22868280112743378, + "learning_rate": 1.1510684878229683e-05, + "loss": 0.9301, + "step": 8890 + }, + { + "epoch": 2.405899066432147, + "grad_norm": 0.2194068282842636, + "learning_rate": 1.1490597470444147e-05, + "loss": 0.9294, + "step": 8892 + }, + { + "epoch": 2.40644026518739, + "grad_norm": 0.22043105959892273, + "learning_rate": 1.1470525329614762e-05, + "loss": 0.9185, + "step": 8894 + }, + { + "epoch": 2.406981463942633, + "grad_norm": 0.23089595139026642, + "learning_rate": 1.1450468463699033e-05, + "loss": 0.9192, + "step": 8896 + }, + { + "epoch": 2.407522662697876, + "grad_norm": 0.21437110006809235, + "learning_rate": 1.1430426880648504e-05, + "loss": 0.9519, + "step": 8898 + }, + { + "epoch": 2.408063861453119, + "grad_norm": 0.23556619882583618, + "learning_rate": 1.1410400588408576e-05, + "loss": 0.9403, + "step": 8900 + }, + { + "epoch": 2.4086050602083615, + "grad_norm": 0.2320043295621872, + "learning_rate": 1.1390389594918649e-05, + "loss": 0.9291, + "step": 8902 + }, + { + "epoch": 2.4091462589636046, + "grad_norm": 0.21659469604492188, + "learning_rate": 1.1370393908112003e-05, + "loss": 0.9219, + "step": 8904 + }, + { + "epoch": 2.409687457718847, + "grad_norm": 0.23196423053741455, + "learning_rate": 1.135041353591591e-05, + "loss": 0.9298, + "step": 8906 + }, + { + "epoch": 2.4102286564740902, + "grad_norm": 0.21749107539653778, + "learning_rate": 1.1330448486251488e-05, + "loss": 0.9255, + "step": 8908 + }, + { + "epoch": 2.410769855229333, + "grad_norm": 0.2233181893825531, + "learning_rate": 1.1310498767033895e-05, + "loss": 0.9377, + "step": 8910 + }, + { + "epoch": 2.411311053984576, + "grad_norm": 0.21337848901748657, + "learning_rate": 1.1290564386172099e-05, + "loss": 0.9285, + "step": 8912 + }, + { + "epoch": 2.4118522527398185, + "grad_norm": 0.21738334000110626, + "learning_rate": 1.1270645351569048e-05, + "loss": 0.9224, + "step": 8914 + }, + { + "epoch": 2.4123934514950616, + "grad_norm": 0.22094985842704773, + "learning_rate": 1.1250741671121617e-05, + "loss": 0.9259, + "step": 8916 + }, + { + "epoch": 2.412934650250304, + "grad_norm": 0.23166124522686005, + "learning_rate": 1.1230853352720532e-05, + "loss": 0.9358, + "step": 8918 + }, + { + "epoch": 2.4134758490055472, + "grad_norm": 0.22243624925613403, + "learning_rate": 1.1210980404250488e-05, + "loss": 0.954, + "step": 8920 + }, + { + "epoch": 2.4140170477607903, + "grad_norm": 0.2145751416683197, + "learning_rate": 1.1191122833590062e-05, + "loss": 0.9142, + "step": 8922 + }, + { + "epoch": 2.414558246516033, + "grad_norm": 0.22256283462047577, + "learning_rate": 1.1171280648611754e-05, + "loss": 0.9088, + "step": 8924 + }, + { + "epoch": 2.415099445271276, + "grad_norm": 0.218195378780365, + "learning_rate": 1.1151453857181921e-05, + "loss": 0.931, + "step": 8926 + }, + { + "epoch": 2.4156406440265186, + "grad_norm": 0.21641704440116882, + "learning_rate": 1.113164246716087e-05, + "loss": 0.914, + "step": 8928 + }, + { + "epoch": 2.4161818427817616, + "grad_norm": 0.22065632045269012, + "learning_rate": 1.1111846486402732e-05, + "loss": 0.9272, + "step": 8930 + }, + { + "epoch": 2.4167230415370042, + "grad_norm": 0.21814922988414764, + "learning_rate": 1.109206592275564e-05, + "loss": 0.9346, + "step": 8932 + }, + { + "epoch": 2.4172642402922473, + "grad_norm": 0.22115272283554077, + "learning_rate": 1.1072300784061502e-05, + "loss": 0.9549, + "step": 8934 + }, + { + "epoch": 2.4178054390474903, + "grad_norm": 0.2396317720413208, + "learning_rate": 1.1052551078156181e-05, + "loss": 0.9327, + "step": 8936 + }, + { + "epoch": 2.418346637802733, + "grad_norm": 0.22320285439491272, + "learning_rate": 1.1032816812869374e-05, + "loss": 0.9275, + "step": 8938 + }, + { + "epoch": 2.418887836557976, + "grad_norm": 0.20963533222675323, + "learning_rate": 1.1013097996024696e-05, + "loss": 0.93, + "step": 8940 + }, + { + "epoch": 2.4194290353132186, + "grad_norm": 0.23613232374191284, + "learning_rate": 1.0993394635439625e-05, + "loss": 0.9225, + "step": 8942 + }, + { + "epoch": 2.4199702340684617, + "grad_norm": 0.21615718305110931, + "learning_rate": 1.0973706738925505e-05, + "loss": 0.9239, + "step": 8944 + }, + { + "epoch": 2.4205114328237043, + "grad_norm": 0.22904038429260254, + "learning_rate": 1.0954034314287565e-05, + "loss": 0.9559, + "step": 8946 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.22816400229930878, + "learning_rate": 1.0934377369324866e-05, + "loss": 0.9111, + "step": 8948 + }, + { + "epoch": 2.4215938303341904, + "grad_norm": 0.24029776453971863, + "learning_rate": 1.0914735911830376e-05, + "loss": 0.934, + "step": 8950 + }, + { + "epoch": 2.422135029089433, + "grad_norm": 0.21745756268501282, + "learning_rate": 1.0895109949590881e-05, + "loss": 0.9391, + "step": 8952 + }, + { + "epoch": 2.422676227844676, + "grad_norm": 0.22948989272117615, + "learning_rate": 1.0875499490387058e-05, + "loss": 0.9428, + "step": 8954 + }, + { + "epoch": 2.4232174265999187, + "grad_norm": 0.21396519243717194, + "learning_rate": 1.0855904541993416e-05, + "loss": 0.9353, + "step": 8956 + }, + { + "epoch": 2.4237586253551617, + "grad_norm": 0.22951596975326538, + "learning_rate": 1.0836325112178347e-05, + "loss": 0.9192, + "step": 8958 + }, + { + "epoch": 2.4242998241104043, + "grad_norm": 0.21889586746692657, + "learning_rate": 1.0816761208704035e-05, + "loss": 0.9458, + "step": 8960 + }, + { + "epoch": 2.4248410228656474, + "grad_norm": 0.22831827402114868, + "learning_rate": 1.0797212839326565e-05, + "loss": 0.9339, + "step": 8962 + }, + { + "epoch": 2.4253822216208905, + "grad_norm": 0.22102707624435425, + "learning_rate": 1.0777680011795815e-05, + "loss": 0.9236, + "step": 8964 + }, + { + "epoch": 2.425923420376133, + "grad_norm": 0.21797436475753784, + "learning_rate": 1.0758162733855543e-05, + "loss": 0.9571, + "step": 8966 + }, + { + "epoch": 2.426464619131376, + "grad_norm": 0.22361426055431366, + "learning_rate": 1.0738661013243317e-05, + "loss": 0.9305, + "step": 8968 + }, + { + "epoch": 2.4270058178866187, + "grad_norm": 0.2192944437265396, + "learning_rate": 1.0719174857690546e-05, + "loss": 0.9416, + "step": 8970 + }, + { + "epoch": 2.427547016641862, + "grad_norm": 0.21529456973075867, + "learning_rate": 1.069970427492249e-05, + "loss": 0.9302, + "step": 8972 + }, + { + "epoch": 2.4280882153971044, + "grad_norm": 0.2117040604352951, + "learning_rate": 1.0680249272658177e-05, + "loss": 0.9382, + "step": 8974 + }, + { + "epoch": 2.4286294141523475, + "grad_norm": 0.21386168897151947, + "learning_rate": 1.066080985861051e-05, + "loss": 0.9256, + "step": 8976 + }, + { + "epoch": 2.4291706129075905, + "grad_norm": 0.21614041924476624, + "learning_rate": 1.0641386040486196e-05, + "loss": 0.9378, + "step": 8978 + }, + { + "epoch": 2.429711811662833, + "grad_norm": 0.21554140746593475, + "learning_rate": 1.0621977825985774e-05, + "loss": 0.9392, + "step": 8980 + }, + { + "epoch": 2.430253010418076, + "grad_norm": 0.21622812747955322, + "learning_rate": 1.0602585222803551e-05, + "loss": 0.9478, + "step": 8982 + }, + { + "epoch": 2.430794209173319, + "grad_norm": 0.22021763026714325, + "learning_rate": 1.0583208238627712e-05, + "loss": 0.9528, + "step": 8984 + }, + { + "epoch": 2.431335407928562, + "grad_norm": 0.22095142304897308, + "learning_rate": 1.0563846881140172e-05, + "loss": 0.9492, + "step": 8986 + }, + { + "epoch": 2.4318766066838045, + "grad_norm": 0.21551690995693207, + "learning_rate": 1.0544501158016724e-05, + "loss": 0.9157, + "step": 8988 + }, + { + "epoch": 2.4324178054390475, + "grad_norm": 0.23666973412036896, + "learning_rate": 1.0525171076926915e-05, + "loss": 0.9231, + "step": 8990 + }, + { + "epoch": 2.4329590041942906, + "grad_norm": 0.22699445486068726, + "learning_rate": 1.050585664553413e-05, + "loss": 0.9415, + "step": 8992 + }, + { + "epoch": 2.433500202949533, + "grad_norm": 0.21800200641155243, + "learning_rate": 1.0486557871495495e-05, + "loss": 0.938, + "step": 8994 + }, + { + "epoch": 2.4340414017047762, + "grad_norm": 0.2277022898197174, + "learning_rate": 1.0467274762461982e-05, + "loss": 0.9643, + "step": 8996 + }, + { + "epoch": 2.434582600460019, + "grad_norm": 0.21464087069034576, + "learning_rate": 1.0448007326078335e-05, + "loss": 0.916, + "step": 8998 + }, + { + "epoch": 2.435123799215262, + "grad_norm": 0.22692528367042542, + "learning_rate": 1.0428755569983045e-05, + "loss": 0.9288, + "step": 9000 + }, + { + "epoch": 2.4356649979705045, + "grad_norm": 0.22359338402748108, + "learning_rate": 1.0409519501808479e-05, + "loss": 0.9389, + "step": 9002 + }, + { + "epoch": 2.4362061967257476, + "grad_norm": 0.21742232143878937, + "learning_rate": 1.0390299129180681e-05, + "loss": 0.9252, + "step": 9004 + }, + { + "epoch": 2.4367473954809906, + "grad_norm": 0.25492212176322937, + "learning_rate": 1.0371094459719555e-05, + "loss": 0.9408, + "step": 9006 + }, + { + "epoch": 2.4372885942362332, + "grad_norm": 0.22658966481685638, + "learning_rate": 1.0351905501038711e-05, + "loss": 0.9586, + "step": 9008 + }, + { + "epoch": 2.4378297929914763, + "grad_norm": 0.22934678196907043, + "learning_rate": 1.0332732260745582e-05, + "loss": 0.9165, + "step": 9010 + }, + { + "epoch": 2.438370991746719, + "grad_norm": 0.22674311697483063, + "learning_rate": 1.031357474644134e-05, + "loss": 0.9354, + "step": 9012 + }, + { + "epoch": 2.438912190501962, + "grad_norm": 0.2146492898464203, + "learning_rate": 1.0294432965720962e-05, + "loss": 0.9243, + "step": 9014 + }, + { + "epoch": 2.4394533892572046, + "grad_norm": 0.235643669962883, + "learning_rate": 1.0275306926173117e-05, + "loss": 0.9214, + "step": 9016 + }, + { + "epoch": 2.4399945880124476, + "grad_norm": 0.21670803427696228, + "learning_rate": 1.0256196635380306e-05, + "loss": 0.9158, + "step": 9018 + }, + { + "epoch": 2.4405357867676907, + "grad_norm": 0.2467275708913803, + "learning_rate": 1.0237102100918756e-05, + "loss": 0.9346, + "step": 9020 + }, + { + "epoch": 2.4410769855229333, + "grad_norm": 0.7340829372406006, + "learning_rate": 1.0218023330358407e-05, + "loss": 0.9169, + "step": 9022 + }, + { + "epoch": 2.4416181842781763, + "grad_norm": 0.22298909723758698, + "learning_rate": 1.0198960331263051e-05, + "loss": 0.9177, + "step": 9024 + }, + { + "epoch": 2.442159383033419, + "grad_norm": 0.2362624704837799, + "learning_rate": 1.0179913111190126e-05, + "loss": 0.9437, + "step": 9026 + }, + { + "epoch": 2.442700581788662, + "grad_norm": 0.2193574756383896, + "learning_rate": 1.0160881677690876e-05, + "loss": 0.9459, + "step": 9028 + }, + { + "epoch": 2.4432417805439046, + "grad_norm": 0.2264617532491684, + "learning_rate": 1.0141866038310239e-05, + "loss": 0.9568, + "step": 9030 + }, + { + "epoch": 2.4437829792991477, + "grad_norm": 0.22021548449993134, + "learning_rate": 1.0122866200586944e-05, + "loss": 0.9207, + "step": 9032 + }, + { + "epoch": 2.4443241780543903, + "grad_norm": 0.22166012227535248, + "learning_rate": 1.0103882172053392e-05, + "loss": 0.9291, + "step": 9034 + }, + { + "epoch": 2.4448653768096333, + "grad_norm": 0.24701930582523346, + "learning_rate": 1.0084913960235808e-05, + "loss": 0.9185, + "step": 9036 + }, + { + "epoch": 2.4454065755648764, + "grad_norm": 0.22556249797344208, + "learning_rate": 1.0065961572654047e-05, + "loss": 0.9276, + "step": 9038 + }, + { + "epoch": 2.445947774320119, + "grad_norm": 0.21386824548244476, + "learning_rate": 1.0047025016821765e-05, + "loss": 0.9429, + "step": 9040 + }, + { + "epoch": 2.446488973075362, + "grad_norm": 0.23600448668003082, + "learning_rate": 1.0028104300246289e-05, + "loss": 0.9212, + "step": 9042 + }, + { + "epoch": 2.4470301718306047, + "grad_norm": 0.22147853672504425, + "learning_rate": 1.0009199430428696e-05, + "loss": 0.936, + "step": 9044 + }, + { + "epoch": 2.4475713705858477, + "grad_norm": 0.22897976636886597, + "learning_rate": 9.990310414863773e-06, + "loss": 0.9503, + "step": 9046 + }, + { + "epoch": 2.4481125693410903, + "grad_norm": 0.21999779343605042, + "learning_rate": 9.971437261040034e-06, + "loss": 0.9298, + "step": 9048 + }, + { + "epoch": 2.4486537680963334, + "grad_norm": 0.22082196176052094, + "learning_rate": 9.952579976439696e-06, + "loss": 0.9521, + "step": 9050 + }, + { + "epoch": 2.449194966851576, + "grad_norm": 0.22231553494930267, + "learning_rate": 9.933738568538663e-06, + "loss": 0.9153, + "step": 9052 + }, + { + "epoch": 2.449736165606819, + "grad_norm": 0.22483180463314056, + "learning_rate": 9.914913044806589e-06, + "loss": 0.9106, + "step": 9054 + }, + { + "epoch": 2.450277364362062, + "grad_norm": 0.21786735951900482, + "learning_rate": 9.896103412706764e-06, + "loss": 0.9298, + "step": 9056 + }, + { + "epoch": 2.4508185631173047, + "grad_norm": 0.22572767734527588, + "learning_rate": 9.877309679696273e-06, + "loss": 0.9335, + "step": 9058 + }, + { + "epoch": 2.451359761872548, + "grad_norm": 0.22724193334579468, + "learning_rate": 9.8585318532258e-06, + "loss": 0.9508, + "step": 9060 + }, + { + "epoch": 2.4519009606277904, + "grad_norm": 0.2230607122182846, + "learning_rate": 9.839769940739801e-06, + "loss": 0.9396, + "step": 9062 + }, + { + "epoch": 2.4524421593830334, + "grad_norm": 0.21672306954860687, + "learning_rate": 9.821023949676362e-06, + "loss": 0.9426, + "step": 9064 + }, + { + "epoch": 2.452983358138276, + "grad_norm": 0.21486173570156097, + "learning_rate": 9.80229388746729e-06, + "loss": 0.9174, + "step": 9066 + }, + { + "epoch": 2.453524556893519, + "grad_norm": 0.21360976994037628, + "learning_rate": 9.783579761538081e-06, + "loss": 0.9194, + "step": 9068 + }, + { + "epoch": 2.454065755648762, + "grad_norm": 0.23886175453662872, + "learning_rate": 9.764881579307911e-06, + "loss": 0.9358, + "step": 9070 + }, + { + "epoch": 2.454606954404005, + "grad_norm": 0.2210131138563156, + "learning_rate": 9.746199348189605e-06, + "loss": 0.934, + "step": 9072 + }, + { + "epoch": 2.455148153159248, + "grad_norm": 0.21905779838562012, + "learning_rate": 9.72753307558969e-06, + "loss": 0.9213, + "step": 9074 + }, + { + "epoch": 2.4556893519144904, + "grad_norm": 0.22789104282855988, + "learning_rate": 9.70888276890839e-06, + "loss": 0.9081, + "step": 9076 + }, + { + "epoch": 2.4562305506697335, + "grad_norm": 0.21520422399044037, + "learning_rate": 9.69024843553954e-06, + "loss": 0.9471, + "step": 9078 + }, + { + "epoch": 2.456771749424976, + "grad_norm": 0.23059752583503723, + "learning_rate": 9.67163008287068e-06, + "loss": 0.9275, + "step": 9080 + }, + { + "epoch": 2.457312948180219, + "grad_norm": 0.2139766663312912, + "learning_rate": 9.653027718283026e-06, + "loss": 0.9191, + "step": 9082 + }, + { + "epoch": 2.4578541469354622, + "grad_norm": 0.2290935069322586, + "learning_rate": 9.634441349151436e-06, + "loss": 0.9461, + "step": 9084 + }, + { + "epoch": 2.458395345690705, + "grad_norm": 0.2176143229007721, + "learning_rate": 9.615870982844411e-06, + "loss": 0.9307, + "step": 9086 + }, + { + "epoch": 2.458936544445948, + "grad_norm": 0.23383669555187225, + "learning_rate": 9.597316626724151e-06, + "loss": 0.952, + "step": 9088 + }, + { + "epoch": 2.4594777432011905, + "grad_norm": 0.21901027858257294, + "learning_rate": 9.578778288146445e-06, + "loss": 0.9212, + "step": 9090 + }, + { + "epoch": 2.4600189419564336, + "grad_norm": 0.25713956356048584, + "learning_rate": 9.560255974460818e-06, + "loss": 0.9429, + "step": 9092 + }, + { + "epoch": 2.460560140711676, + "grad_norm": 0.22989803552627563, + "learning_rate": 9.541749693010371e-06, + "loss": 0.9339, + "step": 9094 + }, + { + "epoch": 2.4611013394669192, + "grad_norm": 0.22521257400512695, + "learning_rate": 9.523259451131866e-06, + "loss": 0.9184, + "step": 9096 + }, + { + "epoch": 2.4616425382221623, + "grad_norm": 0.23861786723136902, + "learning_rate": 9.50478525615574e-06, + "loss": 0.941, + "step": 9098 + }, + { + "epoch": 2.462183736977405, + "grad_norm": 0.24066314101219177, + "learning_rate": 9.486327115406013e-06, + "loss": 0.9305, + "step": 9100 + }, + { + "epoch": 2.462724935732648, + "grad_norm": 0.22237703204154968, + "learning_rate": 9.467885036200374e-06, + "loss": 0.9363, + "step": 9102 + }, + { + "epoch": 2.4632661344878906, + "grad_norm": 0.21756206452846527, + "learning_rate": 9.449459025850149e-06, + "loss": 0.9423, + "step": 9104 + }, + { + "epoch": 2.4638073332431336, + "grad_norm": 0.25314486026763916, + "learning_rate": 9.431049091660294e-06, + "loss": 0.9019, + "step": 9106 + }, + { + "epoch": 2.4643485319983762, + "grad_norm": 0.21412117779254913, + "learning_rate": 9.412655240929353e-06, + "loss": 0.94, + "step": 9108 + }, + { + "epoch": 2.4648897307536193, + "grad_norm": 0.24650193750858307, + "learning_rate": 9.394277480949542e-06, + "loss": 0.9254, + "step": 9110 + }, + { + "epoch": 2.4654309295088623, + "grad_norm": 0.22870595753192902, + "learning_rate": 9.375915819006664e-06, + "loss": 0.9336, + "step": 9112 + }, + { + "epoch": 2.465972128264105, + "grad_norm": 0.2240973711013794, + "learning_rate": 9.357570262380161e-06, + "loss": 0.9306, + "step": 9114 + }, + { + "epoch": 2.466513327019348, + "grad_norm": 0.2161659598350525, + "learning_rate": 9.339240818343087e-06, + "loss": 0.9254, + "step": 9116 + }, + { + "epoch": 2.4670545257745906, + "grad_norm": 0.2153376042842865, + "learning_rate": 9.320927494162108e-06, + "loss": 0.9247, + "step": 9118 + }, + { + "epoch": 2.4675957245298337, + "grad_norm": 0.21541644632816315, + "learning_rate": 9.302630297097475e-06, + "loss": 0.9144, + "step": 9120 + }, + { + "epoch": 2.4681369232850763, + "grad_norm": 0.2231806069612503, + "learning_rate": 9.284349234403084e-06, + "loss": 0.9298, + "step": 9122 + }, + { + "epoch": 2.4686781220403193, + "grad_norm": 0.21714667975902557, + "learning_rate": 9.266084313326417e-06, + "loss": 0.9358, + "step": 9124 + }, + { + "epoch": 2.4692193207955624, + "grad_norm": 0.21613052487373352, + "learning_rate": 9.24783554110853e-06, + "loss": 0.9203, + "step": 9126 + }, + { + "epoch": 2.469760519550805, + "grad_norm": 0.22005732357501984, + "learning_rate": 9.229602924984149e-06, + "loss": 0.9576, + "step": 9128 + }, + { + "epoch": 2.470301718306048, + "grad_norm": 0.21947814524173737, + "learning_rate": 9.21138647218151e-06, + "loss": 0.9276, + "step": 9130 + }, + { + "epoch": 2.4708429170612907, + "grad_norm": 0.23873908817768097, + "learning_rate": 9.193186189922509e-06, + "loss": 0.9128, + "step": 9132 + }, + { + "epoch": 2.4713841158165337, + "grad_norm": 0.22183538973331451, + "learning_rate": 9.175002085422568e-06, + "loss": 0.9474, + "step": 9134 + }, + { + "epoch": 2.4719253145717763, + "grad_norm": 0.21653030812740326, + "learning_rate": 9.156834165890754e-06, + "loss": 0.936, + "step": 9136 + }, + { + "epoch": 2.4724665133270194, + "grad_norm": 0.22118207812309265, + "learning_rate": 9.138682438529684e-06, + "loss": 0.9146, + "step": 9138 + }, + { + "epoch": 2.4730077120822624, + "grad_norm": 0.2176295816898346, + "learning_rate": 9.120546910535583e-06, + "loss": 0.9363, + "step": 9140 + }, + { + "epoch": 2.473548910837505, + "grad_norm": 0.2138061821460724, + "learning_rate": 9.102427589098211e-06, + "loss": 0.9012, + "step": 9142 + }, + { + "epoch": 2.474090109592748, + "grad_norm": 0.23236002027988434, + "learning_rate": 9.084324481400936e-06, + "loss": 0.928, + "step": 9144 + }, + { + "epoch": 2.4746313083479907, + "grad_norm": 0.20726221799850464, + "learning_rate": 9.066237594620703e-06, + "loss": 0.9358, + "step": 9146 + }, + { + "epoch": 2.4751725071032338, + "grad_norm": 0.23319317400455475, + "learning_rate": 9.048166935927987e-06, + "loss": 0.9471, + "step": 9148 + }, + { + "epoch": 2.4757137058584764, + "grad_norm": 0.21616868674755096, + "learning_rate": 9.030112512486876e-06, + "loss": 0.9109, + "step": 9150 + }, + { + "epoch": 2.4762549046137194, + "grad_norm": 0.23894111812114716, + "learning_rate": 9.012074331454989e-06, + "loss": 0.9136, + "step": 9152 + }, + { + "epoch": 2.4767961033689625, + "grad_norm": 0.21648919582366943, + "learning_rate": 8.994052399983538e-06, + "loss": 0.9218, + "step": 9154 + }, + { + "epoch": 2.477337302124205, + "grad_norm": 0.22003208100795746, + "learning_rate": 8.976046725217247e-06, + "loss": 0.9264, + "step": 9156 + }, + { + "epoch": 2.477878500879448, + "grad_norm": 0.2200644314289093, + "learning_rate": 8.958057314294444e-06, + "loss": 0.9388, + "step": 9158 + }, + { + "epoch": 2.4784196996346908, + "grad_norm": 0.21747905015945435, + "learning_rate": 8.940084174346947e-06, + "loss": 0.9296, + "step": 9160 + }, + { + "epoch": 2.478960898389934, + "grad_norm": 0.2293812483549118, + "learning_rate": 8.922127312500212e-06, + "loss": 0.9291, + "step": 9162 + }, + { + "epoch": 2.4795020971451764, + "grad_norm": 0.2231179177761078, + "learning_rate": 8.90418673587316e-06, + "loss": 0.9173, + "step": 9164 + }, + { + "epoch": 2.4800432959004195, + "grad_norm": 0.2299322634935379, + "learning_rate": 8.886262451578309e-06, + "loss": 0.9267, + "step": 9166 + }, + { + "epoch": 2.480584494655662, + "grad_norm": 0.21142719686031342, + "learning_rate": 8.868354466721668e-06, + "loss": 0.941, + "step": 9168 + }, + { + "epoch": 2.481125693410905, + "grad_norm": 0.22220134735107422, + "learning_rate": 8.85046278840283e-06, + "loss": 0.9391, + "step": 9170 + }, + { + "epoch": 2.4816668921661478, + "grad_norm": 0.2188580334186554, + "learning_rate": 8.832587423714905e-06, + "loss": 0.9457, + "step": 9172 + }, + { + "epoch": 2.482208090921391, + "grad_norm": 0.21225586533546448, + "learning_rate": 8.814728379744535e-06, + "loss": 0.9425, + "step": 9174 + }, + { + "epoch": 2.482749289676634, + "grad_norm": 0.2157151699066162, + "learning_rate": 8.796885663571907e-06, + "loss": 0.9304, + "step": 9176 + }, + { + "epoch": 2.4832904884318765, + "grad_norm": 0.2221091091632843, + "learning_rate": 8.779059282270695e-06, + "loss": 0.9254, + "step": 9178 + }, + { + "epoch": 2.4838316871871196, + "grad_norm": 0.224908247590065, + "learning_rate": 8.761249242908154e-06, + "loss": 0.9241, + "step": 9180 + }, + { + "epoch": 2.484372885942362, + "grad_norm": 0.21157538890838623, + "learning_rate": 8.743455552544988e-06, + "loss": 0.9234, + "step": 9182 + }, + { + "epoch": 2.484914084697605, + "grad_norm": 0.23402227461338043, + "learning_rate": 8.725678218235506e-06, + "loss": 0.922, + "step": 9184 + }, + { + "epoch": 2.485455283452848, + "grad_norm": 0.21767190098762512, + "learning_rate": 8.707917247027452e-06, + "loss": 0.9434, + "step": 9186 + }, + { + "epoch": 2.485996482208091, + "grad_norm": 0.21777601540088654, + "learning_rate": 8.690172645962152e-06, + "loss": 0.9518, + "step": 9188 + }, + { + "epoch": 2.486537680963334, + "grad_norm": 0.21638832986354828, + "learning_rate": 8.672444422074372e-06, + "loss": 0.948, + "step": 9190 + }, + { + "epoch": 2.4870788797185766, + "grad_norm": 0.2161891907453537, + "learning_rate": 8.654732582392444e-06, + "loss": 0.9387, + "step": 9192 + }, + { + "epoch": 2.4876200784738196, + "grad_norm": 0.23029199242591858, + "learning_rate": 8.637037133938176e-06, + "loss": 0.9118, + "step": 9194 + }, + { + "epoch": 2.488161277229062, + "grad_norm": 0.21446357667446136, + "learning_rate": 8.619358083726897e-06, + "loss": 0.9443, + "step": 9196 + }, + { + "epoch": 2.4887024759843053, + "grad_norm": 0.21811072528362274, + "learning_rate": 8.601695438767404e-06, + "loss": 0.9252, + "step": 9198 + }, + { + "epoch": 2.489243674739548, + "grad_norm": 0.22181473672389984, + "learning_rate": 8.584049206062017e-06, + "loss": 0.908, + "step": 9200 + }, + { + "epoch": 2.489784873494791, + "grad_norm": 0.23197844624519348, + "learning_rate": 8.566419392606546e-06, + "loss": 0.9229, + "step": 9202 + }, + { + "epoch": 2.490326072250034, + "grad_norm": 0.21416902542114258, + "learning_rate": 8.54880600539027e-06, + "loss": 0.9175, + "step": 9204 + }, + { + "epoch": 2.4908672710052766, + "grad_norm": 0.21921953558921814, + "learning_rate": 8.531209051395982e-06, + "loss": 0.9044, + "step": 9206 + }, + { + "epoch": 2.4914084697605197, + "grad_norm": 0.222914457321167, + "learning_rate": 8.513628537599954e-06, + "loss": 0.9188, + "step": 9208 + }, + { + "epoch": 2.4919496685157623, + "grad_norm": 0.22214266657829285, + "learning_rate": 8.496064470971948e-06, + "loss": 0.9103, + "step": 9210 + }, + { + "epoch": 2.4924908672710053, + "grad_norm": 0.21921002864837646, + "learning_rate": 8.47851685847516e-06, + "loss": 0.9317, + "step": 9212 + }, + { + "epoch": 2.493032066026248, + "grad_norm": 0.2256070375442505, + "learning_rate": 8.460985707066333e-06, + "loss": 0.9305, + "step": 9214 + }, + { + "epoch": 2.493573264781491, + "grad_norm": 0.21288733184337616, + "learning_rate": 8.443471023695603e-06, + "loss": 0.9292, + "step": 9216 + }, + { + "epoch": 2.494114463536734, + "grad_norm": 0.2139580249786377, + "learning_rate": 8.425972815306681e-06, + "loss": 0.9141, + "step": 9218 + }, + { + "epoch": 2.4946556622919767, + "grad_norm": 0.2180006206035614, + "learning_rate": 8.408491088836646e-06, + "loss": 0.9157, + "step": 9220 + }, + { + "epoch": 2.4951968610472197, + "grad_norm": 0.21242035925388336, + "learning_rate": 8.391025851216089e-06, + "loss": 0.9178, + "step": 9222 + }, + { + "epoch": 2.4957380598024623, + "grad_norm": 0.21212564408779144, + "learning_rate": 8.373577109369085e-06, + "loss": 0.9193, + "step": 9224 + }, + { + "epoch": 2.4962792585577054, + "grad_norm": 0.22646214067935944, + "learning_rate": 8.356144870213112e-06, + "loss": 0.9428, + "step": 9226 + }, + { + "epoch": 2.496820457312948, + "grad_norm": 0.2110045999288559, + "learning_rate": 8.338729140659151e-06, + "loss": 0.9293, + "step": 9228 + }, + { + "epoch": 2.497361656068191, + "grad_norm": 0.22855477035045624, + "learning_rate": 8.321329927611632e-06, + "loss": 0.9421, + "step": 9230 + }, + { + "epoch": 2.497902854823434, + "grad_norm": 0.2212769240140915, + "learning_rate": 8.303947237968428e-06, + "loss": 0.9385, + "step": 9232 + }, + { + "epoch": 2.4984440535786767, + "grad_norm": 0.216841459274292, + "learning_rate": 8.286581078620842e-06, + "loss": 0.9084, + "step": 9234 + }, + { + "epoch": 2.4989852523339198, + "grad_norm": 0.21523214876651764, + "learning_rate": 8.26923145645368e-06, + "loss": 0.9156, + "step": 9236 + }, + { + "epoch": 2.4995264510891624, + "grad_norm": 0.22173893451690674, + "learning_rate": 8.251898378345119e-06, + "loss": 0.9302, + "step": 9238 + }, + { + "epoch": 2.5000676498444054, + "grad_norm": 0.22232675552368164, + "learning_rate": 8.234581851166828e-06, + "loss": 0.9273, + "step": 9240 + }, + { + "epoch": 2.500608848599648, + "grad_norm": 0.2142835110425949, + "learning_rate": 8.217281881783912e-06, + "loss": 0.9124, + "step": 9242 + }, + { + "epoch": 2.501150047354891, + "grad_norm": 0.21807673573493958, + "learning_rate": 8.1999984770549e-06, + "loss": 0.9239, + "step": 9244 + }, + { + "epoch": 2.501691246110134, + "grad_norm": 0.22329942882061005, + "learning_rate": 8.18273164383173e-06, + "loss": 0.9259, + "step": 9246 + }, + { + "epoch": 2.5022324448653768, + "grad_norm": 0.22551564872264862, + "learning_rate": 8.165481388959818e-06, + "loss": 0.9364, + "step": 9248 + }, + { + "epoch": 2.50277364362062, + "grad_norm": 0.22128087282180786, + "learning_rate": 8.14824771927798e-06, + "loss": 0.9045, + "step": 9250 + }, + { + "epoch": 2.5033148423758624, + "grad_norm": 0.22071202099323273, + "learning_rate": 8.131030641618453e-06, + "loss": 0.921, + "step": 9252 + }, + { + "epoch": 2.5038560411311055, + "grad_norm": 0.21182720363140106, + "learning_rate": 8.113830162806924e-06, + "loss": 0.9214, + "step": 9254 + }, + { + "epoch": 2.504397239886348, + "grad_norm": 0.2163417935371399, + "learning_rate": 8.096646289662458e-06, + "loss": 0.9345, + "step": 9256 + }, + { + "epoch": 2.504938438641591, + "grad_norm": 0.21256625652313232, + "learning_rate": 8.079479028997572e-06, + "loss": 0.9404, + "step": 9258 + }, + { + "epoch": 2.505479637396834, + "grad_norm": 0.20927195250988007, + "learning_rate": 8.062328387618173e-06, + "loss": 0.9366, + "step": 9260 + }, + { + "epoch": 2.506020836152077, + "grad_norm": 0.2203509360551834, + "learning_rate": 8.045194372323589e-06, + "loss": 0.9194, + "step": 9262 + }, + { + "epoch": 2.5065620349073194, + "grad_norm": 0.21689756214618683, + "learning_rate": 8.028076989906568e-06, + "loss": 0.9083, + "step": 9264 + }, + { + "epoch": 2.5071032336625625, + "grad_norm": 0.20948179066181183, + "learning_rate": 8.010976247153263e-06, + "loss": 0.908, + "step": 9266 + }, + { + "epoch": 2.5076444324178055, + "grad_norm": 0.2324371039867401, + "learning_rate": 7.993892150843184e-06, + "loss": 0.9355, + "step": 9268 + }, + { + "epoch": 2.508185631173048, + "grad_norm": 0.21885903179645538, + "learning_rate": 7.976824707749308e-06, + "loss": 0.8877, + "step": 9270 + }, + { + "epoch": 2.508726829928291, + "grad_norm": 0.22025516629219055, + "learning_rate": 7.959773924637986e-06, + "loss": 0.9163, + "step": 9272 + }, + { + "epoch": 2.5092680286835343, + "grad_norm": 0.22476725280284882, + "learning_rate": 7.94273980826893e-06, + "loss": 0.9287, + "step": 9274 + }, + { + "epoch": 2.509809227438777, + "grad_norm": 0.2088979333639145, + "learning_rate": 7.925722365395288e-06, + "loss": 0.9083, + "step": 9276 + }, + { + "epoch": 2.5103504261940195, + "grad_norm": 0.212238609790802, + "learning_rate": 7.90872160276358e-06, + "loss": 0.9314, + "step": 9278 + }, + { + "epoch": 2.5108916249492625, + "grad_norm": 0.21347813308238983, + "learning_rate": 7.891737527113735e-06, + "loss": 0.9242, + "step": 9280 + }, + { + "epoch": 2.5114328237045056, + "grad_norm": 0.21741822361946106, + "learning_rate": 7.874770145179012e-06, + "loss": 0.9259, + "step": 9282 + }, + { + "epoch": 2.511974022459748, + "grad_norm": 0.21451082825660706, + "learning_rate": 7.857819463686111e-06, + "loss": 0.919, + "step": 9284 + }, + { + "epoch": 2.5125152212149913, + "grad_norm": 0.22058527171611786, + "learning_rate": 7.840885489355076e-06, + "loss": 0.9247, + "step": 9286 + }, + { + "epoch": 2.5130564199702343, + "grad_norm": 0.22194302082061768, + "learning_rate": 7.823968228899359e-06, + "loss": 0.9161, + "step": 9288 + }, + { + "epoch": 2.513597618725477, + "grad_norm": 0.21734069287776947, + "learning_rate": 7.807067689025737e-06, + "loss": 0.9481, + "step": 9290 + }, + { + "epoch": 2.5141388174807195, + "grad_norm": 0.21294578909873962, + "learning_rate": 7.790183876434415e-06, + "loss": 0.9351, + "step": 9292 + }, + { + "epoch": 2.5146800162359626, + "grad_norm": 0.22264544665813446, + "learning_rate": 7.773316797818908e-06, + "loss": 0.9309, + "step": 9294 + }, + { + "epoch": 2.5152212149912057, + "grad_norm": 0.2067432850599289, + "learning_rate": 7.756466459866147e-06, + "loss": 0.9284, + "step": 9296 + }, + { + "epoch": 2.5157624137464483, + "grad_norm": 0.21904096007347107, + "learning_rate": 7.739632869256397e-06, + "loss": 0.9108, + "step": 9298 + }, + { + "epoch": 2.5163036125016913, + "grad_norm": 0.21278297901153564, + "learning_rate": 7.722816032663298e-06, + "loss": 0.9323, + "step": 9300 + }, + { + "epoch": 2.5168448112569344, + "grad_norm": 0.21409979462623596, + "learning_rate": 7.706015956753855e-06, + "loss": 0.9191, + "step": 9302 + }, + { + "epoch": 2.517386010012177, + "grad_norm": 0.22656381130218506, + "learning_rate": 7.689232648188394e-06, + "loss": 0.9357, + "step": 9304 + }, + { + "epoch": 2.5179272087674196, + "grad_norm": 0.20955991744995117, + "learning_rate": 7.672466113620635e-06, + "loss": 0.9136, + "step": 9306 + }, + { + "epoch": 2.5184684075226627, + "grad_norm": 0.2214009016752243, + "learning_rate": 7.65571635969759e-06, + "loss": 0.9323, + "step": 9308 + }, + { + "epoch": 2.5190096062779057, + "grad_norm": 0.2177300602197647, + "learning_rate": 7.638983393059718e-06, + "loss": 0.9299, + "step": 9310 + }, + { + "epoch": 2.5195508050331483, + "grad_norm": 0.21035240590572357, + "learning_rate": 7.622267220340712e-06, + "loss": 0.9172, + "step": 9312 + }, + { + "epoch": 2.5200920037883914, + "grad_norm": 0.2137453258037567, + "learning_rate": 7.605567848167683e-06, + "loss": 0.9285, + "step": 9314 + }, + { + "epoch": 2.520633202543634, + "grad_norm": 0.22579017281532288, + "learning_rate": 7.58888528316104e-06, + "loss": 0.9334, + "step": 9316 + }, + { + "epoch": 2.521174401298877, + "grad_norm": 0.21448515355587006, + "learning_rate": 7.572219531934549e-06, + "loss": 0.921, + "step": 9318 + }, + { + "epoch": 2.5217156000541197, + "grad_norm": 0.22959654033184052, + "learning_rate": 7.555570601095308e-06, + "loss": 0.9142, + "step": 9320 + }, + { + "epoch": 2.5222567988093627, + "grad_norm": 0.2115289866924286, + "learning_rate": 7.5389384972437525e-06, + "loss": 0.9167, + "step": 9322 + }, + { + "epoch": 2.5227979975646058, + "grad_norm": 0.21191760897636414, + "learning_rate": 7.522323226973615e-06, + "loss": 0.9338, + "step": 9324 + }, + { + "epoch": 2.5233391963198484, + "grad_norm": 0.216501846909523, + "learning_rate": 7.505724796871994e-06, + "loss": 0.9182, + "step": 9326 + }, + { + "epoch": 2.5238803950750914, + "grad_norm": 0.21663382649421692, + "learning_rate": 7.489143213519301e-06, + "loss": 0.9257, + "step": 9328 + }, + { + "epoch": 2.524421593830334, + "grad_norm": 0.22709527611732483, + "learning_rate": 7.472578483489235e-06, + "loss": 0.933, + "step": 9330 + }, + { + "epoch": 2.524962792585577, + "grad_norm": 0.21982735395431519, + "learning_rate": 7.4560306133488796e-06, + "loss": 0.933, + "step": 9332 + }, + { + "epoch": 2.5255039913408197, + "grad_norm": 0.22381721436977386, + "learning_rate": 7.4394996096585735e-06, + "loss": 0.9332, + "step": 9334 + }, + { + "epoch": 2.5260451900960628, + "grad_norm": 0.21767573058605194, + "learning_rate": 7.422985478972006e-06, + "loss": 0.9507, + "step": 9336 + }, + { + "epoch": 2.526586388851306, + "grad_norm": 0.21965819597244263, + "learning_rate": 7.406488227836139e-06, + "loss": 0.9247, + "step": 9338 + }, + { + "epoch": 2.5271275876065484, + "grad_norm": 0.21398931741714478, + "learning_rate": 7.3900078627912985e-06, + "loss": 0.9274, + "step": 9340 + }, + { + "epoch": 2.5276687863617915, + "grad_norm": 0.2142874002456665, + "learning_rate": 7.373544390371034e-06, + "loss": 0.9344, + "step": 9342 + }, + { + "epoch": 2.528209985117034, + "grad_norm": 0.22144971787929535, + "learning_rate": 7.35709781710231e-06, + "loss": 0.925, + "step": 9344 + }, + { + "epoch": 2.528751183872277, + "grad_norm": 0.2120436280965805, + "learning_rate": 7.340668149505287e-06, + "loss": 0.932, + "step": 9346 + }, + { + "epoch": 2.5292923826275198, + "grad_norm": 0.2348075956106186, + "learning_rate": 7.324255394093477e-06, + "loss": 0.9327, + "step": 9348 + }, + { + "epoch": 2.529833581382763, + "grad_norm": 0.2130444049835205, + "learning_rate": 7.307859557373686e-06, + "loss": 0.9295, + "step": 9350 + }, + { + "epoch": 2.530374780138006, + "grad_norm": 0.21393387019634247, + "learning_rate": 7.291480645845988e-06, + "loss": 0.9306, + "step": 9352 + }, + { + "epoch": 2.5309159788932485, + "grad_norm": 0.21368271112442017, + "learning_rate": 7.275118666003761e-06, + "loss": 0.9375, + "step": 9354 + }, + { + "epoch": 2.5314571776484915, + "grad_norm": 0.21483318507671356, + "learning_rate": 7.258773624333676e-06, + "loss": 0.9538, + "step": 9356 + }, + { + "epoch": 2.531998376403734, + "grad_norm": 0.20913955569267273, + "learning_rate": 7.2424455273157e-06, + "loss": 0.9384, + "step": 9358 + }, + { + "epoch": 2.532539575158977, + "grad_norm": 0.21114329993724823, + "learning_rate": 7.226134381423039e-06, + "loss": 0.9107, + "step": 9360 + }, + { + "epoch": 2.53308077391422, + "grad_norm": 0.20708878338336945, + "learning_rate": 7.209840193122225e-06, + "loss": 0.9149, + "step": 9362 + }, + { + "epoch": 2.533621972669463, + "grad_norm": 0.22181889414787292, + "learning_rate": 7.193562968873019e-06, + "loss": 0.9248, + "step": 9364 + }, + { + "epoch": 2.534163171424706, + "grad_norm": 0.2157425731420517, + "learning_rate": 7.177302715128531e-06, + "loss": 0.9465, + "step": 9366 + }, + { + "epoch": 2.5347043701799485, + "grad_norm": 0.21364335715770721, + "learning_rate": 7.16105943833506e-06, + "loss": 0.9458, + "step": 9368 + }, + { + "epoch": 2.5352455689351916, + "grad_norm": 0.22688333690166473, + "learning_rate": 7.144833144932239e-06, + "loss": 0.9166, + "step": 9370 + }, + { + "epoch": 2.535786767690434, + "grad_norm": 0.21751616895198822, + "learning_rate": 7.128623841352916e-06, + "loss": 0.9395, + "step": 9372 + }, + { + "epoch": 2.5363279664456773, + "grad_norm": 0.220353364944458, + "learning_rate": 7.112431534023245e-06, + "loss": 0.916, + "step": 9374 + }, + { + "epoch": 2.53686916520092, + "grad_norm": 0.21988412737846375, + "learning_rate": 7.096256229362619e-06, + "loss": 0.9192, + "step": 9376 + }, + { + "epoch": 2.537410363956163, + "grad_norm": 0.21356196701526642, + "learning_rate": 7.080097933783702e-06, + "loss": 0.9248, + "step": 9378 + }, + { + "epoch": 2.537951562711406, + "grad_norm": 0.2283223569393158, + "learning_rate": 7.063956653692422e-06, + "loss": 0.9395, + "step": 9380 + }, + { + "epoch": 2.5384927614666486, + "grad_norm": 0.2209359109401703, + "learning_rate": 7.0478323954879185e-06, + "loss": 0.9277, + "step": 9382 + }, + { + "epoch": 2.5390339602218917, + "grad_norm": 0.21657121181488037, + "learning_rate": 7.031725165562641e-06, + "loss": 0.9428, + "step": 9384 + }, + { + "epoch": 2.5395751589771343, + "grad_norm": 0.21016372740268707, + "learning_rate": 7.015634970302243e-06, + "loss": 0.9186, + "step": 9386 + }, + { + "epoch": 2.5401163577323773, + "grad_norm": 0.21861585974693298, + "learning_rate": 6.999561816085648e-06, + "loss": 0.8966, + "step": 9388 + }, + { + "epoch": 2.54065755648762, + "grad_norm": 0.20906995236873627, + "learning_rate": 6.983505709285015e-06, + "loss": 0.9444, + "step": 9390 + }, + { + "epoch": 2.541198755242863, + "grad_norm": 0.21254542469978333, + "learning_rate": 6.967466656265759e-06, + "loss": 0.9428, + "step": 9392 + }, + { + "epoch": 2.541739953998106, + "grad_norm": 0.22033677995204926, + "learning_rate": 6.951444663386508e-06, + "loss": 0.9187, + "step": 9394 + }, + { + "epoch": 2.5422811527533486, + "grad_norm": 0.21230098605155945, + "learning_rate": 6.935439736999144e-06, + "loss": 0.9319, + "step": 9396 + }, + { + "epoch": 2.5428223515085913, + "grad_norm": 0.22501398622989655, + "learning_rate": 6.919451883448797e-06, + "loss": 0.8884, + "step": 9398 + }, + { + "epoch": 2.5433635502638343, + "grad_norm": 0.21757538616657257, + "learning_rate": 6.903481109073778e-06, + "loss": 0.9016, + "step": 9400 + }, + { + "epoch": 2.5439047490190774, + "grad_norm": 0.24840104579925537, + "learning_rate": 6.887527420205686e-06, + "loss": 0.9371, + "step": 9402 + }, + { + "epoch": 2.54444594777432, + "grad_norm": 0.23990055918693542, + "learning_rate": 6.871590823169316e-06, + "loss": 0.9029, + "step": 9404 + }, + { + "epoch": 2.544987146529563, + "grad_norm": 0.22422128915786743, + "learning_rate": 6.855671324282697e-06, + "loss": 0.9268, + "step": 9406 + }, + { + "epoch": 2.545528345284806, + "grad_norm": 0.21258831024169922, + "learning_rate": 6.83976892985706e-06, + "loss": 0.9384, + "step": 9408 + }, + { + "epoch": 2.5460695440400487, + "grad_norm": 0.222090944647789, + "learning_rate": 6.823883646196877e-06, + "loss": 0.9195, + "step": 9410 + }, + { + "epoch": 2.5466107427952913, + "grad_norm": 0.2162269949913025, + "learning_rate": 6.808015479599827e-06, + "loss": 0.9082, + "step": 9412 + }, + { + "epoch": 2.5471519415505344, + "grad_norm": 0.2120511382818222, + "learning_rate": 6.792164436356824e-06, + "loss": 0.9388, + "step": 9414 + }, + { + "epoch": 2.5476931403057774, + "grad_norm": 0.23143614828586578, + "learning_rate": 6.776330522751939e-06, + "loss": 0.9369, + "step": 9416 + }, + { + "epoch": 2.54823433906102, + "grad_norm": 0.22686822712421417, + "learning_rate": 6.760513745062519e-06, + "loss": 0.9552, + "step": 9418 + }, + { + "epoch": 2.548775537816263, + "grad_norm": 0.2194250226020813, + "learning_rate": 6.744714109559064e-06, + "loss": 0.9057, + "step": 9420 + }, + { + "epoch": 2.549316736571506, + "grad_norm": 0.21884877979755402, + "learning_rate": 6.7289316225053075e-06, + "loss": 0.8999, + "step": 9422 + }, + { + "epoch": 2.5498579353267488, + "grad_norm": 0.21890580654144287, + "learning_rate": 6.713166290158179e-06, + "loss": 0.9107, + "step": 9424 + }, + { + "epoch": 2.5503991340819914, + "grad_norm": 0.24606740474700928, + "learning_rate": 6.697418118767806e-06, + "loss": 0.9261, + "step": 9426 + }, + { + "epoch": 2.5509403328372344, + "grad_norm": 0.24582087993621826, + "learning_rate": 6.681687114577518e-06, + "loss": 0.9252, + "step": 9428 + }, + { + "epoch": 2.5514815315924775, + "grad_norm": 0.20682582259178162, + "learning_rate": 6.665973283823812e-06, + "loss": 0.9149, + "step": 9430 + }, + { + "epoch": 2.55202273034772, + "grad_norm": 0.21695150434970856, + "learning_rate": 6.6502766327364185e-06, + "loss": 0.9252, + "step": 9432 + }, + { + "epoch": 2.552563929102963, + "grad_norm": 0.22221341729164124, + "learning_rate": 6.634597167538209e-06, + "loss": 0.9134, + "step": 9434 + }, + { + "epoch": 2.553105127858206, + "grad_norm": 0.21815122663974762, + "learning_rate": 6.6189348944453056e-06, + "loss": 0.9448, + "step": 9436 + }, + { + "epoch": 2.553646326613449, + "grad_norm": 0.21014618873596191, + "learning_rate": 6.6032898196669414e-06, + "loss": 0.9438, + "step": 9438 + }, + { + "epoch": 2.5541875253686914, + "grad_norm": 0.2127705216407776, + "learning_rate": 6.587661949405599e-06, + "loss": 0.9326, + "step": 9440 + }, + { + "epoch": 2.5547287241239345, + "grad_norm": 0.21143341064453125, + "learning_rate": 6.572051289856873e-06, + "loss": 0.9504, + "step": 9442 + }, + { + "epoch": 2.5552699228791775, + "grad_norm": 0.20858249068260193, + "learning_rate": 6.5564578472095965e-06, + "loss": 0.9159, + "step": 9444 + }, + { + "epoch": 2.55581112163442, + "grad_norm": 0.21567776799201965, + "learning_rate": 6.54088162764574e-06, + "loss": 0.8946, + "step": 9446 + }, + { + "epoch": 2.556352320389663, + "grad_norm": 0.2256898432970047, + "learning_rate": 6.52532263734047e-06, + "loss": 0.9339, + "step": 9448 + }, + { + "epoch": 2.556893519144906, + "grad_norm": 0.21769912540912628, + "learning_rate": 6.509780882462091e-06, + "loss": 0.9291, + "step": 9450 + }, + { + "epoch": 2.557434717900149, + "grad_norm": 0.2179538905620575, + "learning_rate": 6.494256369172097e-06, + "loss": 0.9163, + "step": 9452 + }, + { + "epoch": 2.5579759166553915, + "grad_norm": 0.21418409049510956, + "learning_rate": 6.478749103625159e-06, + "loss": 0.9365, + "step": 9454 + }, + { + "epoch": 2.5585171154106345, + "grad_norm": 0.21773137152194977, + "learning_rate": 6.4632590919690594e-06, + "loss": 0.9367, + "step": 9456 + }, + { + "epoch": 2.5590583141658776, + "grad_norm": 0.21577022969722748, + "learning_rate": 6.447786340344819e-06, + "loss": 0.8986, + "step": 9458 + }, + { + "epoch": 2.55959951292112, + "grad_norm": 0.21877475082874298, + "learning_rate": 6.432330854886537e-06, + "loss": 0.9234, + "step": 9460 + }, + { + "epoch": 2.5601407116763633, + "grad_norm": 0.21625058352947235, + "learning_rate": 6.416892641721522e-06, + "loss": 0.9492, + "step": 9462 + }, + { + "epoch": 2.560681910431606, + "grad_norm": 0.2214227169752121, + "learning_rate": 6.401471706970197e-06, + "loss": 0.9299, + "step": 9464 + }, + { + "epoch": 2.561223109186849, + "grad_norm": 0.21283641457557678, + "learning_rate": 6.38606805674617e-06, + "loss": 0.9181, + "step": 9466 + }, + { + "epoch": 2.5617643079420915, + "grad_norm": 0.2216806560754776, + "learning_rate": 6.3706816971561554e-06, + "loss": 0.9299, + "step": 9468 + }, + { + "epoch": 2.5623055066973346, + "grad_norm": 0.21593159437179565, + "learning_rate": 6.3553126343000706e-06, + "loss": 0.9066, + "step": 9470 + }, + { + "epoch": 2.5628467054525776, + "grad_norm": 0.22921022772789001, + "learning_rate": 6.339960874270917e-06, + "loss": 0.9314, + "step": 9472 + }, + { + "epoch": 2.5633879042078203, + "grad_norm": 0.22072766721248627, + "learning_rate": 6.324626423154867e-06, + "loss": 0.9313, + "step": 9474 + }, + { + "epoch": 2.5639291029630633, + "grad_norm": 0.22152520716190338, + "learning_rate": 6.3093092870312346e-06, + "loss": 0.9307, + "step": 9476 + }, + { + "epoch": 2.564470301718306, + "grad_norm": 0.2176951915025711, + "learning_rate": 6.294009471972445e-06, + "loss": 0.9271, + "step": 9478 + }, + { + "epoch": 2.565011500473549, + "grad_norm": 0.21515873074531555, + "learning_rate": 6.278726984044081e-06, + "loss": 0.9302, + "step": 9480 + }, + { + "epoch": 2.5655526992287916, + "grad_norm": 0.21769382059574127, + "learning_rate": 6.263461829304845e-06, + "loss": 0.9362, + "step": 9482 + }, + { + "epoch": 2.5660938979840346, + "grad_norm": 0.22007985413074493, + "learning_rate": 6.248214013806575e-06, + "loss": 0.9243, + "step": 9484 + }, + { + "epoch": 2.5666350967392777, + "grad_norm": 0.2168111354112625, + "learning_rate": 6.232983543594218e-06, + "loss": 0.9111, + "step": 9486 + }, + { + "epoch": 2.5671762954945203, + "grad_norm": 0.21599526703357697, + "learning_rate": 6.217770424705871e-06, + "loss": 0.913, + "step": 9488 + }, + { + "epoch": 2.5677174942497634, + "grad_norm": 0.2112477570772171, + "learning_rate": 6.202574663172706e-06, + "loss": 0.9095, + "step": 9490 + }, + { + "epoch": 2.568258693005006, + "grad_norm": 0.22417528927326202, + "learning_rate": 6.187396265019085e-06, + "loss": 0.8877, + "step": 9492 + }, + { + "epoch": 2.568799891760249, + "grad_norm": 0.24721667170524597, + "learning_rate": 6.17223523626242e-06, + "loss": 0.9214, + "step": 9494 + }, + { + "epoch": 2.5693410905154916, + "grad_norm": 0.22341226041316986, + "learning_rate": 6.157091582913271e-06, + "loss": 0.9446, + "step": 9496 + }, + { + "epoch": 2.5698822892707347, + "grad_norm": 0.22680166363716125, + "learning_rate": 6.1419653109753e-06, + "loss": 0.9108, + "step": 9498 + }, + { + "epoch": 2.5704234880259778, + "grad_norm": 0.21408212184906006, + "learning_rate": 6.126856426445271e-06, + "loss": 0.9341, + "step": 9500 + }, + { + "epoch": 2.5709646867812204, + "grad_norm": 0.21931543946266174, + "learning_rate": 6.11176493531308e-06, + "loss": 0.9243, + "step": 9502 + }, + { + "epoch": 2.5715058855364634, + "grad_norm": 0.22283321619033813, + "learning_rate": 6.0966908435616975e-06, + "loss": 0.9309, + "step": 9504 + }, + { + "epoch": 2.572047084291706, + "grad_norm": 0.21407191455364227, + "learning_rate": 6.081634157167227e-06, + "loss": 0.9191, + "step": 9506 + }, + { + "epoch": 2.572588283046949, + "grad_norm": 0.2145175039768219, + "learning_rate": 6.0665948820988305e-06, + "loss": 0.9134, + "step": 9508 + }, + { + "epoch": 2.5731294818021917, + "grad_norm": 0.2078877091407776, + "learning_rate": 6.051573024318813e-06, + "loss": 0.9168, + "step": 9510 + }, + { + "epoch": 2.5736706805574348, + "grad_norm": 0.22311575710773468, + "learning_rate": 6.0365685897825265e-06, + "loss": 0.9369, + "step": 9512 + }, + { + "epoch": 2.574211879312678, + "grad_norm": 0.21917875111103058, + "learning_rate": 6.0215815844384626e-06, + "loss": 0.9222, + "step": 9514 + }, + { + "epoch": 2.5747530780679204, + "grad_norm": 0.2239609807729721, + "learning_rate": 6.0066120142281746e-06, + "loss": 0.9178, + "step": 9516 + }, + { + "epoch": 2.575294276823163, + "grad_norm": 0.21223671734333038, + "learning_rate": 5.99165988508632e-06, + "loss": 0.9274, + "step": 9518 + }, + { + "epoch": 2.575835475578406, + "grad_norm": 0.21494688093662262, + "learning_rate": 5.9767252029406195e-06, + "loss": 0.9353, + "step": 9520 + }, + { + "epoch": 2.576376674333649, + "grad_norm": 0.2212141454219818, + "learning_rate": 5.961807973711892e-06, + "loss": 0.9404, + "step": 9522 + }, + { + "epoch": 2.5769178730888918, + "grad_norm": 0.2208017259836197, + "learning_rate": 5.946908203314044e-06, + "loss": 0.9496, + "step": 9524 + }, + { + "epoch": 2.577459071844135, + "grad_norm": 0.2084929347038269, + "learning_rate": 5.932025897654053e-06, + "loss": 0.9263, + "step": 9526 + }, + { + "epoch": 2.578000270599378, + "grad_norm": 0.21754911541938782, + "learning_rate": 5.91716106263196e-06, + "loss": 0.9144, + "step": 9528 + }, + { + "epoch": 2.5785414693546205, + "grad_norm": 0.21551008522510529, + "learning_rate": 5.9023137041409e-06, + "loss": 0.9327, + "step": 9530 + }, + { + "epoch": 2.579082668109863, + "grad_norm": 0.22183187305927277, + "learning_rate": 5.887483828067075e-06, + "loss": 0.8993, + "step": 9532 + }, + { + "epoch": 2.579623866865106, + "grad_norm": 0.2130623608827591, + "learning_rate": 5.87267144028974e-06, + "loss": 0.9184, + "step": 9534 + }, + { + "epoch": 2.580165065620349, + "grad_norm": 0.21146562695503235, + "learning_rate": 5.857876546681234e-06, + "loss": 0.9316, + "step": 9536 + }, + { + "epoch": 2.580706264375592, + "grad_norm": 0.2094356119632721, + "learning_rate": 5.843099153106962e-06, + "loss": 0.918, + "step": 9538 + }, + { + "epoch": 2.581247463130835, + "grad_norm": 0.23078614473342896, + "learning_rate": 5.828339265425387e-06, + "loss": 0.9122, + "step": 9540 + }, + { + "epoch": 2.581788661886078, + "grad_norm": 0.2206854373216629, + "learning_rate": 5.81359688948801e-06, + "loss": 0.92, + "step": 9542 + }, + { + "epoch": 2.5823298606413205, + "grad_norm": 0.21436099708080292, + "learning_rate": 5.798872031139435e-06, + "loss": 0.9136, + "step": 9544 + }, + { + "epoch": 2.582871059396563, + "grad_norm": 0.20988523960113525, + "learning_rate": 5.78416469621727e-06, + "loss": 0.9221, + "step": 9546 + }, + { + "epoch": 2.583412258151806, + "grad_norm": 0.2196241170167923, + "learning_rate": 5.769474890552212e-06, + "loss": 0.9197, + "step": 9548 + }, + { + "epoch": 2.5839534569070493, + "grad_norm": 0.2228415459394455, + "learning_rate": 5.7548026199679975e-06, + "loss": 0.9316, + "step": 9550 + }, + { + "epoch": 2.584494655662292, + "grad_norm": 0.22791919112205505, + "learning_rate": 5.740147890281411e-06, + "loss": 0.9253, + "step": 9552 + }, + { + "epoch": 2.585035854417535, + "grad_norm": 0.21699950098991394, + "learning_rate": 5.7255107073022955e-06, + "loss": 0.9276, + "step": 9554 + }, + { + "epoch": 2.585577053172778, + "grad_norm": 0.22852075099945068, + "learning_rate": 5.710891076833508e-06, + "loss": 0.9092, + "step": 9556 + }, + { + "epoch": 2.5861182519280206, + "grad_norm": 0.22590157389640808, + "learning_rate": 5.696289004670979e-06, + "loss": 0.9473, + "step": 9558 + }, + { + "epoch": 2.586659450683263, + "grad_norm": 0.21700870990753174, + "learning_rate": 5.681704496603635e-06, + "loss": 0.927, + "step": 9560 + }, + { + "epoch": 2.5872006494385063, + "grad_norm": 0.21075235307216644, + "learning_rate": 5.667137558413504e-06, + "loss": 0.9257, + "step": 9562 + }, + { + "epoch": 2.5877418481937493, + "grad_norm": 0.21568815410137177, + "learning_rate": 5.652588195875591e-06, + "loss": 0.9372, + "step": 9564 + }, + { + "epoch": 2.588283046948992, + "grad_norm": 0.21554645895957947, + "learning_rate": 5.63805641475797e-06, + "loss": 0.8987, + "step": 9566 + }, + { + "epoch": 2.588824245704235, + "grad_norm": 0.21190887689590454, + "learning_rate": 5.623542220821709e-06, + "loss": 0.9201, + "step": 9568 + }, + { + "epoch": 2.5893654444594776, + "grad_norm": 0.2131725549697876, + "learning_rate": 5.6090456198209295e-06, + "loss": 0.9097, + "step": 9570 + }, + { + "epoch": 2.5899066432147206, + "grad_norm": 0.2159763127565384, + "learning_rate": 5.594566617502778e-06, + "loss": 0.8976, + "step": 9572 + }, + { + "epoch": 2.5904478419699632, + "grad_norm": 0.20846380293369293, + "learning_rate": 5.580105219607429e-06, + "loss": 0.9393, + "step": 9574 + }, + { + "epoch": 2.5909890407252063, + "grad_norm": 0.2086823433637619, + "learning_rate": 5.5656614318680464e-06, + "loss": 0.9187, + "step": 9576 + }, + { + "epoch": 2.5915302394804494, + "grad_norm": 0.21716119349002838, + "learning_rate": 5.551235260010839e-06, + "loss": 0.9178, + "step": 9578 + }, + { + "epoch": 2.592071438235692, + "grad_norm": 0.21530120074748993, + "learning_rate": 5.536826709755044e-06, + "loss": 0.9364, + "step": 9580 + }, + { + "epoch": 2.592612636990935, + "grad_norm": 0.22428621351718903, + "learning_rate": 5.522435786812863e-06, + "loss": 0.8978, + "step": 9582 + }, + { + "epoch": 2.5931538357461776, + "grad_norm": 0.2113945335149765, + "learning_rate": 5.508062496889577e-06, + "loss": 0.917, + "step": 9584 + }, + { + "epoch": 2.5936950345014207, + "grad_norm": 0.21664150059223175, + "learning_rate": 5.4937068456834105e-06, + "loss": 0.9207, + "step": 9586 + }, + { + "epoch": 2.5942362332566633, + "grad_norm": 0.21347230672836304, + "learning_rate": 5.479368838885657e-06, + "loss": 0.9312, + "step": 9588 + }, + { + "epoch": 2.5947774320119064, + "grad_norm": 0.20815637707710266, + "learning_rate": 5.465048482180546e-06, + "loss": 0.9195, + "step": 9590 + }, + { + "epoch": 2.5953186307671494, + "grad_norm": 0.20802514255046844, + "learning_rate": 5.450745781245381e-06, + "loss": 0.9373, + "step": 9592 + }, + { + "epoch": 2.595859829522392, + "grad_norm": 0.21624764800071716, + "learning_rate": 5.4364607417503985e-06, + "loss": 0.9254, + "step": 9594 + }, + { + "epoch": 2.596401028277635, + "grad_norm": 0.22775937616825104, + "learning_rate": 5.422193369358903e-06, + "loss": 0.9331, + "step": 9596 + }, + { + "epoch": 2.5969422270328777, + "grad_norm": 0.2140657901763916, + "learning_rate": 5.407943669727128e-06, + "loss": 0.916, + "step": 9598 + }, + { + "epoch": 2.5974834257881207, + "grad_norm": 0.2225465029478073, + "learning_rate": 5.393711648504346e-06, + "loss": 0.9359, + "step": 9600 + }, + { + "epoch": 2.5980246245433634, + "grad_norm": 0.20806999504566193, + "learning_rate": 5.379497311332815e-06, + "loss": 0.9469, + "step": 9602 + }, + { + "epoch": 2.5985658232986064, + "grad_norm": 0.21351569890975952, + "learning_rate": 5.36530066384775e-06, + "loss": 0.9405, + "step": 9604 + }, + { + "epoch": 2.5991070220538495, + "grad_norm": 0.2128276824951172, + "learning_rate": 5.3511217116773926e-06, + "loss": 0.9372, + "step": 9606 + }, + { + "epoch": 2.599648220809092, + "grad_norm": 0.2124856561422348, + "learning_rate": 5.336960460442947e-06, + "loss": 0.9385, + "step": 9608 + }, + { + "epoch": 2.600189419564335, + "grad_norm": 0.212458997964859, + "learning_rate": 5.322816915758616e-06, + "loss": 0.9333, + "step": 9610 + }, + { + "epoch": 2.6007306183195777, + "grad_norm": 0.2121894210577011, + "learning_rate": 5.308691083231554e-06, + "loss": 0.9208, + "step": 9612 + }, + { + "epoch": 2.601271817074821, + "grad_norm": 0.2149016112089157, + "learning_rate": 5.294582968461936e-06, + "loss": 0.9194, + "step": 9614 + }, + { + "epoch": 2.6018130158300634, + "grad_norm": 0.22723639011383057, + "learning_rate": 5.280492577042851e-06, + "loss": 0.9288, + "step": 9616 + }, + { + "epoch": 2.6023542145853065, + "grad_norm": 0.21222719550132751, + "learning_rate": 5.2664199145604446e-06, + "loss": 0.9306, + "step": 9618 + }, + { + "epoch": 2.6028954133405495, + "grad_norm": 0.22372601926326752, + "learning_rate": 5.252364986593755e-06, + "loss": 0.911, + "step": 9620 + }, + { + "epoch": 2.603436612095792, + "grad_norm": 0.21405529975891113, + "learning_rate": 5.238327798714848e-06, + "loss": 0.9469, + "step": 9622 + }, + { + "epoch": 2.603977810851035, + "grad_norm": 0.2115193009376526, + "learning_rate": 5.224308356488705e-06, + "loss": 0.9188, + "step": 9624 + }, + { + "epoch": 2.604519009606278, + "grad_norm": 0.20777717232704163, + "learning_rate": 5.210306665473319e-06, + "loss": 0.9349, + "step": 9626 + }, + { + "epoch": 2.605060208361521, + "grad_norm": 0.21201135218143463, + "learning_rate": 5.196322731219616e-06, + "loss": 0.9277, + "step": 9628 + }, + { + "epoch": 2.6056014071167635, + "grad_norm": 0.2202913463115692, + "learning_rate": 5.1823565592714895e-06, + "loss": 0.9139, + "step": 9630 + }, + { + "epoch": 2.6061426058720065, + "grad_norm": 0.22009167075157166, + "learning_rate": 5.1684081551658156e-06, + "loss": 0.8854, + "step": 9632 + }, + { + "epoch": 2.6066838046272496, + "grad_norm": 0.21579593420028687, + "learning_rate": 5.154477524432372e-06, + "loss": 0.9047, + "step": 9634 + }, + { + "epoch": 2.607225003382492, + "grad_norm": 0.2174842357635498, + "learning_rate": 5.140564672593951e-06, + "loss": 0.9168, + "step": 9636 + }, + { + "epoch": 2.6077662021377352, + "grad_norm": 0.2194041758775711, + "learning_rate": 5.126669605166246e-06, + "loss": 0.9373, + "step": 9638 + }, + { + "epoch": 2.608307400892978, + "grad_norm": 0.21152830123901367, + "learning_rate": 5.112792327657923e-06, + "loss": 0.9367, + "step": 9640 + }, + { + "epoch": 2.608848599648221, + "grad_norm": 0.20887935161590576, + "learning_rate": 5.098932845570609e-06, + "loss": 0.9235, + "step": 9642 + }, + { + "epoch": 2.6093897984034635, + "grad_norm": 0.20872168242931366, + "learning_rate": 5.085091164398853e-06, + "loss": 0.8964, + "step": 9644 + }, + { + "epoch": 2.6099309971587066, + "grad_norm": 0.2132871001958847, + "learning_rate": 5.07126728963015e-06, + "loss": 0.9132, + "step": 9646 + }, + { + "epoch": 2.6104721959139496, + "grad_norm": 0.21363700926303864, + "learning_rate": 5.057461226744959e-06, + "loss": 0.896, + "step": 9648 + }, + { + "epoch": 2.6110133946691922, + "grad_norm": 0.22881589829921722, + "learning_rate": 5.043672981216618e-06, + "loss": 0.9165, + "step": 9650 + }, + { + "epoch": 2.611554593424435, + "grad_norm": 0.21715807914733887, + "learning_rate": 5.029902558511495e-06, + "loss": 0.9041, + "step": 9652 + }, + { + "epoch": 2.612095792179678, + "grad_norm": 0.22497858107089996, + "learning_rate": 5.016149964088801e-06, + "loss": 0.9072, + "step": 9654 + }, + { + "epoch": 2.612636990934921, + "grad_norm": 0.2185206562280655, + "learning_rate": 5.002415203400729e-06, + "loss": 0.9242, + "step": 9656 + }, + { + "epoch": 2.6131781896901636, + "grad_norm": 0.21675042808055878, + "learning_rate": 4.988698281892407e-06, + "loss": 0.9199, + "step": 9658 + }, + { + "epoch": 2.6137193884454066, + "grad_norm": 0.21377485990524292, + "learning_rate": 4.974999205001846e-06, + "loss": 0.9235, + "step": 9660 + }, + { + "epoch": 2.6142605872006497, + "grad_norm": 0.21371828019618988, + "learning_rate": 4.961317978160024e-06, + "loss": 0.8899, + "step": 9662 + }, + { + "epoch": 2.6148017859558923, + "grad_norm": 0.21455076336860657, + "learning_rate": 4.9476546067908306e-06, + "loss": 0.9283, + "step": 9664 + }, + { + "epoch": 2.615342984711135, + "grad_norm": 0.2115284949541092, + "learning_rate": 4.934009096311082e-06, + "loss": 0.921, + "step": 9666 + }, + { + "epoch": 2.615884183466378, + "grad_norm": 0.21466834843158722, + "learning_rate": 4.920381452130485e-06, + "loss": 0.9308, + "step": 9668 + }, + { + "epoch": 2.616425382221621, + "grad_norm": 0.2045622318983078, + "learning_rate": 4.906771679651712e-06, + "loss": 0.9325, + "step": 9670 + }, + { + "epoch": 2.6169665809768636, + "grad_norm": 0.20819289982318878, + "learning_rate": 4.893179784270302e-06, + "loss": 0.9017, + "step": 9672 + }, + { + "epoch": 2.6175077797321067, + "grad_norm": 0.21677805483341217, + "learning_rate": 4.879605771374729e-06, + "loss": 0.9203, + "step": 9674 + }, + { + "epoch": 2.6180489784873497, + "grad_norm": 0.21292994916439056, + "learning_rate": 4.866049646346388e-06, + "loss": 0.8958, + "step": 9676 + }, + { + "epoch": 2.6185901772425924, + "grad_norm": 0.20883965492248535, + "learning_rate": 4.852511414559574e-06, + "loss": 0.9261, + "step": 9678 + }, + { + "epoch": 2.619131375997835, + "grad_norm": 0.23521077632904053, + "learning_rate": 4.838991081381466e-06, + "loss": 0.9413, + "step": 9680 + }, + { + "epoch": 2.619672574753078, + "grad_norm": 0.5041726231575012, + "learning_rate": 4.825488652172178e-06, + "loss": 0.9292, + "step": 9682 + }, + { + "epoch": 2.620213773508321, + "grad_norm": 0.21232503652572632, + "learning_rate": 4.812004132284714e-06, + "loss": 0.9188, + "step": 9684 + }, + { + "epoch": 2.6207549722635637, + "grad_norm": 0.22552818059921265, + "learning_rate": 4.798537527064973e-06, + "loss": 0.9123, + "step": 9686 + }, + { + "epoch": 2.6212961710188067, + "grad_norm": 0.21286830306053162, + "learning_rate": 4.78508884185177e-06, + "loss": 0.9315, + "step": 9688 + }, + { + "epoch": 2.6218373697740494, + "grad_norm": 0.21211306750774384, + "learning_rate": 4.771658081976788e-06, + "loss": 0.9212, + "step": 9690 + }, + { + "epoch": 2.6223785685292924, + "grad_norm": 0.21798257529735565, + "learning_rate": 4.758245252764632e-06, + "loss": 0.9452, + "step": 9692 + }, + { + "epoch": 2.622919767284535, + "grad_norm": 0.226260706782341, + "learning_rate": 4.744850359532765e-06, + "loss": 0.9324, + "step": 9694 + }, + { + "epoch": 2.623460966039778, + "grad_norm": 0.21312588453292847, + "learning_rate": 4.731473407591575e-06, + "loss": 0.9165, + "step": 9696 + }, + { + "epoch": 2.624002164795021, + "grad_norm": 0.22322995960712433, + "learning_rate": 4.7181144022443115e-06, + "loss": 0.9204, + "step": 9698 + }, + { + "epoch": 2.6245433635502637, + "grad_norm": 0.21204566955566406, + "learning_rate": 4.704773348787134e-06, + "loss": 0.9385, + "step": 9700 + }, + { + "epoch": 2.625084562305507, + "grad_norm": 0.20762857794761658, + "learning_rate": 4.69145025250905e-06, + "loss": 0.9283, + "step": 9702 + }, + { + "epoch": 2.6256257610607494, + "grad_norm": 0.2284674495458603, + "learning_rate": 4.678145118691979e-06, + "loss": 0.9218, + "step": 9704 + }, + { + "epoch": 2.6261669598159925, + "grad_norm": 0.21979452669620514, + "learning_rate": 4.664857952610707e-06, + "loss": 0.9531, + "step": 9706 + }, + { + "epoch": 2.626708158571235, + "grad_norm": 0.22279872000217438, + "learning_rate": 4.651588759532888e-06, + "loss": 0.9153, + "step": 9708 + }, + { + "epoch": 2.627249357326478, + "grad_norm": 0.21282440423965454, + "learning_rate": 4.638337544719079e-06, + "loss": 0.9281, + "step": 9710 + }, + { + "epoch": 2.627790556081721, + "grad_norm": 0.20626047253608704, + "learning_rate": 4.6251043134226735e-06, + "loss": 0.9101, + "step": 9712 + }, + { + "epoch": 2.628331754836964, + "grad_norm": 0.2364240437746048, + "learning_rate": 4.611889070889968e-06, + "loss": 0.9422, + "step": 9714 + }, + { + "epoch": 2.628872953592207, + "grad_norm": 0.2121487408876419, + "learning_rate": 4.5986918223601015e-06, + "loss": 0.9152, + "step": 9716 + }, + { + "epoch": 2.6294141523474495, + "grad_norm": 0.2053249329328537, + "learning_rate": 4.585512573065098e-06, + "loss": 0.9037, + "step": 9718 + }, + { + "epoch": 2.6299553511026925, + "grad_norm": 0.2118249088525772, + "learning_rate": 4.572351328229813e-06, + "loss": 0.9202, + "step": 9720 + }, + { + "epoch": 2.630496549857935, + "grad_norm": 0.21597343683242798, + "learning_rate": 4.559208093072026e-06, + "loss": 0.9363, + "step": 9722 + }, + { + "epoch": 2.631037748613178, + "grad_norm": 0.21382808685302734, + "learning_rate": 4.546082872802316e-06, + "loss": 0.9262, + "step": 9724 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.22571536898612976, + "learning_rate": 4.532975672624157e-06, + "loss": 0.9286, + "step": 9726 + }, + { + "epoch": 2.632120146123664, + "grad_norm": 0.211785688996315, + "learning_rate": 4.519886497733855e-06, + "loss": 0.9478, + "step": 9728 + }, + { + "epoch": 2.632661344878907, + "grad_norm": 0.2123601883649826, + "learning_rate": 4.506815353320576e-06, + "loss": 0.9156, + "step": 9730 + }, + { + "epoch": 2.6332025436341495, + "grad_norm": 0.21944217383861542, + "learning_rate": 4.4937622445663575e-06, + "loss": 0.9238, + "step": 9732 + }, + { + "epoch": 2.6337437423893926, + "grad_norm": 0.2095131129026413, + "learning_rate": 4.4807271766460605e-06, + "loss": 0.9028, + "step": 9734 + }, + { + "epoch": 2.634284941144635, + "grad_norm": 0.21231773495674133, + "learning_rate": 4.467710154727422e-06, + "loss": 0.9026, + "step": 9736 + }, + { + "epoch": 2.6348261398998782, + "grad_norm": 0.20916488766670227, + "learning_rate": 4.454711183970983e-06, + "loss": 0.9098, + "step": 9738 + }, + { + "epoch": 2.6353673386551213, + "grad_norm": 0.20952165126800537, + "learning_rate": 4.4417302695301755e-06, + "loss": 0.9433, + "step": 9740 + }, + { + "epoch": 2.635908537410364, + "grad_norm": 0.2100578397512436, + "learning_rate": 4.428767416551227e-06, + "loss": 0.9195, + "step": 9742 + }, + { + "epoch": 2.636449736165607, + "grad_norm": 0.2157282531261444, + "learning_rate": 4.415822630173255e-06, + "loss": 0.9082, + "step": 9744 + }, + { + "epoch": 2.6369909349208496, + "grad_norm": 0.20804545283317566, + "learning_rate": 4.40289591552816e-06, + "loss": 0.8968, + "step": 9746 + }, + { + "epoch": 2.6375321336760926, + "grad_norm": 0.20871908962726593, + "learning_rate": 4.389987277740737e-06, + "loss": 0.9127, + "step": 9748 + }, + { + "epoch": 2.6380733324313352, + "grad_norm": 0.21653971076011658, + "learning_rate": 4.377096721928553e-06, + "loss": 0.9157, + "step": 9750 + }, + { + "epoch": 2.6386145311865783, + "grad_norm": 0.27845850586891174, + "learning_rate": 4.364224253202048e-06, + "loss": 0.9434, + "step": 9752 + }, + { + "epoch": 2.6391557299418213, + "grad_norm": 0.2142685055732727, + "learning_rate": 4.3513698766644795e-06, + "loss": 0.9213, + "step": 9754 + }, + { + "epoch": 2.639696928697064, + "grad_norm": 0.21633394062519073, + "learning_rate": 4.3385335974119475e-06, + "loss": 0.9354, + "step": 9756 + }, + { + "epoch": 2.640238127452307, + "grad_norm": 0.21480292081832886, + "learning_rate": 4.325715420533338e-06, + "loss": 0.9255, + "step": 9758 + }, + { + "epoch": 2.6407793262075496, + "grad_norm": 0.21260100603103638, + "learning_rate": 4.312915351110397e-06, + "loss": 0.908, + "step": 9760 + }, + { + "epoch": 2.6413205249627927, + "grad_norm": 0.21978868544101715, + "learning_rate": 4.300133394217692e-06, + "loss": 0.9152, + "step": 9762 + }, + { + "epoch": 2.6418617237180353, + "grad_norm": 0.22278951108455658, + "learning_rate": 4.2873695549225655e-06, + "loss": 0.9178, + "step": 9764 + }, + { + "epoch": 2.6424029224732783, + "grad_norm": 0.22018632292747498, + "learning_rate": 4.274623838285247e-06, + "loss": 0.9175, + "step": 9766 + }, + { + "epoch": 2.6429441212285214, + "grad_norm": 0.22219111025333405, + "learning_rate": 4.261896249358716e-06, + "loss": 0.9082, + "step": 9768 + }, + { + "epoch": 2.643485319983764, + "grad_norm": 0.21356992423534393, + "learning_rate": 4.249186793188814e-06, + "loss": 0.9329, + "step": 9770 + }, + { + "epoch": 2.6440265187390066, + "grad_norm": 0.21656227111816406, + "learning_rate": 4.236495474814156e-06, + "loss": 0.9095, + "step": 9772 + }, + { + "epoch": 2.6445677174942497, + "grad_norm": 0.20931150019168854, + "learning_rate": 4.2238222992662e-06, + "loss": 0.9141, + "step": 9774 + }, + { + "epoch": 2.6451089162494927, + "grad_norm": 0.21429456770420074, + "learning_rate": 4.21116727156916e-06, + "loss": 0.9266, + "step": 9776 + }, + { + "epoch": 2.6456501150047353, + "grad_norm": 0.21203023195266724, + "learning_rate": 4.198530396740141e-06, + "loss": 0.9027, + "step": 9778 + }, + { + "epoch": 2.6461913137599784, + "grad_norm": 0.2151024341583252, + "learning_rate": 4.185911679788962e-06, + "loss": 0.9332, + "step": 9780 + }, + { + "epoch": 2.6467325125152215, + "grad_norm": 0.20736394822597504, + "learning_rate": 4.173311125718293e-06, + "loss": 0.9311, + "step": 9782 + }, + { + "epoch": 2.647273711270464, + "grad_norm": 0.21102149784564972, + "learning_rate": 4.1607287395236e-06, + "loss": 0.9151, + "step": 9784 + }, + { + "epoch": 2.6478149100257067, + "grad_norm": 0.21661317348480225, + "learning_rate": 4.148164526193127e-06, + "loss": 0.9282, + "step": 9786 + }, + { + "epoch": 2.6483561087809497, + "grad_norm": 0.4423425793647766, + "learning_rate": 4.135618490707926e-06, + "loss": 0.9154, + "step": 9788 + }, + { + "epoch": 2.648897307536193, + "grad_norm": 0.216521754860878, + "learning_rate": 4.123090638041849e-06, + "loss": 0.9244, + "step": 9790 + }, + { + "epoch": 2.6494385062914354, + "grad_norm": 0.22125104069709778, + "learning_rate": 4.110580973161532e-06, + "loss": 0.9161, + "step": 9792 + }, + { + "epoch": 2.6499797050466785, + "grad_norm": 0.21048854291439056, + "learning_rate": 4.098089501026392e-06, + "loss": 0.9132, + "step": 9794 + }, + { + "epoch": 2.6505209038019215, + "grad_norm": 0.20949687063694, + "learning_rate": 4.085616226588651e-06, + "loss": 0.9297, + "step": 9796 + }, + { + "epoch": 2.651062102557164, + "grad_norm": 0.2204485684633255, + "learning_rate": 4.073161154793293e-06, + "loss": 0.9291, + "step": 9798 + }, + { + "epoch": 2.6516033013124067, + "grad_norm": 0.20806816220283508, + "learning_rate": 4.060724290578111e-06, + "loss": 0.9217, + "step": 9800 + }, + { + "epoch": 2.65214450006765, + "grad_norm": 0.20764431357383728, + "learning_rate": 4.048305638873667e-06, + "loss": 0.9258, + "step": 9802 + }, + { + "epoch": 2.652685698822893, + "grad_norm": 0.21075132489204407, + "learning_rate": 4.035905204603307e-06, + "loss": 0.9303, + "step": 9804 + }, + { + "epoch": 2.6532268975781355, + "grad_norm": 0.2081594169139862, + "learning_rate": 4.023522992683148e-06, + "loss": 0.9115, + "step": 9806 + }, + { + "epoch": 2.6537680963333785, + "grad_norm": 0.2170921117067337, + "learning_rate": 4.011159008022087e-06, + "loss": 0.9212, + "step": 9808 + }, + { + "epoch": 2.6543092950886216, + "grad_norm": 0.21863363683223724, + "learning_rate": 3.9988132555217935e-06, + "loss": 0.9362, + "step": 9810 + }, + { + "epoch": 2.654850493843864, + "grad_norm": 0.20957957208156586, + "learning_rate": 3.986485740076712e-06, + "loss": 0.9372, + "step": 9812 + }, + { + "epoch": 2.655391692599107, + "grad_norm": 0.21519392728805542, + "learning_rate": 3.974176466574065e-06, + "loss": 0.9384, + "step": 9814 + }, + { + "epoch": 2.65593289135435, + "grad_norm": 0.21502479910850525, + "learning_rate": 3.961885439893814e-06, + "loss": 0.9206, + "step": 9816 + }, + { + "epoch": 2.656474090109593, + "grad_norm": 0.2135077863931656, + "learning_rate": 3.949612664908725e-06, + "loss": 0.9412, + "step": 9818 + }, + { + "epoch": 2.6570152888648355, + "grad_norm": 0.21586261689662933, + "learning_rate": 3.937358146484288e-06, + "loss": 0.9316, + "step": 9820 + }, + { + "epoch": 2.6575564876200786, + "grad_norm": 0.2121972143650055, + "learning_rate": 3.925121889478795e-06, + "loss": 0.9367, + "step": 9822 + }, + { + "epoch": 2.658097686375321, + "grad_norm": 0.22273525595664978, + "learning_rate": 3.9129038987432644e-06, + "loss": 0.9406, + "step": 9824 + }, + { + "epoch": 2.6586388851305642, + "grad_norm": 0.21330350637435913, + "learning_rate": 3.900704179121506e-06, + "loss": 0.9268, + "step": 9826 + }, + { + "epoch": 2.659180083885807, + "grad_norm": 0.21365892887115479, + "learning_rate": 3.888522735450051e-06, + "loss": 0.9286, + "step": 9828 + }, + { + "epoch": 2.65972128264105, + "grad_norm": 0.21690934896469116, + "learning_rate": 3.876359572558208e-06, + "loss": 0.9327, + "step": 9830 + }, + { + "epoch": 2.660262481396293, + "grad_norm": 0.2091062366962433, + "learning_rate": 3.8642146952680426e-06, + "loss": 0.9071, + "step": 9832 + }, + { + "epoch": 2.6608036801515356, + "grad_norm": 0.22157010436058044, + "learning_rate": 3.852088108394342e-06, + "loss": 0.9316, + "step": 9834 + }, + { + "epoch": 2.6613448789067786, + "grad_norm": 0.20877839624881744, + "learning_rate": 3.839979816744666e-06, + "loss": 0.925, + "step": 9836 + }, + { + "epoch": 2.6618860776620212, + "grad_norm": 0.2144051045179367, + "learning_rate": 3.827889825119329e-06, + "loss": 0.9233, + "step": 9838 + }, + { + "epoch": 2.6624272764172643, + "grad_norm": 0.2103882133960724, + "learning_rate": 3.815818138311372e-06, + "loss": 0.9179, + "step": 9840 + }, + { + "epoch": 2.662968475172507, + "grad_norm": 0.21600565314292908, + "learning_rate": 3.803764761106582e-06, + "loss": 0.938, + "step": 9842 + }, + { + "epoch": 2.66350967392775, + "grad_norm": 0.21050213277339935, + "learning_rate": 3.791729698283486e-06, + "loss": 0.911, + "step": 9844 + }, + { + "epoch": 2.664050872682993, + "grad_norm": 0.21677501499652863, + "learning_rate": 3.7797129546133626e-06, + "loss": 0.9376, + "step": 9846 + }, + { + "epoch": 2.6645920714382356, + "grad_norm": 0.21668270230293274, + "learning_rate": 3.767714534860223e-06, + "loss": 0.9252, + "step": 9848 + }, + { + "epoch": 2.6651332701934787, + "grad_norm": 0.21813543140888214, + "learning_rate": 3.755734443780795e-06, + "loss": 0.9241, + "step": 9850 + }, + { + "epoch": 2.6656744689487213, + "grad_norm": 0.21688979864120483, + "learning_rate": 3.7437726861245747e-06, + "loss": 0.9191, + "step": 9852 + }, + { + "epoch": 2.6662156677039643, + "grad_norm": 0.21569763123989105, + "learning_rate": 3.7318292666337573e-06, + "loss": 0.9343, + "step": 9854 + }, + { + "epoch": 2.666756866459207, + "grad_norm": 0.20640601217746735, + "learning_rate": 3.7199041900432774e-06, + "loss": 0.9198, + "step": 9856 + }, + { + "epoch": 2.66729806521445, + "grad_norm": 0.21720999479293823, + "learning_rate": 3.70799746108082e-06, + "loss": 0.9267, + "step": 9858 + }, + { + "epoch": 2.667839263969693, + "grad_norm": 0.2146361768245697, + "learning_rate": 3.6961090844667588e-06, + "loss": 0.9178, + "step": 9860 + }, + { + "epoch": 2.6683804627249357, + "grad_norm": 0.22774489223957062, + "learning_rate": 3.6842390649142353e-06, + "loss": 0.9199, + "step": 9862 + }, + { + "epoch": 2.6689216614801787, + "grad_norm": 0.22013112902641296, + "learning_rate": 3.672387407129063e-06, + "loss": 0.9384, + "step": 9864 + }, + { + "epoch": 2.6694628602354213, + "grad_norm": 0.21078211069107056, + "learning_rate": 3.660554115809822e-06, + "loss": 0.9375, + "step": 9866 + }, + { + "epoch": 2.6700040589906644, + "grad_norm": 0.20997470617294312, + "learning_rate": 3.6487391956477603e-06, + "loss": 0.931, + "step": 9868 + }, + { + "epoch": 2.670545257745907, + "grad_norm": 0.2088768184185028, + "learning_rate": 3.636942651326919e-06, + "loss": 0.91, + "step": 9870 + }, + { + "epoch": 2.67108645650115, + "grad_norm": 0.22607386112213135, + "learning_rate": 3.6251644875239687e-06, + "loss": 0.9062, + "step": 9872 + }, + { + "epoch": 2.671627655256393, + "grad_norm": 0.21860456466674805, + "learning_rate": 3.6134047089083633e-06, + "loss": 0.909, + "step": 9874 + }, + { + "epoch": 2.6721688540116357, + "grad_norm": 0.2087193876504898, + "learning_rate": 3.6016633201422123e-06, + "loss": 0.913, + "step": 9876 + }, + { + "epoch": 2.672710052766879, + "grad_norm": 0.21565331518650055, + "learning_rate": 3.58994032588037e-06, + "loss": 0.9283, + "step": 9878 + }, + { + "epoch": 2.6732512515221214, + "grad_norm": 0.21783699095249176, + "learning_rate": 3.578235730770385e-06, + "loss": 0.9181, + "step": 9880 + }, + { + "epoch": 2.6737924502773645, + "grad_norm": 0.21517501771450043, + "learning_rate": 3.5665495394525294e-06, + "loss": 0.9221, + "step": 9882 + }, + { + "epoch": 2.674333649032607, + "grad_norm": 0.2097310721874237, + "learning_rate": 3.554881756559747e-06, + "loss": 0.9281, + "step": 9884 + }, + { + "epoch": 2.67487484778785, + "grad_norm": 0.21278955042362213, + "learning_rate": 3.54323238671771e-06, + "loss": 0.9329, + "step": 9886 + }, + { + "epoch": 2.675416046543093, + "grad_norm": 0.20802854001522064, + "learning_rate": 3.531601434544779e-06, + "loss": 0.9295, + "step": 9888 + }, + { + "epoch": 2.675957245298336, + "grad_norm": 0.21119655668735504, + "learning_rate": 3.519988904652005e-06, + "loss": 0.9352, + "step": 9890 + }, + { + "epoch": 2.676498444053579, + "grad_norm": 0.21427717804908752, + "learning_rate": 3.5083948016431723e-06, + "loss": 0.9142, + "step": 9892 + }, + { + "epoch": 2.6770396428088215, + "grad_norm": 0.21735575795173645, + "learning_rate": 3.496819130114715e-06, + "loss": 0.9151, + "step": 9894 + }, + { + "epoch": 2.6775808415640645, + "grad_norm": 0.2185177057981491, + "learning_rate": 3.485261894655789e-06, + "loss": 0.9178, + "step": 9896 + }, + { + "epoch": 2.678122040319307, + "grad_norm": 0.2142525017261505, + "learning_rate": 3.473723099848214e-06, + "loss": 0.9203, + "step": 9898 + }, + { + "epoch": 2.67866323907455, + "grad_norm": 0.23173563182353973, + "learning_rate": 3.462202750266541e-06, + "loss": 0.9508, + "step": 9900 + }, + { + "epoch": 2.6792044378297932, + "grad_norm": 0.22277452051639557, + "learning_rate": 3.4507008504779504e-06, + "loss": 0.8993, + "step": 9902 + }, + { + "epoch": 2.679745636585036, + "grad_norm": 0.2197294682264328, + "learning_rate": 3.4392174050423774e-06, + "loss": 0.9621, + "step": 9904 + }, + { + "epoch": 2.6802868353402785, + "grad_norm": 0.21560606360435486, + "learning_rate": 3.4277524185123868e-06, + "loss": 0.9161, + "step": 9906 + }, + { + "epoch": 2.6808280340955215, + "grad_norm": 0.2142927348613739, + "learning_rate": 3.416305895433242e-06, + "loss": 0.9225, + "step": 9908 + }, + { + "epoch": 2.6813692328507646, + "grad_norm": 0.21276259422302246, + "learning_rate": 3.4048778403429017e-06, + "loss": 0.9243, + "step": 9910 + }, + { + "epoch": 2.681910431606007, + "grad_norm": 0.2092997133731842, + "learning_rate": 3.393468257771976e-06, + "loss": 0.9242, + "step": 9912 + }, + { + "epoch": 2.6824516303612502, + "grad_norm": 0.21740198135375977, + "learning_rate": 3.382077152243762e-06, + "loss": 0.9144, + "step": 9914 + }, + { + "epoch": 2.6829928291164933, + "grad_norm": 0.21233439445495605, + "learning_rate": 3.370704528274249e-06, + "loss": 0.9389, + "step": 9916 + }, + { + "epoch": 2.683534027871736, + "grad_norm": 0.20915208756923676, + "learning_rate": 3.3593503903720904e-06, + "loss": 0.9076, + "step": 9918 + }, + { + "epoch": 2.6840752266269785, + "grad_norm": 0.219877228140831, + "learning_rate": 3.34801474303858e-06, + "loss": 0.9323, + "step": 9920 + }, + { + "epoch": 2.6846164253822216, + "grad_norm": 0.2228020429611206, + "learning_rate": 3.3366975907677347e-06, + "loss": 0.9244, + "step": 9922 + }, + { + "epoch": 2.6851576241374646, + "grad_norm": 0.21531300246715546, + "learning_rate": 3.3253989380461812e-06, + "loss": 0.9223, + "step": 9924 + }, + { + "epoch": 2.6856988228927072, + "grad_norm": 0.21286436915397644, + "learning_rate": 3.3141187893532712e-06, + "loss": 0.9233, + "step": 9926 + }, + { + "epoch": 2.6862400216479503, + "grad_norm": 0.21415787935256958, + "learning_rate": 3.302857149160976e-06, + "loss": 0.9295, + "step": 9928 + }, + { + "epoch": 2.6867812204031933, + "grad_norm": 0.21482230722904205, + "learning_rate": 3.2916140219339532e-06, + "loss": 0.9196, + "step": 9930 + }, + { + "epoch": 2.687322419158436, + "grad_norm": 0.20880477130413055, + "learning_rate": 3.2803894121294975e-06, + "loss": 0.9232, + "step": 9932 + }, + { + "epoch": 2.6878636179136786, + "grad_norm": 0.21388433873653412, + "learning_rate": 3.2691833241975887e-06, + "loss": 0.9084, + "step": 9934 + }, + { + "epoch": 2.6884048166689216, + "grad_norm": 0.21751351654529572, + "learning_rate": 3.2579957625808512e-06, + "loss": 0.909, + "step": 9936 + }, + { + "epoch": 2.6889460154241647, + "grad_norm": 0.21467554569244385, + "learning_rate": 3.246826731714564e-06, + "loss": 0.934, + "step": 9938 + }, + { + "epoch": 2.6894872141794073, + "grad_norm": 0.21572718024253845, + "learning_rate": 3.2356762360266692e-06, + "loss": 0.935, + "step": 9940 + }, + { + "epoch": 2.6900284129346503, + "grad_norm": 0.21419967710971832, + "learning_rate": 3.2245442799377358e-06, + "loss": 0.9253, + "step": 9942 + }, + { + "epoch": 2.690569611689893, + "grad_norm": 0.21684741973876953, + "learning_rate": 3.2134308678610224e-06, + "loss": 0.9155, + "step": 9944 + }, + { + "epoch": 2.691110810445136, + "grad_norm": 0.21216537058353424, + "learning_rate": 3.2023360042023888e-06, + "loss": 0.9127, + "step": 9946 + }, + { + "epoch": 2.6916520092003786, + "grad_norm": 0.21773020923137665, + "learning_rate": 3.191259693360382e-06, + "loss": 0.9229, + "step": 9948 + }, + { + "epoch": 2.6921932079556217, + "grad_norm": 0.21705667674541473, + "learning_rate": 3.180201939726174e-06, + "loss": 0.9436, + "step": 9950 + }, + { + "epoch": 2.6927344067108647, + "grad_norm": 0.21495117247104645, + "learning_rate": 3.169162747683585e-06, + "loss": 0.9051, + "step": 9952 + }, + { + "epoch": 2.6932756054661073, + "grad_norm": 0.22001990675926208, + "learning_rate": 3.15814212160907e-06, + "loss": 0.9392, + "step": 9954 + }, + { + "epoch": 2.6938168042213504, + "grad_norm": 0.20875225961208344, + "learning_rate": 3.1471400658717344e-06, + "loss": 0.9188, + "step": 9956 + }, + { + "epoch": 2.694358002976593, + "grad_norm": 0.2155345231294632, + "learning_rate": 3.1361565848333163e-06, + "loss": 0.9093, + "step": 9958 + }, + { + "epoch": 2.694899201731836, + "grad_norm": 0.21063363552093506, + "learning_rate": 3.125191682848183e-06, + "loss": 0.9058, + "step": 9960 + }, + { + "epoch": 2.6954404004870787, + "grad_norm": 0.2168024480342865, + "learning_rate": 3.114245364263341e-06, + "loss": 0.9138, + "step": 9962 + }, + { + "epoch": 2.6959815992423217, + "grad_norm": 0.21178032457828522, + "learning_rate": 3.1033176334184346e-06, + "loss": 0.9375, + "step": 9964 + }, + { + "epoch": 2.696522797997565, + "grad_norm": 0.21875621378421783, + "learning_rate": 3.0924084946457486e-06, + "loss": 0.9274, + "step": 9966 + }, + { + "epoch": 2.6970639967528074, + "grad_norm": 0.21629124879837036, + "learning_rate": 3.0815179522701632e-06, + "loss": 0.9161, + "step": 9968 + }, + { + "epoch": 2.6976051955080504, + "grad_norm": 0.20874366164207458, + "learning_rate": 3.070646010609213e-06, + "loss": 0.947, + "step": 9970 + }, + { + "epoch": 2.698146394263293, + "grad_norm": 0.2121841162443161, + "learning_rate": 3.0597926739730498e-06, + "loss": 0.9241, + "step": 9972 + }, + { + "epoch": 2.698687593018536, + "grad_norm": 0.21571554243564606, + "learning_rate": 3.0489579466644657e-06, + "loss": 0.9063, + "step": 9974 + }, + { + "epoch": 2.6992287917737787, + "grad_norm": 0.2162920981645584, + "learning_rate": 3.038141832978847e-06, + "loss": 0.9252, + "step": 9976 + }, + { + "epoch": 2.699769990529022, + "grad_norm": 0.22083249688148499, + "learning_rate": 3.0273443372042243e-06, + "loss": 0.9168, + "step": 9978 + }, + { + "epoch": 2.700311189284265, + "grad_norm": 0.2186659723520279, + "learning_rate": 3.0165654636212235e-06, + "loss": 0.9347, + "step": 9980 + }, + { + "epoch": 2.7008523880395074, + "grad_norm": 0.21560966968536377, + "learning_rate": 3.0058052165031104e-06, + "loss": 0.9192, + "step": 9982 + }, + { + "epoch": 2.7013935867947505, + "grad_norm": 0.2172321081161499, + "learning_rate": 2.9950636001157606e-06, + "loss": 0.9193, + "step": 9984 + }, + { + "epoch": 2.701934785549993, + "grad_norm": 0.215291827917099, + "learning_rate": 2.984340618717657e-06, + "loss": 0.9382, + "step": 9986 + }, + { + "epoch": 2.702475984305236, + "grad_norm": 0.21620896458625793, + "learning_rate": 2.97363627655991e-06, + "loss": 0.9278, + "step": 9988 + }, + { + "epoch": 2.703017183060479, + "grad_norm": 0.21557556092739105, + "learning_rate": 2.9629505778862087e-06, + "loss": 0.9108, + "step": 9990 + }, + { + "epoch": 2.703558381815722, + "grad_norm": 0.21696265041828156, + "learning_rate": 2.952283526932892e-06, + "loss": 0.9202, + "step": 9992 + }, + { + "epoch": 2.704099580570965, + "grad_norm": 0.21361365914344788, + "learning_rate": 2.9416351279288656e-06, + "loss": 0.9312, + "step": 9994 + }, + { + "epoch": 2.7046407793262075, + "grad_norm": 0.21512265503406525, + "learning_rate": 2.9310053850956808e-06, + "loss": 0.9072, + "step": 9996 + }, + { + "epoch": 2.7051819780814506, + "grad_norm": 0.21364180743694305, + "learning_rate": 2.9203943026474558e-06, + "loss": 0.9075, + "step": 9998 + }, + { + "epoch": 2.705723176836693, + "grad_norm": 0.2109646499156952, + "learning_rate": 2.909801884790947e-06, + "loss": 0.9248, + "step": 10000 + }, + { + "epoch": 2.7062643755919362, + "grad_norm": 0.21330474317073822, + "learning_rate": 2.899228135725468e-06, + "loss": 0.9086, + "step": 10002 + }, + { + "epoch": 2.706805574347179, + "grad_norm": 0.2131352722644806, + "learning_rate": 2.8886730596429655e-06, + "loss": 0.9198, + "step": 10004 + }, + { + "epoch": 2.707346773102422, + "grad_norm": 0.21582989394664764, + "learning_rate": 2.8781366607279757e-06, + "loss": 0.9272, + "step": 10006 + }, + { + "epoch": 2.707887971857665, + "grad_norm": 0.21782691776752472, + "learning_rate": 2.867618943157635e-06, + "loss": 0.9187, + "step": 10008 + }, + { + "epoch": 2.7084291706129076, + "grad_norm": 0.21956509351730347, + "learning_rate": 2.8571199111016465e-06, + "loss": 0.9253, + "step": 10010 + }, + { + "epoch": 2.7089703693681506, + "grad_norm": 0.20969277620315552, + "learning_rate": 2.8466395687223424e-06, + "loss": 0.9036, + "step": 10012 + }, + { + "epoch": 2.709511568123393, + "grad_norm": 0.2168881595134735, + "learning_rate": 2.836177920174621e-06, + "loss": 0.9057, + "step": 10014 + }, + { + "epoch": 2.7100527668786363, + "grad_norm": 0.22248847782611847, + "learning_rate": 2.82573496960597e-06, + "loss": 0.9244, + "step": 10016 + }, + { + "epoch": 2.710593965633879, + "grad_norm": 0.2195899784564972, + "learning_rate": 2.8153107211564887e-06, + "loss": 0.9275, + "step": 10018 + }, + { + "epoch": 2.711135164389122, + "grad_norm": 0.2160910665988922, + "learning_rate": 2.8049051789588277e-06, + "loss": 0.9224, + "step": 10020 + }, + { + "epoch": 2.711676363144365, + "grad_norm": 0.22572214901447296, + "learning_rate": 2.794518347138253e-06, + "loss": 0.9316, + "step": 10022 + }, + { + "epoch": 2.7122175618996076, + "grad_norm": 0.21667098999023438, + "learning_rate": 2.784150229812582e-06, + "loss": 0.9224, + "step": 10024 + }, + { + "epoch": 2.71275876065485, + "grad_norm": 0.20611009001731873, + "learning_rate": 2.7738008310922383e-06, + "loss": 0.9188, + "step": 10026 + }, + { + "epoch": 2.7132999594100933, + "grad_norm": 0.2115674912929535, + "learning_rate": 2.7634701550802057e-06, + "loss": 0.9252, + "step": 10028 + }, + { + "epoch": 2.7138411581653363, + "grad_norm": 0.21092501282691956, + "learning_rate": 2.7531582058720696e-06, + "loss": 0.9306, + "step": 10030 + }, + { + "epoch": 2.714382356920579, + "grad_norm": 0.20970696210861206, + "learning_rate": 2.7428649875559655e-06, + "loss": 0.9242, + "step": 10032 + }, + { + "epoch": 2.714923555675822, + "grad_norm": 0.21444176137447357, + "learning_rate": 2.7325905042126177e-06, + "loss": 0.9289, + "step": 10034 + }, + { + "epoch": 2.715464754431065, + "grad_norm": 0.2261998951435089, + "learning_rate": 2.7223347599153294e-06, + "loss": 0.9079, + "step": 10036 + }, + { + "epoch": 2.7160059531863077, + "grad_norm": 0.20935611426830292, + "learning_rate": 2.7120977587299425e-06, + "loss": 0.9128, + "step": 10038 + }, + { + "epoch": 2.7165471519415503, + "grad_norm": 0.20867066085338593, + "learning_rate": 2.701879504714905e-06, + "loss": 0.928, + "step": 10040 + }, + { + "epoch": 2.7170883506967933, + "grad_norm": 0.2132885456085205, + "learning_rate": 2.6916800019212095e-06, + "loss": 0.9224, + "step": 10042 + }, + { + "epoch": 2.7176295494520364, + "grad_norm": 0.2161032259464264, + "learning_rate": 2.6814992543924445e-06, + "loss": 0.9671, + "step": 10044 + }, + { + "epoch": 2.718170748207279, + "grad_norm": 0.2119714915752411, + "learning_rate": 2.671337266164714e-06, + "loss": 0.9269, + "step": 10046 + }, + { + "epoch": 2.718711946962522, + "grad_norm": 0.21657121181488037, + "learning_rate": 2.661194041266735e-06, + "loss": 0.9402, + "step": 10048 + }, + { + "epoch": 2.719253145717765, + "grad_norm": 0.2108713537454605, + "learning_rate": 2.6510695837197398e-06, + "loss": 0.8988, + "step": 10050 + }, + { + "epoch": 2.7197943444730077, + "grad_norm": 0.2185131162405014, + "learning_rate": 2.6409638975375737e-06, + "loss": 0.9225, + "step": 10052 + }, + { + "epoch": 2.7203355432282503, + "grad_norm": 0.21219466626644135, + "learning_rate": 2.630876986726588e-06, + "loss": 0.9463, + "step": 10054 + }, + { + "epoch": 2.7208767419834934, + "grad_norm": 0.21359297633171082, + "learning_rate": 2.620808855285728e-06, + "loss": 0.9071, + "step": 10056 + }, + { + "epoch": 2.7214179407387364, + "grad_norm": 0.21115168929100037, + "learning_rate": 2.6107595072064616e-06, + "loss": 0.9272, + "step": 10058 + }, + { + "epoch": 2.721959139493979, + "grad_norm": 0.21979151666164398, + "learning_rate": 2.6007289464728414e-06, + "loss": 0.9275, + "step": 10060 + }, + { + "epoch": 2.722500338249222, + "grad_norm": 0.21622899174690247, + "learning_rate": 2.5907171770614526e-06, + "loss": 0.9165, + "step": 10062 + }, + { + "epoch": 2.723041537004465, + "grad_norm": 0.2232370525598526, + "learning_rate": 2.580724202941437e-06, + "loss": 0.9088, + "step": 10064 + }, + { + "epoch": 2.7235827357597078, + "grad_norm": 0.20912574231624603, + "learning_rate": 2.570750028074498e-06, + "loss": 0.9086, + "step": 10066 + }, + { + "epoch": 2.7241239345149504, + "grad_norm": 0.21085098385810852, + "learning_rate": 2.56079465641485e-06, + "loss": 0.9197, + "step": 10068 + }, + { + "epoch": 2.7246651332701934, + "grad_norm": 0.2122288942337036, + "learning_rate": 2.550858091909286e-06, + "loss": 0.9126, + "step": 10070 + }, + { + "epoch": 2.7252063320254365, + "grad_norm": 0.20993103086948395, + "learning_rate": 2.5409403384971276e-06, + "loss": 0.9293, + "step": 10072 + }, + { + "epoch": 2.725747530780679, + "grad_norm": 0.22006955742835999, + "learning_rate": 2.5310414001102454e-06, + "loss": 0.9306, + "step": 10074 + }, + { + "epoch": 2.726288729535922, + "grad_norm": 0.2129303365945816, + "learning_rate": 2.521161280673051e-06, + "loss": 0.9063, + "step": 10076 + }, + { + "epoch": 2.7268299282911648, + "grad_norm": 0.20847098529338837, + "learning_rate": 2.5112999841025e-06, + "loss": 0.9137, + "step": 10078 + }, + { + "epoch": 2.727371127046408, + "grad_norm": 0.22190417349338531, + "learning_rate": 2.501457514308064e-06, + "loss": 0.9277, + "step": 10080 + }, + { + "epoch": 2.7279123258016504, + "grad_norm": 0.2094324231147766, + "learning_rate": 2.491633875191768e-06, + "loss": 0.9094, + "step": 10082 + }, + { + "epoch": 2.7284535245568935, + "grad_norm": 0.2182997316122055, + "learning_rate": 2.481829070648173e-06, + "loss": 0.9251, + "step": 10084 + }, + { + "epoch": 2.7289947233121365, + "grad_norm": 0.21566352248191833, + "learning_rate": 2.472043104564381e-06, + "loss": 0.938, + "step": 10086 + }, + { + "epoch": 2.729535922067379, + "grad_norm": 0.2203526496887207, + "learning_rate": 2.462275980819989e-06, + "loss": 0.9048, + "step": 10088 + }, + { + "epoch": 2.730077120822622, + "grad_norm": 0.21169818937778473, + "learning_rate": 2.4525277032871662e-06, + "loss": 0.925, + "step": 10090 + }, + { + "epoch": 2.730618319577865, + "grad_norm": 0.21208597719669342, + "learning_rate": 2.4427982758305932e-06, + "loss": 0.9323, + "step": 10092 + }, + { + "epoch": 2.731159518333108, + "grad_norm": 0.21410343050956726, + "learning_rate": 2.4330877023074684e-06, + "loss": 0.9346, + "step": 10094 + }, + { + "epoch": 2.7317007170883505, + "grad_norm": 0.22480835020542145, + "learning_rate": 2.4233959865675227e-06, + "loss": 0.9231, + "step": 10096 + }, + { + "epoch": 2.7322419158435935, + "grad_norm": 0.2164887636899948, + "learning_rate": 2.413723132453022e-06, + "loss": 0.8748, + "step": 10098 + }, + { + "epoch": 2.7327831145988366, + "grad_norm": 0.21500205993652344, + "learning_rate": 2.4040691437987482e-06, + "loss": 0.9197, + "step": 10100 + }, + { + "epoch": 2.733324313354079, + "grad_norm": 0.21443840861320496, + "learning_rate": 2.394434024431985e-06, + "loss": 0.9348, + "step": 10102 + }, + { + "epoch": 2.7338655121093223, + "grad_norm": 0.21349507570266724, + "learning_rate": 2.3848177781725655e-06, + "loss": 0.9333, + "step": 10104 + }, + { + "epoch": 2.734406710864565, + "grad_norm": 0.20747917890548706, + "learning_rate": 2.3752204088328177e-06, + "loss": 0.9166, + "step": 10106 + }, + { + "epoch": 2.734947909619808, + "grad_norm": 0.2029200941324234, + "learning_rate": 2.365641920217593e-06, + "loss": 0.9249, + "step": 10108 + }, + { + "epoch": 2.7354891083750505, + "grad_norm": 0.2101501077413559, + "learning_rate": 2.3560823161242695e-06, + "loss": 0.8995, + "step": 10110 + }, + { + "epoch": 2.7360303071302936, + "grad_norm": 0.21798282861709595, + "learning_rate": 2.3465416003427223e-06, + "loss": 0.9162, + "step": 10112 + }, + { + "epoch": 2.7365715058855367, + "grad_norm": 0.2158065140247345, + "learning_rate": 2.337019776655347e-06, + "loss": 0.9591, + "step": 10114 + }, + { + "epoch": 2.7371127046407793, + "grad_norm": 0.20708400011062622, + "learning_rate": 2.3275168488370412e-06, + "loss": 0.9435, + "step": 10116 + }, + { + "epoch": 2.7376539033960223, + "grad_norm": 0.21228860318660736, + "learning_rate": 2.3180328206552195e-06, + "loss": 0.9278, + "step": 10118 + }, + { + "epoch": 2.738195102151265, + "grad_norm": 0.2101002037525177, + "learning_rate": 2.3085676958698012e-06, + "loss": 0.9289, + "step": 10120 + }, + { + "epoch": 2.738736300906508, + "grad_norm": 0.21251070499420166, + "learning_rate": 2.2991214782332237e-06, + "loss": 0.9047, + "step": 10122 + }, + { + "epoch": 2.7392774996617506, + "grad_norm": 0.21508239209651947, + "learning_rate": 2.289694171490392e-06, + "loss": 0.9372, + "step": 10124 + }, + { + "epoch": 2.7398186984169937, + "grad_norm": 0.21714644134044647, + "learning_rate": 2.2802857793787603e-06, + "loss": 0.9188, + "step": 10126 + }, + { + "epoch": 2.7403598971722367, + "grad_norm": 0.20984797179698944, + "learning_rate": 2.270896305628245e-06, + "loss": 0.907, + "step": 10128 + }, + { + "epoch": 2.7409010959274793, + "grad_norm": 0.2118617445230484, + "learning_rate": 2.2615257539612854e-06, + "loss": 0.8996, + "step": 10130 + }, + { + "epoch": 2.7414422946827224, + "grad_norm": 0.212440624833107, + "learning_rate": 2.2521741280928153e-06, + "loss": 0.9265, + "step": 10132 + }, + { + "epoch": 2.741983493437965, + "grad_norm": 0.22519421577453613, + "learning_rate": 2.242841431730275e-06, + "loss": 0.9266, + "step": 10134 + }, + { + "epoch": 2.742524692193208, + "grad_norm": 0.2083277851343155, + "learning_rate": 2.2335276685735653e-06, + "loss": 0.9103, + "step": 10136 + }, + { + "epoch": 2.7430658909484507, + "grad_norm": 0.21476352214813232, + "learning_rate": 2.224232842315116e-06, + "loss": 0.9223, + "step": 10138 + }, + { + "epoch": 2.7436070897036937, + "grad_norm": 0.2155662477016449, + "learning_rate": 2.2149569566398407e-06, + "loss": 0.9343, + "step": 10140 + }, + { + "epoch": 2.7441482884589368, + "grad_norm": 0.21446378529071808, + "learning_rate": 2.205700015225126e-06, + "loss": 0.9201, + "step": 10142 + }, + { + "epoch": 2.7446894872141794, + "grad_norm": 0.21759900450706482, + "learning_rate": 2.1964620217408915e-06, + "loss": 0.9148, + "step": 10144 + }, + { + "epoch": 2.7452306859694224, + "grad_norm": 0.2084120512008667, + "learning_rate": 2.1872429798494908e-06, + "loss": 0.9188, + "step": 10146 + }, + { + "epoch": 2.745771884724665, + "grad_norm": 0.21628348529338837, + "learning_rate": 2.178042893205806e-06, + "loss": 0.934, + "step": 10148 + }, + { + "epoch": 2.746313083479908, + "grad_norm": 0.2113163024187088, + "learning_rate": 2.1688617654571697e-06, + "loss": 0.9162, + "step": 10150 + }, + { + "epoch": 2.7468542822351507, + "grad_norm": 0.21270491182804108, + "learning_rate": 2.159699600243442e-06, + "loss": 0.9364, + "step": 10152 + }, + { + "epoch": 2.7473954809903938, + "grad_norm": 0.20905953645706177, + "learning_rate": 2.150556401196907e-06, + "loss": 0.9085, + "step": 10154 + }, + { + "epoch": 2.747936679745637, + "grad_norm": 0.21128450334072113, + "learning_rate": 2.1414321719424036e-06, + "loss": 0.9271, + "step": 10156 + }, + { + "epoch": 2.7484778785008794, + "grad_norm": 0.2195645272731781, + "learning_rate": 2.1323269160971773e-06, + "loss": 0.9227, + "step": 10158 + }, + { + "epoch": 2.749019077256122, + "grad_norm": 0.20804567635059357, + "learning_rate": 2.1232406372709957e-06, + "loss": 0.9055, + "step": 10160 + }, + { + "epoch": 2.749560276011365, + "grad_norm": 0.21035341918468475, + "learning_rate": 2.1141733390661e-06, + "loss": 0.917, + "step": 10162 + }, + { + "epoch": 2.750101474766608, + "grad_norm": 0.2167172133922577, + "learning_rate": 2.1051250250771817e-06, + "loss": 0.9327, + "step": 10164 + }, + { + "epoch": 2.7506426735218508, + "grad_norm": 0.21498580276966095, + "learning_rate": 2.096095698891426e-06, + "loss": 0.9201, + "step": 10166 + }, + { + "epoch": 2.751183872277094, + "grad_norm": 0.21536193788051605, + "learning_rate": 2.0870853640884922e-06, + "loss": 0.9339, + "step": 10168 + }, + { + "epoch": 2.751725071032337, + "grad_norm": 0.21404580771923065, + "learning_rate": 2.0780940242405124e-06, + "loss": 0.929, + "step": 10170 + }, + { + "epoch": 2.7522662697875795, + "grad_norm": 0.21269471943378448, + "learning_rate": 2.0691216829120617e-06, + "loss": 0.9096, + "step": 10172 + }, + { + "epoch": 2.752807468542822, + "grad_norm": 0.21956828236579895, + "learning_rate": 2.060168343660218e-06, + "loss": 0.9244, + "step": 10174 + }, + { + "epoch": 2.753348667298065, + "grad_norm": 0.21703964471817017, + "learning_rate": 2.051234010034492e-06, + "loss": 0.9079, + "step": 10176 + }, + { + "epoch": 2.753889866053308, + "grad_norm": 0.21299919486045837, + "learning_rate": 2.042318685576905e-06, + "loss": 0.91, + "step": 10178 + }, + { + "epoch": 2.754431064808551, + "grad_norm": 0.21364818513393402, + "learning_rate": 2.0334223738218918e-06, + "loss": 0.9268, + "step": 10180 + }, + { + "epoch": 2.754972263563794, + "grad_norm": 0.22158017754554749, + "learning_rate": 2.024545078296386e-06, + "loss": 0.9099, + "step": 10182 + }, + { + "epoch": 2.755513462319037, + "grad_norm": 0.2174145132303238, + "learning_rate": 2.0156868025197617e-06, + "loss": 0.9142, + "step": 10184 + }, + { + "epoch": 2.7560546610742795, + "grad_norm": 0.21465855836868286, + "learning_rate": 2.00684755000386e-06, + "loss": 0.9257, + "step": 10186 + }, + { + "epoch": 2.756595859829522, + "grad_norm": 0.21879304945468903, + "learning_rate": 1.9980273242529823e-06, + "loss": 0.9253, + "step": 10188 + }, + { + "epoch": 2.757137058584765, + "grad_norm": 0.2039138674736023, + "learning_rate": 1.989226128763888e-06, + "loss": 0.8929, + "step": 10190 + }, + { + "epoch": 2.7576782573400083, + "grad_norm": 0.2115360051393509, + "learning_rate": 1.9804439670257914e-06, + "loss": 0.934, + "step": 10192 + }, + { + "epoch": 2.758219456095251, + "grad_norm": 0.20571130514144897, + "learning_rate": 1.9716808425203513e-06, + "loss": 0.9321, + "step": 10194 + }, + { + "epoch": 2.758760654850494, + "grad_norm": 0.21286971867084503, + "learning_rate": 1.9629367587216895e-06, + "loss": 0.9323, + "step": 10196 + }, + { + "epoch": 2.7593018536057365, + "grad_norm": 0.21485485136508942, + "learning_rate": 1.9542117190963662e-06, + "loss": 0.9259, + "step": 10198 + }, + { + "epoch": 2.7598430523609796, + "grad_norm": 0.20296654105186462, + "learning_rate": 1.945505727103425e-06, + "loss": 0.9211, + "step": 10200 + } + ], + "logging_steps": 2, + "max_steps": 11088, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.23970508885201e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}