diff --git "a/l2-7b-eu/checkpoint-1500/trainer_state.json" "b/l2-7b-eu/checkpoint-1500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/l2-7b-eu/checkpoint-1500/trainer_state.json" @@ -0,0 +1,5291 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4058990664321472, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00027059937762143147, + "grad_norm": 4.086390018463135, + "learning_rate": 0.0, + "loss": 3.2754, + "step": 1 + }, + { + "epoch": 0.0005411987552428629, + "grad_norm": 3.758815288543701, + "learning_rate": 9.017132551848513e-08, + "loss": 3.2863, + "step": 2 + }, + { + "epoch": 0.0010823975104857259, + "grad_norm": 3.8250608444213867, + "learning_rate": 2.705139765554554e-07, + "loss": 3.3425, + "step": 4 + }, + { + "epoch": 0.0016235962657285888, + "grad_norm": 3.8092095851898193, + "learning_rate": 4.5085662759242564e-07, + "loss": 3.3165, + "step": 6 + }, + { + "epoch": 0.0021647950209714517, + "grad_norm": 3.7621052265167236, + "learning_rate": 6.311992786293959e-07, + "loss": 3.3295, + "step": 8 + }, + { + "epoch": 0.002705993776214315, + "grad_norm": 3.4136276245117188, + "learning_rate": 8.115419296663661e-07, + "loss": 3.3073, + "step": 10 + }, + { + "epoch": 0.0032471925314571776, + "grad_norm": 2.855100393295288, + "learning_rate": 9.918845807033363e-07, + "loss": 3.3031, + "step": 12 + }, + { + "epoch": 0.0037883912867000408, + "grad_norm": 2.491767406463623, + "learning_rate": 1.1722272317403068e-06, + "loss": 3.2943, + "step": 14 + }, + { + "epoch": 0.0043295900419429035, + "grad_norm": 2.359778642654419, + "learning_rate": 1.3525698827772768e-06, + "loss": 3.2622, + "step": 16 + }, + { + "epoch": 0.004870788797185766, + "grad_norm": 2.037504196166992, + "learning_rate": 1.5329125338142473e-06, + "loss": 3.239, + "step": 18 + }, + { + "epoch": 0.00541198755242863, + "grad_norm": 2.8542497158050537, + "learning_rate": 1.7132551848512173e-06, + "loss": 3.2031, + "step": 20 + }, + { + "epoch": 0.0059531863076714925, + "grad_norm": 2.297046661376953, + "learning_rate": 1.8935978358881876e-06, + "loss": 3.1721, + "step": 22 + }, + { + "epoch": 0.006494385062914355, + "grad_norm": 2.2149112224578857, + "learning_rate": 2.0739404869251576e-06, + "loss": 3.121, + "step": 24 + }, + { + "epoch": 0.007035583818157218, + "grad_norm": 1.8048591613769531, + "learning_rate": 2.254283137962128e-06, + "loss": 3.0857, + "step": 26 + }, + { + "epoch": 0.0075767825734000815, + "grad_norm": 1.7466434240341187, + "learning_rate": 2.4346257889990986e-06, + "loss": 3.0489, + "step": 28 + }, + { + "epoch": 0.008117981328642944, + "grad_norm": 2.1722524166107178, + "learning_rate": 2.6149684400360686e-06, + "loss": 3.0016, + "step": 30 + }, + { + "epoch": 0.008659180083885807, + "grad_norm": 1.364578366279602, + "learning_rate": 2.7953110910730386e-06, + "loss": 2.9587, + "step": 32 + }, + { + "epoch": 0.00920037883912867, + "grad_norm": 1.5823427438735962, + "learning_rate": 2.9756537421100095e-06, + "loss": 2.931, + "step": 34 + }, + { + "epoch": 0.009741577594371532, + "grad_norm": 1.2367908954620361, + "learning_rate": 3.1559963931469796e-06, + "loss": 2.8953, + "step": 36 + }, + { + "epoch": 0.010282776349614395, + "grad_norm": 1.0437366962432861, + "learning_rate": 3.3363390441839496e-06, + "loss": 2.8412, + "step": 38 + }, + { + "epoch": 0.01082397510485726, + "grad_norm": 1.081803798675537, + "learning_rate": 3.5166816952209197e-06, + "loss": 2.7832, + "step": 40 + }, + { + "epoch": 0.011365173860100122, + "grad_norm": 0.9715840220451355, + "learning_rate": 3.69702434625789e-06, + "loss": 2.7729, + "step": 42 + }, + { + "epoch": 0.011906372615342985, + "grad_norm": 0.8603936433792114, + "learning_rate": 3.877366997294861e-06, + "loss": 2.6904, + "step": 44 + }, + { + "epoch": 0.012447571370585848, + "grad_norm": 0.8236231803894043, + "learning_rate": 4.057709648331831e-06, + "loss": 2.6908, + "step": 46 + }, + { + "epoch": 0.01298877012582871, + "grad_norm": 0.7681186199188232, + "learning_rate": 4.2380522993688015e-06, + "loss": 2.6212, + "step": 48 + }, + { + "epoch": 0.013529968881071573, + "grad_norm": 0.8002827167510986, + "learning_rate": 4.4183949504057716e-06, + "loss": 2.6035, + "step": 50 + }, + { + "epoch": 0.014071167636314436, + "grad_norm": 0.6757120490074158, + "learning_rate": 4.598737601442742e-06, + "loss": 2.595, + "step": 52 + }, + { + "epoch": 0.014612366391557299, + "grad_norm": 0.6619369387626648, + "learning_rate": 4.779080252479712e-06, + "loss": 2.5522, + "step": 54 + }, + { + "epoch": 0.015153565146800163, + "grad_norm": 0.6247105598449707, + "learning_rate": 4.959422903516682e-06, + "loss": 2.5079, + "step": 56 + }, + { + "epoch": 0.015694763902043024, + "grad_norm": 0.6559263467788696, + "learning_rate": 5.139765554553652e-06, + "loss": 2.5009, + "step": 58 + }, + { + "epoch": 0.01623596265728589, + "grad_norm": 0.6590877175331116, + "learning_rate": 5.320108205590623e-06, + "loss": 2.4648, + "step": 60 + }, + { + "epoch": 0.01677716141252875, + "grad_norm": 0.6045516133308411, + "learning_rate": 5.500450856627593e-06, + "loss": 2.421, + "step": 62 + }, + { + "epoch": 0.017318360167771614, + "grad_norm": 0.6533932089805603, + "learning_rate": 5.680793507664563e-06, + "loss": 2.3966, + "step": 64 + }, + { + "epoch": 0.01785955892301448, + "grad_norm": 0.6478094458580017, + "learning_rate": 5.861136158701533e-06, + "loss": 2.3903, + "step": 66 + }, + { + "epoch": 0.01840075767825734, + "grad_norm": 0.7349300980567932, + "learning_rate": 6.041478809738504e-06, + "loss": 2.3552, + "step": 68 + }, + { + "epoch": 0.018941956433500204, + "grad_norm": 0.6454821825027466, + "learning_rate": 6.221821460775474e-06, + "loss": 2.3262, + "step": 70 + }, + { + "epoch": 0.019483155188743065, + "grad_norm": 0.7321672439575195, + "learning_rate": 6.402164111812444e-06, + "loss": 2.3197, + "step": 72 + }, + { + "epoch": 0.02002435394398593, + "grad_norm": 0.7664237022399902, + "learning_rate": 6.582506762849414e-06, + "loss": 2.2992, + "step": 74 + }, + { + "epoch": 0.02056555269922879, + "grad_norm": 0.6843811869621277, + "learning_rate": 6.762849413886384e-06, + "loss": 2.2927, + "step": 76 + }, + { + "epoch": 0.021106751454471655, + "grad_norm": 0.7199612259864807, + "learning_rate": 6.9431920649233556e-06, + "loss": 2.2525, + "step": 78 + }, + { + "epoch": 0.02164795020971452, + "grad_norm": 0.778446614742279, + "learning_rate": 7.123534715960326e-06, + "loss": 2.2267, + "step": 80 + }, + { + "epoch": 0.02218914896495738, + "grad_norm": 0.9287930727005005, + "learning_rate": 7.303877366997296e-06, + "loss": 2.2206, + "step": 82 + }, + { + "epoch": 0.022730347720200245, + "grad_norm": 1.033782958984375, + "learning_rate": 7.484220018034266e-06, + "loss": 2.2063, + "step": 84 + }, + { + "epoch": 0.023271546475443106, + "grad_norm": 1.0132615566253662, + "learning_rate": 7.664562669071236e-06, + "loss": 2.1677, + "step": 86 + }, + { + "epoch": 0.02381274523068597, + "grad_norm": 0.9043529033660889, + "learning_rate": 7.844905320108207e-06, + "loss": 2.1696, + "step": 88 + }, + { + "epoch": 0.02435394398592883, + "grad_norm": 0.6718290448188782, + "learning_rate": 8.025247971145176e-06, + "loss": 2.1492, + "step": 90 + }, + { + "epoch": 0.024895142741171696, + "grad_norm": 0.9615944027900696, + "learning_rate": 8.205590622182147e-06, + "loss": 2.1452, + "step": 92 + }, + { + "epoch": 0.02543634149641456, + "grad_norm": 0.9435996413230896, + "learning_rate": 8.385933273219116e-06, + "loss": 2.1098, + "step": 94 + }, + { + "epoch": 0.02597754025165742, + "grad_norm": 0.7614261507987976, + "learning_rate": 8.566275924256087e-06, + "loss": 2.1286, + "step": 96 + }, + { + "epoch": 0.026518739006900285, + "grad_norm": 0.9416339993476868, + "learning_rate": 8.746618575293058e-06, + "loss": 2.1092, + "step": 98 + }, + { + "epoch": 0.027059937762143146, + "grad_norm": 0.9229443073272705, + "learning_rate": 8.926961226330027e-06, + "loss": 2.0932, + "step": 100 + }, + { + "epoch": 0.02760113651738601, + "grad_norm": 0.7135593295097351, + "learning_rate": 9.107303877366998e-06, + "loss": 2.0699, + "step": 102 + }, + { + "epoch": 0.028142335272628872, + "grad_norm": 1.0263723134994507, + "learning_rate": 9.287646528403967e-06, + "loss": 2.0445, + "step": 104 + }, + { + "epoch": 0.028683534027871736, + "grad_norm": 1.0300300121307373, + "learning_rate": 9.467989179440938e-06, + "loss": 2.0463, + "step": 106 + }, + { + "epoch": 0.029224732783114597, + "grad_norm": 0.8331286311149597, + "learning_rate": 9.648331830477909e-06, + "loss": 2.0381, + "step": 108 + }, + { + "epoch": 0.02976593153835746, + "grad_norm": 0.7501435875892639, + "learning_rate": 9.828674481514878e-06, + "loss": 2.0411, + "step": 110 + }, + { + "epoch": 0.030307130293600326, + "grad_norm": 0.6895191073417664, + "learning_rate": 1.0009017132551849e-05, + "loss": 2.0475, + "step": 112 + }, + { + "epoch": 0.030848329048843187, + "grad_norm": 0.95854252576828, + "learning_rate": 1.018935978358882e-05, + "loss": 2.0071, + "step": 114 + }, + { + "epoch": 0.03138952780408605, + "grad_norm": 1.1303929090499878, + "learning_rate": 1.036970243462579e-05, + "loss": 2.0008, + "step": 116 + }, + { + "epoch": 0.031930726559328916, + "grad_norm": 0.7708876729011536, + "learning_rate": 1.055004508566276e-05, + "loss": 2.0061, + "step": 118 + }, + { + "epoch": 0.03247192531457178, + "grad_norm": 0.9773860573768616, + "learning_rate": 1.073038773669973e-05, + "loss": 2.0096, + "step": 120 + }, + { + "epoch": 0.03301312406981464, + "grad_norm": 1.118385910987854, + "learning_rate": 1.09107303877367e-05, + "loss": 1.9939, + "step": 122 + }, + { + "epoch": 0.0335543228250575, + "grad_norm": 0.7215014696121216, + "learning_rate": 1.109107303877367e-05, + "loss": 1.9515, + "step": 124 + }, + { + "epoch": 0.03409552158030037, + "grad_norm": 0.9696834683418274, + "learning_rate": 1.1271415689810642e-05, + "loss": 1.9639, + "step": 126 + }, + { + "epoch": 0.03463672033554323, + "grad_norm": 0.945482611656189, + "learning_rate": 1.1451758340847611e-05, + "loss": 1.9397, + "step": 128 + }, + { + "epoch": 0.03517791909078609, + "grad_norm": 0.7454535365104675, + "learning_rate": 1.1632100991884582e-05, + "loss": 1.9353, + "step": 130 + }, + { + "epoch": 0.03571911784602896, + "grad_norm": 0.7824187278747559, + "learning_rate": 1.1812443642921551e-05, + "loss": 1.9227, + "step": 132 + }, + { + "epoch": 0.03626031660127182, + "grad_norm": 0.7939879894256592, + "learning_rate": 1.1992786293958522e-05, + "loss": 1.9126, + "step": 134 + }, + { + "epoch": 0.03680151535651468, + "grad_norm": 0.7776147723197937, + "learning_rate": 1.2173128944995491e-05, + "loss": 1.9002, + "step": 136 + }, + { + "epoch": 0.03734271411175754, + "grad_norm": 0.6580236554145813, + "learning_rate": 1.2353471596032462e-05, + "loss": 1.9121, + "step": 138 + }, + { + "epoch": 0.03788391286700041, + "grad_norm": 0.7200301289558411, + "learning_rate": 1.2533814247069433e-05, + "loss": 1.8885, + "step": 140 + }, + { + "epoch": 0.03842511162224327, + "grad_norm": 0.7958497405052185, + "learning_rate": 1.2714156898106402e-05, + "loss": 1.9095, + "step": 142 + }, + { + "epoch": 0.03896631037748613, + "grad_norm": 0.9120681881904602, + "learning_rate": 1.2894499549143375e-05, + "loss": 1.884, + "step": 144 + }, + { + "epoch": 0.039507509132729, + "grad_norm": 0.8108247518539429, + "learning_rate": 1.3074842200180342e-05, + "loss": 1.8656, + "step": 146 + }, + { + "epoch": 0.04004870788797186, + "grad_norm": 0.7010449171066284, + "learning_rate": 1.3255184851217315e-05, + "loss": 1.8635, + "step": 148 + }, + { + "epoch": 0.04058990664321472, + "grad_norm": 0.8178524374961853, + "learning_rate": 1.3435527502254284e-05, + "loss": 1.8933, + "step": 150 + }, + { + "epoch": 0.04113110539845758, + "grad_norm": 1.0447405576705933, + "learning_rate": 1.3615870153291255e-05, + "loss": 1.8523, + "step": 152 + }, + { + "epoch": 0.04167230415370045, + "grad_norm": 0.8516271710395813, + "learning_rate": 1.3796212804328224e-05, + "loss": 1.8528, + "step": 154 + }, + { + "epoch": 0.04221350290894331, + "grad_norm": 0.8437328934669495, + "learning_rate": 1.3976555455365195e-05, + "loss": 1.861, + "step": 156 + }, + { + "epoch": 0.04275470166418617, + "grad_norm": 0.851265549659729, + "learning_rate": 1.4156898106402164e-05, + "loss": 1.8315, + "step": 158 + }, + { + "epoch": 0.04329590041942904, + "grad_norm": 0.7337156534194946, + "learning_rate": 1.4337240757439135e-05, + "loss": 1.8354, + "step": 160 + }, + { + "epoch": 0.0438370991746719, + "grad_norm": 0.9754143357276917, + "learning_rate": 1.4517583408476104e-05, + "loss": 1.8252, + "step": 162 + }, + { + "epoch": 0.04437829792991476, + "grad_norm": 0.6172115802764893, + "learning_rate": 1.4697926059513075e-05, + "loss": 1.8094, + "step": 164 + }, + { + "epoch": 0.04491949668515762, + "grad_norm": 0.8304158449172974, + "learning_rate": 1.4878268710550044e-05, + "loss": 1.8078, + "step": 166 + }, + { + "epoch": 0.04546069544040049, + "grad_norm": 0.6388853788375854, + "learning_rate": 1.5058611361587017e-05, + "loss": 1.8106, + "step": 168 + }, + { + "epoch": 0.04600189419564335, + "grad_norm": 0.743231475353241, + "learning_rate": 1.5238954012623984e-05, + "loss": 1.8144, + "step": 170 + }, + { + "epoch": 0.04654309295088621, + "grad_norm": 0.6442289352416992, + "learning_rate": 1.5419296663660955e-05, + "loss": 1.7831, + "step": 172 + }, + { + "epoch": 0.04708429170612908, + "grad_norm": 0.6877187490463257, + "learning_rate": 1.559963931469793e-05, + "loss": 1.8043, + "step": 174 + }, + { + "epoch": 0.04762549046137194, + "grad_norm": 0.9389640688896179, + "learning_rate": 1.5779981965734897e-05, + "loss": 1.7869, + "step": 176 + }, + { + "epoch": 0.0481666892166148, + "grad_norm": 1.0456589460372925, + "learning_rate": 1.5960324616771868e-05, + "loss": 1.7681, + "step": 178 + }, + { + "epoch": 0.04870788797185766, + "grad_norm": 0.9617791175842285, + "learning_rate": 1.614066726780884e-05, + "loss": 1.7668, + "step": 180 + }, + { + "epoch": 0.04924908672710053, + "grad_norm": 0.9334360361099243, + "learning_rate": 1.632100991884581e-05, + "loss": 1.7893, + "step": 182 + }, + { + "epoch": 0.04979028548234339, + "grad_norm": 0.8952531814575195, + "learning_rate": 1.6501352569882777e-05, + "loss": 1.7758, + "step": 184 + }, + { + "epoch": 0.05033148423758625, + "grad_norm": 0.8544924855232239, + "learning_rate": 1.6681695220919748e-05, + "loss": 1.793, + "step": 186 + }, + { + "epoch": 0.05087268299282912, + "grad_norm": 0.7782765030860901, + "learning_rate": 1.686203787195672e-05, + "loss": 1.768, + "step": 188 + }, + { + "epoch": 0.05141388174807198, + "grad_norm": 0.7119695544242859, + "learning_rate": 1.704238052299369e-05, + "loss": 1.7685, + "step": 190 + }, + { + "epoch": 0.05195508050331484, + "grad_norm": 0.9119647145271301, + "learning_rate": 1.7222723174030657e-05, + "loss": 1.7706, + "step": 192 + }, + { + "epoch": 0.0524962792585577, + "grad_norm": 0.6414957642555237, + "learning_rate": 1.7403065825067628e-05, + "loss": 1.7626, + "step": 194 + }, + { + "epoch": 0.05303747801380057, + "grad_norm": 0.8069677352905273, + "learning_rate": 1.75834084761046e-05, + "loss": 1.7423, + "step": 196 + }, + { + "epoch": 0.05357867676904343, + "grad_norm": 0.6549937725067139, + "learning_rate": 1.776375112714157e-05, + "loss": 1.7428, + "step": 198 + }, + { + "epoch": 0.05411987552428629, + "grad_norm": 0.8064024448394775, + "learning_rate": 1.7944093778178538e-05, + "loss": 1.7448, + "step": 200 + }, + { + "epoch": 0.054661074279529154, + "grad_norm": 0.7182701826095581, + "learning_rate": 1.8124436429215512e-05, + "loss": 1.7248, + "step": 202 + }, + { + "epoch": 0.05520227303477202, + "grad_norm": 0.6997919678688049, + "learning_rate": 1.830477908025248e-05, + "loss": 1.7281, + "step": 204 + }, + { + "epoch": 0.05574347179001488, + "grad_norm": 0.7071277499198914, + "learning_rate": 1.848512173128945e-05, + "loss": 1.714, + "step": 206 + }, + { + "epoch": 0.056284670545257744, + "grad_norm": 0.6344273090362549, + "learning_rate": 1.866546438232642e-05, + "loss": 1.7463, + "step": 208 + }, + { + "epoch": 0.05682586930050061, + "grad_norm": 0.7192733883857727, + "learning_rate": 1.8845807033363392e-05, + "loss": 1.737, + "step": 210 + }, + { + "epoch": 0.05736706805574347, + "grad_norm": 0.7418521642684937, + "learning_rate": 1.9026149684400363e-05, + "loss": 1.7197, + "step": 212 + }, + { + "epoch": 0.057908266810986334, + "grad_norm": 0.875845730304718, + "learning_rate": 1.920649233543733e-05, + "loss": 1.6968, + "step": 214 + }, + { + "epoch": 0.058449465566229195, + "grad_norm": 0.7394037842750549, + "learning_rate": 1.9386834986474305e-05, + "loss": 1.7051, + "step": 216 + }, + { + "epoch": 0.05899066432147206, + "grad_norm": 0.6689572930335999, + "learning_rate": 1.9567177637511272e-05, + "loss": 1.7152, + "step": 218 + }, + { + "epoch": 0.05953186307671492, + "grad_norm": 0.7955539226531982, + "learning_rate": 1.9747520288548243e-05, + "loss": 1.7136, + "step": 220 + }, + { + "epoch": 0.060073061831957784, + "grad_norm": 0.7005388140678406, + "learning_rate": 1.9927862939585214e-05, + "loss": 1.7152, + "step": 222 + }, + { + "epoch": 0.06061426058720065, + "grad_norm": 0.6205731630325317, + "learning_rate": 2.0108205590622185e-05, + "loss": 1.6901, + "step": 224 + }, + { + "epoch": 0.06115545934244351, + "grad_norm": 0.7079929709434509, + "learning_rate": 2.0288548241659152e-05, + "loss": 1.6905, + "step": 226 + }, + { + "epoch": 0.061696658097686374, + "grad_norm": 0.6871302723884583, + "learning_rate": 2.0468890892696123e-05, + "loss": 1.6867, + "step": 228 + }, + { + "epoch": 0.062237856852929235, + "grad_norm": 0.7172162532806396, + "learning_rate": 2.0649233543733094e-05, + "loss": 1.685, + "step": 230 + }, + { + "epoch": 0.0627790556081721, + "grad_norm": 0.6729004979133606, + "learning_rate": 2.0829576194770065e-05, + "loss": 1.6961, + "step": 232 + }, + { + "epoch": 0.06332025436341496, + "grad_norm": 0.7335099577903748, + "learning_rate": 2.1009918845807033e-05, + "loss": 1.6797, + "step": 234 + }, + { + "epoch": 0.06386145311865783, + "grad_norm": 0.6398060321807861, + "learning_rate": 2.1190261496844003e-05, + "loss": 1.7037, + "step": 236 + }, + { + "epoch": 0.0644026518739007, + "grad_norm": 0.7026365399360657, + "learning_rate": 2.1370604147880974e-05, + "loss": 1.6698, + "step": 238 + }, + { + "epoch": 0.06494385062914355, + "grad_norm": 0.7972332239151001, + "learning_rate": 2.1550946798917945e-05, + "loss": 1.6866, + "step": 240 + }, + { + "epoch": 0.06548504938438642, + "grad_norm": 0.7363021969795227, + "learning_rate": 2.1731289449954913e-05, + "loss": 1.6879, + "step": 242 + }, + { + "epoch": 0.06602624813962928, + "grad_norm": 0.7071017026901245, + "learning_rate": 2.1911632100991887e-05, + "loss": 1.6898, + "step": 244 + }, + { + "epoch": 0.06656744689487214, + "grad_norm": 0.8030880093574524, + "learning_rate": 2.2091974752028858e-05, + "loss": 1.6734, + "step": 246 + }, + { + "epoch": 0.067108645650115, + "grad_norm": 0.7429569363594055, + "learning_rate": 2.2272317403065825e-05, + "loss": 1.6722, + "step": 248 + }, + { + "epoch": 0.06764984440535787, + "grad_norm": 0.6807804107666016, + "learning_rate": 2.2452660054102796e-05, + "loss": 1.6697, + "step": 250 + }, + { + "epoch": 0.06819104316060073, + "grad_norm": 0.6632562875747681, + "learning_rate": 2.2633002705139767e-05, + "loss": 1.6453, + "step": 252 + }, + { + "epoch": 0.0687322419158436, + "grad_norm": 0.6661680340766907, + "learning_rate": 2.2813345356176738e-05, + "loss": 1.6701, + "step": 254 + }, + { + "epoch": 0.06927344067108646, + "grad_norm": 0.6747105121612549, + "learning_rate": 2.2993688007213706e-05, + "loss": 1.6729, + "step": 256 + }, + { + "epoch": 0.06981463942632932, + "grad_norm": 0.7698473334312439, + "learning_rate": 2.317403065825068e-05, + "loss": 1.6528, + "step": 258 + }, + { + "epoch": 0.07035583818157218, + "grad_norm": 0.6111325621604919, + "learning_rate": 2.3354373309287647e-05, + "loss": 1.6412, + "step": 260 + }, + { + "epoch": 0.07089703693681504, + "grad_norm": 0.7405019998550415, + "learning_rate": 2.3534715960324618e-05, + "loss": 1.6564, + "step": 262 + }, + { + "epoch": 0.07143823569205791, + "grad_norm": 0.6702501773834229, + "learning_rate": 2.371505861136159e-05, + "loss": 1.654, + "step": 264 + }, + { + "epoch": 0.07197943444730077, + "grad_norm": 0.7076373100280762, + "learning_rate": 2.389540126239856e-05, + "loss": 1.6301, + "step": 266 + }, + { + "epoch": 0.07252063320254364, + "grad_norm": 0.7239627242088318, + "learning_rate": 2.4075743913435528e-05, + "loss": 1.6575, + "step": 268 + }, + { + "epoch": 0.0730618319577865, + "grad_norm": 0.753480076789856, + "learning_rate": 2.42560865644725e-05, + "loss": 1.6603, + "step": 270 + }, + { + "epoch": 0.07360303071302936, + "grad_norm": 0.7261641025543213, + "learning_rate": 2.443642921550947e-05, + "loss": 1.6449, + "step": 272 + }, + { + "epoch": 0.07414422946827222, + "grad_norm": 0.6315119862556458, + "learning_rate": 2.461677186654644e-05, + "loss": 1.6538, + "step": 274 + }, + { + "epoch": 0.07468542822351508, + "grad_norm": 0.5698412656784058, + "learning_rate": 2.4797114517583408e-05, + "loss": 1.6663, + "step": 276 + }, + { + "epoch": 0.07522662697875795, + "grad_norm": 0.5968983173370361, + "learning_rate": 2.497745716862038e-05, + "loss": 1.643, + "step": 278 + }, + { + "epoch": 0.07576782573400082, + "grad_norm": 0.561126172542572, + "learning_rate": 2.5157799819657353e-05, + "loss": 1.6301, + "step": 280 + }, + { + "epoch": 0.07630902448924368, + "grad_norm": 0.7290865778923035, + "learning_rate": 2.533814247069432e-05, + "loss": 1.6412, + "step": 282 + }, + { + "epoch": 0.07685022324448654, + "grad_norm": 0.7629122138023376, + "learning_rate": 2.5518485121731288e-05, + "loss": 1.6335, + "step": 284 + }, + { + "epoch": 0.0773914219997294, + "grad_norm": 0.5383496284484863, + "learning_rate": 2.5698827772768262e-05, + "loss": 1.6226, + "step": 286 + }, + { + "epoch": 0.07793262075497226, + "grad_norm": 0.7778373956680298, + "learning_rate": 2.5879170423805233e-05, + "loss": 1.6333, + "step": 288 + }, + { + "epoch": 0.07847381951021512, + "grad_norm": 0.6851366758346558, + "learning_rate": 2.60595130748422e-05, + "loss": 1.6251, + "step": 290 + }, + { + "epoch": 0.079015018265458, + "grad_norm": 0.5947225689888, + "learning_rate": 2.623985572587917e-05, + "loss": 1.6298, + "step": 292 + }, + { + "epoch": 0.07955621702070086, + "grad_norm": 0.9742544889450073, + "learning_rate": 2.6420198376916146e-05, + "loss": 1.6252, + "step": 294 + }, + { + "epoch": 0.08009741577594372, + "grad_norm": 1.2064323425292969, + "learning_rate": 2.6600541027953113e-05, + "loss": 1.6152, + "step": 296 + }, + { + "epoch": 0.08063861453118658, + "grad_norm": 1.0506716966629028, + "learning_rate": 2.678088367899008e-05, + "loss": 1.6351, + "step": 298 + }, + { + "epoch": 0.08117981328642944, + "grad_norm": 1.2992738485336304, + "learning_rate": 2.696122633002705e-05, + "loss": 1.6193, + "step": 300 + }, + { + "epoch": 0.0817210120416723, + "grad_norm": 1.0616599321365356, + "learning_rate": 2.7141568981064026e-05, + "loss": 1.6135, + "step": 302 + }, + { + "epoch": 0.08226221079691516, + "grad_norm": 1.037997841835022, + "learning_rate": 2.7321911632100993e-05, + "loss": 1.6344, + "step": 304 + }, + { + "epoch": 0.08280340955215804, + "grad_norm": 0.8937569856643677, + "learning_rate": 2.7502254283137964e-05, + "loss": 1.6077, + "step": 306 + }, + { + "epoch": 0.0833446083074009, + "grad_norm": 1.1334234476089478, + "learning_rate": 2.7682596934174932e-05, + "loss": 1.6193, + "step": 308 + }, + { + "epoch": 0.08388580706264376, + "grad_norm": 0.8336219191551208, + "learning_rate": 2.7862939585211906e-05, + "loss": 1.5948, + "step": 310 + }, + { + "epoch": 0.08442700581788662, + "grad_norm": 1.1825398206710815, + "learning_rate": 2.8043282236248874e-05, + "loss": 1.6239, + "step": 312 + }, + { + "epoch": 0.08496820457312948, + "grad_norm": 0.7945433259010315, + "learning_rate": 2.8223624887285844e-05, + "loss": 1.6119, + "step": 314 + }, + { + "epoch": 0.08550940332837234, + "grad_norm": 0.6971009969711304, + "learning_rate": 2.8403967538322812e-05, + "loss": 1.5822, + "step": 316 + }, + { + "epoch": 0.0860506020836152, + "grad_norm": 0.6050766706466675, + "learning_rate": 2.8584310189359786e-05, + "loss": 1.6161, + "step": 318 + }, + { + "epoch": 0.08659180083885808, + "grad_norm": 0.6123189330101013, + "learning_rate": 2.8764652840396754e-05, + "loss": 1.5941, + "step": 320 + }, + { + "epoch": 0.08713299959410094, + "grad_norm": 0.5471253395080566, + "learning_rate": 2.8944995491433725e-05, + "loss": 1.603, + "step": 322 + }, + { + "epoch": 0.0876741983493438, + "grad_norm": 0.5793882608413696, + "learning_rate": 2.91253381424707e-05, + "loss": 1.6076, + "step": 324 + }, + { + "epoch": 0.08821539710458666, + "grad_norm": 0.5409413576126099, + "learning_rate": 2.9305680793507666e-05, + "loss": 1.5825, + "step": 326 + }, + { + "epoch": 0.08875659585982952, + "grad_norm": 6.757148265838623, + "learning_rate": 2.9486023444544637e-05, + "loss": 1.5942, + "step": 328 + }, + { + "epoch": 0.08929779461507238, + "grad_norm": 1.3357856273651123, + "learning_rate": 2.9666366095581605e-05, + "loss": 1.642, + "step": 330 + }, + { + "epoch": 0.08983899337031524, + "grad_norm": 0.8245829939842224, + "learning_rate": 2.984670874661858e-05, + "loss": 1.6062, + "step": 332 + }, + { + "epoch": 0.09038019212555812, + "grad_norm": 0.8888993263244629, + "learning_rate": 3.0027051397655547e-05, + "loss": 1.5952, + "step": 334 + }, + { + "epoch": 0.09092139088080098, + "grad_norm": 0.8923915028572083, + "learning_rate": 3.0207394048692517e-05, + "loss": 1.5977, + "step": 336 + }, + { + "epoch": 0.09146258963604384, + "grad_norm": 0.7443459033966064, + "learning_rate": 3.0387736699729485e-05, + "loss": 1.5738, + "step": 338 + }, + { + "epoch": 0.0920037883912867, + "grad_norm": 0.7297430038452148, + "learning_rate": 3.056807935076646e-05, + "loss": 1.5907, + "step": 340 + }, + { + "epoch": 0.09254498714652956, + "grad_norm": 0.6882812976837158, + "learning_rate": 3.074842200180343e-05, + "loss": 1.5767, + "step": 342 + }, + { + "epoch": 0.09308618590177242, + "grad_norm": 0.6150392889976501, + "learning_rate": 3.0928764652840394e-05, + "loss": 1.5747, + "step": 344 + }, + { + "epoch": 0.09362738465701528, + "grad_norm": 0.6230599284172058, + "learning_rate": 3.110910730387737e-05, + "loss": 1.583, + "step": 346 + }, + { + "epoch": 0.09416858341225816, + "grad_norm": 0.6081874966621399, + "learning_rate": 3.128944995491434e-05, + "loss": 1.5875, + "step": 348 + }, + { + "epoch": 0.09470978216750102, + "grad_norm": 0.5467821955680847, + "learning_rate": 3.146979260595131e-05, + "loss": 1.575, + "step": 350 + }, + { + "epoch": 0.09525098092274388, + "grad_norm": 0.5629361271858215, + "learning_rate": 3.165013525698828e-05, + "loss": 1.5828, + "step": 352 + }, + { + "epoch": 0.09579217967798674, + "grad_norm": 0.5995283126831055, + "learning_rate": 3.1830477908025245e-05, + "loss": 1.5872, + "step": 354 + }, + { + "epoch": 0.0963333784332296, + "grad_norm": 0.556450366973877, + "learning_rate": 3.201082055906222e-05, + "loss": 1.553, + "step": 356 + }, + { + "epoch": 0.09687457718847246, + "grad_norm": 0.6498537063598633, + "learning_rate": 3.219116321009919e-05, + "loss": 1.5667, + "step": 358 + }, + { + "epoch": 0.09741577594371532, + "grad_norm": 0.5891172885894775, + "learning_rate": 3.237150586113616e-05, + "loss": 1.5818, + "step": 360 + }, + { + "epoch": 0.0979569746989582, + "grad_norm": 0.6487797498703003, + "learning_rate": 3.2551848512173136e-05, + "loss": 1.5582, + "step": 362 + }, + { + "epoch": 0.09849817345420106, + "grad_norm": 0.5860658884048462, + "learning_rate": 3.27321911632101e-05, + "loss": 1.5725, + "step": 364 + }, + { + "epoch": 0.09903937220944392, + "grad_norm": 0.5619581937789917, + "learning_rate": 3.291253381424707e-05, + "loss": 1.5779, + "step": 366 + }, + { + "epoch": 0.09958057096468678, + "grad_norm": 0.7147429585456848, + "learning_rate": 3.309287646528404e-05, + "loss": 1.5766, + "step": 368 + }, + { + "epoch": 0.10012176971992964, + "grad_norm": 0.5840562582015991, + "learning_rate": 3.327321911632101e-05, + "loss": 1.5609, + "step": 370 + }, + { + "epoch": 0.1006629684751725, + "grad_norm": 0.6277860403060913, + "learning_rate": 3.345356176735798e-05, + "loss": 1.5645, + "step": 372 + }, + { + "epoch": 0.10120416723041536, + "grad_norm": 0.6395567655563354, + "learning_rate": 3.3633904418394954e-05, + "loss": 1.545, + "step": 374 + }, + { + "epoch": 0.10174536598565824, + "grad_norm": 0.6651553511619568, + "learning_rate": 3.381424706943192e-05, + "loss": 1.5643, + "step": 376 + }, + { + "epoch": 0.1022865647409011, + "grad_norm": 0.6691033244132996, + "learning_rate": 3.3994589720468896e-05, + "loss": 1.5705, + "step": 378 + }, + { + "epoch": 0.10282776349614396, + "grad_norm": 0.5426511764526367, + "learning_rate": 3.4174932371505863e-05, + "loss": 1.536, + "step": 380 + }, + { + "epoch": 0.10336896225138682, + "grad_norm": 0.6677694916725159, + "learning_rate": 3.435527502254283e-05, + "loss": 1.5664, + "step": 382 + }, + { + "epoch": 0.10391016100662968, + "grad_norm": 0.5283762216567993, + "learning_rate": 3.45356176735798e-05, + "loss": 1.5474, + "step": 384 + }, + { + "epoch": 0.10445135976187254, + "grad_norm": 0.652812659740448, + "learning_rate": 3.471596032461677e-05, + "loss": 1.5509, + "step": 386 + }, + { + "epoch": 0.1049925585171154, + "grad_norm": 0.8639987111091614, + "learning_rate": 3.489630297565375e-05, + "loss": 1.5563, + "step": 388 + }, + { + "epoch": 0.10553375727235827, + "grad_norm": 0.7726946473121643, + "learning_rate": 3.5076645626690715e-05, + "loss": 1.5682, + "step": 390 + }, + { + "epoch": 0.10607495602760114, + "grad_norm": 0.6511155962944031, + "learning_rate": 3.525698827772768e-05, + "loss": 1.5571, + "step": 392 + }, + { + "epoch": 0.106616154782844, + "grad_norm": 0.6578395962715149, + "learning_rate": 3.5437330928764656e-05, + "loss": 1.5452, + "step": 394 + }, + { + "epoch": 0.10715735353808686, + "grad_norm": 0.642919659614563, + "learning_rate": 3.5617673579801624e-05, + "loss": 1.5508, + "step": 396 + }, + { + "epoch": 0.10769855229332972, + "grad_norm": 0.5190348029136658, + "learning_rate": 3.579801623083859e-05, + "loss": 1.5432, + "step": 398 + }, + { + "epoch": 0.10823975104857259, + "grad_norm": 0.48932549357414246, + "learning_rate": 3.5978358881875566e-05, + "loss": 1.5544, + "step": 400 + }, + { + "epoch": 0.10878094980381545, + "grad_norm": 0.5018340945243835, + "learning_rate": 3.615870153291254e-05, + "loss": 1.5322, + "step": 402 + }, + { + "epoch": 0.10932214855905831, + "grad_norm": 0.5701499581336975, + "learning_rate": 3.633904418394951e-05, + "loss": 1.5288, + "step": 404 + }, + { + "epoch": 0.10986334731430118, + "grad_norm": 0.6049205660820007, + "learning_rate": 3.6519386834986475e-05, + "loss": 1.5627, + "step": 406 + }, + { + "epoch": 0.11040454606954404, + "grad_norm": 0.5781517028808594, + "learning_rate": 3.669972948602345e-05, + "loss": 1.542, + "step": 408 + }, + { + "epoch": 0.1109457448247869, + "grad_norm": 0.5594660043716431, + "learning_rate": 3.688007213706042e-05, + "loss": 1.5461, + "step": 410 + }, + { + "epoch": 0.11148694358002977, + "grad_norm": 0.5319619178771973, + "learning_rate": 3.7060414788097384e-05, + "loss": 1.5668, + "step": 412 + }, + { + "epoch": 0.11202814233527263, + "grad_norm": 0.5311123728752136, + "learning_rate": 3.724075743913435e-05, + "loss": 1.528, + "step": 414 + }, + { + "epoch": 0.11256934109051549, + "grad_norm": 0.5555101633071899, + "learning_rate": 3.7421100090171326e-05, + "loss": 1.5392, + "step": 416 + }, + { + "epoch": 0.11311053984575835, + "grad_norm": 0.5486223101615906, + "learning_rate": 3.76014427412083e-05, + "loss": 1.5337, + "step": 418 + }, + { + "epoch": 0.11365173860100122, + "grad_norm": 0.5156669020652771, + "learning_rate": 3.778178539224527e-05, + "loss": 1.5105, + "step": 420 + }, + { + "epoch": 0.11419293735624408, + "grad_norm": 0.49596554040908813, + "learning_rate": 3.7962128043282235e-05, + "loss": 1.515, + "step": 422 + }, + { + "epoch": 0.11473413611148695, + "grad_norm": 0.641333281993866, + "learning_rate": 3.814247069431921e-05, + "loss": 1.5328, + "step": 424 + }, + { + "epoch": 0.1152753348667298, + "grad_norm": 0.6106113195419312, + "learning_rate": 3.832281334535618e-05, + "loss": 1.5189, + "step": 426 + }, + { + "epoch": 0.11581653362197267, + "grad_norm": 0.5619134306907654, + "learning_rate": 3.8503155996393145e-05, + "loss": 1.5295, + "step": 428 + }, + { + "epoch": 0.11635773237721553, + "grad_norm": 0.5396978259086609, + "learning_rate": 3.868349864743012e-05, + "loss": 1.5173, + "step": 430 + }, + { + "epoch": 0.11689893113245839, + "grad_norm": 0.5466894507408142, + "learning_rate": 3.886384129846709e-05, + "loss": 1.5191, + "step": 432 + }, + { + "epoch": 0.11744012988770126, + "grad_norm": 0.5601218342781067, + "learning_rate": 3.904418394950406e-05, + "loss": 1.5285, + "step": 434 + }, + { + "epoch": 0.11798132864294412, + "grad_norm": 0.6620492935180664, + "learning_rate": 3.922452660054103e-05, + "loss": 1.4946, + "step": 436 + }, + { + "epoch": 0.11852252739818699, + "grad_norm": 0.49140048027038574, + "learning_rate": 3.9404869251578e-05, + "loss": 1.512, + "step": 438 + }, + { + "epoch": 0.11906372615342985, + "grad_norm": 0.5824118256568909, + "learning_rate": 3.958521190261497e-05, + "loss": 1.5244, + "step": 440 + }, + { + "epoch": 0.11960492490867271, + "grad_norm": 0.4967150092124939, + "learning_rate": 3.976555455365194e-05, + "loss": 1.5273, + "step": 442 + }, + { + "epoch": 0.12014612366391557, + "grad_norm": 0.5089767575263977, + "learning_rate": 3.994589720468891e-05, + "loss": 1.5119, + "step": 444 + }, + { + "epoch": 0.12068732241915843, + "grad_norm": 0.5404312014579773, + "learning_rate": 4.0126239855725886e-05, + "loss": 1.5072, + "step": 446 + }, + { + "epoch": 0.1212285211744013, + "grad_norm": 0.5239550471305847, + "learning_rate": 4.0306582506762853e-05, + "loss": 1.5336, + "step": 448 + }, + { + "epoch": 0.12176971992964417, + "grad_norm": 0.4974781274795532, + "learning_rate": 4.048692515779982e-05, + "loss": 1.5225, + "step": 450 + }, + { + "epoch": 0.12231091868488703, + "grad_norm": 0.5363791584968567, + "learning_rate": 4.066726780883679e-05, + "loss": 1.5176, + "step": 452 + }, + { + "epoch": 0.12285211744012989, + "grad_norm": 0.5095157027244568, + "learning_rate": 4.084761045987376e-05, + "loss": 1.4936, + "step": 454 + }, + { + "epoch": 0.12339331619537275, + "grad_norm": 0.4920356869697571, + "learning_rate": 4.102795311091073e-05, + "loss": 1.5269, + "step": 456 + }, + { + "epoch": 0.12393451495061561, + "grad_norm": 0.4940793514251709, + "learning_rate": 4.1208295761947705e-05, + "loss": 1.5072, + "step": 458 + }, + { + "epoch": 0.12447571370585847, + "grad_norm": 0.4805227220058441, + "learning_rate": 4.138863841298467e-05, + "loss": 1.4987, + "step": 460 + }, + { + "epoch": 0.12501691246110133, + "grad_norm": 0.49683934450149536, + "learning_rate": 4.1568981064021646e-05, + "loss": 1.5008, + "step": 462 + }, + { + "epoch": 0.1255581112163442, + "grad_norm": 0.5283801555633545, + "learning_rate": 4.1749323715058614e-05, + "loss": 1.5177, + "step": 464 + }, + { + "epoch": 0.12609930997158705, + "grad_norm": 0.5395119190216064, + "learning_rate": 4.192966636609558e-05, + "loss": 1.5106, + "step": 466 + }, + { + "epoch": 0.12664050872682991, + "grad_norm": 0.5403693914413452, + "learning_rate": 4.211000901713255e-05, + "loss": 1.4854, + "step": 468 + }, + { + "epoch": 0.1271817074820728, + "grad_norm": 0.4690951406955719, + "learning_rate": 4.229035166816952e-05, + "loss": 1.5079, + "step": 470 + }, + { + "epoch": 0.12772290623731566, + "grad_norm": 0.5077293515205383, + "learning_rate": 4.24706943192065e-05, + "loss": 1.4953, + "step": 472 + }, + { + "epoch": 0.12826410499255853, + "grad_norm": 0.440019816160202, + "learning_rate": 4.2651036970243465e-05, + "loss": 1.4864, + "step": 474 + }, + { + "epoch": 0.1288053037478014, + "grad_norm": 0.48672759532928467, + "learning_rate": 4.283137962128044e-05, + "loss": 1.5205, + "step": 476 + }, + { + "epoch": 0.12934650250304425, + "grad_norm": 0.4732811450958252, + "learning_rate": 4.301172227231741e-05, + "loss": 1.4998, + "step": 478 + }, + { + "epoch": 0.1298877012582871, + "grad_norm": 0.46713048219680786, + "learning_rate": 4.3192064923354374e-05, + "loss": 1.4893, + "step": 480 + }, + { + "epoch": 0.13042890001352997, + "grad_norm": 0.502356231212616, + "learning_rate": 4.337240757439134e-05, + "loss": 1.5125, + "step": 482 + }, + { + "epoch": 0.13097009876877283, + "grad_norm": 0.45067864656448364, + "learning_rate": 4.3552750225428316e-05, + "loss": 1.4978, + "step": 484 + }, + { + "epoch": 0.1315112975240157, + "grad_norm": 0.46964120864868164, + "learning_rate": 4.373309287646529e-05, + "loss": 1.5006, + "step": 486 + }, + { + "epoch": 0.13205249627925855, + "grad_norm": 0.47723180055618286, + "learning_rate": 4.391343552750226e-05, + "loss": 1.513, + "step": 488 + }, + { + "epoch": 0.1325936950345014, + "grad_norm": 0.5100542306900024, + "learning_rate": 4.4093778178539225e-05, + "loss": 1.5279, + "step": 490 + }, + { + "epoch": 0.13313489378974427, + "grad_norm": 0.5344257354736328, + "learning_rate": 4.42741208295762e-05, + "loss": 1.5193, + "step": 492 + }, + { + "epoch": 0.13367609254498714, + "grad_norm": 0.5867893695831299, + "learning_rate": 4.445446348061317e-05, + "loss": 1.512, + "step": 494 + }, + { + "epoch": 0.13421729130023, + "grad_norm": 0.7811394929885864, + "learning_rate": 4.4634806131650134e-05, + "loss": 1.5038, + "step": 496 + }, + { + "epoch": 0.13475849005547288, + "grad_norm": 0.8505339622497559, + "learning_rate": 4.48151487826871e-05, + "loss": 1.5169, + "step": 498 + }, + { + "epoch": 0.13529968881071575, + "grad_norm": 0.6337641477584839, + "learning_rate": 4.4995491433724076e-05, + "loss": 1.4951, + "step": 500 + }, + { + "epoch": 0.1358408875659586, + "grad_norm": 0.7979961633682251, + "learning_rate": 4.517583408476105e-05, + "loss": 1.5031, + "step": 502 + }, + { + "epoch": 0.13638208632120147, + "grad_norm": 0.6946894526481628, + "learning_rate": 4.535617673579802e-05, + "loss": 1.501, + "step": 504 + }, + { + "epoch": 0.13692328507644433, + "grad_norm": 0.6830259561538696, + "learning_rate": 4.5536519386834986e-05, + "loss": 1.4896, + "step": 506 + }, + { + "epoch": 0.1374644838316872, + "grad_norm": 0.5908662676811218, + "learning_rate": 4.571686203787196e-05, + "loss": 1.4992, + "step": 508 + }, + { + "epoch": 0.13800568258693005, + "grad_norm": 0.7655865550041199, + "learning_rate": 4.589720468890893e-05, + "loss": 1.4911, + "step": 510 + }, + { + "epoch": 0.1385468813421729, + "grad_norm": 0.5924785733222961, + "learning_rate": 4.6077547339945895e-05, + "loss": 1.4719, + "step": 512 + }, + { + "epoch": 0.13908808009741577, + "grad_norm": 0.6654263138771057, + "learning_rate": 4.625788999098287e-05, + "loss": 1.5109, + "step": 514 + }, + { + "epoch": 0.13962927885265863, + "grad_norm": 0.5296297073364258, + "learning_rate": 4.6438232642019843e-05, + "loss": 1.4934, + "step": 516 + }, + { + "epoch": 0.1401704776079015, + "grad_norm": 0.5698690414428711, + "learning_rate": 4.661857529305681e-05, + "loss": 1.4954, + "step": 518 + }, + { + "epoch": 0.14071167636314436, + "grad_norm": 0.5790325403213501, + "learning_rate": 4.679891794409378e-05, + "loss": 1.4673, + "step": 520 + }, + { + "epoch": 0.14125287511838722, + "grad_norm": 0.551480770111084, + "learning_rate": 4.697926059513075e-05, + "loss": 1.476, + "step": 522 + }, + { + "epoch": 0.14179407387363008, + "grad_norm": 0.5201780796051025, + "learning_rate": 4.715960324616772e-05, + "loss": 1.4701, + "step": 524 + }, + { + "epoch": 0.14233527262887297, + "grad_norm": 0.46442562341690063, + "learning_rate": 4.733994589720469e-05, + "loss": 1.4831, + "step": 526 + }, + { + "epoch": 0.14287647138411583, + "grad_norm": 0.5558522939682007, + "learning_rate": 4.752028854824166e-05, + "loss": 1.4729, + "step": 528 + }, + { + "epoch": 0.1434176701393587, + "grad_norm": 0.48511791229248047, + "learning_rate": 4.7700631199278636e-05, + "loss": 1.4742, + "step": 530 + }, + { + "epoch": 0.14395886889460155, + "grad_norm": 0.5244829058647156, + "learning_rate": 4.7880973850315604e-05, + "loss": 1.4928, + "step": 532 + }, + { + "epoch": 0.1445000676498444, + "grad_norm": 0.48878946900367737, + "learning_rate": 4.806131650135257e-05, + "loss": 1.4921, + "step": 534 + }, + { + "epoch": 0.14504126640508727, + "grad_norm": 0.5348760485649109, + "learning_rate": 4.824165915238954e-05, + "loss": 1.4917, + "step": 536 + }, + { + "epoch": 0.14558246516033013, + "grad_norm": 0.5444923639297485, + "learning_rate": 4.842200180342651e-05, + "loss": 1.4546, + "step": 538 + }, + { + "epoch": 0.146123663915573, + "grad_norm": 0.494761198759079, + "learning_rate": 4.860234445446348e-05, + "loss": 1.4751, + "step": 540 + }, + { + "epoch": 0.14666486267081585, + "grad_norm": 0.4921441674232483, + "learning_rate": 4.8782687105500455e-05, + "loss": 1.4767, + "step": 542 + }, + { + "epoch": 0.14720606142605872, + "grad_norm": 0.48382577300071716, + "learning_rate": 4.896302975653742e-05, + "loss": 1.485, + "step": 544 + }, + { + "epoch": 0.14774726018130158, + "grad_norm": 0.4616708755493164, + "learning_rate": 4.9143372407574397e-05, + "loss": 1.4732, + "step": 546 + }, + { + "epoch": 0.14828845893654444, + "grad_norm": 0.5030043125152588, + "learning_rate": 4.9323715058611364e-05, + "loss": 1.4799, + "step": 548 + }, + { + "epoch": 0.1488296576917873, + "grad_norm": 0.467230886220932, + "learning_rate": 4.950405770964833e-05, + "loss": 1.4594, + "step": 550 + }, + { + "epoch": 0.14937085644703016, + "grad_norm": 0.42864304780960083, + "learning_rate": 4.9684400360685306e-05, + "loss": 1.4748, + "step": 552 + }, + { + "epoch": 0.14991205520227305, + "grad_norm": 0.43733683228492737, + "learning_rate": 4.986474301172227e-05, + "loss": 1.462, + "step": 554 + }, + { + "epoch": 0.1504532539575159, + "grad_norm": 0.45550286769866943, + "learning_rate": 5.004508566275925e-05, + "loss": 1.475, + "step": 556 + }, + { + "epoch": 0.15099445271275877, + "grad_norm": 0.44999995827674866, + "learning_rate": 5.022542831379622e-05, + "loss": 1.4794, + "step": 558 + }, + { + "epoch": 0.15153565146800163, + "grad_norm": 0.5035279989242554, + "learning_rate": 5.040577096483319e-05, + "loss": 1.471, + "step": 560 + }, + { + "epoch": 0.1520768502232445, + "grad_norm": 0.44605591893196106, + "learning_rate": 5.058611361587016e-05, + "loss": 1.4461, + "step": 562 + }, + { + "epoch": 0.15261804897848735, + "grad_norm": 0.5482723712921143, + "learning_rate": 5.0766456266907124e-05, + "loss": 1.4597, + "step": 564 + }, + { + "epoch": 0.1531592477337302, + "grad_norm": 0.5323627591133118, + "learning_rate": 5.094679891794409e-05, + "loss": 1.4743, + "step": 566 + }, + { + "epoch": 0.15370044648897307, + "grad_norm": 0.5289944410324097, + "learning_rate": 5.1127141568981066e-05, + "loss": 1.5, + "step": 568 + }, + { + "epoch": 0.15424164524421594, + "grad_norm": 0.5446243286132812, + "learning_rate": 5.1307484220018034e-05, + "loss": 1.4751, + "step": 570 + }, + { + "epoch": 0.1547828439994588, + "grad_norm": 0.525830090045929, + "learning_rate": 5.1487826871055015e-05, + "loss": 1.4639, + "step": 572 + }, + { + "epoch": 0.15532404275470166, + "grad_norm": 0.48129191994667053, + "learning_rate": 5.166816952209198e-05, + "loss": 1.4652, + "step": 574 + }, + { + "epoch": 0.15586524150994452, + "grad_norm": 0.47915297746658325, + "learning_rate": 5.184851217312895e-05, + "loss": 1.4627, + "step": 576 + }, + { + "epoch": 0.15640644026518738, + "grad_norm": 0.5229325294494629, + "learning_rate": 5.202885482416592e-05, + "loss": 1.4525, + "step": 578 + }, + { + "epoch": 0.15694763902043024, + "grad_norm": 0.5452600121498108, + "learning_rate": 5.2209197475202885e-05, + "loss": 1.458, + "step": 580 + }, + { + "epoch": 0.15748883777567313, + "grad_norm": 0.427432656288147, + "learning_rate": 5.238954012623985e-05, + "loss": 1.4773, + "step": 582 + }, + { + "epoch": 0.158030036530916, + "grad_norm": 0.450712114572525, + "learning_rate": 5.2569882777276827e-05, + "loss": 1.469, + "step": 584 + }, + { + "epoch": 0.15857123528615885, + "grad_norm": 0.5500516891479492, + "learning_rate": 5.27502254283138e-05, + "loss": 1.4603, + "step": 586 + }, + { + "epoch": 0.1591124340414017, + "grad_norm": 0.457157164812088, + "learning_rate": 5.2930568079350775e-05, + "loss": 1.4785, + "step": 588 + }, + { + "epoch": 0.15965363279664457, + "grad_norm": 0.49750396609306335, + "learning_rate": 5.311091073038774e-05, + "loss": 1.4603, + "step": 590 + }, + { + "epoch": 0.16019483155188743, + "grad_norm": 0.5720525979995728, + "learning_rate": 5.329125338142471e-05, + "loss": 1.4753, + "step": 592 + }, + { + "epoch": 0.1607360303071303, + "grad_norm": 0.4425548315048218, + "learning_rate": 5.347159603246168e-05, + "loss": 1.462, + "step": 594 + }, + { + "epoch": 0.16127722906237316, + "grad_norm": 0.5064132809638977, + "learning_rate": 5.3651938683498645e-05, + "loss": 1.4596, + "step": 596 + }, + { + "epoch": 0.16181842781761602, + "grad_norm": 0.518460750579834, + "learning_rate": 5.383228133453562e-05, + "loss": 1.4763, + "step": 598 + }, + { + "epoch": 0.16235962657285888, + "grad_norm": 0.4613576829433441, + "learning_rate": 5.401262398557259e-05, + "loss": 1.4487, + "step": 600 + }, + { + "epoch": 0.16290082532810174, + "grad_norm": 0.7046213746070862, + "learning_rate": 5.419296663660957e-05, + "loss": 1.472, + "step": 602 + }, + { + "epoch": 0.1634420240833446, + "grad_norm": 0.6164196133613586, + "learning_rate": 5.4373309287646535e-05, + "loss": 1.4424, + "step": 604 + }, + { + "epoch": 0.16398322283858746, + "grad_norm": 0.5106020569801331, + "learning_rate": 5.45536519386835e-05, + "loss": 1.4567, + "step": 606 + }, + { + "epoch": 0.16452442159383032, + "grad_norm": 0.4291236400604248, + "learning_rate": 5.473399458972047e-05, + "loss": 1.4514, + "step": 608 + }, + { + "epoch": 0.16506562034907318, + "grad_norm": 0.46577414870262146, + "learning_rate": 5.491433724075744e-05, + "loss": 1.4408, + "step": 610 + }, + { + "epoch": 0.16560681910431607, + "grad_norm": 0.4729917049407959, + "learning_rate": 5.509467989179441e-05, + "loss": 1.4493, + "step": 612 + }, + { + "epoch": 0.16614801785955893, + "grad_norm": 0.4651925563812256, + "learning_rate": 5.527502254283138e-05, + "loss": 1.465, + "step": 614 + }, + { + "epoch": 0.1666892166148018, + "grad_norm": 0.4756859540939331, + "learning_rate": 5.545536519386835e-05, + "loss": 1.4641, + "step": 616 + }, + { + "epoch": 0.16723041537004465, + "grad_norm": 0.42555975914001465, + "learning_rate": 5.563570784490533e-05, + "loss": 1.4569, + "step": 618 + }, + { + "epoch": 0.16777161412528752, + "grad_norm": 0.5162522196769714, + "learning_rate": 5.5816050495942296e-05, + "loss": 1.4344, + "step": 620 + }, + { + "epoch": 0.16831281288053038, + "grad_norm": 0.5867063999176025, + "learning_rate": 5.599639314697926e-05, + "loss": 1.4647, + "step": 622 + }, + { + "epoch": 0.16885401163577324, + "grad_norm": 0.6629165410995483, + "learning_rate": 5.617673579801623e-05, + "loss": 1.473, + "step": 624 + }, + { + "epoch": 0.1693952103910161, + "grad_norm": 0.5905330777168274, + "learning_rate": 5.6357078449053205e-05, + "loss": 1.4459, + "step": 626 + }, + { + "epoch": 0.16993640914625896, + "grad_norm": 0.7457858324050903, + "learning_rate": 5.653742110009017e-05, + "loss": 1.4603, + "step": 628 + }, + { + "epoch": 0.17047760790150182, + "grad_norm": 0.5977684855461121, + "learning_rate": 5.671776375112714e-05, + "loss": 1.4621, + "step": 630 + }, + { + "epoch": 0.17101880665674468, + "grad_norm": 0.7097992897033691, + "learning_rate": 5.689810640216412e-05, + "loss": 1.4646, + "step": 632 + }, + { + "epoch": 0.17156000541198754, + "grad_norm": 0.5895450711250305, + "learning_rate": 5.707844905320109e-05, + "loss": 1.4338, + "step": 634 + }, + { + "epoch": 0.1721012041672304, + "grad_norm": 0.576877772808075, + "learning_rate": 5.7258791704238056e-05, + "loss": 1.4666, + "step": 636 + }, + { + "epoch": 0.17264240292247326, + "grad_norm": 0.541110098361969, + "learning_rate": 5.7439134355275024e-05, + "loss": 1.4624, + "step": 638 + }, + { + "epoch": 0.17318360167771615, + "grad_norm": 0.5172320604324341, + "learning_rate": 5.7619477006312e-05, + "loss": 1.473, + "step": 640 + }, + { + "epoch": 0.17372480043295901, + "grad_norm": 0.47511357069015503, + "learning_rate": 5.7799819657348965e-05, + "loss": 1.446, + "step": 642 + }, + { + "epoch": 0.17426599918820188, + "grad_norm": 0.48614808917045593, + "learning_rate": 5.798016230838593e-05, + "loss": 1.4394, + "step": 644 + }, + { + "epoch": 0.17480719794344474, + "grad_norm": 0.4435577094554901, + "learning_rate": 5.81605049594229e-05, + "loss": 1.43, + "step": 646 + }, + { + "epoch": 0.1753483966986876, + "grad_norm": 0.4458653926849365, + "learning_rate": 5.834084761045988e-05, + "loss": 1.46, + "step": 648 + }, + { + "epoch": 0.17588959545393046, + "grad_norm": 0.40675726532936096, + "learning_rate": 5.852119026149685e-05, + "loss": 1.4565, + "step": 650 + }, + { + "epoch": 0.17643079420917332, + "grad_norm": 0.4132504165172577, + "learning_rate": 5.8701532912533817e-05, + "loss": 1.4522, + "step": 652 + }, + { + "epoch": 0.17697199296441618, + "grad_norm": 0.40881386399269104, + "learning_rate": 5.888187556357079e-05, + "loss": 1.4232, + "step": 654 + }, + { + "epoch": 0.17751319171965904, + "grad_norm": 0.40527868270874023, + "learning_rate": 5.906221821460776e-05, + "loss": 1.441, + "step": 656 + }, + { + "epoch": 0.1780543904749019, + "grad_norm": 0.40227004885673523, + "learning_rate": 5.9242560865644726e-05, + "loss": 1.4259, + "step": 658 + }, + { + "epoch": 0.17859558923014476, + "grad_norm": 0.4043656289577484, + "learning_rate": 5.942290351668169e-05, + "loss": 1.4298, + "step": 660 + }, + { + "epoch": 0.17913678798538762, + "grad_norm": 0.4288482666015625, + "learning_rate": 5.9603246167718674e-05, + "loss": 1.4439, + "step": 662 + }, + { + "epoch": 0.17967798674063049, + "grad_norm": 0.4385060966014862, + "learning_rate": 5.978358881875564e-05, + "loss": 1.4237, + "step": 664 + }, + { + "epoch": 0.18021918549587335, + "grad_norm": 0.396980345249176, + "learning_rate": 5.996393146979261e-05, + "loss": 1.4174, + "step": 666 + }, + { + "epoch": 0.18076038425111624, + "grad_norm": 0.4060603678226471, + "learning_rate": 6.014427412082958e-05, + "loss": 1.4479, + "step": 668 + }, + { + "epoch": 0.1813015830063591, + "grad_norm": 0.4485025703907013, + "learning_rate": 6.032461677186655e-05, + "loss": 1.4493, + "step": 670 + }, + { + "epoch": 0.18184278176160196, + "grad_norm": 0.44034305214881897, + "learning_rate": 6.050495942290352e-05, + "loss": 1.4461, + "step": 672 + }, + { + "epoch": 0.18238398051684482, + "grad_norm": 0.418074369430542, + "learning_rate": 6.0685302073940486e-05, + "loss": 1.4287, + "step": 674 + }, + { + "epoch": 0.18292517927208768, + "grad_norm": 0.41937318444252014, + "learning_rate": 6.0865644724977454e-05, + "loss": 1.4338, + "step": 676 + }, + { + "epoch": 0.18346637802733054, + "grad_norm": 0.4103530943393707, + "learning_rate": 6.104598737601444e-05, + "loss": 1.4391, + "step": 678 + }, + { + "epoch": 0.1840075767825734, + "grad_norm": 0.4066039025783539, + "learning_rate": 6.122633002705141e-05, + "loss": 1.4357, + "step": 680 + }, + { + "epoch": 0.18454877553781626, + "grad_norm": 0.36903437972068787, + "learning_rate": 6.140667267808838e-05, + "loss": 1.4111, + "step": 682 + }, + { + "epoch": 0.18508997429305912, + "grad_norm": 0.37125757336616516, + "learning_rate": 6.158701532912534e-05, + "loss": 1.4233, + "step": 684 + }, + { + "epoch": 0.18563117304830198, + "grad_norm": 0.44102513790130615, + "learning_rate": 6.176735798016231e-05, + "loss": 1.4437, + "step": 686 + }, + { + "epoch": 0.18617237180354484, + "grad_norm": 0.4337277114391327, + "learning_rate": 6.194770063119928e-05, + "loss": 1.4425, + "step": 688 + }, + { + "epoch": 0.1867135705587877, + "grad_norm": 0.37394315004348755, + "learning_rate": 6.212804328223625e-05, + "loss": 1.4452, + "step": 690 + }, + { + "epoch": 0.18725476931403057, + "grad_norm": 0.41764944791793823, + "learning_rate": 6.230838593327321e-05, + "loss": 1.4535, + "step": 692 + }, + { + "epoch": 0.18779596806927343, + "grad_norm": 0.4214741289615631, + "learning_rate": 6.24887285843102e-05, + "loss": 1.4391, + "step": 694 + }, + { + "epoch": 0.18833716682451632, + "grad_norm": 0.4159027338027954, + "learning_rate": 6.266907123534716e-05, + "loss": 1.4197, + "step": 696 + }, + { + "epoch": 0.18887836557975918, + "grad_norm": 0.38865673542022705, + "learning_rate": 6.284941388638413e-05, + "loss": 1.4329, + "step": 698 + }, + { + "epoch": 0.18941956433500204, + "grad_norm": 0.43646490573883057, + "learning_rate": 6.30297565374211e-05, + "loss": 1.4147, + "step": 700 + }, + { + "epoch": 0.1899607630902449, + "grad_norm": 0.41997334361076355, + "learning_rate": 6.321009918845807e-05, + "loss": 1.4275, + "step": 702 + }, + { + "epoch": 0.19050196184548776, + "grad_norm": 0.38556602597236633, + "learning_rate": 6.339044183949505e-05, + "loss": 1.4258, + "step": 704 + }, + { + "epoch": 0.19104316060073062, + "grad_norm": 0.42955082654953003, + "learning_rate": 6.357078449053201e-05, + "loss": 1.4201, + "step": 706 + }, + { + "epoch": 0.19158435935597348, + "grad_norm": 0.3844427764415741, + "learning_rate": 6.3751127141569e-05, + "loss": 1.4448, + "step": 708 + }, + { + "epoch": 0.19212555811121634, + "grad_norm": 0.4312956929206848, + "learning_rate": 6.393146979260596e-05, + "loss": 1.4051, + "step": 710 + }, + { + "epoch": 0.1926667568664592, + "grad_norm": 0.4556865394115448, + "learning_rate": 6.411181244364293e-05, + "loss": 1.4305, + "step": 712 + }, + { + "epoch": 0.19320795562170207, + "grad_norm": 0.37053731083869934, + "learning_rate": 6.42921550946799e-05, + "loss": 1.4301, + "step": 714 + }, + { + "epoch": 0.19374915437694493, + "grad_norm": 0.3996010720729828, + "learning_rate": 6.447249774571686e-05, + "loss": 1.4282, + "step": 716 + }, + { + "epoch": 0.1942903531321878, + "grad_norm": 0.37610816955566406, + "learning_rate": 6.465284039675383e-05, + "loss": 1.4277, + "step": 718 + }, + { + "epoch": 0.19483155188743065, + "grad_norm": 0.3677166998386383, + "learning_rate": 6.48331830477908e-05, + "loss": 1.4029, + "step": 720 + }, + { + "epoch": 0.1953727506426735, + "grad_norm": 0.3841564357280731, + "learning_rate": 6.501352569882777e-05, + "loss": 1.4144, + "step": 722 + }, + { + "epoch": 0.1959139493979164, + "grad_norm": 0.3687719404697418, + "learning_rate": 6.519386834986475e-05, + "loss": 1.4079, + "step": 724 + }, + { + "epoch": 0.19645514815315926, + "grad_norm": 0.38350847363471985, + "learning_rate": 6.537421100090172e-05, + "loss": 1.4269, + "step": 726 + }, + { + "epoch": 0.19699634690840212, + "grad_norm": 0.39060813188552856, + "learning_rate": 6.555455365193868e-05, + "loss": 1.4265, + "step": 728 + }, + { + "epoch": 0.19753754566364498, + "grad_norm": 0.36068469285964966, + "learning_rate": 6.573489630297565e-05, + "loss": 1.4325, + "step": 730 + }, + { + "epoch": 0.19807874441888784, + "grad_norm": 0.41185086965560913, + "learning_rate": 6.591523895401263e-05, + "loss": 1.4348, + "step": 732 + }, + { + "epoch": 0.1986199431741307, + "grad_norm": 0.4441224932670593, + "learning_rate": 6.60955816050496e-05, + "loss": 1.4103, + "step": 734 + }, + { + "epoch": 0.19916114192937356, + "grad_norm": 0.3727317452430725, + "learning_rate": 6.627592425608657e-05, + "loss": 1.4188, + "step": 736 + }, + { + "epoch": 0.19970234068461643, + "grad_norm": 0.394972562789917, + "learning_rate": 6.645626690712355e-05, + "loss": 1.4095, + "step": 738 + }, + { + "epoch": 0.20024353943985929, + "grad_norm": 0.40716880559921265, + "learning_rate": 6.663660955816052e-05, + "loss": 1.4127, + "step": 740 + }, + { + "epoch": 0.20078473819510215, + "grad_norm": 0.4156644344329834, + "learning_rate": 6.681695220919748e-05, + "loss": 1.4189, + "step": 742 + }, + { + "epoch": 0.201325936950345, + "grad_norm": 0.3787958323955536, + "learning_rate": 6.699729486023445e-05, + "loss": 1.4221, + "step": 744 + }, + { + "epoch": 0.20186713570558787, + "grad_norm": 0.42427608370780945, + "learning_rate": 6.717763751127142e-05, + "loss": 1.4192, + "step": 746 + }, + { + "epoch": 0.20240833446083073, + "grad_norm": 0.4778277277946472, + "learning_rate": 6.735798016230839e-05, + "loss": 1.4024, + "step": 748 + }, + { + "epoch": 0.2029495332160736, + "grad_norm": 0.44801151752471924, + "learning_rate": 6.753832281334535e-05, + "loss": 1.4222, + "step": 750 + }, + { + "epoch": 0.20349073197131648, + "grad_norm": 0.46737611293792725, + "learning_rate": 6.771866546438232e-05, + "loss": 1.4117, + "step": 752 + }, + { + "epoch": 0.20403193072655934, + "grad_norm": 0.4184872806072235, + "learning_rate": 6.78990081154193e-05, + "loss": 1.4066, + "step": 754 + }, + { + "epoch": 0.2045731294818022, + "grad_norm": 0.40458211302757263, + "learning_rate": 6.807935076645627e-05, + "loss": 1.4274, + "step": 756 + }, + { + "epoch": 0.20511432823704506, + "grad_norm": 0.43926185369491577, + "learning_rate": 6.825969341749324e-05, + "loss": 1.4231, + "step": 758 + }, + { + "epoch": 0.20565552699228792, + "grad_norm": 0.4434867203235626, + "learning_rate": 6.844003606853022e-05, + "loss": 1.4121, + "step": 760 + }, + { + "epoch": 0.20619672574753078, + "grad_norm": 0.4500143826007843, + "learning_rate": 6.862037871956719e-05, + "loss": 1.4179, + "step": 762 + }, + { + "epoch": 0.20673792450277365, + "grad_norm": 0.45456650853157043, + "learning_rate": 6.880072137060415e-05, + "loss": 1.3912, + "step": 764 + }, + { + "epoch": 0.2072791232580165, + "grad_norm": 0.4214187264442444, + "learning_rate": 6.898106402164112e-05, + "loss": 1.3962, + "step": 766 + }, + { + "epoch": 0.20782032201325937, + "grad_norm": 0.427682101726532, + "learning_rate": 6.916140667267809e-05, + "loss": 1.4316, + "step": 768 + }, + { + "epoch": 0.20836152076850223, + "grad_norm": 0.44491469860076904, + "learning_rate": 6.934174932371507e-05, + "loss": 1.4218, + "step": 770 + }, + { + "epoch": 0.2089027195237451, + "grad_norm": 0.42736080288887024, + "learning_rate": 6.952209197475204e-05, + "loss": 1.3931, + "step": 772 + }, + { + "epoch": 0.20944391827898795, + "grad_norm": 0.4041571021080017, + "learning_rate": 6.9702434625789e-05, + "loss": 1.4201, + "step": 774 + }, + { + "epoch": 0.2099851170342308, + "grad_norm": 0.4250961244106293, + "learning_rate": 6.988277727682597e-05, + "loss": 1.4299, + "step": 776 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.4335261881351471, + "learning_rate": 7.006311992786294e-05, + "loss": 1.4125, + "step": 778 + }, + { + "epoch": 0.21106751454471653, + "grad_norm": 0.42000851035118103, + "learning_rate": 7.02434625788999e-05, + "loss": 1.3969, + "step": 780 + }, + { + "epoch": 0.21160871329995942, + "grad_norm": 0.38111838698387146, + "learning_rate": 7.042380522993687e-05, + "loss": 1.3795, + "step": 782 + }, + { + "epoch": 0.21214991205520228, + "grad_norm": 0.38366812467575073, + "learning_rate": 7.060414788097385e-05, + "loss": 1.4041, + "step": 784 + }, + { + "epoch": 0.21269111081044514, + "grad_norm": 0.4334602355957031, + "learning_rate": 7.078449053201082e-05, + "loss": 1.415, + "step": 786 + }, + { + "epoch": 0.213232309565688, + "grad_norm": 0.40296411514282227, + "learning_rate": 7.096483318304779e-05, + "loss": 1.4052, + "step": 788 + }, + { + "epoch": 0.21377350832093087, + "grad_norm": 0.4197232723236084, + "learning_rate": 7.114517583408477e-05, + "loss": 1.4205, + "step": 790 + }, + { + "epoch": 0.21431470707617373, + "grad_norm": 0.40287715196609497, + "learning_rate": 7.132551848512174e-05, + "loss": 1.4047, + "step": 792 + }, + { + "epoch": 0.2148559058314166, + "grad_norm": 0.37324196100234985, + "learning_rate": 7.15058611361587e-05, + "loss": 1.4398, + "step": 794 + }, + { + "epoch": 0.21539710458665945, + "grad_norm": 0.4409985840320587, + "learning_rate": 7.168620378719567e-05, + "loss": 1.3873, + "step": 796 + }, + { + "epoch": 0.2159383033419023, + "grad_norm": 0.41441893577575684, + "learning_rate": 7.186654643823264e-05, + "loss": 1.4174, + "step": 798 + }, + { + "epoch": 0.21647950209714517, + "grad_norm": 0.4271719455718994, + "learning_rate": 7.204688908926962e-05, + "loss": 1.3987, + "step": 800 + }, + { + "epoch": 0.21702070085238803, + "grad_norm": 0.4969992935657501, + "learning_rate": 7.222723174030659e-05, + "loss": 1.4049, + "step": 802 + }, + { + "epoch": 0.2175618996076309, + "grad_norm": 0.45711180567741394, + "learning_rate": 7.240757439134356e-05, + "loss": 1.4061, + "step": 804 + }, + { + "epoch": 0.21810309836287375, + "grad_norm": 0.4479979872703552, + "learning_rate": 7.258791704238052e-05, + "loss": 1.4049, + "step": 806 + }, + { + "epoch": 0.21864429711811662, + "grad_norm": 0.4708006978034973, + "learning_rate": 7.276825969341749e-05, + "loss": 1.3971, + "step": 808 + }, + { + "epoch": 0.2191854958733595, + "grad_norm": 0.4387456774711609, + "learning_rate": 7.294860234445446e-05, + "loss": 1.4272, + "step": 810 + }, + { + "epoch": 0.21972669462860236, + "grad_norm": 0.5285756587982178, + "learning_rate": 7.312894499549143e-05, + "loss": 1.3902, + "step": 812 + }, + { + "epoch": 0.22026789338384523, + "grad_norm": 0.5111876726150513, + "learning_rate": 7.330928764652841e-05, + "loss": 1.4176, + "step": 814 + }, + { + "epoch": 0.2208090921390881, + "grad_norm": 0.4643821716308594, + "learning_rate": 7.348963029756538e-05, + "loss": 1.4216, + "step": 816 + }, + { + "epoch": 0.22135029089433095, + "grad_norm": 0.5162214040756226, + "learning_rate": 7.366997294860236e-05, + "loss": 1.4025, + "step": 818 + }, + { + "epoch": 0.2218914896495738, + "grad_norm": 0.4296860992908478, + "learning_rate": 7.385031559963932e-05, + "loss": 1.3919, + "step": 820 + }, + { + "epoch": 0.22243268840481667, + "grad_norm": 0.4449775815010071, + "learning_rate": 7.403065825067629e-05, + "loss": 1.4002, + "step": 822 + }, + { + "epoch": 0.22297388716005953, + "grad_norm": 0.39713212847709656, + "learning_rate": 7.421100090171326e-05, + "loss": 1.4012, + "step": 824 + }, + { + "epoch": 0.2235150859153024, + "grad_norm": 0.41655346751213074, + "learning_rate": 7.439134355275023e-05, + "loss": 1.4155, + "step": 826 + }, + { + "epoch": 0.22405628467054525, + "grad_norm": 0.3751365542411804, + "learning_rate": 7.45716862037872e-05, + "loss": 1.4021, + "step": 828 + }, + { + "epoch": 0.2245974834257881, + "grad_norm": 0.41483408212661743, + "learning_rate": 7.475202885482417e-05, + "loss": 1.4207, + "step": 830 + }, + { + "epoch": 0.22513868218103097, + "grad_norm": 0.397360235452652, + "learning_rate": 7.493237150586114e-05, + "loss": 1.392, + "step": 832 + }, + { + "epoch": 0.22567988093627384, + "grad_norm": 0.3874877691268921, + "learning_rate": 7.511271415689811e-05, + "loss": 1.4143, + "step": 834 + }, + { + "epoch": 0.2262210796915167, + "grad_norm": 0.4382254481315613, + "learning_rate": 7.529305680793508e-05, + "loss": 1.4109, + "step": 836 + }, + { + "epoch": 0.22676227844675959, + "grad_norm": 0.3728530704975128, + "learning_rate": 7.547339945897204e-05, + "loss": 1.4215, + "step": 838 + }, + { + "epoch": 0.22730347720200245, + "grad_norm": 0.41155338287353516, + "learning_rate": 7.565374211000901e-05, + "loss": 1.3963, + "step": 840 + }, + { + "epoch": 0.2278446759572453, + "grad_norm": 0.3550320267677307, + "learning_rate": 7.5834084761046e-05, + "loss": 1.3998, + "step": 842 + }, + { + "epoch": 0.22838587471248817, + "grad_norm": 0.3858035206794739, + "learning_rate": 7.601442741208296e-05, + "loss": 1.387, + "step": 844 + }, + { + "epoch": 0.22892707346773103, + "grad_norm": 0.38636457920074463, + "learning_rate": 7.619477006311994e-05, + "loss": 1.387, + "step": 846 + }, + { + "epoch": 0.2294682722229739, + "grad_norm": 0.41915518045425415, + "learning_rate": 7.637511271415691e-05, + "loss": 1.3917, + "step": 848 + }, + { + "epoch": 0.23000947097821675, + "grad_norm": 0.35796865820884705, + "learning_rate": 7.655545536519388e-05, + "loss": 1.406, + "step": 850 + }, + { + "epoch": 0.2305506697334596, + "grad_norm": 0.35221853852272034, + "learning_rate": 7.673579801623084e-05, + "loss": 1.3892, + "step": 852 + }, + { + "epoch": 0.23109186848870247, + "grad_norm": 0.3815077245235443, + "learning_rate": 7.691614066726781e-05, + "loss": 1.3845, + "step": 854 + }, + { + "epoch": 0.23163306724394533, + "grad_norm": 0.3554491400718689, + "learning_rate": 7.709648331830478e-05, + "loss": 1.3644, + "step": 856 + }, + { + "epoch": 0.2321742659991882, + "grad_norm": 0.3762814998626709, + "learning_rate": 7.727682596934175e-05, + "loss": 1.3976, + "step": 858 + }, + { + "epoch": 0.23271546475443106, + "grad_norm": 0.34575173258781433, + "learning_rate": 7.745716862037873e-05, + "loss": 1.3925, + "step": 860 + }, + { + "epoch": 0.23325666350967392, + "grad_norm": 0.37864556908607483, + "learning_rate": 7.76375112714157e-05, + "loss": 1.3993, + "step": 862 + }, + { + "epoch": 0.23379786226491678, + "grad_norm": 0.34448474645614624, + "learning_rate": 7.781785392245266e-05, + "loss": 1.3855, + "step": 864 + }, + { + "epoch": 0.23433906102015967, + "grad_norm": 0.40932390093803406, + "learning_rate": 7.799819657348963e-05, + "loss": 1.395, + "step": 866 + }, + { + "epoch": 0.23488025977540253, + "grad_norm": 0.3737650513648987, + "learning_rate": 7.81785392245266e-05, + "loss": 1.3918, + "step": 868 + }, + { + "epoch": 0.2354214585306454, + "grad_norm": 0.42988118529319763, + "learning_rate": 7.835888187556357e-05, + "loss": 1.3837, + "step": 870 + }, + { + "epoch": 0.23596265728588825, + "grad_norm": 0.3865496814250946, + "learning_rate": 7.853922452660055e-05, + "loss": 1.3976, + "step": 872 + }, + { + "epoch": 0.2365038560411311, + "grad_norm": 0.3682670295238495, + "learning_rate": 7.871956717763751e-05, + "loss": 1.3792, + "step": 874 + }, + { + "epoch": 0.23704505479637397, + "grad_norm": 0.4236462712287903, + "learning_rate": 7.88999098286745e-05, + "loss": 1.4032, + "step": 876 + }, + { + "epoch": 0.23758625355161683, + "grad_norm": 0.3742213249206543, + "learning_rate": 7.908025247971146e-05, + "loss": 1.3709, + "step": 878 + }, + { + "epoch": 0.2381274523068597, + "grad_norm": 0.38234424591064453, + "learning_rate": 7.926059513074843e-05, + "loss": 1.3862, + "step": 880 + }, + { + "epoch": 0.23866865106210255, + "grad_norm": 0.37414151430130005, + "learning_rate": 7.94409377817854e-05, + "loss": 1.3751, + "step": 882 + }, + { + "epoch": 0.23920984981734542, + "grad_norm": 0.3838132619857788, + "learning_rate": 7.962128043282237e-05, + "loss": 1.3805, + "step": 884 + }, + { + "epoch": 0.23975104857258828, + "grad_norm": 0.3818622827529907, + "learning_rate": 7.980162308385933e-05, + "loss": 1.3735, + "step": 886 + }, + { + "epoch": 0.24029224732783114, + "grad_norm": 0.38791927695274353, + "learning_rate": 7.99819657348963e-05, + "loss": 1.3958, + "step": 888 + }, + { + "epoch": 0.240833446083074, + "grad_norm": 0.4164978861808777, + "learning_rate": 8.016230838593328e-05, + "loss": 1.421, + "step": 890 + }, + { + "epoch": 0.24137464483831686, + "grad_norm": 0.3721414804458618, + "learning_rate": 8.034265103697025e-05, + "loss": 1.3977, + "step": 892 + }, + { + "epoch": 0.24191584359355975, + "grad_norm": 0.37698984146118164, + "learning_rate": 8.052299368800722e-05, + "loss": 1.3854, + "step": 894 + }, + { + "epoch": 0.2424570423488026, + "grad_norm": 0.3553116023540497, + "learning_rate": 8.070333633904418e-05, + "loss": 1.3925, + "step": 896 + }, + { + "epoch": 0.24299824110404547, + "grad_norm": 0.37809059023857117, + "learning_rate": 8.088367899008115e-05, + "loss": 1.368, + "step": 898 + }, + { + "epoch": 0.24353943985928833, + "grad_norm": 0.3835943043231964, + "learning_rate": 8.106402164111813e-05, + "loss": 1.3992, + "step": 900 + }, + { + "epoch": 0.2440806386145312, + "grad_norm": 0.4013379216194153, + "learning_rate": 8.12443642921551e-05, + "loss": 1.3912, + "step": 902 + }, + { + "epoch": 0.24462183736977405, + "grad_norm": 0.37845560908317566, + "learning_rate": 8.142470694319207e-05, + "loss": 1.3934, + "step": 904 + }, + { + "epoch": 0.24516303612501691, + "grad_norm": 0.39762255549430847, + "learning_rate": 8.160504959422905e-05, + "loss": 1.3782, + "step": 906 + }, + { + "epoch": 0.24570423488025978, + "grad_norm": 0.36652496457099915, + "learning_rate": 8.178539224526602e-05, + "loss": 1.3787, + "step": 908 + }, + { + "epoch": 0.24624543363550264, + "grad_norm": 0.39953047037124634, + "learning_rate": 8.196573489630298e-05, + "loss": 1.3752, + "step": 910 + }, + { + "epoch": 0.2467866323907455, + "grad_norm": 0.35875022411346436, + "learning_rate": 8.214607754733995e-05, + "loss": 1.3768, + "step": 912 + }, + { + "epoch": 0.24732783114598836, + "grad_norm": 0.3617067337036133, + "learning_rate": 8.232642019837692e-05, + "loss": 1.3859, + "step": 914 + }, + { + "epoch": 0.24786902990123122, + "grad_norm": 0.38250839710235596, + "learning_rate": 8.250676284941389e-05, + "loss": 1.3897, + "step": 916 + }, + { + "epoch": 0.24841022865647408, + "grad_norm": 0.3404116928577423, + "learning_rate": 8.268710550045085e-05, + "loss": 1.3933, + "step": 918 + }, + { + "epoch": 0.24895142741171694, + "grad_norm": 0.3547706604003906, + "learning_rate": 8.286744815148782e-05, + "loss": 1.3787, + "step": 920 + }, + { + "epoch": 0.2494926261669598, + "grad_norm": 0.32752275466918945, + "learning_rate": 8.30477908025248e-05, + "loss": 1.3905, + "step": 922 + }, + { + "epoch": 0.25003382492220266, + "grad_norm": 0.3413980007171631, + "learning_rate": 8.322813345356177e-05, + "loss": 1.385, + "step": 924 + }, + { + "epoch": 0.25057502367744555, + "grad_norm": 0.5574982762336731, + "learning_rate": 8.340847610459874e-05, + "loss": 1.3869, + "step": 926 + }, + { + "epoch": 0.2511162224326884, + "grad_norm": 0.41128844022750854, + "learning_rate": 8.358881875563572e-05, + "loss": 1.3583, + "step": 928 + }, + { + "epoch": 0.2516574211879313, + "grad_norm": 0.3476073145866394, + "learning_rate": 8.376916140667269e-05, + "loss": 1.3832, + "step": 930 + }, + { + "epoch": 0.2521986199431741, + "grad_norm": 0.34838998317718506, + "learning_rate": 8.394950405770965e-05, + "loss": 1.3748, + "step": 932 + }, + { + "epoch": 0.252739818698417, + "grad_norm": 0.3552824556827545, + "learning_rate": 8.412984670874662e-05, + "loss": 1.3936, + "step": 934 + }, + { + "epoch": 0.25328101745365983, + "grad_norm": 0.34918278455734253, + "learning_rate": 8.43101893597836e-05, + "loss": 1.3733, + "step": 936 + }, + { + "epoch": 0.2538222162089027, + "grad_norm": 0.431455135345459, + "learning_rate": 8.449053201082057e-05, + "loss": 1.3924, + "step": 938 + }, + { + "epoch": 0.2543634149641456, + "grad_norm": 0.37811046838760376, + "learning_rate": 8.467087466185754e-05, + "loss": 1.3861, + "step": 940 + }, + { + "epoch": 0.25490461371938844, + "grad_norm": 0.35659778118133545, + "learning_rate": 8.48512173128945e-05, + "loss": 1.3736, + "step": 942 + }, + { + "epoch": 0.25544581247463133, + "grad_norm": 0.4327319264411926, + "learning_rate": 8.503155996393147e-05, + "loss": 1.3883, + "step": 944 + }, + { + "epoch": 0.25598701122987416, + "grad_norm": 0.39134231209754944, + "learning_rate": 8.521190261496844e-05, + "loss": 1.3704, + "step": 946 + }, + { + "epoch": 0.25652820998511705, + "grad_norm": 0.39573270082473755, + "learning_rate": 8.53922452660054e-05, + "loss": 1.4047, + "step": 948 + }, + { + "epoch": 0.2570694087403599, + "grad_norm": 0.3299993872642517, + "learning_rate": 8.557258791704237e-05, + "loss": 1.3778, + "step": 950 + }, + { + "epoch": 0.2576106074956028, + "grad_norm": 0.3559456765651703, + "learning_rate": 8.575293056807936e-05, + "loss": 1.3794, + "step": 952 + }, + { + "epoch": 0.2581518062508456, + "grad_norm": 0.36347028613090515, + "learning_rate": 8.593327321911632e-05, + "loss": 1.3817, + "step": 954 + }, + { + "epoch": 0.2586930050060885, + "grad_norm": 0.39882585406303406, + "learning_rate": 8.611361587015329e-05, + "loss": 1.3565, + "step": 956 + }, + { + "epoch": 0.2592342037613313, + "grad_norm": 0.3932117223739624, + "learning_rate": 8.629395852119027e-05, + "loss": 1.396, + "step": 958 + }, + { + "epoch": 0.2597754025165742, + "grad_norm": 0.3526294231414795, + "learning_rate": 8.647430117222724e-05, + "loss": 1.3624, + "step": 960 + }, + { + "epoch": 0.26031660127181705, + "grad_norm": 0.3804738223552704, + "learning_rate": 8.66546438232642e-05, + "loss": 1.3616, + "step": 962 + }, + { + "epoch": 0.26085780002705994, + "grad_norm": 0.36557725071907043, + "learning_rate": 8.683498647430117e-05, + "loss": 1.3997, + "step": 964 + }, + { + "epoch": 0.2613989987823028, + "grad_norm": 0.3574380874633789, + "learning_rate": 8.701532912533815e-05, + "loss": 1.3901, + "step": 966 + }, + { + "epoch": 0.26194019753754566, + "grad_norm": 0.4025056064128876, + "learning_rate": 8.719567177637512e-05, + "loss": 1.3707, + "step": 968 + }, + { + "epoch": 0.26248139629278855, + "grad_norm": 0.3687063157558441, + "learning_rate": 8.737601442741209e-05, + "loss": 1.3679, + "step": 970 + }, + { + "epoch": 0.2630225950480314, + "grad_norm": 0.3697878420352936, + "learning_rate": 8.755635707844906e-05, + "loss": 1.3981, + "step": 972 + }, + { + "epoch": 0.26356379380327427, + "grad_norm": 0.34241798520088196, + "learning_rate": 8.773669972948602e-05, + "loss": 1.3728, + "step": 974 + }, + { + "epoch": 0.2641049925585171, + "grad_norm": 0.40002745389938354, + "learning_rate": 8.791704238052299e-05, + "loss": 1.3732, + "step": 976 + }, + { + "epoch": 0.26464619131376, + "grad_norm": 0.42943906784057617, + "learning_rate": 8.809738503155996e-05, + "loss": 1.3731, + "step": 978 + }, + { + "epoch": 0.2651873900690028, + "grad_norm": 0.37437063455581665, + "learning_rate": 8.827772768259693e-05, + "loss": 1.372, + "step": 980 + }, + { + "epoch": 0.2657285888242457, + "grad_norm": 0.3378891944885254, + "learning_rate": 8.845807033363391e-05, + "loss": 1.3777, + "step": 982 + }, + { + "epoch": 0.26626978757948855, + "grad_norm": 0.32884734869003296, + "learning_rate": 8.863841298467088e-05, + "loss": 1.3639, + "step": 984 + }, + { + "epoch": 0.26681098633473144, + "grad_norm": 0.3945903480052948, + "learning_rate": 8.881875563570786e-05, + "loss": 1.3722, + "step": 986 + }, + { + "epoch": 0.26735218508997427, + "grad_norm": 0.39569205045700073, + "learning_rate": 8.899909828674482e-05, + "loss": 1.376, + "step": 988 + }, + { + "epoch": 0.26789338384521716, + "grad_norm": 0.31659135222435, + "learning_rate": 8.917944093778179e-05, + "loss": 1.3807, + "step": 990 + }, + { + "epoch": 0.26843458260046, + "grad_norm": 0.44032666087150574, + "learning_rate": 8.935978358881876e-05, + "loss": 1.3986, + "step": 992 + }, + { + "epoch": 0.2689757813557029, + "grad_norm": 0.3445993661880493, + "learning_rate": 8.954012623985573e-05, + "loss": 1.3589, + "step": 994 + }, + { + "epoch": 0.26951698011094577, + "grad_norm": 0.3693557679653168, + "learning_rate": 8.97204688908927e-05, + "loss": 1.3593, + "step": 996 + }, + { + "epoch": 0.2700581788661886, + "grad_norm": 0.3965442478656769, + "learning_rate": 8.990081154192968e-05, + "loss": 1.3909, + "step": 998 + }, + { + "epoch": 0.2705993776214315, + "grad_norm": 0.4038390815258026, + "learning_rate": 9.008115419296664e-05, + "loss": 1.3629, + "step": 1000 + }, + { + "epoch": 0.2711405763766743, + "grad_norm": 0.36394256353378296, + "learning_rate": 9.026149684400361e-05, + "loss": 1.3812, + "step": 1002 + }, + { + "epoch": 0.2716817751319172, + "grad_norm": 0.4527181386947632, + "learning_rate": 9.044183949504058e-05, + "loss": 1.3692, + "step": 1004 + }, + { + "epoch": 0.27222297388716005, + "grad_norm": 0.37700143456459045, + "learning_rate": 9.062218214607755e-05, + "loss": 1.3652, + "step": 1006 + }, + { + "epoch": 0.27276417264240294, + "grad_norm": 0.45016244053840637, + "learning_rate": 9.080252479711451e-05, + "loss": 1.3657, + "step": 1008 + }, + { + "epoch": 0.27330537139764577, + "grad_norm": 0.42159709334373474, + "learning_rate": 9.09828674481515e-05, + "loss": 1.3702, + "step": 1010 + }, + { + "epoch": 0.27384657015288866, + "grad_norm": 0.3884572982788086, + "learning_rate": 9.116321009918846e-05, + "loss": 1.3535, + "step": 1012 + }, + { + "epoch": 0.2743877689081315, + "grad_norm": 0.37507420778274536, + "learning_rate": 9.134355275022544e-05, + "loss": 1.3659, + "step": 1014 + }, + { + "epoch": 0.2749289676633744, + "grad_norm": 0.35269656777381897, + "learning_rate": 9.152389540126241e-05, + "loss": 1.3623, + "step": 1016 + }, + { + "epoch": 0.2754701664186172, + "grad_norm": 0.3543412387371063, + "learning_rate": 9.170423805229938e-05, + "loss": 1.3695, + "step": 1018 + }, + { + "epoch": 0.2760113651738601, + "grad_norm": 0.3173674941062927, + "learning_rate": 9.188458070333635e-05, + "loss": 1.3572, + "step": 1020 + }, + { + "epoch": 0.276552563929103, + "grad_norm": 0.3729746341705322, + "learning_rate": 9.206492335437331e-05, + "loss": 1.3888, + "step": 1022 + }, + { + "epoch": 0.2770937626843458, + "grad_norm": 0.33210429549217224, + "learning_rate": 9.224526600541028e-05, + "loss": 1.3395, + "step": 1024 + }, + { + "epoch": 0.2776349614395887, + "grad_norm": 0.338366836309433, + "learning_rate": 9.242560865644725e-05, + "loss": 1.3498, + "step": 1026 + }, + { + "epoch": 0.27817616019483155, + "grad_norm": 0.3367864191532135, + "learning_rate": 9.260595130748423e-05, + "loss": 1.3548, + "step": 1028 + }, + { + "epoch": 0.27871735895007443, + "grad_norm": 0.40313002467155457, + "learning_rate": 9.27862939585212e-05, + "loss": 1.4059, + "step": 1030 + }, + { + "epoch": 0.27925855770531727, + "grad_norm": 0.3434394299983978, + "learning_rate": 9.296663660955816e-05, + "loss": 1.3522, + "step": 1032 + }, + { + "epoch": 0.27979975646056016, + "grad_norm": 0.35454580187797546, + "learning_rate": 9.314697926059513e-05, + "loss": 1.3838, + "step": 1034 + }, + { + "epoch": 0.280340955215803, + "grad_norm": 0.3280038833618164, + "learning_rate": 9.33273219116321e-05, + "loss": 1.3753, + "step": 1036 + }, + { + "epoch": 0.2808821539710459, + "grad_norm": 0.4306875169277191, + "learning_rate": 9.350766456266907e-05, + "loss": 1.3807, + "step": 1038 + }, + { + "epoch": 0.2814233527262887, + "grad_norm": 0.3500923812389374, + "learning_rate": 9.368800721370605e-05, + "loss": 1.36, + "step": 1040 + }, + { + "epoch": 0.2819645514815316, + "grad_norm": 0.3702130913734436, + "learning_rate": 9.386834986474301e-05, + "loss": 1.3919, + "step": 1042 + }, + { + "epoch": 0.28250575023677443, + "grad_norm": 0.3651416599750519, + "learning_rate": 9.404869251578e-05, + "loss": 1.3805, + "step": 1044 + }, + { + "epoch": 0.2830469489920173, + "grad_norm": 0.35927796363830566, + "learning_rate": 9.422903516681696e-05, + "loss": 1.3507, + "step": 1046 + }, + { + "epoch": 0.28358814774726016, + "grad_norm": 0.36750975251197815, + "learning_rate": 9.440937781785393e-05, + "loss": 1.3475, + "step": 1048 + }, + { + "epoch": 0.28412934650250304, + "grad_norm": 0.31946998834609985, + "learning_rate": 9.45897204688909e-05, + "loss": 1.3708, + "step": 1050 + }, + { + "epoch": 0.28467054525774593, + "grad_norm": 0.3447932302951813, + "learning_rate": 9.477006311992787e-05, + "loss": 1.3519, + "step": 1052 + }, + { + "epoch": 0.28521174401298877, + "grad_norm": 0.31405511498451233, + "learning_rate": 9.495040577096483e-05, + "loss": 1.3806, + "step": 1054 + }, + { + "epoch": 0.28575294276823165, + "grad_norm": 0.3198442757129669, + "learning_rate": 9.51307484220018e-05, + "loss": 1.368, + "step": 1056 + }, + { + "epoch": 0.2862941415234745, + "grad_norm": 0.33328956365585327, + "learning_rate": 9.531109107303878e-05, + "loss": 1.3429, + "step": 1058 + }, + { + "epoch": 0.2868353402787174, + "grad_norm": 0.29432907700538635, + "learning_rate": 9.549143372407575e-05, + "loss": 1.3698, + "step": 1060 + }, + { + "epoch": 0.2873765390339602, + "grad_norm": 0.3468937575817108, + "learning_rate": 9.567177637511272e-05, + "loss": 1.356, + "step": 1062 + }, + { + "epoch": 0.2879177377892031, + "grad_norm": 0.3619658350944519, + "learning_rate": 9.585211902614968e-05, + "loss": 1.3596, + "step": 1064 + }, + { + "epoch": 0.28845893654444593, + "grad_norm": 0.3384917378425598, + "learning_rate": 9.603246167718665e-05, + "loss": 1.3693, + "step": 1066 + }, + { + "epoch": 0.2890001352996888, + "grad_norm": 0.3724029064178467, + "learning_rate": 9.621280432822363e-05, + "loss": 1.3639, + "step": 1068 + }, + { + "epoch": 0.28954133405493165, + "grad_norm": 0.7029115557670593, + "learning_rate": 9.63931469792606e-05, + "loss": 1.3557, + "step": 1070 + }, + { + "epoch": 0.29008253281017454, + "grad_norm": 0.5529230833053589, + "learning_rate": 9.657348963029757e-05, + "loss": 1.3657, + "step": 1072 + }, + { + "epoch": 0.2906237315654174, + "grad_norm": 0.4254820644855499, + "learning_rate": 9.675383228133455e-05, + "loss": 1.3633, + "step": 1074 + }, + { + "epoch": 0.29116493032066026, + "grad_norm": 0.4930615723133087, + "learning_rate": 9.693417493237152e-05, + "loss": 1.3714, + "step": 1076 + }, + { + "epoch": 0.2917061290759031, + "grad_norm": 0.4455857574939728, + "learning_rate": 9.711451758340848e-05, + "loss": 1.3615, + "step": 1078 + }, + { + "epoch": 0.292247327831146, + "grad_norm": 0.4171796441078186, + "learning_rate": 9.729486023444545e-05, + "loss": 1.3673, + "step": 1080 + }, + { + "epoch": 0.2927885265863889, + "grad_norm": 0.37810683250427246, + "learning_rate": 9.747520288548242e-05, + "loss": 1.3683, + "step": 1082 + }, + { + "epoch": 0.2933297253416317, + "grad_norm": 0.4057900905609131, + "learning_rate": 9.765554553651939e-05, + "loss": 1.3674, + "step": 1084 + }, + { + "epoch": 0.2938709240968746, + "grad_norm": 0.40583640336990356, + "learning_rate": 9.783588818755635e-05, + "loss": 1.3566, + "step": 1086 + }, + { + "epoch": 0.29441212285211743, + "grad_norm": 0.39454150199890137, + "learning_rate": 9.801623083859334e-05, + "loss": 1.3611, + "step": 1088 + }, + { + "epoch": 0.2949533216073603, + "grad_norm": 0.42229679226875305, + "learning_rate": 9.81965734896303e-05, + "loss": 1.3726, + "step": 1090 + }, + { + "epoch": 0.29549452036260315, + "grad_norm": 0.3274170160293579, + "learning_rate": 9.837691614066727e-05, + "loss": 1.3375, + "step": 1092 + }, + { + "epoch": 0.29603571911784604, + "grad_norm": 0.40999388694763184, + "learning_rate": 9.855725879170424e-05, + "loss": 1.3548, + "step": 1094 + }, + { + "epoch": 0.2965769178730889, + "grad_norm": 0.33515796065330505, + "learning_rate": 9.873760144274122e-05, + "loss": 1.3903, + "step": 1096 + }, + { + "epoch": 0.29711811662833176, + "grad_norm": 0.3834095597267151, + "learning_rate": 9.891794409377819e-05, + "loss": 1.3653, + "step": 1098 + }, + { + "epoch": 0.2976593153835746, + "grad_norm": 0.34850651025772095, + "learning_rate": 9.909828674481515e-05, + "loss": 1.3573, + "step": 1100 + }, + { + "epoch": 0.2982005141388175, + "grad_norm": 0.3811749815940857, + "learning_rate": 9.927862939585212e-05, + "loss": 1.3843, + "step": 1102 + }, + { + "epoch": 0.2987417128940603, + "grad_norm": 0.3308597803115845, + "learning_rate": 9.94589720468891e-05, + "loss": 1.3492, + "step": 1104 + }, + { + "epoch": 0.2992829116493032, + "grad_norm": 0.31952470541000366, + "learning_rate": 9.963931469792607e-05, + "loss": 1.3586, + "step": 1106 + }, + { + "epoch": 0.2998241104045461, + "grad_norm": 0.3433592915534973, + "learning_rate": 9.981965734896304e-05, + "loss": 1.3524, + "step": 1108 + }, + { + "epoch": 0.30036530915978893, + "grad_norm": 0.4547680914402008, + "learning_rate": 0.0001, + "loss": 1.3562, + "step": 1110 + }, + { + "epoch": 0.3009065079150318, + "grad_norm": 0.4963592290878296, + "learning_rate": 9.999999008881264e-05, + "loss": 1.3452, + "step": 1112 + }, + { + "epoch": 0.30144770667027465, + "grad_norm": 1.1111193895339966, + "learning_rate": 9.999996035525452e-05, + "loss": 1.3732, + "step": 1114 + }, + { + "epoch": 0.30198890542551754, + "grad_norm": 0.6860964298248291, + "learning_rate": 9.999991079933739e-05, + "loss": 1.3689, + "step": 1116 + }, + { + "epoch": 0.3025301041807604, + "grad_norm": 0.7344204783439636, + "learning_rate": 9.999984142108093e-05, + "loss": 1.3575, + "step": 1118 + }, + { + "epoch": 0.30307130293600326, + "grad_norm": 0.6534725427627563, + "learning_rate": 9.999975222051263e-05, + "loss": 1.376, + "step": 1120 + }, + { + "epoch": 0.3036125016912461, + "grad_norm": 0.5108229517936707, + "learning_rate": 9.999964319766785e-05, + "loss": 1.3741, + "step": 1122 + }, + { + "epoch": 0.304153700446489, + "grad_norm": 0.4888688325881958, + "learning_rate": 9.99995143525898e-05, + "loss": 1.3555, + "step": 1124 + }, + { + "epoch": 0.3046948992017318, + "grad_norm": 0.42808806896209717, + "learning_rate": 9.999936568532962e-05, + "loss": 1.3548, + "step": 1126 + }, + { + "epoch": 0.3052360979569747, + "grad_norm": 0.3921727240085602, + "learning_rate": 9.999919719594617e-05, + "loss": 1.3559, + "step": 1128 + }, + { + "epoch": 0.30577729671221754, + "grad_norm": 0.3473529517650604, + "learning_rate": 9.999900888450628e-05, + "loss": 1.3603, + "step": 1130 + }, + { + "epoch": 0.3063184954674604, + "grad_norm": 0.3337381184101105, + "learning_rate": 9.999880075108464e-05, + "loss": 1.3642, + "step": 1132 + }, + { + "epoch": 0.30685969422270326, + "grad_norm": 0.3363231122493744, + "learning_rate": 9.99985727957637e-05, + "loss": 1.3606, + "step": 1134 + }, + { + "epoch": 0.30740089297794615, + "grad_norm": 0.32726484537124634, + "learning_rate": 9.999832501863386e-05, + "loss": 1.3493, + "step": 1136 + }, + { + "epoch": 0.30794209173318904, + "grad_norm": 0.3190646767616272, + "learning_rate": 9.999805741979338e-05, + "loss": 1.3518, + "step": 1138 + }, + { + "epoch": 0.30848329048843187, + "grad_norm": 0.31244540214538574, + "learning_rate": 9.999776999934831e-05, + "loss": 1.3495, + "step": 1140 + }, + { + "epoch": 0.30902448924367476, + "grad_norm": 0.3286384344100952, + "learning_rate": 9.999746275741261e-05, + "loss": 1.3517, + "step": 1142 + }, + { + "epoch": 0.3095656879989176, + "grad_norm": 0.3630046546459198, + "learning_rate": 9.99971356941081e-05, + "loss": 1.3641, + "step": 1144 + }, + { + "epoch": 0.3101068867541605, + "grad_norm": 0.30771151185035706, + "learning_rate": 9.999678880956443e-05, + "loss": 1.3571, + "step": 1146 + }, + { + "epoch": 0.3106480855094033, + "grad_norm": 0.30026301741600037, + "learning_rate": 9.99964221039191e-05, + "loss": 1.3541, + "step": 1148 + }, + { + "epoch": 0.3111892842646462, + "grad_norm": 0.3128298223018646, + "learning_rate": 9.999603557731754e-05, + "loss": 1.3556, + "step": 1150 + }, + { + "epoch": 0.31173048301988904, + "grad_norm": 0.30185452103614807, + "learning_rate": 9.999562922991293e-05, + "loss": 1.3484, + "step": 1152 + }, + { + "epoch": 0.3122716817751319, + "grad_norm": 0.3274635076522827, + "learning_rate": 9.99952030618664e-05, + "loss": 1.3729, + "step": 1154 + }, + { + "epoch": 0.31281288053037476, + "grad_norm": 0.30549076199531555, + "learning_rate": 9.999475707334692e-05, + "loss": 1.3642, + "step": 1156 + }, + { + "epoch": 0.31335407928561765, + "grad_norm": 0.3147718906402588, + "learning_rate": 9.999429126453126e-05, + "loss": 1.3493, + "step": 1158 + }, + { + "epoch": 0.3138952780408605, + "grad_norm": 0.6205586791038513, + "learning_rate": 9.99938056356041e-05, + "loss": 1.3623, + "step": 1160 + }, + { + "epoch": 0.31443647679610337, + "grad_norm": 0.3471706211566925, + "learning_rate": 9.999330018675798e-05, + "loss": 1.3533, + "step": 1162 + }, + { + "epoch": 0.31497767555134626, + "grad_norm": 1.3515815734863281, + "learning_rate": 9.999277491819328e-05, + "loss": 1.3565, + "step": 1164 + }, + { + "epoch": 0.3155188743065891, + "grad_norm": 733.9155883789062, + "learning_rate": 9.999222983011824e-05, + "loss": 5.2143, + "step": 1166 + }, + { + "epoch": 0.316060073061832, + "grad_norm": 2.9439170360565186, + "learning_rate": 9.999166492274894e-05, + "loss": 1.4438, + "step": 1168 + }, + { + "epoch": 0.3166012718170748, + "grad_norm": 1.5871142148971558, + "learning_rate": 9.999108019630938e-05, + "loss": 1.4426, + "step": 1170 + }, + { + "epoch": 0.3171424705723177, + "grad_norm": 711.9217529296875, + "learning_rate": 9.999047565103132e-05, + "loss": 3.6935, + "step": 1172 + }, + { + "epoch": 0.31768366932756054, + "grad_norm": 100.76264953613281, + "learning_rate": 9.998985128715448e-05, + "loss": 4.2396, + "step": 1174 + }, + { + "epoch": 0.3182248680828034, + "grad_norm": 108.88189697265625, + "learning_rate": 9.998920710492634e-05, + "loss": 4.9929, + "step": 1176 + }, + { + "epoch": 0.31876606683804626, + "grad_norm": 72.18595123291016, + "learning_rate": 9.998854310460233e-05, + "loss": 6.0375, + "step": 1178 + }, + { + "epoch": 0.31930726559328915, + "grad_norm": 59.48538589477539, + "learning_rate": 9.998785928644567e-05, + "loss": 5.8932, + "step": 1180 + }, + { + "epoch": 0.319848464348532, + "grad_norm": 36.32703399658203, + "learning_rate": 9.998715565072744e-05, + "loss": 6.5369, + "step": 1182 + }, + { + "epoch": 0.32038966310377487, + "grad_norm": 18.565351486206055, + "learning_rate": 9.998643219772664e-05, + "loss": 6.1671, + "step": 1184 + }, + { + "epoch": 0.3209308618590177, + "grad_norm": 45.84898376464844, + "learning_rate": 9.998568892773003e-05, + "loss": 5.9379, + "step": 1186 + }, + { + "epoch": 0.3214720606142606, + "grad_norm": 66.2480239868164, + "learning_rate": 9.998492584103232e-05, + "loss": 5.7071, + "step": 1188 + }, + { + "epoch": 0.3220132593695034, + "grad_norm": 41.693092346191406, + "learning_rate": 9.998414293793599e-05, + "loss": 6.3198, + "step": 1190 + }, + { + "epoch": 0.3225544581247463, + "grad_norm": 19.323413848876953, + "learning_rate": 9.998334021875147e-05, + "loss": 5.377, + "step": 1192 + }, + { + "epoch": 0.3230956568799892, + "grad_norm": 15.907301902770996, + "learning_rate": 9.998251768379696e-05, + "loss": 4.5293, + "step": 1194 + }, + { + "epoch": 0.32363685563523203, + "grad_norm": 80.1374740600586, + "learning_rate": 9.998167533339857e-05, + "loss": 4.3471, + "step": 1196 + }, + { + "epoch": 0.3241780543904749, + "grad_norm": 23.298336029052734, + "learning_rate": 9.998081316789024e-05, + "loss": 3.7461, + "step": 1198 + }, + { + "epoch": 0.32471925314571776, + "grad_norm": 82.48027801513672, + "learning_rate": 9.997993118761378e-05, + "loss": 4.1647, + "step": 1200 + }, + { + "epoch": 0.32526045190096065, + "grad_norm": 27.916913986206055, + "learning_rate": 9.997902939291883e-05, + "loss": 3.9092, + "step": 1202 + }, + { + "epoch": 0.3258016506562035, + "grad_norm": 15.70148754119873, + "learning_rate": 9.997810778416293e-05, + "loss": 3.1628, + "step": 1204 + }, + { + "epoch": 0.32634284941144637, + "grad_norm": 18.33330535888672, + "learning_rate": 9.997716636171142e-05, + "loss": 2.8777, + "step": 1206 + }, + { + "epoch": 0.3268840481666892, + "grad_norm": 10.6620512008667, + "learning_rate": 9.997620512593755e-05, + "loss": 2.3009, + "step": 1208 + }, + { + "epoch": 0.3274252469219321, + "grad_norm": 32.01799011230469, + "learning_rate": 9.99752240772224e-05, + "loss": 1.9617, + "step": 1210 + }, + { + "epoch": 0.3279664456771749, + "grad_norm": 5.677090644836426, + "learning_rate": 9.997422321595488e-05, + "loss": 1.8401, + "step": 1212 + }, + { + "epoch": 0.3285076444324178, + "grad_norm": 8.914667129516602, + "learning_rate": 9.997320254253179e-05, + "loss": 1.6707, + "step": 1214 + }, + { + "epoch": 0.32904884318766064, + "grad_norm": 2.3725008964538574, + "learning_rate": 9.997216205735779e-05, + "loss": 1.5757, + "step": 1216 + }, + { + "epoch": 0.32959004194290353, + "grad_norm": 2.418389320373535, + "learning_rate": 9.997110176084538e-05, + "loss": 1.5154, + "step": 1218 + }, + { + "epoch": 0.33013124069814637, + "grad_norm": 2.802185297012329, + "learning_rate": 9.997002165341487e-05, + "loss": 1.4883, + "step": 1220 + }, + { + "epoch": 0.33067243945338926, + "grad_norm": 2.1769211292266846, + "learning_rate": 9.996892173549452e-05, + "loss": 1.445, + "step": 1222 + }, + { + "epoch": 0.33121363820863214, + "grad_norm": 1.799670934677124, + "learning_rate": 9.996780200752035e-05, + "loss": 1.4276, + "step": 1224 + }, + { + "epoch": 0.331754836963875, + "grad_norm": 3.2545313835144043, + "learning_rate": 9.996666246993627e-05, + "loss": 1.4394, + "step": 1226 + }, + { + "epoch": 0.33229603571911787, + "grad_norm": 1.1922351121902466, + "learning_rate": 9.996550312319408e-05, + "loss": 1.4359, + "step": 1228 + }, + { + "epoch": 0.3328372344743607, + "grad_norm": 2.6813228130340576, + "learning_rate": 9.996432396775339e-05, + "loss": 1.4229, + "step": 1230 + }, + { + "epoch": 0.3333784332296036, + "grad_norm": 1.6968843936920166, + "learning_rate": 9.996312500408165e-05, + "loss": 1.4281, + "step": 1232 + }, + { + "epoch": 0.3339196319848464, + "grad_norm": 1.3502254486083984, + "learning_rate": 9.996190623265421e-05, + "loss": 1.408, + "step": 1234 + }, + { + "epoch": 0.3344608307400893, + "grad_norm": 1.2809518575668335, + "learning_rate": 9.996066765395424e-05, + "loss": 1.4176, + "step": 1236 + }, + { + "epoch": 0.33500202949533214, + "grad_norm": 1.0455057621002197, + "learning_rate": 9.995940926847279e-05, + "loss": 1.4056, + "step": 1238 + }, + { + "epoch": 0.33554322825057503, + "grad_norm": 1.3292824029922485, + "learning_rate": 9.99581310767087e-05, + "loss": 1.4033, + "step": 1240 + }, + { + "epoch": 0.33608442700581787, + "grad_norm": 1.5960067510604858, + "learning_rate": 9.995683307916875e-05, + "loss": 1.379, + "step": 1242 + }, + { + "epoch": 0.33662562576106075, + "grad_norm": 1.0471105575561523, + "learning_rate": 9.99555152763675e-05, + "loss": 1.3823, + "step": 1244 + }, + { + "epoch": 0.3371668245163036, + "grad_norm": 2.339273452758789, + "learning_rate": 9.99541776688274e-05, + "loss": 1.3698, + "step": 1246 + }, + { + "epoch": 0.3377080232715465, + "grad_norm": 0.81674724817276, + "learning_rate": 9.995282025707875e-05, + "loss": 1.4154, + "step": 1248 + }, + { + "epoch": 0.33824922202678936, + "grad_norm": 0.6240290999412537, + "learning_rate": 9.995144304165968e-05, + "loss": 1.4035, + "step": 1250 + }, + { + "epoch": 0.3387904207820322, + "grad_norm": 2.281787872314453, + "learning_rate": 9.995004602311619e-05, + "loss": 1.3906, + "step": 1252 + }, + { + "epoch": 0.3393316195372751, + "grad_norm": 0.6818395853042603, + "learning_rate": 9.99486292020021e-05, + "loss": 1.3853, + "step": 1254 + }, + { + "epoch": 0.3398728182925179, + "grad_norm": 6.299881935119629, + "learning_rate": 9.994719257887915e-05, + "loss": 1.3856, + "step": 1256 + }, + { + "epoch": 0.3404140170477608, + "grad_norm": 0.8173750638961792, + "learning_rate": 9.994573615431686e-05, + "loss": 1.3871, + "step": 1258 + }, + { + "epoch": 0.34095521580300364, + "grad_norm": 2.155395746231079, + "learning_rate": 9.994425992889262e-05, + "loss": 1.3382, + "step": 1260 + }, + { + "epoch": 0.34149641455824653, + "grad_norm": 0.5846114754676819, + "learning_rate": 9.99427639031917e-05, + "loss": 1.3978, + "step": 1262 + }, + { + "epoch": 0.34203761331348936, + "grad_norm": 0.6624069213867188, + "learning_rate": 9.994124807780717e-05, + "loss": 1.3792, + "step": 1264 + }, + { + "epoch": 0.34257881206873225, + "grad_norm": 0.5708588361740112, + "learning_rate": 9.993971245333998e-05, + "loss": 1.3677, + "step": 1266 + }, + { + "epoch": 0.3431200108239751, + "grad_norm": 0.5245474576950073, + "learning_rate": 9.993815703039894e-05, + "loss": 1.3672, + "step": 1268 + }, + { + "epoch": 0.343661209579218, + "grad_norm": 0.501871645450592, + "learning_rate": 9.993658180960069e-05, + "loss": 1.3674, + "step": 1270 + }, + { + "epoch": 0.3442024083344608, + "grad_norm": 0.5990382432937622, + "learning_rate": 9.993498679156969e-05, + "loss": 1.3804, + "step": 1272 + }, + { + "epoch": 0.3447436070897037, + "grad_norm": 0.42392146587371826, + "learning_rate": 9.993337197693833e-05, + "loss": 1.3628, + "step": 1274 + }, + { + "epoch": 0.34528480584494653, + "grad_norm": 0.46936917304992676, + "learning_rate": 9.993173736634676e-05, + "loss": 1.3696, + "step": 1276 + }, + { + "epoch": 0.3458260046001894, + "grad_norm": 0.52222740650177, + "learning_rate": 9.993008296044304e-05, + "loss": 1.3697, + "step": 1278 + }, + { + "epoch": 0.3463672033554323, + "grad_norm": 0.3582518398761749, + "learning_rate": 9.992840875988305e-05, + "loss": 1.3825, + "step": 1280 + }, + { + "epoch": 0.34690840211067514, + "grad_norm": 0.3533988296985626, + "learning_rate": 9.99267147653305e-05, + "loss": 1.361, + "step": 1282 + }, + { + "epoch": 0.34744960086591803, + "grad_norm": 0.35905274748802185, + "learning_rate": 9.992500097745702e-05, + "loss": 1.3721, + "step": 1284 + }, + { + "epoch": 0.34799079962116086, + "grad_norm": 0.3057416081428528, + "learning_rate": 9.9923267396942e-05, + "loss": 1.369, + "step": 1286 + }, + { + "epoch": 0.34853199837640375, + "grad_norm": 0.3299311101436615, + "learning_rate": 9.992151402447272e-05, + "loss": 1.358, + "step": 1288 + }, + { + "epoch": 0.3490731971316466, + "grad_norm": 0.3086453080177307, + "learning_rate": 9.99197408607443e-05, + "loss": 1.3534, + "step": 1290 + }, + { + "epoch": 0.3496143958868895, + "grad_norm": 0.3111782968044281, + "learning_rate": 9.991794790645969e-05, + "loss": 1.3605, + "step": 1292 + }, + { + "epoch": 0.3501555946421323, + "grad_norm": 0.3231568932533264, + "learning_rate": 9.991613516232974e-05, + "loss": 1.3543, + "step": 1294 + }, + { + "epoch": 0.3506967933973752, + "grad_norm": 0.3288814425468445, + "learning_rate": 9.991430262907309e-05, + "loss": 1.3521, + "step": 1296 + }, + { + "epoch": 0.35123799215261803, + "grad_norm": 0.3239436745643616, + "learning_rate": 9.991245030741622e-05, + "loss": 1.3335, + "step": 1298 + }, + { + "epoch": 0.3517791909078609, + "grad_norm": 0.3560773730278015, + "learning_rate": 9.991057819809353e-05, + "loss": 1.3487, + "step": 1300 + }, + { + "epoch": 0.35232038966310375, + "grad_norm": 0.4387347400188446, + "learning_rate": 9.990868630184716e-05, + "loss": 1.3548, + "step": 1302 + }, + { + "epoch": 0.35286158841834664, + "grad_norm": 0.32067278027534485, + "learning_rate": 9.990677461942717e-05, + "loss": 1.3471, + "step": 1304 + }, + { + "epoch": 0.3534027871735895, + "grad_norm": 0.4399580955505371, + "learning_rate": 9.990484315159146e-05, + "loss": 1.3588, + "step": 1306 + }, + { + "epoch": 0.35394398592883236, + "grad_norm": 0.9175602793693542, + "learning_rate": 9.990289189910571e-05, + "loss": 1.3432, + "step": 1308 + }, + { + "epoch": 0.35448518468407525, + "grad_norm": 0.45273318886756897, + "learning_rate": 9.990092086274352e-05, + "loss": 1.3434, + "step": 1310 + }, + { + "epoch": 0.3550263834393181, + "grad_norm": 0.3346487879753113, + "learning_rate": 9.989893004328632e-05, + "loss": 1.3339, + "step": 1312 + }, + { + "epoch": 0.35556758219456097, + "grad_norm": 0.4779951870441437, + "learning_rate": 9.989691944152333e-05, + "loss": 1.3561, + "step": 1314 + }, + { + "epoch": 0.3561087809498038, + "grad_norm": 0.6359366774559021, + "learning_rate": 9.989488905825166e-05, + "loss": 1.3499, + "step": 1316 + }, + { + "epoch": 0.3566499797050467, + "grad_norm": 0.5867050290107727, + "learning_rate": 9.989283889427625e-05, + "loss": 1.3791, + "step": 1318 + }, + { + "epoch": 0.3571911784602895, + "grad_norm": 1.869691014289856, + "learning_rate": 9.989076895040989e-05, + "loss": 1.3663, + "step": 1320 + }, + { + "epoch": 0.3577323772155324, + "grad_norm": 2.7147843837738037, + "learning_rate": 9.98886792274732e-05, + "loss": 1.358, + "step": 1322 + }, + { + "epoch": 0.35827357597077525, + "grad_norm": 0.8717885613441467, + "learning_rate": 9.988656972629465e-05, + "loss": 1.34, + "step": 1324 + }, + { + "epoch": 0.35881477472601814, + "grad_norm": 0.7126337885856628, + "learning_rate": 9.988444044771054e-05, + "loss": 1.3281, + "step": 1326 + }, + { + "epoch": 0.35935597348126097, + "grad_norm": 0.7409217357635498, + "learning_rate": 9.988229139256502e-05, + "loss": 1.3571, + "step": 1328 + }, + { + "epoch": 0.35989717223650386, + "grad_norm": 0.5892549157142639, + "learning_rate": 9.988012256171006e-05, + "loss": 1.3269, + "step": 1330 + }, + { + "epoch": 0.3604383709917467, + "grad_norm": 0.4858717620372772, + "learning_rate": 9.98779339560055e-05, + "loss": 1.3506, + "step": 1332 + }, + { + "epoch": 0.3609795697469896, + "grad_norm": 0.37409740686416626, + "learning_rate": 9.987572557631903e-05, + "loss": 1.3339, + "step": 1334 + }, + { + "epoch": 0.36152076850223247, + "grad_norm": 0.38315168023109436, + "learning_rate": 9.987349742352611e-05, + "loss": 1.3404, + "step": 1336 + }, + { + "epoch": 0.3620619672574753, + "grad_norm": 0.32702726125717163, + "learning_rate": 9.987124949851014e-05, + "loss": 1.3595, + "step": 1338 + }, + { + "epoch": 0.3626031660127182, + "grad_norm": 0.3133656680583954, + "learning_rate": 9.986898180216226e-05, + "loss": 1.3428, + "step": 1340 + }, + { + "epoch": 0.363144364767961, + "grad_norm": 0.2916230857372284, + "learning_rate": 9.986669433538152e-05, + "loss": 1.3381, + "step": 1342 + }, + { + "epoch": 0.3636855635232039, + "grad_norm": 0.28036215901374817, + "learning_rate": 9.986438709907476e-05, + "loss": 1.3447, + "step": 1344 + }, + { + "epoch": 0.36422676227844675, + "grad_norm": 0.30352699756622314, + "learning_rate": 9.98620600941567e-05, + "loss": 1.3427, + "step": 1346 + }, + { + "epoch": 0.36476796103368964, + "grad_norm": 0.3100769519805908, + "learning_rate": 9.985971332154984e-05, + "loss": 1.3603, + "step": 1348 + }, + { + "epoch": 0.36530915978893247, + "grad_norm": 0.2933647930622101, + "learning_rate": 9.98573467821846e-05, + "loss": 1.3646, + "step": 1350 + }, + { + "epoch": 0.36585035854417536, + "grad_norm": 0.2938663959503174, + "learning_rate": 9.985496047699916e-05, + "loss": 1.3763, + "step": 1352 + }, + { + "epoch": 0.3663915572994182, + "grad_norm": 0.2916519343852997, + "learning_rate": 9.985255440693955e-05, + "loss": 1.3431, + "step": 1354 + }, + { + "epoch": 0.3669327560546611, + "grad_norm": 0.2954147756099701, + "learning_rate": 9.985012857295968e-05, + "loss": 1.338, + "step": 1356 + }, + { + "epoch": 0.3674739548099039, + "grad_norm": 0.2839341163635254, + "learning_rate": 9.984768297602125e-05, + "loss": 1.3653, + "step": 1358 + }, + { + "epoch": 0.3680151535651468, + "grad_norm": 0.2878473699092865, + "learning_rate": 9.984521761709382e-05, + "loss": 1.3302, + "step": 1360 + }, + { + "epoch": 0.3685563523203897, + "grad_norm": 0.2859325408935547, + "learning_rate": 9.984273249715478e-05, + "loss": 1.3273, + "step": 1362 + }, + { + "epoch": 0.3690975510756325, + "grad_norm": 0.28399959206581116, + "learning_rate": 9.984022761718933e-05, + "loss": 1.3516, + "step": 1364 + }, + { + "epoch": 0.3696387498308754, + "grad_norm": 0.29740169644355774, + "learning_rate": 9.983770297819052e-05, + "loss": 1.3389, + "step": 1366 + }, + { + "epoch": 0.37017994858611825, + "grad_norm": 0.3143361806869507, + "learning_rate": 9.983515858115928e-05, + "loss": 1.3557, + "step": 1368 + }, + { + "epoch": 0.37072114734136113, + "grad_norm": 0.30783936381340027, + "learning_rate": 9.983259442710429e-05, + "loss": 1.3498, + "step": 1370 + }, + { + "epoch": 0.37126234609660397, + "grad_norm": 0.297091543674469, + "learning_rate": 9.983001051704211e-05, + "loss": 1.3308, + "step": 1372 + }, + { + "epoch": 0.37180354485184686, + "grad_norm": 0.3118893504142761, + "learning_rate": 9.982740685199712e-05, + "loss": 1.3372, + "step": 1374 + }, + { + "epoch": 0.3723447436070897, + "grad_norm": 0.2826865017414093, + "learning_rate": 9.982478343300155e-05, + "loss": 1.3488, + "step": 1376 + }, + { + "epoch": 0.3728859423623326, + "grad_norm": 0.2829175889492035, + "learning_rate": 9.982214026109544e-05, + "loss": 1.3693, + "step": 1378 + }, + { + "epoch": 0.3734271411175754, + "grad_norm": 0.3026389479637146, + "learning_rate": 9.981947733732668e-05, + "loss": 1.3276, + "step": 1380 + }, + { + "epoch": 0.3739683398728183, + "grad_norm": 0.30112889409065247, + "learning_rate": 9.981679466275096e-05, + "loss": 1.3441, + "step": 1382 + }, + { + "epoch": 0.37450953862806113, + "grad_norm": 0.27241262793540955, + "learning_rate": 9.981409223843183e-05, + "loss": 1.3373, + "step": 1384 + }, + { + "epoch": 0.375050737383304, + "grad_norm": 0.2804114520549774, + "learning_rate": 9.981137006544066e-05, + "loss": 1.344, + "step": 1386 + }, + { + "epoch": 0.37559193613854686, + "grad_norm": 0.27698764204978943, + "learning_rate": 9.980862814485665e-05, + "loss": 1.3543, + "step": 1388 + }, + { + "epoch": 0.37613313489378974, + "grad_norm": 0.29283177852630615, + "learning_rate": 9.980586647776681e-05, + "loss": 1.3332, + "step": 1390 + }, + { + "epoch": 0.37667433364903263, + "grad_norm": 0.2896028459072113, + "learning_rate": 9.980308506526604e-05, + "loss": 1.3392, + "step": 1392 + }, + { + "epoch": 0.37721553240427547, + "grad_norm": 0.27882838249206543, + "learning_rate": 9.980028390845697e-05, + "loss": 1.336, + "step": 1394 + }, + { + "epoch": 0.37775673115951836, + "grad_norm": 0.2886262834072113, + "learning_rate": 9.979746300845015e-05, + "loss": 1.3331, + "step": 1396 + }, + { + "epoch": 0.3782979299147612, + "grad_norm": 0.3085189163684845, + "learning_rate": 9.97946223663639e-05, + "loss": 1.3296, + "step": 1398 + }, + { + "epoch": 0.3788391286700041, + "grad_norm": 0.3342386484146118, + "learning_rate": 9.97917619833244e-05, + "loss": 1.351, + "step": 1400 + }, + { + "epoch": 0.3793803274252469, + "grad_norm": 0.3263756036758423, + "learning_rate": 9.978888186046562e-05, + "loss": 1.3526, + "step": 1402 + }, + { + "epoch": 0.3799215261804898, + "grad_norm": 0.292346715927124, + "learning_rate": 9.97859819989294e-05, + "loss": 1.3498, + "step": 1404 + }, + { + "epoch": 0.38046272493573263, + "grad_norm": 0.29072263836860657, + "learning_rate": 9.978306239986536e-05, + "loss": 1.3423, + "step": 1406 + }, + { + "epoch": 0.3810039236909755, + "grad_norm": 0.3350834548473358, + "learning_rate": 9.978012306443101e-05, + "loss": 1.3559, + "step": 1408 + }, + { + "epoch": 0.38154512244621835, + "grad_norm": 0.28721559047698975, + "learning_rate": 9.977716399379157e-05, + "loss": 1.3294, + "step": 1410 + }, + { + "epoch": 0.38208632120146124, + "grad_norm": 0.3062276244163513, + "learning_rate": 9.977418518912023e-05, + "loss": 1.3457, + "step": 1412 + }, + { + "epoch": 0.3826275199567041, + "grad_norm": 0.30255332589149475, + "learning_rate": 9.977118665159791e-05, + "loss": 1.3371, + "step": 1414 + }, + { + "epoch": 0.38316871871194697, + "grad_norm": 0.2800199091434479, + "learning_rate": 9.976816838241334e-05, + "loss": 1.3439, + "step": 1416 + }, + { + "epoch": 0.3837099174671898, + "grad_norm": 0.2754746675491333, + "learning_rate": 9.976513038276312e-05, + "loss": 1.3303, + "step": 1418 + }, + { + "epoch": 0.3842511162224327, + "grad_norm": 0.29933616518974304, + "learning_rate": 9.976207265385168e-05, + "loss": 1.3365, + "step": 1420 + }, + { + "epoch": 0.3847923149776756, + "grad_norm": 0.3023386001586914, + "learning_rate": 9.975899519689122e-05, + "loss": 1.3164, + "step": 1422 + }, + { + "epoch": 0.3853335137329184, + "grad_norm": 0.2901383936405182, + "learning_rate": 9.975589801310181e-05, + "loss": 1.3209, + "step": 1424 + }, + { + "epoch": 0.3858747124881613, + "grad_norm": 0.28566035628318787, + "learning_rate": 9.975278110371131e-05, + "loss": 1.3301, + "step": 1426 + }, + { + "epoch": 0.38641591124340413, + "grad_norm": 0.3010505735874176, + "learning_rate": 9.974964446995543e-05, + "loss": 1.319, + "step": 1428 + }, + { + "epoch": 0.386957109998647, + "grad_norm": 0.2977135479450226, + "learning_rate": 9.974648811307766e-05, + "loss": 1.3311, + "step": 1430 + }, + { + "epoch": 0.38749830875388985, + "grad_norm": 0.28914034366607666, + "learning_rate": 9.974331203432932e-05, + "loss": 1.343, + "step": 1432 + }, + { + "epoch": 0.38803950750913274, + "grad_norm": 0.2842980623245239, + "learning_rate": 9.974011623496958e-05, + "loss": 1.3162, + "step": 1434 + }, + { + "epoch": 0.3885807062643756, + "grad_norm": 0.3048929274082184, + "learning_rate": 9.97369007162654e-05, + "loss": 1.3166, + "step": 1436 + }, + { + "epoch": 0.38912190501961846, + "grad_norm": 0.3024531304836273, + "learning_rate": 9.973366547949157e-05, + "loss": 1.3156, + "step": 1438 + }, + { + "epoch": 0.3896631037748613, + "grad_norm": 0.2911103367805481, + "learning_rate": 9.973041052593068e-05, + "loss": 1.3314, + "step": 1440 + }, + { + "epoch": 0.3902043025301042, + "grad_norm": 0.30932334065437317, + "learning_rate": 9.972713585687317e-05, + "loss": 1.3144, + "step": 1442 + }, + { + "epoch": 0.390745501285347, + "grad_norm": 0.302971750497818, + "learning_rate": 9.972384147361725e-05, + "loss": 1.3431, + "step": 1444 + }, + { + "epoch": 0.3912867000405899, + "grad_norm": 0.32412296533584595, + "learning_rate": 9.972052737746898e-05, + "loss": 1.3167, + "step": 1446 + }, + { + "epoch": 0.3918278987958328, + "grad_norm": 0.4637945890426636, + "learning_rate": 9.97171935697422e-05, + "loss": 1.3433, + "step": 1448 + }, + { + "epoch": 0.39236909755107563, + "grad_norm": 0.32690081000328064, + "learning_rate": 9.971384005175864e-05, + "loss": 1.3327, + "step": 1450 + }, + { + "epoch": 0.3929102963063185, + "grad_norm": 0.3049994111061096, + "learning_rate": 9.971046682484776e-05, + "loss": 1.3401, + "step": 1452 + }, + { + "epoch": 0.39345149506156135, + "grad_norm": 0.306095689535141, + "learning_rate": 9.970707389034688e-05, + "loss": 1.3205, + "step": 1454 + }, + { + "epoch": 0.39399269381680424, + "grad_norm": 0.3375592529773712, + "learning_rate": 9.970366124960111e-05, + "loss": 1.3243, + "step": 1456 + }, + { + "epoch": 0.3945338925720471, + "grad_norm": 0.30508387088775635, + "learning_rate": 9.970022890396338e-05, + "loss": 1.3342, + "step": 1458 + }, + { + "epoch": 0.39507509132728996, + "grad_norm": 0.2996918261051178, + "learning_rate": 9.969677685479444e-05, + "loss": 1.3457, + "step": 1460 + }, + { + "epoch": 0.3956162900825328, + "grad_norm": 0.29500269889831543, + "learning_rate": 9.969330510346286e-05, + "loss": 1.3306, + "step": 1462 + }, + { + "epoch": 0.3961574888377757, + "grad_norm": 0.28392598032951355, + "learning_rate": 9.9689813651345e-05, + "loss": 1.3347, + "step": 1464 + }, + { + "epoch": 0.3966986875930185, + "grad_norm": 0.2859434485435486, + "learning_rate": 9.968630249982503e-05, + "loss": 1.3342, + "step": 1466 + }, + { + "epoch": 0.3972398863482614, + "grad_norm": 0.3038876950740814, + "learning_rate": 9.968277165029494e-05, + "loss": 1.3248, + "step": 1468 + }, + { + "epoch": 0.39778108510350424, + "grad_norm": 0.3060581088066101, + "learning_rate": 9.967922110415454e-05, + "loss": 1.3403, + "step": 1470 + }, + { + "epoch": 0.39832228385874713, + "grad_norm": 0.30475133657455444, + "learning_rate": 9.96756508628114e-05, + "loss": 1.3338, + "step": 1472 + }, + { + "epoch": 0.39886348261398996, + "grad_norm": 0.33263343572616577, + "learning_rate": 9.967206092768095e-05, + "loss": 1.3209, + "step": 1474 + }, + { + "epoch": 0.39940468136923285, + "grad_norm": 0.2895435094833374, + "learning_rate": 9.966845130018645e-05, + "loss": 1.3352, + "step": 1476 + }, + { + "epoch": 0.39994588012447574, + "grad_norm": 0.27237775921821594, + "learning_rate": 9.966482198175886e-05, + "loss": 1.3239, + "step": 1478 + }, + { + "epoch": 0.40048707887971857, + "grad_norm": 0.2740168571472168, + "learning_rate": 9.966117297383707e-05, + "loss": 1.3371, + "step": 1480 + }, + { + "epoch": 0.40102827763496146, + "grad_norm": 0.30601269006729126, + "learning_rate": 9.965750427786768e-05, + "loss": 1.343, + "step": 1482 + }, + { + "epoch": 0.4015694763902043, + "grad_norm": 0.28768840432167053, + "learning_rate": 9.965381589530518e-05, + "loss": 1.3442, + "step": 1484 + }, + { + "epoch": 0.4021106751454472, + "grad_norm": 0.28244882822036743, + "learning_rate": 9.965010782761177e-05, + "loss": 1.3336, + "step": 1486 + }, + { + "epoch": 0.40265187390069, + "grad_norm": 0.2694818079471588, + "learning_rate": 9.964638007625754e-05, + "loss": 1.3448, + "step": 1488 + }, + { + "epoch": 0.4031930726559329, + "grad_norm": 0.29507288336753845, + "learning_rate": 9.964263264272033e-05, + "loss": 1.327, + "step": 1490 + }, + { + "epoch": 0.40373427141117574, + "grad_norm": 0.3036315143108368, + "learning_rate": 9.963886552848581e-05, + "loss": 1.3289, + "step": 1492 + }, + { + "epoch": 0.4042754701664186, + "grad_norm": 0.2737107574939728, + "learning_rate": 9.963507873504744e-05, + "loss": 1.3281, + "step": 1494 + }, + { + "epoch": 0.40481666892166146, + "grad_norm": 0.29833105206489563, + "learning_rate": 9.963127226390647e-05, + "loss": 1.3378, + "step": 1496 + }, + { + "epoch": 0.40535786767690435, + "grad_norm": 0.32203689217567444, + "learning_rate": 9.9627446116572e-05, + "loss": 1.3158, + "step": 1498 + }, + { + "epoch": 0.4058990664321472, + "grad_norm": 0.27837038040161133, + "learning_rate": 9.962360029456086e-05, + "loss": 1.3051, + "step": 1500 + } + ], + "logging_steps": 2, + "max_steps": 11088, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.235471745541734e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}