{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5422811527533486, "eval_steps": 500, "global_step": 5700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00027059937762143147, "grad_norm": 4.086390018463135, "learning_rate": 0.0, "loss": 3.2754, "step": 1 }, { "epoch": 0.0005411987552428629, "grad_norm": 3.758815288543701, "learning_rate": 9.017132551848513e-08, "loss": 3.2863, "step": 2 }, { "epoch": 0.0010823975104857259, "grad_norm": 3.8250608444213867, "learning_rate": 2.705139765554554e-07, "loss": 3.3425, "step": 4 }, { "epoch": 0.0016235962657285888, "grad_norm": 3.8092095851898193, "learning_rate": 4.5085662759242564e-07, "loss": 3.3165, "step": 6 }, { "epoch": 0.0021647950209714517, "grad_norm": 3.7621052265167236, "learning_rate": 6.311992786293959e-07, "loss": 3.3295, "step": 8 }, { "epoch": 0.002705993776214315, "grad_norm": 3.4136276245117188, "learning_rate": 8.115419296663661e-07, "loss": 3.3073, "step": 10 }, { "epoch": 0.0032471925314571776, "grad_norm": 2.855100393295288, "learning_rate": 9.918845807033363e-07, "loss": 3.3031, "step": 12 }, { "epoch": 0.0037883912867000408, "grad_norm": 2.491767406463623, "learning_rate": 1.1722272317403068e-06, "loss": 3.2943, "step": 14 }, { "epoch": 0.0043295900419429035, "grad_norm": 2.359778642654419, "learning_rate": 1.3525698827772768e-06, "loss": 3.2622, "step": 16 }, { "epoch": 0.004870788797185766, "grad_norm": 2.037504196166992, "learning_rate": 1.5329125338142473e-06, "loss": 3.239, "step": 18 }, { "epoch": 0.00541198755242863, "grad_norm": 2.8542497158050537, "learning_rate": 1.7132551848512173e-06, "loss": 3.2031, "step": 20 }, { "epoch": 0.0059531863076714925, "grad_norm": 2.297046661376953, "learning_rate": 1.8935978358881876e-06, "loss": 3.1721, "step": 22 }, { "epoch": 0.006494385062914355, "grad_norm": 2.2149112224578857, "learning_rate": 2.0739404869251576e-06, "loss": 3.121, "step": 24 }, { "epoch": 0.007035583818157218, "grad_norm": 1.8048591613769531, "learning_rate": 2.254283137962128e-06, "loss": 3.0857, "step": 26 }, { "epoch": 0.0075767825734000815, "grad_norm": 1.7466434240341187, "learning_rate": 2.4346257889990986e-06, "loss": 3.0489, "step": 28 }, { "epoch": 0.008117981328642944, "grad_norm": 2.1722524166107178, "learning_rate": 2.6149684400360686e-06, "loss": 3.0016, "step": 30 }, { "epoch": 0.008659180083885807, "grad_norm": 1.364578366279602, "learning_rate": 2.7953110910730386e-06, "loss": 2.9587, "step": 32 }, { "epoch": 0.00920037883912867, "grad_norm": 1.5823427438735962, "learning_rate": 2.9756537421100095e-06, "loss": 2.931, "step": 34 }, { "epoch": 0.009741577594371532, "grad_norm": 1.2367908954620361, "learning_rate": 3.1559963931469796e-06, "loss": 2.8953, "step": 36 }, { "epoch": 0.010282776349614395, "grad_norm": 1.0437366962432861, "learning_rate": 3.3363390441839496e-06, "loss": 2.8412, "step": 38 }, { "epoch": 0.01082397510485726, "grad_norm": 1.081803798675537, "learning_rate": 3.5166816952209197e-06, "loss": 2.7832, "step": 40 }, { "epoch": 0.011365173860100122, "grad_norm": 0.9715840220451355, "learning_rate": 3.69702434625789e-06, "loss": 2.7729, "step": 42 }, { "epoch": 0.011906372615342985, "grad_norm": 0.8603936433792114, "learning_rate": 3.877366997294861e-06, "loss": 2.6904, "step": 44 }, { "epoch": 0.012447571370585848, "grad_norm": 0.8236231803894043, "learning_rate": 4.057709648331831e-06, "loss": 2.6908, "step": 46 }, { "epoch": 0.01298877012582871, "grad_norm": 0.7681186199188232, "learning_rate": 4.2380522993688015e-06, "loss": 2.6212, "step": 48 }, { "epoch": 0.013529968881071573, "grad_norm": 0.8002827167510986, "learning_rate": 4.4183949504057716e-06, "loss": 2.6035, "step": 50 }, { "epoch": 0.014071167636314436, "grad_norm": 0.6757120490074158, "learning_rate": 4.598737601442742e-06, "loss": 2.595, "step": 52 }, { "epoch": 0.014612366391557299, "grad_norm": 0.6619369387626648, "learning_rate": 4.779080252479712e-06, "loss": 2.5522, "step": 54 }, { "epoch": 0.015153565146800163, "grad_norm": 0.6247105598449707, "learning_rate": 4.959422903516682e-06, "loss": 2.5079, "step": 56 }, { "epoch": 0.015694763902043024, "grad_norm": 0.6559263467788696, "learning_rate": 5.139765554553652e-06, "loss": 2.5009, "step": 58 }, { "epoch": 0.01623596265728589, "grad_norm": 0.6590877175331116, "learning_rate": 5.320108205590623e-06, "loss": 2.4648, "step": 60 }, { "epoch": 0.01677716141252875, "grad_norm": 0.6045516133308411, "learning_rate": 5.500450856627593e-06, "loss": 2.421, "step": 62 }, { "epoch": 0.017318360167771614, "grad_norm": 0.6533932089805603, "learning_rate": 5.680793507664563e-06, "loss": 2.3966, "step": 64 }, { "epoch": 0.01785955892301448, "grad_norm": 0.6478094458580017, "learning_rate": 5.861136158701533e-06, "loss": 2.3903, "step": 66 }, { "epoch": 0.01840075767825734, "grad_norm": 0.7349300980567932, "learning_rate": 6.041478809738504e-06, "loss": 2.3552, "step": 68 }, { "epoch": 0.018941956433500204, "grad_norm": 0.6454821825027466, "learning_rate": 6.221821460775474e-06, "loss": 2.3262, "step": 70 }, { "epoch": 0.019483155188743065, "grad_norm": 0.7321672439575195, "learning_rate": 6.402164111812444e-06, "loss": 2.3197, "step": 72 }, { "epoch": 0.02002435394398593, "grad_norm": 0.7664237022399902, "learning_rate": 6.582506762849414e-06, "loss": 2.2992, "step": 74 }, { "epoch": 0.02056555269922879, "grad_norm": 0.6843811869621277, "learning_rate": 6.762849413886384e-06, "loss": 2.2927, "step": 76 }, { "epoch": 0.021106751454471655, "grad_norm": 0.7199612259864807, "learning_rate": 6.9431920649233556e-06, "loss": 2.2525, "step": 78 }, { "epoch": 0.02164795020971452, "grad_norm": 0.778446614742279, "learning_rate": 7.123534715960326e-06, "loss": 2.2267, "step": 80 }, { "epoch": 0.02218914896495738, "grad_norm": 0.9287930727005005, "learning_rate": 7.303877366997296e-06, "loss": 2.2206, "step": 82 }, { "epoch": 0.022730347720200245, "grad_norm": 1.033782958984375, "learning_rate": 7.484220018034266e-06, "loss": 2.2063, "step": 84 }, { "epoch": 0.023271546475443106, "grad_norm": 1.0132615566253662, "learning_rate": 7.664562669071236e-06, "loss": 2.1677, "step": 86 }, { "epoch": 0.02381274523068597, "grad_norm": 0.9043529033660889, "learning_rate": 7.844905320108207e-06, "loss": 2.1696, "step": 88 }, { "epoch": 0.02435394398592883, "grad_norm": 0.6718290448188782, "learning_rate": 8.025247971145176e-06, "loss": 2.1492, "step": 90 }, { "epoch": 0.024895142741171696, "grad_norm": 0.9615944027900696, "learning_rate": 8.205590622182147e-06, "loss": 2.1452, "step": 92 }, { "epoch": 0.02543634149641456, "grad_norm": 0.9435996413230896, "learning_rate": 8.385933273219116e-06, "loss": 2.1098, "step": 94 }, { "epoch": 0.02597754025165742, "grad_norm": 0.7614261507987976, "learning_rate": 8.566275924256087e-06, "loss": 2.1286, "step": 96 }, { "epoch": 0.026518739006900285, "grad_norm": 0.9416339993476868, "learning_rate": 8.746618575293058e-06, "loss": 2.1092, "step": 98 }, { "epoch": 0.027059937762143146, "grad_norm": 0.9229443073272705, "learning_rate": 8.926961226330027e-06, "loss": 2.0932, "step": 100 }, { "epoch": 0.02760113651738601, "grad_norm": 0.7135593295097351, "learning_rate": 9.107303877366998e-06, "loss": 2.0699, "step": 102 }, { "epoch": 0.028142335272628872, "grad_norm": 1.0263723134994507, "learning_rate": 9.287646528403967e-06, "loss": 2.0445, "step": 104 }, { "epoch": 0.028683534027871736, "grad_norm": 1.0300300121307373, "learning_rate": 9.467989179440938e-06, "loss": 2.0463, "step": 106 }, { "epoch": 0.029224732783114597, "grad_norm": 0.8331286311149597, "learning_rate": 9.648331830477909e-06, "loss": 2.0381, "step": 108 }, { "epoch": 0.02976593153835746, "grad_norm": 0.7501435875892639, "learning_rate": 9.828674481514878e-06, "loss": 2.0411, "step": 110 }, { "epoch": 0.030307130293600326, "grad_norm": 0.6895191073417664, "learning_rate": 1.0009017132551849e-05, "loss": 2.0475, "step": 112 }, { "epoch": 0.030848329048843187, "grad_norm": 0.95854252576828, "learning_rate": 1.018935978358882e-05, "loss": 2.0071, "step": 114 }, { "epoch": 0.03138952780408605, "grad_norm": 1.1303929090499878, "learning_rate": 1.036970243462579e-05, "loss": 2.0008, "step": 116 }, { "epoch": 0.031930726559328916, "grad_norm": 0.7708876729011536, "learning_rate": 1.055004508566276e-05, "loss": 2.0061, "step": 118 }, { "epoch": 0.03247192531457178, "grad_norm": 0.9773860573768616, "learning_rate": 1.073038773669973e-05, "loss": 2.0096, "step": 120 }, { "epoch": 0.03301312406981464, "grad_norm": 1.118385910987854, "learning_rate": 1.09107303877367e-05, "loss": 1.9939, "step": 122 }, { "epoch": 0.0335543228250575, "grad_norm": 0.7215014696121216, "learning_rate": 1.109107303877367e-05, "loss": 1.9515, "step": 124 }, { "epoch": 0.03409552158030037, "grad_norm": 0.9696834683418274, "learning_rate": 1.1271415689810642e-05, "loss": 1.9639, "step": 126 }, { "epoch": 0.03463672033554323, "grad_norm": 0.945482611656189, "learning_rate": 1.1451758340847611e-05, "loss": 1.9397, "step": 128 }, { "epoch": 0.03517791909078609, "grad_norm": 0.7454535365104675, "learning_rate": 1.1632100991884582e-05, "loss": 1.9353, "step": 130 }, { "epoch": 0.03571911784602896, "grad_norm": 0.7824187278747559, "learning_rate": 1.1812443642921551e-05, "loss": 1.9227, "step": 132 }, { "epoch": 0.03626031660127182, "grad_norm": 0.7939879894256592, "learning_rate": 1.1992786293958522e-05, "loss": 1.9126, "step": 134 }, { "epoch": 0.03680151535651468, "grad_norm": 0.7776147723197937, "learning_rate": 1.2173128944995491e-05, "loss": 1.9002, "step": 136 }, { "epoch": 0.03734271411175754, "grad_norm": 0.6580236554145813, "learning_rate": 1.2353471596032462e-05, "loss": 1.9121, "step": 138 }, { "epoch": 0.03788391286700041, "grad_norm": 0.7200301289558411, "learning_rate": 1.2533814247069433e-05, "loss": 1.8885, "step": 140 }, { "epoch": 0.03842511162224327, "grad_norm": 0.7958497405052185, "learning_rate": 1.2714156898106402e-05, "loss": 1.9095, "step": 142 }, { "epoch": 0.03896631037748613, "grad_norm": 0.9120681881904602, "learning_rate": 1.2894499549143375e-05, "loss": 1.884, "step": 144 }, { "epoch": 0.039507509132729, "grad_norm": 0.8108247518539429, "learning_rate": 1.3074842200180342e-05, "loss": 1.8656, "step": 146 }, { "epoch": 0.04004870788797186, "grad_norm": 0.7010449171066284, "learning_rate": 1.3255184851217315e-05, "loss": 1.8635, "step": 148 }, { "epoch": 0.04058990664321472, "grad_norm": 0.8178524374961853, "learning_rate": 1.3435527502254284e-05, "loss": 1.8933, "step": 150 }, { "epoch": 0.04113110539845758, "grad_norm": 1.0447405576705933, "learning_rate": 1.3615870153291255e-05, "loss": 1.8523, "step": 152 }, { "epoch": 0.04167230415370045, "grad_norm": 0.8516271710395813, "learning_rate": 1.3796212804328224e-05, "loss": 1.8528, "step": 154 }, { "epoch": 0.04221350290894331, "grad_norm": 0.8437328934669495, "learning_rate": 1.3976555455365195e-05, "loss": 1.861, "step": 156 }, { "epoch": 0.04275470166418617, "grad_norm": 0.851265549659729, "learning_rate": 1.4156898106402164e-05, "loss": 1.8315, "step": 158 }, { "epoch": 0.04329590041942904, "grad_norm": 0.7337156534194946, "learning_rate": 1.4337240757439135e-05, "loss": 1.8354, "step": 160 }, { "epoch": 0.0438370991746719, "grad_norm": 0.9754143357276917, "learning_rate": 1.4517583408476104e-05, "loss": 1.8252, "step": 162 }, { "epoch": 0.04437829792991476, "grad_norm": 0.6172115802764893, "learning_rate": 1.4697926059513075e-05, "loss": 1.8094, "step": 164 }, { "epoch": 0.04491949668515762, "grad_norm": 0.8304158449172974, "learning_rate": 1.4878268710550044e-05, "loss": 1.8078, "step": 166 }, { "epoch": 0.04546069544040049, "grad_norm": 0.6388853788375854, "learning_rate": 1.5058611361587017e-05, "loss": 1.8106, "step": 168 }, { "epoch": 0.04600189419564335, "grad_norm": 0.743231475353241, "learning_rate": 1.5238954012623984e-05, "loss": 1.8144, "step": 170 }, { "epoch": 0.04654309295088621, "grad_norm": 0.6442289352416992, "learning_rate": 1.5419296663660955e-05, "loss": 1.7831, "step": 172 }, { "epoch": 0.04708429170612908, "grad_norm": 0.6877187490463257, "learning_rate": 1.559963931469793e-05, "loss": 1.8043, "step": 174 }, { "epoch": 0.04762549046137194, "grad_norm": 0.9389640688896179, "learning_rate": 1.5779981965734897e-05, "loss": 1.7869, "step": 176 }, { "epoch": 0.0481666892166148, "grad_norm": 1.0456589460372925, "learning_rate": 1.5960324616771868e-05, "loss": 1.7681, "step": 178 }, { "epoch": 0.04870788797185766, "grad_norm": 0.9617791175842285, "learning_rate": 1.614066726780884e-05, "loss": 1.7668, "step": 180 }, { "epoch": 0.04924908672710053, "grad_norm": 0.9334360361099243, "learning_rate": 1.632100991884581e-05, "loss": 1.7893, "step": 182 }, { "epoch": 0.04979028548234339, "grad_norm": 0.8952531814575195, "learning_rate": 1.6501352569882777e-05, "loss": 1.7758, "step": 184 }, { "epoch": 0.05033148423758625, "grad_norm": 0.8544924855232239, "learning_rate": 1.6681695220919748e-05, "loss": 1.793, "step": 186 }, { "epoch": 0.05087268299282912, "grad_norm": 0.7782765030860901, "learning_rate": 1.686203787195672e-05, "loss": 1.768, "step": 188 }, { "epoch": 0.05141388174807198, "grad_norm": 0.7119695544242859, "learning_rate": 1.704238052299369e-05, "loss": 1.7685, "step": 190 }, { "epoch": 0.05195508050331484, "grad_norm": 0.9119647145271301, "learning_rate": 1.7222723174030657e-05, "loss": 1.7706, "step": 192 }, { "epoch": 0.0524962792585577, "grad_norm": 0.6414957642555237, "learning_rate": 1.7403065825067628e-05, "loss": 1.7626, "step": 194 }, { "epoch": 0.05303747801380057, "grad_norm": 0.8069677352905273, "learning_rate": 1.75834084761046e-05, "loss": 1.7423, "step": 196 }, { "epoch": 0.05357867676904343, "grad_norm": 0.6549937725067139, "learning_rate": 1.776375112714157e-05, "loss": 1.7428, "step": 198 }, { "epoch": 0.05411987552428629, "grad_norm": 0.8064024448394775, "learning_rate": 1.7944093778178538e-05, "loss": 1.7448, "step": 200 }, { "epoch": 0.054661074279529154, "grad_norm": 0.7182701826095581, "learning_rate": 1.8124436429215512e-05, "loss": 1.7248, "step": 202 }, { "epoch": 0.05520227303477202, "grad_norm": 0.6997919678688049, "learning_rate": 1.830477908025248e-05, "loss": 1.7281, "step": 204 }, { "epoch": 0.05574347179001488, "grad_norm": 0.7071277499198914, "learning_rate": 1.848512173128945e-05, "loss": 1.714, "step": 206 }, { "epoch": 0.056284670545257744, "grad_norm": 0.6344273090362549, "learning_rate": 1.866546438232642e-05, "loss": 1.7463, "step": 208 }, { "epoch": 0.05682586930050061, "grad_norm": 0.7192733883857727, "learning_rate": 1.8845807033363392e-05, "loss": 1.737, "step": 210 }, { "epoch": 0.05736706805574347, "grad_norm": 0.7418521642684937, "learning_rate": 1.9026149684400363e-05, "loss": 1.7197, "step": 212 }, { "epoch": 0.057908266810986334, "grad_norm": 0.875845730304718, "learning_rate": 1.920649233543733e-05, "loss": 1.6968, "step": 214 }, { "epoch": 0.058449465566229195, "grad_norm": 0.7394037842750549, "learning_rate": 1.9386834986474305e-05, "loss": 1.7051, "step": 216 }, { "epoch": 0.05899066432147206, "grad_norm": 0.6689572930335999, "learning_rate": 1.9567177637511272e-05, "loss": 1.7152, "step": 218 }, { "epoch": 0.05953186307671492, "grad_norm": 0.7955539226531982, "learning_rate": 1.9747520288548243e-05, "loss": 1.7136, "step": 220 }, { "epoch": 0.060073061831957784, "grad_norm": 0.7005388140678406, "learning_rate": 1.9927862939585214e-05, "loss": 1.7152, "step": 222 }, { "epoch": 0.06061426058720065, "grad_norm": 0.6205731630325317, "learning_rate": 2.0108205590622185e-05, "loss": 1.6901, "step": 224 }, { "epoch": 0.06115545934244351, "grad_norm": 0.7079929709434509, "learning_rate": 2.0288548241659152e-05, "loss": 1.6905, "step": 226 }, { "epoch": 0.061696658097686374, "grad_norm": 0.6871302723884583, "learning_rate": 2.0468890892696123e-05, "loss": 1.6867, "step": 228 }, { "epoch": 0.062237856852929235, "grad_norm": 0.7172162532806396, "learning_rate": 2.0649233543733094e-05, "loss": 1.685, "step": 230 }, { "epoch": 0.0627790556081721, "grad_norm": 0.6729004979133606, "learning_rate": 2.0829576194770065e-05, "loss": 1.6961, "step": 232 }, { "epoch": 0.06332025436341496, "grad_norm": 0.7335099577903748, "learning_rate": 2.1009918845807033e-05, "loss": 1.6797, "step": 234 }, { "epoch": 0.06386145311865783, "grad_norm": 0.6398060321807861, "learning_rate": 2.1190261496844003e-05, "loss": 1.7037, "step": 236 }, { "epoch": 0.0644026518739007, "grad_norm": 0.7026365399360657, "learning_rate": 2.1370604147880974e-05, "loss": 1.6698, "step": 238 }, { "epoch": 0.06494385062914355, "grad_norm": 0.7972332239151001, "learning_rate": 2.1550946798917945e-05, "loss": 1.6866, "step": 240 }, { "epoch": 0.06548504938438642, "grad_norm": 0.7363021969795227, "learning_rate": 2.1731289449954913e-05, "loss": 1.6879, "step": 242 }, { "epoch": 0.06602624813962928, "grad_norm": 0.7071017026901245, "learning_rate": 2.1911632100991887e-05, "loss": 1.6898, "step": 244 }, { "epoch": 0.06656744689487214, "grad_norm": 0.8030880093574524, "learning_rate": 2.2091974752028858e-05, "loss": 1.6734, "step": 246 }, { "epoch": 0.067108645650115, "grad_norm": 0.7429569363594055, "learning_rate": 2.2272317403065825e-05, "loss": 1.6722, "step": 248 }, { "epoch": 0.06764984440535787, "grad_norm": 0.6807804107666016, "learning_rate": 2.2452660054102796e-05, "loss": 1.6697, "step": 250 }, { "epoch": 0.06819104316060073, "grad_norm": 0.6632562875747681, "learning_rate": 2.2633002705139767e-05, "loss": 1.6453, "step": 252 }, { "epoch": 0.0687322419158436, "grad_norm": 0.6661680340766907, "learning_rate": 2.2813345356176738e-05, "loss": 1.6701, "step": 254 }, { "epoch": 0.06927344067108646, "grad_norm": 0.6747105121612549, "learning_rate": 2.2993688007213706e-05, "loss": 1.6729, "step": 256 }, { "epoch": 0.06981463942632932, "grad_norm": 0.7698473334312439, "learning_rate": 2.317403065825068e-05, "loss": 1.6528, "step": 258 }, { "epoch": 0.07035583818157218, "grad_norm": 0.6111325621604919, "learning_rate": 2.3354373309287647e-05, "loss": 1.6412, "step": 260 }, { "epoch": 0.07089703693681504, "grad_norm": 0.7405019998550415, "learning_rate": 2.3534715960324618e-05, "loss": 1.6564, "step": 262 }, { "epoch": 0.07143823569205791, "grad_norm": 0.6702501773834229, "learning_rate": 2.371505861136159e-05, "loss": 1.654, "step": 264 }, { "epoch": 0.07197943444730077, "grad_norm": 0.7076373100280762, "learning_rate": 2.389540126239856e-05, "loss": 1.6301, "step": 266 }, { "epoch": 0.07252063320254364, "grad_norm": 0.7239627242088318, "learning_rate": 2.4075743913435528e-05, "loss": 1.6575, "step": 268 }, { "epoch": 0.0730618319577865, "grad_norm": 0.753480076789856, "learning_rate": 2.42560865644725e-05, "loss": 1.6603, "step": 270 }, { "epoch": 0.07360303071302936, "grad_norm": 0.7261641025543213, "learning_rate": 2.443642921550947e-05, "loss": 1.6449, "step": 272 }, { "epoch": 0.07414422946827222, "grad_norm": 0.6315119862556458, "learning_rate": 2.461677186654644e-05, "loss": 1.6538, "step": 274 }, { "epoch": 0.07468542822351508, "grad_norm": 0.5698412656784058, "learning_rate": 2.4797114517583408e-05, "loss": 1.6663, "step": 276 }, { "epoch": 0.07522662697875795, "grad_norm": 0.5968983173370361, "learning_rate": 2.497745716862038e-05, "loss": 1.643, "step": 278 }, { "epoch": 0.07576782573400082, "grad_norm": 0.561126172542572, "learning_rate": 2.5157799819657353e-05, "loss": 1.6301, "step": 280 }, { "epoch": 0.07630902448924368, "grad_norm": 0.7290865778923035, "learning_rate": 2.533814247069432e-05, "loss": 1.6412, "step": 282 }, { "epoch": 0.07685022324448654, "grad_norm": 0.7629122138023376, "learning_rate": 2.5518485121731288e-05, "loss": 1.6335, "step": 284 }, { "epoch": 0.0773914219997294, "grad_norm": 0.5383496284484863, "learning_rate": 2.5698827772768262e-05, "loss": 1.6226, "step": 286 }, { "epoch": 0.07793262075497226, "grad_norm": 0.7778373956680298, "learning_rate": 2.5879170423805233e-05, "loss": 1.6333, "step": 288 }, { "epoch": 0.07847381951021512, "grad_norm": 0.6851366758346558, "learning_rate": 2.60595130748422e-05, "loss": 1.6251, "step": 290 }, { "epoch": 0.079015018265458, "grad_norm": 0.5947225689888, "learning_rate": 2.623985572587917e-05, "loss": 1.6298, "step": 292 }, { "epoch": 0.07955621702070086, "grad_norm": 0.9742544889450073, "learning_rate": 2.6420198376916146e-05, "loss": 1.6252, "step": 294 }, { "epoch": 0.08009741577594372, "grad_norm": 1.2064323425292969, "learning_rate": 2.6600541027953113e-05, "loss": 1.6152, "step": 296 }, { "epoch": 0.08063861453118658, "grad_norm": 1.0506716966629028, "learning_rate": 2.678088367899008e-05, "loss": 1.6351, "step": 298 }, { "epoch": 0.08117981328642944, "grad_norm": 1.2992738485336304, "learning_rate": 2.696122633002705e-05, "loss": 1.6193, "step": 300 }, { "epoch": 0.0817210120416723, "grad_norm": 1.0616599321365356, "learning_rate": 2.7141568981064026e-05, "loss": 1.6135, "step": 302 }, { "epoch": 0.08226221079691516, "grad_norm": 1.037997841835022, "learning_rate": 2.7321911632100993e-05, "loss": 1.6344, "step": 304 }, { "epoch": 0.08280340955215804, "grad_norm": 0.8937569856643677, "learning_rate": 2.7502254283137964e-05, "loss": 1.6077, "step": 306 }, { "epoch": 0.0833446083074009, "grad_norm": 1.1334234476089478, "learning_rate": 2.7682596934174932e-05, "loss": 1.6193, "step": 308 }, { "epoch": 0.08388580706264376, "grad_norm": 0.8336219191551208, "learning_rate": 2.7862939585211906e-05, "loss": 1.5948, "step": 310 }, { "epoch": 0.08442700581788662, "grad_norm": 1.1825398206710815, "learning_rate": 2.8043282236248874e-05, "loss": 1.6239, "step": 312 }, { "epoch": 0.08496820457312948, "grad_norm": 0.7945433259010315, "learning_rate": 2.8223624887285844e-05, "loss": 1.6119, "step": 314 }, { "epoch": 0.08550940332837234, "grad_norm": 0.6971009969711304, "learning_rate": 2.8403967538322812e-05, "loss": 1.5822, "step": 316 }, { "epoch": 0.0860506020836152, "grad_norm": 0.6050766706466675, "learning_rate": 2.8584310189359786e-05, "loss": 1.6161, "step": 318 }, { "epoch": 0.08659180083885808, "grad_norm": 0.6123189330101013, "learning_rate": 2.8764652840396754e-05, "loss": 1.5941, "step": 320 }, { "epoch": 0.08713299959410094, "grad_norm": 0.5471253395080566, "learning_rate": 2.8944995491433725e-05, "loss": 1.603, "step": 322 }, { "epoch": 0.0876741983493438, "grad_norm": 0.5793882608413696, "learning_rate": 2.91253381424707e-05, "loss": 1.6076, "step": 324 }, { "epoch": 0.08821539710458666, "grad_norm": 0.5409413576126099, "learning_rate": 2.9305680793507666e-05, "loss": 1.5825, "step": 326 }, { "epoch": 0.08875659585982952, "grad_norm": 6.757148265838623, "learning_rate": 2.9486023444544637e-05, "loss": 1.5942, "step": 328 }, { "epoch": 0.08929779461507238, "grad_norm": 1.3357856273651123, "learning_rate": 2.9666366095581605e-05, "loss": 1.642, "step": 330 }, { "epoch": 0.08983899337031524, "grad_norm": 0.8245829939842224, "learning_rate": 2.984670874661858e-05, "loss": 1.6062, "step": 332 }, { "epoch": 0.09038019212555812, "grad_norm": 0.8888993263244629, "learning_rate": 3.0027051397655547e-05, "loss": 1.5952, "step": 334 }, { "epoch": 0.09092139088080098, "grad_norm": 0.8923915028572083, "learning_rate": 3.0207394048692517e-05, "loss": 1.5977, "step": 336 }, { "epoch": 0.09146258963604384, "grad_norm": 0.7443459033966064, "learning_rate": 3.0387736699729485e-05, "loss": 1.5738, "step": 338 }, { "epoch": 0.0920037883912867, "grad_norm": 0.7297430038452148, "learning_rate": 3.056807935076646e-05, "loss": 1.5907, "step": 340 }, { "epoch": 0.09254498714652956, "grad_norm": 0.6882812976837158, "learning_rate": 3.074842200180343e-05, "loss": 1.5767, "step": 342 }, { "epoch": 0.09308618590177242, "grad_norm": 0.6150392889976501, "learning_rate": 3.0928764652840394e-05, "loss": 1.5747, "step": 344 }, { "epoch": 0.09362738465701528, "grad_norm": 0.6230599284172058, "learning_rate": 3.110910730387737e-05, "loss": 1.583, "step": 346 }, { "epoch": 0.09416858341225816, "grad_norm": 0.6081874966621399, "learning_rate": 3.128944995491434e-05, "loss": 1.5875, "step": 348 }, { "epoch": 0.09470978216750102, "grad_norm": 0.5467821955680847, "learning_rate": 3.146979260595131e-05, "loss": 1.575, "step": 350 }, { "epoch": 0.09525098092274388, "grad_norm": 0.5629361271858215, "learning_rate": 3.165013525698828e-05, "loss": 1.5828, "step": 352 }, { "epoch": 0.09579217967798674, "grad_norm": 0.5995283126831055, "learning_rate": 3.1830477908025245e-05, "loss": 1.5872, "step": 354 }, { "epoch": 0.0963333784332296, "grad_norm": 0.556450366973877, "learning_rate": 3.201082055906222e-05, "loss": 1.553, "step": 356 }, { "epoch": 0.09687457718847246, "grad_norm": 0.6498537063598633, "learning_rate": 3.219116321009919e-05, "loss": 1.5667, "step": 358 }, { "epoch": 0.09741577594371532, "grad_norm": 0.5891172885894775, "learning_rate": 3.237150586113616e-05, "loss": 1.5818, "step": 360 }, { "epoch": 0.0979569746989582, "grad_norm": 0.6487797498703003, "learning_rate": 3.2551848512173136e-05, "loss": 1.5582, "step": 362 }, { "epoch": 0.09849817345420106, "grad_norm": 0.5860658884048462, "learning_rate": 3.27321911632101e-05, "loss": 1.5725, "step": 364 }, { "epoch": 0.09903937220944392, "grad_norm": 0.5619581937789917, "learning_rate": 3.291253381424707e-05, "loss": 1.5779, "step": 366 }, { "epoch": 0.09958057096468678, "grad_norm": 0.7147429585456848, "learning_rate": 3.309287646528404e-05, "loss": 1.5766, "step": 368 }, { "epoch": 0.10012176971992964, "grad_norm": 0.5840562582015991, "learning_rate": 3.327321911632101e-05, "loss": 1.5609, "step": 370 }, { "epoch": 0.1006629684751725, "grad_norm": 0.6277860403060913, "learning_rate": 3.345356176735798e-05, "loss": 1.5645, "step": 372 }, { "epoch": 0.10120416723041536, "grad_norm": 0.6395567655563354, "learning_rate": 3.3633904418394954e-05, "loss": 1.545, "step": 374 }, { "epoch": 0.10174536598565824, "grad_norm": 0.6651553511619568, "learning_rate": 3.381424706943192e-05, "loss": 1.5643, "step": 376 }, { "epoch": 0.1022865647409011, "grad_norm": 0.6691033244132996, "learning_rate": 3.3994589720468896e-05, "loss": 1.5705, "step": 378 }, { "epoch": 0.10282776349614396, "grad_norm": 0.5426511764526367, "learning_rate": 3.4174932371505863e-05, "loss": 1.536, "step": 380 }, { "epoch": 0.10336896225138682, "grad_norm": 0.6677694916725159, "learning_rate": 3.435527502254283e-05, "loss": 1.5664, "step": 382 }, { "epoch": 0.10391016100662968, "grad_norm": 0.5283762216567993, "learning_rate": 3.45356176735798e-05, "loss": 1.5474, "step": 384 }, { "epoch": 0.10445135976187254, "grad_norm": 0.652812659740448, "learning_rate": 3.471596032461677e-05, "loss": 1.5509, "step": 386 }, { "epoch": 0.1049925585171154, "grad_norm": 0.8639987111091614, "learning_rate": 3.489630297565375e-05, "loss": 1.5563, "step": 388 }, { "epoch": 0.10553375727235827, "grad_norm": 0.7726946473121643, "learning_rate": 3.5076645626690715e-05, "loss": 1.5682, "step": 390 }, { "epoch": 0.10607495602760114, "grad_norm": 0.6511155962944031, "learning_rate": 3.525698827772768e-05, "loss": 1.5571, "step": 392 }, { "epoch": 0.106616154782844, "grad_norm": 0.6578395962715149, "learning_rate": 3.5437330928764656e-05, "loss": 1.5452, "step": 394 }, { "epoch": 0.10715735353808686, "grad_norm": 0.642919659614563, "learning_rate": 3.5617673579801624e-05, "loss": 1.5508, "step": 396 }, { "epoch": 0.10769855229332972, "grad_norm": 0.5190348029136658, "learning_rate": 3.579801623083859e-05, "loss": 1.5432, "step": 398 }, { "epoch": 0.10823975104857259, "grad_norm": 0.48932549357414246, "learning_rate": 3.5978358881875566e-05, "loss": 1.5544, "step": 400 }, { "epoch": 0.10878094980381545, "grad_norm": 0.5018340945243835, "learning_rate": 3.615870153291254e-05, "loss": 1.5322, "step": 402 }, { "epoch": 0.10932214855905831, "grad_norm": 0.5701499581336975, "learning_rate": 3.633904418394951e-05, "loss": 1.5288, "step": 404 }, { "epoch": 0.10986334731430118, "grad_norm": 0.6049205660820007, "learning_rate": 3.6519386834986475e-05, "loss": 1.5627, "step": 406 }, { "epoch": 0.11040454606954404, "grad_norm": 0.5781517028808594, "learning_rate": 3.669972948602345e-05, "loss": 1.542, "step": 408 }, { "epoch": 0.1109457448247869, "grad_norm": 0.5594660043716431, "learning_rate": 3.688007213706042e-05, "loss": 1.5461, "step": 410 }, { "epoch": 0.11148694358002977, "grad_norm": 0.5319619178771973, "learning_rate": 3.7060414788097384e-05, "loss": 1.5668, "step": 412 }, { "epoch": 0.11202814233527263, "grad_norm": 0.5311123728752136, "learning_rate": 3.724075743913435e-05, "loss": 1.528, "step": 414 }, { "epoch": 0.11256934109051549, "grad_norm": 0.5555101633071899, "learning_rate": 3.7421100090171326e-05, "loss": 1.5392, "step": 416 }, { "epoch": 0.11311053984575835, "grad_norm": 0.5486223101615906, "learning_rate": 3.76014427412083e-05, "loss": 1.5337, "step": 418 }, { "epoch": 0.11365173860100122, "grad_norm": 0.5156669020652771, "learning_rate": 3.778178539224527e-05, "loss": 1.5105, "step": 420 }, { "epoch": 0.11419293735624408, "grad_norm": 0.49596554040908813, "learning_rate": 3.7962128043282235e-05, "loss": 1.515, "step": 422 }, { "epoch": 0.11473413611148695, "grad_norm": 0.641333281993866, "learning_rate": 3.814247069431921e-05, "loss": 1.5328, "step": 424 }, { "epoch": 0.1152753348667298, "grad_norm": 0.6106113195419312, "learning_rate": 3.832281334535618e-05, "loss": 1.5189, "step": 426 }, { "epoch": 0.11581653362197267, "grad_norm": 0.5619134306907654, "learning_rate": 3.8503155996393145e-05, "loss": 1.5295, "step": 428 }, { "epoch": 0.11635773237721553, "grad_norm": 0.5396978259086609, "learning_rate": 3.868349864743012e-05, "loss": 1.5173, "step": 430 }, { "epoch": 0.11689893113245839, "grad_norm": 0.5466894507408142, "learning_rate": 3.886384129846709e-05, "loss": 1.5191, "step": 432 }, { "epoch": 0.11744012988770126, "grad_norm": 0.5601218342781067, "learning_rate": 3.904418394950406e-05, "loss": 1.5285, "step": 434 }, { "epoch": 0.11798132864294412, "grad_norm": 0.6620492935180664, "learning_rate": 3.922452660054103e-05, "loss": 1.4946, "step": 436 }, { "epoch": 0.11852252739818699, "grad_norm": 0.49140048027038574, "learning_rate": 3.9404869251578e-05, "loss": 1.512, "step": 438 }, { "epoch": 0.11906372615342985, "grad_norm": 0.5824118256568909, "learning_rate": 3.958521190261497e-05, "loss": 1.5244, "step": 440 }, { "epoch": 0.11960492490867271, "grad_norm": 0.4967150092124939, "learning_rate": 3.976555455365194e-05, "loss": 1.5273, "step": 442 }, { "epoch": 0.12014612366391557, "grad_norm": 0.5089767575263977, "learning_rate": 3.994589720468891e-05, "loss": 1.5119, "step": 444 }, { "epoch": 0.12068732241915843, "grad_norm": 0.5404312014579773, "learning_rate": 4.0126239855725886e-05, "loss": 1.5072, "step": 446 }, { "epoch": 0.1212285211744013, "grad_norm": 0.5239550471305847, "learning_rate": 4.0306582506762853e-05, "loss": 1.5336, "step": 448 }, { "epoch": 0.12176971992964417, "grad_norm": 0.4974781274795532, "learning_rate": 4.048692515779982e-05, "loss": 1.5225, "step": 450 }, { "epoch": 0.12231091868488703, "grad_norm": 0.5363791584968567, "learning_rate": 4.066726780883679e-05, "loss": 1.5176, "step": 452 }, { "epoch": 0.12285211744012989, "grad_norm": 0.5095157027244568, "learning_rate": 4.084761045987376e-05, "loss": 1.4936, "step": 454 }, { "epoch": 0.12339331619537275, "grad_norm": 0.4920356869697571, "learning_rate": 4.102795311091073e-05, "loss": 1.5269, "step": 456 }, { "epoch": 0.12393451495061561, "grad_norm": 0.4940793514251709, "learning_rate": 4.1208295761947705e-05, "loss": 1.5072, "step": 458 }, { "epoch": 0.12447571370585847, "grad_norm": 0.4805227220058441, "learning_rate": 4.138863841298467e-05, "loss": 1.4987, "step": 460 }, { "epoch": 0.12501691246110133, "grad_norm": 0.49683934450149536, "learning_rate": 4.1568981064021646e-05, "loss": 1.5008, "step": 462 }, { "epoch": 0.1255581112163442, "grad_norm": 0.5283801555633545, "learning_rate": 4.1749323715058614e-05, "loss": 1.5177, "step": 464 }, { "epoch": 0.12609930997158705, "grad_norm": 0.5395119190216064, "learning_rate": 4.192966636609558e-05, "loss": 1.5106, "step": 466 }, { "epoch": 0.12664050872682991, "grad_norm": 0.5403693914413452, "learning_rate": 4.211000901713255e-05, "loss": 1.4854, "step": 468 }, { "epoch": 0.1271817074820728, "grad_norm": 0.4690951406955719, "learning_rate": 4.229035166816952e-05, "loss": 1.5079, "step": 470 }, { "epoch": 0.12772290623731566, "grad_norm": 0.5077293515205383, "learning_rate": 4.24706943192065e-05, "loss": 1.4953, "step": 472 }, { "epoch": 0.12826410499255853, "grad_norm": 0.440019816160202, "learning_rate": 4.2651036970243465e-05, "loss": 1.4864, "step": 474 }, { "epoch": 0.1288053037478014, "grad_norm": 0.48672759532928467, "learning_rate": 4.283137962128044e-05, "loss": 1.5205, "step": 476 }, { "epoch": 0.12934650250304425, "grad_norm": 0.4732811450958252, "learning_rate": 4.301172227231741e-05, "loss": 1.4998, "step": 478 }, { "epoch": 0.1298877012582871, "grad_norm": 0.46713048219680786, "learning_rate": 4.3192064923354374e-05, "loss": 1.4893, "step": 480 }, { "epoch": 0.13042890001352997, "grad_norm": 0.502356231212616, "learning_rate": 4.337240757439134e-05, "loss": 1.5125, "step": 482 }, { "epoch": 0.13097009876877283, "grad_norm": 0.45067864656448364, "learning_rate": 4.3552750225428316e-05, "loss": 1.4978, "step": 484 }, { "epoch": 0.1315112975240157, "grad_norm": 0.46964120864868164, "learning_rate": 4.373309287646529e-05, "loss": 1.5006, "step": 486 }, { "epoch": 0.13205249627925855, "grad_norm": 0.47723180055618286, "learning_rate": 4.391343552750226e-05, "loss": 1.513, "step": 488 }, { "epoch": 0.1325936950345014, "grad_norm": 0.5100542306900024, "learning_rate": 4.4093778178539225e-05, "loss": 1.5279, "step": 490 }, { "epoch": 0.13313489378974427, "grad_norm": 0.5344257354736328, "learning_rate": 4.42741208295762e-05, "loss": 1.5193, "step": 492 }, { "epoch": 0.13367609254498714, "grad_norm": 0.5867893695831299, "learning_rate": 4.445446348061317e-05, "loss": 1.512, "step": 494 }, { "epoch": 0.13421729130023, "grad_norm": 0.7811394929885864, "learning_rate": 4.4634806131650134e-05, "loss": 1.5038, "step": 496 }, { "epoch": 0.13475849005547288, "grad_norm": 0.8505339622497559, "learning_rate": 4.48151487826871e-05, "loss": 1.5169, "step": 498 }, { "epoch": 0.13529968881071575, "grad_norm": 0.6337641477584839, "learning_rate": 4.4995491433724076e-05, "loss": 1.4951, "step": 500 }, { "epoch": 0.1358408875659586, "grad_norm": 0.7979961633682251, "learning_rate": 4.517583408476105e-05, "loss": 1.5031, "step": 502 }, { "epoch": 0.13638208632120147, "grad_norm": 0.6946894526481628, "learning_rate": 4.535617673579802e-05, "loss": 1.501, "step": 504 }, { "epoch": 0.13692328507644433, "grad_norm": 0.6830259561538696, "learning_rate": 4.5536519386834986e-05, "loss": 1.4896, "step": 506 }, { "epoch": 0.1374644838316872, "grad_norm": 0.5908662676811218, "learning_rate": 4.571686203787196e-05, "loss": 1.4992, "step": 508 }, { "epoch": 0.13800568258693005, "grad_norm": 0.7655865550041199, "learning_rate": 4.589720468890893e-05, "loss": 1.4911, "step": 510 }, { "epoch": 0.1385468813421729, "grad_norm": 0.5924785733222961, "learning_rate": 4.6077547339945895e-05, "loss": 1.4719, "step": 512 }, { "epoch": 0.13908808009741577, "grad_norm": 0.6654263138771057, "learning_rate": 4.625788999098287e-05, "loss": 1.5109, "step": 514 }, { "epoch": 0.13962927885265863, "grad_norm": 0.5296297073364258, "learning_rate": 4.6438232642019843e-05, "loss": 1.4934, "step": 516 }, { "epoch": 0.1401704776079015, "grad_norm": 0.5698690414428711, "learning_rate": 4.661857529305681e-05, "loss": 1.4954, "step": 518 }, { "epoch": 0.14071167636314436, "grad_norm": 0.5790325403213501, "learning_rate": 4.679891794409378e-05, "loss": 1.4673, "step": 520 }, { "epoch": 0.14125287511838722, "grad_norm": 0.551480770111084, "learning_rate": 4.697926059513075e-05, "loss": 1.476, "step": 522 }, { "epoch": 0.14179407387363008, "grad_norm": 0.5201780796051025, "learning_rate": 4.715960324616772e-05, "loss": 1.4701, "step": 524 }, { "epoch": 0.14233527262887297, "grad_norm": 0.46442562341690063, "learning_rate": 4.733994589720469e-05, "loss": 1.4831, "step": 526 }, { "epoch": 0.14287647138411583, "grad_norm": 0.5558522939682007, "learning_rate": 4.752028854824166e-05, "loss": 1.4729, "step": 528 }, { "epoch": 0.1434176701393587, "grad_norm": 0.48511791229248047, "learning_rate": 4.7700631199278636e-05, "loss": 1.4742, "step": 530 }, { "epoch": 0.14395886889460155, "grad_norm": 0.5244829058647156, "learning_rate": 4.7880973850315604e-05, "loss": 1.4928, "step": 532 }, { "epoch": 0.1445000676498444, "grad_norm": 0.48878946900367737, "learning_rate": 4.806131650135257e-05, "loss": 1.4921, "step": 534 }, { "epoch": 0.14504126640508727, "grad_norm": 0.5348760485649109, "learning_rate": 4.824165915238954e-05, "loss": 1.4917, "step": 536 }, { "epoch": 0.14558246516033013, "grad_norm": 0.5444923639297485, "learning_rate": 4.842200180342651e-05, "loss": 1.4546, "step": 538 }, { "epoch": 0.146123663915573, "grad_norm": 0.494761198759079, "learning_rate": 4.860234445446348e-05, "loss": 1.4751, "step": 540 }, { "epoch": 0.14666486267081585, "grad_norm": 0.4921441674232483, "learning_rate": 4.8782687105500455e-05, "loss": 1.4767, "step": 542 }, { "epoch": 0.14720606142605872, "grad_norm": 0.48382577300071716, "learning_rate": 4.896302975653742e-05, "loss": 1.485, "step": 544 }, { "epoch": 0.14774726018130158, "grad_norm": 0.4616708755493164, "learning_rate": 4.9143372407574397e-05, "loss": 1.4732, "step": 546 }, { "epoch": 0.14828845893654444, "grad_norm": 0.5030043125152588, "learning_rate": 4.9323715058611364e-05, "loss": 1.4799, "step": 548 }, { "epoch": 0.1488296576917873, "grad_norm": 0.467230886220932, "learning_rate": 4.950405770964833e-05, "loss": 1.4594, "step": 550 }, { "epoch": 0.14937085644703016, "grad_norm": 0.42864304780960083, "learning_rate": 4.9684400360685306e-05, "loss": 1.4748, "step": 552 }, { "epoch": 0.14991205520227305, "grad_norm": 0.43733683228492737, "learning_rate": 4.986474301172227e-05, "loss": 1.462, "step": 554 }, { "epoch": 0.1504532539575159, "grad_norm": 0.45550286769866943, "learning_rate": 5.004508566275925e-05, "loss": 1.475, "step": 556 }, { "epoch": 0.15099445271275877, "grad_norm": 0.44999995827674866, "learning_rate": 5.022542831379622e-05, "loss": 1.4794, "step": 558 }, { "epoch": 0.15153565146800163, "grad_norm": 0.5035279989242554, "learning_rate": 5.040577096483319e-05, "loss": 1.471, "step": 560 }, { "epoch": 0.1520768502232445, "grad_norm": 0.44605591893196106, "learning_rate": 5.058611361587016e-05, "loss": 1.4461, "step": 562 }, { "epoch": 0.15261804897848735, "grad_norm": 0.5482723712921143, "learning_rate": 5.0766456266907124e-05, "loss": 1.4597, "step": 564 }, { "epoch": 0.1531592477337302, "grad_norm": 0.5323627591133118, "learning_rate": 5.094679891794409e-05, "loss": 1.4743, "step": 566 }, { "epoch": 0.15370044648897307, "grad_norm": 0.5289944410324097, "learning_rate": 5.1127141568981066e-05, "loss": 1.5, "step": 568 }, { "epoch": 0.15424164524421594, "grad_norm": 0.5446243286132812, "learning_rate": 5.1307484220018034e-05, "loss": 1.4751, "step": 570 }, { "epoch": 0.1547828439994588, "grad_norm": 0.525830090045929, "learning_rate": 5.1487826871055015e-05, "loss": 1.4639, "step": 572 }, { "epoch": 0.15532404275470166, "grad_norm": 0.48129191994667053, "learning_rate": 5.166816952209198e-05, "loss": 1.4652, "step": 574 }, { "epoch": 0.15586524150994452, "grad_norm": 0.47915297746658325, "learning_rate": 5.184851217312895e-05, "loss": 1.4627, "step": 576 }, { "epoch": 0.15640644026518738, "grad_norm": 0.5229325294494629, "learning_rate": 5.202885482416592e-05, "loss": 1.4525, "step": 578 }, { "epoch": 0.15694763902043024, "grad_norm": 0.5452600121498108, "learning_rate": 5.2209197475202885e-05, "loss": 1.458, "step": 580 }, { "epoch": 0.15748883777567313, "grad_norm": 0.427432656288147, "learning_rate": 5.238954012623985e-05, "loss": 1.4773, "step": 582 }, { "epoch": 0.158030036530916, "grad_norm": 0.450712114572525, "learning_rate": 5.2569882777276827e-05, "loss": 1.469, "step": 584 }, { "epoch": 0.15857123528615885, "grad_norm": 0.5500516891479492, "learning_rate": 5.27502254283138e-05, "loss": 1.4603, "step": 586 }, { "epoch": 0.1591124340414017, "grad_norm": 0.457157164812088, "learning_rate": 5.2930568079350775e-05, "loss": 1.4785, "step": 588 }, { "epoch": 0.15965363279664457, "grad_norm": 0.49750396609306335, "learning_rate": 5.311091073038774e-05, "loss": 1.4603, "step": 590 }, { "epoch": 0.16019483155188743, "grad_norm": 0.5720525979995728, "learning_rate": 5.329125338142471e-05, "loss": 1.4753, "step": 592 }, { "epoch": 0.1607360303071303, "grad_norm": 0.4425548315048218, "learning_rate": 5.347159603246168e-05, "loss": 1.462, "step": 594 }, { "epoch": 0.16127722906237316, "grad_norm": 0.5064132809638977, "learning_rate": 5.3651938683498645e-05, "loss": 1.4596, "step": 596 }, { "epoch": 0.16181842781761602, "grad_norm": 0.518460750579834, "learning_rate": 5.383228133453562e-05, "loss": 1.4763, "step": 598 }, { "epoch": 0.16235962657285888, "grad_norm": 0.4613576829433441, "learning_rate": 5.401262398557259e-05, "loss": 1.4487, "step": 600 }, { "epoch": 0.16290082532810174, "grad_norm": 0.7046213746070862, "learning_rate": 5.419296663660957e-05, "loss": 1.472, "step": 602 }, { "epoch": 0.1634420240833446, "grad_norm": 0.6164196133613586, "learning_rate": 5.4373309287646535e-05, "loss": 1.4424, "step": 604 }, { "epoch": 0.16398322283858746, "grad_norm": 0.5106020569801331, "learning_rate": 5.45536519386835e-05, "loss": 1.4567, "step": 606 }, { "epoch": 0.16452442159383032, "grad_norm": 0.4291236400604248, "learning_rate": 5.473399458972047e-05, "loss": 1.4514, "step": 608 }, { "epoch": 0.16506562034907318, "grad_norm": 0.46577414870262146, "learning_rate": 5.491433724075744e-05, "loss": 1.4408, "step": 610 }, { "epoch": 0.16560681910431607, "grad_norm": 0.4729917049407959, "learning_rate": 5.509467989179441e-05, "loss": 1.4493, "step": 612 }, { "epoch": 0.16614801785955893, "grad_norm": 0.4651925563812256, "learning_rate": 5.527502254283138e-05, "loss": 1.465, "step": 614 }, { "epoch": 0.1666892166148018, "grad_norm": 0.4756859540939331, "learning_rate": 5.545536519386835e-05, "loss": 1.4641, "step": 616 }, { "epoch": 0.16723041537004465, "grad_norm": 0.42555975914001465, "learning_rate": 5.563570784490533e-05, "loss": 1.4569, "step": 618 }, { "epoch": 0.16777161412528752, "grad_norm": 0.5162522196769714, "learning_rate": 5.5816050495942296e-05, "loss": 1.4344, "step": 620 }, { "epoch": 0.16831281288053038, "grad_norm": 0.5867063999176025, "learning_rate": 5.599639314697926e-05, "loss": 1.4647, "step": 622 }, { "epoch": 0.16885401163577324, "grad_norm": 0.6629165410995483, "learning_rate": 5.617673579801623e-05, "loss": 1.473, "step": 624 }, { "epoch": 0.1693952103910161, "grad_norm": 0.5905330777168274, "learning_rate": 5.6357078449053205e-05, "loss": 1.4459, "step": 626 }, { "epoch": 0.16993640914625896, "grad_norm": 0.7457858324050903, "learning_rate": 5.653742110009017e-05, "loss": 1.4603, "step": 628 }, { "epoch": 0.17047760790150182, "grad_norm": 0.5977684855461121, "learning_rate": 5.671776375112714e-05, "loss": 1.4621, "step": 630 }, { "epoch": 0.17101880665674468, "grad_norm": 0.7097992897033691, "learning_rate": 5.689810640216412e-05, "loss": 1.4646, "step": 632 }, { "epoch": 0.17156000541198754, "grad_norm": 0.5895450711250305, "learning_rate": 5.707844905320109e-05, "loss": 1.4338, "step": 634 }, { "epoch": 0.1721012041672304, "grad_norm": 0.576877772808075, "learning_rate": 5.7258791704238056e-05, "loss": 1.4666, "step": 636 }, { "epoch": 0.17264240292247326, "grad_norm": 0.541110098361969, "learning_rate": 5.7439134355275024e-05, "loss": 1.4624, "step": 638 }, { "epoch": 0.17318360167771615, "grad_norm": 0.5172320604324341, "learning_rate": 5.7619477006312e-05, "loss": 1.473, "step": 640 }, { "epoch": 0.17372480043295901, "grad_norm": 0.47511357069015503, "learning_rate": 5.7799819657348965e-05, "loss": 1.446, "step": 642 }, { "epoch": 0.17426599918820188, "grad_norm": 0.48614808917045593, "learning_rate": 5.798016230838593e-05, "loss": 1.4394, "step": 644 }, { "epoch": 0.17480719794344474, "grad_norm": 0.4435577094554901, "learning_rate": 5.81605049594229e-05, "loss": 1.43, "step": 646 }, { "epoch": 0.1753483966986876, "grad_norm": 0.4458653926849365, "learning_rate": 5.834084761045988e-05, "loss": 1.46, "step": 648 }, { "epoch": 0.17588959545393046, "grad_norm": 0.40675726532936096, "learning_rate": 5.852119026149685e-05, "loss": 1.4565, "step": 650 }, { "epoch": 0.17643079420917332, "grad_norm": 0.4132504165172577, "learning_rate": 5.8701532912533817e-05, "loss": 1.4522, "step": 652 }, { "epoch": 0.17697199296441618, "grad_norm": 0.40881386399269104, "learning_rate": 5.888187556357079e-05, "loss": 1.4232, "step": 654 }, { "epoch": 0.17751319171965904, "grad_norm": 0.40527868270874023, "learning_rate": 5.906221821460776e-05, "loss": 1.441, "step": 656 }, { "epoch": 0.1780543904749019, "grad_norm": 0.40227004885673523, "learning_rate": 5.9242560865644726e-05, "loss": 1.4259, "step": 658 }, { "epoch": 0.17859558923014476, "grad_norm": 0.4043656289577484, "learning_rate": 5.942290351668169e-05, "loss": 1.4298, "step": 660 }, { "epoch": 0.17913678798538762, "grad_norm": 0.4288482666015625, "learning_rate": 5.9603246167718674e-05, "loss": 1.4439, "step": 662 }, { "epoch": 0.17967798674063049, "grad_norm": 0.4385060966014862, "learning_rate": 5.978358881875564e-05, "loss": 1.4237, "step": 664 }, { "epoch": 0.18021918549587335, "grad_norm": 0.396980345249176, "learning_rate": 5.996393146979261e-05, "loss": 1.4174, "step": 666 }, { "epoch": 0.18076038425111624, "grad_norm": 0.4060603678226471, "learning_rate": 6.014427412082958e-05, "loss": 1.4479, "step": 668 }, { "epoch": 0.1813015830063591, "grad_norm": 0.4485025703907013, "learning_rate": 6.032461677186655e-05, "loss": 1.4493, "step": 670 }, { "epoch": 0.18184278176160196, "grad_norm": 0.44034305214881897, "learning_rate": 6.050495942290352e-05, "loss": 1.4461, "step": 672 }, { "epoch": 0.18238398051684482, "grad_norm": 0.418074369430542, "learning_rate": 6.0685302073940486e-05, "loss": 1.4287, "step": 674 }, { "epoch": 0.18292517927208768, "grad_norm": 0.41937318444252014, "learning_rate": 6.0865644724977454e-05, "loss": 1.4338, "step": 676 }, { "epoch": 0.18346637802733054, "grad_norm": 0.4103530943393707, "learning_rate": 6.104598737601444e-05, "loss": 1.4391, "step": 678 }, { "epoch": 0.1840075767825734, "grad_norm": 0.4066039025783539, "learning_rate": 6.122633002705141e-05, "loss": 1.4357, "step": 680 }, { "epoch": 0.18454877553781626, "grad_norm": 0.36903437972068787, "learning_rate": 6.140667267808838e-05, "loss": 1.4111, "step": 682 }, { "epoch": 0.18508997429305912, "grad_norm": 0.37125757336616516, "learning_rate": 6.158701532912534e-05, "loss": 1.4233, "step": 684 }, { "epoch": 0.18563117304830198, "grad_norm": 0.44102513790130615, "learning_rate": 6.176735798016231e-05, "loss": 1.4437, "step": 686 }, { "epoch": 0.18617237180354484, "grad_norm": 0.4337277114391327, "learning_rate": 6.194770063119928e-05, "loss": 1.4425, "step": 688 }, { "epoch": 0.1867135705587877, "grad_norm": 0.37394315004348755, "learning_rate": 6.212804328223625e-05, "loss": 1.4452, "step": 690 }, { "epoch": 0.18725476931403057, "grad_norm": 0.41764944791793823, "learning_rate": 6.230838593327321e-05, "loss": 1.4535, "step": 692 }, { "epoch": 0.18779596806927343, "grad_norm": 0.4214741289615631, "learning_rate": 6.24887285843102e-05, "loss": 1.4391, "step": 694 }, { "epoch": 0.18833716682451632, "grad_norm": 0.4159027338027954, "learning_rate": 6.266907123534716e-05, "loss": 1.4197, "step": 696 }, { "epoch": 0.18887836557975918, "grad_norm": 0.38865673542022705, "learning_rate": 6.284941388638413e-05, "loss": 1.4329, "step": 698 }, { "epoch": 0.18941956433500204, "grad_norm": 0.43646490573883057, "learning_rate": 6.30297565374211e-05, "loss": 1.4147, "step": 700 }, { "epoch": 0.1899607630902449, "grad_norm": 0.41997334361076355, "learning_rate": 6.321009918845807e-05, "loss": 1.4275, "step": 702 }, { "epoch": 0.19050196184548776, "grad_norm": 0.38556602597236633, "learning_rate": 6.339044183949505e-05, "loss": 1.4258, "step": 704 }, { "epoch": 0.19104316060073062, "grad_norm": 0.42955082654953003, "learning_rate": 6.357078449053201e-05, "loss": 1.4201, "step": 706 }, { "epoch": 0.19158435935597348, "grad_norm": 0.3844427764415741, "learning_rate": 6.3751127141569e-05, "loss": 1.4448, "step": 708 }, { "epoch": 0.19212555811121634, "grad_norm": 0.4312956929206848, "learning_rate": 6.393146979260596e-05, "loss": 1.4051, "step": 710 }, { "epoch": 0.1926667568664592, "grad_norm": 0.4556865394115448, "learning_rate": 6.411181244364293e-05, "loss": 1.4305, "step": 712 }, { "epoch": 0.19320795562170207, "grad_norm": 0.37053731083869934, "learning_rate": 6.42921550946799e-05, "loss": 1.4301, "step": 714 }, { "epoch": 0.19374915437694493, "grad_norm": 0.3996010720729828, "learning_rate": 6.447249774571686e-05, "loss": 1.4282, "step": 716 }, { "epoch": 0.1942903531321878, "grad_norm": 0.37610816955566406, "learning_rate": 6.465284039675383e-05, "loss": 1.4277, "step": 718 }, { "epoch": 0.19483155188743065, "grad_norm": 0.3677166998386383, "learning_rate": 6.48331830477908e-05, "loss": 1.4029, "step": 720 }, { "epoch": 0.1953727506426735, "grad_norm": 0.3841564357280731, "learning_rate": 6.501352569882777e-05, "loss": 1.4144, "step": 722 }, { "epoch": 0.1959139493979164, "grad_norm": 0.3687719404697418, "learning_rate": 6.519386834986475e-05, "loss": 1.4079, "step": 724 }, { "epoch": 0.19645514815315926, "grad_norm": 0.38350847363471985, "learning_rate": 6.537421100090172e-05, "loss": 1.4269, "step": 726 }, { "epoch": 0.19699634690840212, "grad_norm": 0.39060813188552856, "learning_rate": 6.555455365193868e-05, "loss": 1.4265, "step": 728 }, { "epoch": 0.19753754566364498, "grad_norm": 0.36068469285964966, "learning_rate": 6.573489630297565e-05, "loss": 1.4325, "step": 730 }, { "epoch": 0.19807874441888784, "grad_norm": 0.41185086965560913, "learning_rate": 6.591523895401263e-05, "loss": 1.4348, "step": 732 }, { "epoch": 0.1986199431741307, "grad_norm": 0.4441224932670593, "learning_rate": 6.60955816050496e-05, "loss": 1.4103, "step": 734 }, { "epoch": 0.19916114192937356, "grad_norm": 0.3727317452430725, "learning_rate": 6.627592425608657e-05, "loss": 1.4188, "step": 736 }, { "epoch": 0.19970234068461643, "grad_norm": 0.394972562789917, "learning_rate": 6.645626690712355e-05, "loss": 1.4095, "step": 738 }, { "epoch": 0.20024353943985929, "grad_norm": 0.40716880559921265, "learning_rate": 6.663660955816052e-05, "loss": 1.4127, "step": 740 }, { "epoch": 0.20078473819510215, "grad_norm": 0.4156644344329834, "learning_rate": 6.681695220919748e-05, "loss": 1.4189, "step": 742 }, { "epoch": 0.201325936950345, "grad_norm": 0.3787958323955536, "learning_rate": 6.699729486023445e-05, "loss": 1.4221, "step": 744 }, { "epoch": 0.20186713570558787, "grad_norm": 0.42427608370780945, "learning_rate": 6.717763751127142e-05, "loss": 1.4192, "step": 746 }, { "epoch": 0.20240833446083073, "grad_norm": 0.4778277277946472, "learning_rate": 6.735798016230839e-05, "loss": 1.4024, "step": 748 }, { "epoch": 0.2029495332160736, "grad_norm": 0.44801151752471924, "learning_rate": 6.753832281334535e-05, "loss": 1.4222, "step": 750 }, { "epoch": 0.20349073197131648, "grad_norm": 0.46737611293792725, "learning_rate": 6.771866546438232e-05, "loss": 1.4117, "step": 752 }, { "epoch": 0.20403193072655934, "grad_norm": 0.4184872806072235, "learning_rate": 6.78990081154193e-05, "loss": 1.4066, "step": 754 }, { "epoch": 0.2045731294818022, "grad_norm": 0.40458211302757263, "learning_rate": 6.807935076645627e-05, "loss": 1.4274, "step": 756 }, { "epoch": 0.20511432823704506, "grad_norm": 0.43926185369491577, "learning_rate": 6.825969341749324e-05, "loss": 1.4231, "step": 758 }, { "epoch": 0.20565552699228792, "grad_norm": 0.4434867203235626, "learning_rate": 6.844003606853022e-05, "loss": 1.4121, "step": 760 }, { "epoch": 0.20619672574753078, "grad_norm": 0.4500143826007843, "learning_rate": 6.862037871956719e-05, "loss": 1.4179, "step": 762 }, { "epoch": 0.20673792450277365, "grad_norm": 0.45456650853157043, "learning_rate": 6.880072137060415e-05, "loss": 1.3912, "step": 764 }, { "epoch": 0.2072791232580165, "grad_norm": 0.4214187264442444, "learning_rate": 6.898106402164112e-05, "loss": 1.3962, "step": 766 }, { "epoch": 0.20782032201325937, "grad_norm": 0.427682101726532, "learning_rate": 6.916140667267809e-05, "loss": 1.4316, "step": 768 }, { "epoch": 0.20836152076850223, "grad_norm": 0.44491469860076904, "learning_rate": 6.934174932371507e-05, "loss": 1.4218, "step": 770 }, { "epoch": 0.2089027195237451, "grad_norm": 0.42736080288887024, "learning_rate": 6.952209197475204e-05, "loss": 1.3931, "step": 772 }, { "epoch": 0.20944391827898795, "grad_norm": 0.4041571021080017, "learning_rate": 6.9702434625789e-05, "loss": 1.4201, "step": 774 }, { "epoch": 0.2099851170342308, "grad_norm": 0.4250961244106293, "learning_rate": 6.988277727682597e-05, "loss": 1.4299, "step": 776 }, { "epoch": 0.21052631578947367, "grad_norm": 0.4335261881351471, "learning_rate": 7.006311992786294e-05, "loss": 1.4125, "step": 778 }, { "epoch": 0.21106751454471653, "grad_norm": 0.42000851035118103, "learning_rate": 7.02434625788999e-05, "loss": 1.3969, "step": 780 }, { "epoch": 0.21160871329995942, "grad_norm": 0.38111838698387146, "learning_rate": 7.042380522993687e-05, "loss": 1.3795, "step": 782 }, { "epoch": 0.21214991205520228, "grad_norm": 0.38366812467575073, "learning_rate": 7.060414788097385e-05, "loss": 1.4041, "step": 784 }, { "epoch": 0.21269111081044514, "grad_norm": 0.4334602355957031, "learning_rate": 7.078449053201082e-05, "loss": 1.415, "step": 786 }, { "epoch": 0.213232309565688, "grad_norm": 0.40296411514282227, "learning_rate": 7.096483318304779e-05, "loss": 1.4052, "step": 788 }, { "epoch": 0.21377350832093087, "grad_norm": 0.4197232723236084, "learning_rate": 7.114517583408477e-05, "loss": 1.4205, "step": 790 }, { "epoch": 0.21431470707617373, "grad_norm": 0.40287715196609497, "learning_rate": 7.132551848512174e-05, "loss": 1.4047, "step": 792 }, { "epoch": 0.2148559058314166, "grad_norm": 0.37324196100234985, "learning_rate": 7.15058611361587e-05, "loss": 1.4398, "step": 794 }, { "epoch": 0.21539710458665945, "grad_norm": 0.4409985840320587, "learning_rate": 7.168620378719567e-05, "loss": 1.3873, "step": 796 }, { "epoch": 0.2159383033419023, "grad_norm": 0.41441893577575684, "learning_rate": 7.186654643823264e-05, "loss": 1.4174, "step": 798 }, { "epoch": 0.21647950209714517, "grad_norm": 0.4271719455718994, "learning_rate": 7.204688908926962e-05, "loss": 1.3987, "step": 800 }, { "epoch": 0.21702070085238803, "grad_norm": 0.4969992935657501, "learning_rate": 7.222723174030659e-05, "loss": 1.4049, "step": 802 }, { "epoch": 0.2175618996076309, "grad_norm": 0.45711180567741394, "learning_rate": 7.240757439134356e-05, "loss": 1.4061, "step": 804 }, { "epoch": 0.21810309836287375, "grad_norm": 0.4479979872703552, "learning_rate": 7.258791704238052e-05, "loss": 1.4049, "step": 806 }, { "epoch": 0.21864429711811662, "grad_norm": 0.4708006978034973, "learning_rate": 7.276825969341749e-05, "loss": 1.3971, "step": 808 }, { "epoch": 0.2191854958733595, "grad_norm": 0.4387456774711609, "learning_rate": 7.294860234445446e-05, "loss": 1.4272, "step": 810 }, { "epoch": 0.21972669462860236, "grad_norm": 0.5285756587982178, "learning_rate": 7.312894499549143e-05, "loss": 1.3902, "step": 812 }, { "epoch": 0.22026789338384523, "grad_norm": 0.5111876726150513, "learning_rate": 7.330928764652841e-05, "loss": 1.4176, "step": 814 }, { "epoch": 0.2208090921390881, "grad_norm": 0.4643821716308594, "learning_rate": 7.348963029756538e-05, "loss": 1.4216, "step": 816 }, { "epoch": 0.22135029089433095, "grad_norm": 0.5162214040756226, "learning_rate": 7.366997294860236e-05, "loss": 1.4025, "step": 818 }, { "epoch": 0.2218914896495738, "grad_norm": 0.4296860992908478, "learning_rate": 7.385031559963932e-05, "loss": 1.3919, "step": 820 }, { "epoch": 0.22243268840481667, "grad_norm": 0.4449775815010071, "learning_rate": 7.403065825067629e-05, "loss": 1.4002, "step": 822 }, { "epoch": 0.22297388716005953, "grad_norm": 0.39713212847709656, "learning_rate": 7.421100090171326e-05, "loss": 1.4012, "step": 824 }, { "epoch": 0.2235150859153024, "grad_norm": 0.41655346751213074, "learning_rate": 7.439134355275023e-05, "loss": 1.4155, "step": 826 }, { "epoch": 0.22405628467054525, "grad_norm": 0.3751365542411804, "learning_rate": 7.45716862037872e-05, "loss": 1.4021, "step": 828 }, { "epoch": 0.2245974834257881, "grad_norm": 0.41483408212661743, "learning_rate": 7.475202885482417e-05, "loss": 1.4207, "step": 830 }, { "epoch": 0.22513868218103097, "grad_norm": 0.397360235452652, "learning_rate": 7.493237150586114e-05, "loss": 1.392, "step": 832 }, { "epoch": 0.22567988093627384, "grad_norm": 0.3874877691268921, "learning_rate": 7.511271415689811e-05, "loss": 1.4143, "step": 834 }, { "epoch": 0.2262210796915167, "grad_norm": 0.4382254481315613, "learning_rate": 7.529305680793508e-05, "loss": 1.4109, "step": 836 }, { "epoch": 0.22676227844675959, "grad_norm": 0.3728530704975128, "learning_rate": 7.547339945897204e-05, "loss": 1.4215, "step": 838 }, { "epoch": 0.22730347720200245, "grad_norm": 0.41155338287353516, "learning_rate": 7.565374211000901e-05, "loss": 1.3963, "step": 840 }, { "epoch": 0.2278446759572453, "grad_norm": 0.3550320267677307, "learning_rate": 7.5834084761046e-05, "loss": 1.3998, "step": 842 }, { "epoch": 0.22838587471248817, "grad_norm": 0.3858035206794739, "learning_rate": 7.601442741208296e-05, "loss": 1.387, "step": 844 }, { "epoch": 0.22892707346773103, "grad_norm": 0.38636457920074463, "learning_rate": 7.619477006311994e-05, "loss": 1.387, "step": 846 }, { "epoch": 0.2294682722229739, "grad_norm": 0.41915518045425415, "learning_rate": 7.637511271415691e-05, "loss": 1.3917, "step": 848 }, { "epoch": 0.23000947097821675, "grad_norm": 0.35796865820884705, "learning_rate": 7.655545536519388e-05, "loss": 1.406, "step": 850 }, { "epoch": 0.2305506697334596, "grad_norm": 0.35221853852272034, "learning_rate": 7.673579801623084e-05, "loss": 1.3892, "step": 852 }, { "epoch": 0.23109186848870247, "grad_norm": 0.3815077245235443, "learning_rate": 7.691614066726781e-05, "loss": 1.3845, "step": 854 }, { "epoch": 0.23163306724394533, "grad_norm": 0.3554491400718689, "learning_rate": 7.709648331830478e-05, "loss": 1.3644, "step": 856 }, { "epoch": 0.2321742659991882, "grad_norm": 0.3762814998626709, "learning_rate": 7.727682596934175e-05, "loss": 1.3976, "step": 858 }, { "epoch": 0.23271546475443106, "grad_norm": 0.34575173258781433, "learning_rate": 7.745716862037873e-05, "loss": 1.3925, "step": 860 }, { "epoch": 0.23325666350967392, "grad_norm": 0.37864556908607483, "learning_rate": 7.76375112714157e-05, "loss": 1.3993, "step": 862 }, { "epoch": 0.23379786226491678, "grad_norm": 0.34448474645614624, "learning_rate": 7.781785392245266e-05, "loss": 1.3855, "step": 864 }, { "epoch": 0.23433906102015967, "grad_norm": 0.40932390093803406, "learning_rate": 7.799819657348963e-05, "loss": 1.395, "step": 866 }, { "epoch": 0.23488025977540253, "grad_norm": 0.3737650513648987, "learning_rate": 7.81785392245266e-05, "loss": 1.3918, "step": 868 }, { "epoch": 0.2354214585306454, "grad_norm": 0.42988118529319763, "learning_rate": 7.835888187556357e-05, "loss": 1.3837, "step": 870 }, { "epoch": 0.23596265728588825, "grad_norm": 0.3865496814250946, "learning_rate": 7.853922452660055e-05, "loss": 1.3976, "step": 872 }, { "epoch": 0.2365038560411311, "grad_norm": 0.3682670295238495, "learning_rate": 7.871956717763751e-05, "loss": 1.3792, "step": 874 }, { "epoch": 0.23704505479637397, "grad_norm": 0.4236462712287903, "learning_rate": 7.88999098286745e-05, "loss": 1.4032, "step": 876 }, { "epoch": 0.23758625355161683, "grad_norm": 0.3742213249206543, "learning_rate": 7.908025247971146e-05, "loss": 1.3709, "step": 878 }, { "epoch": 0.2381274523068597, "grad_norm": 0.38234424591064453, "learning_rate": 7.926059513074843e-05, "loss": 1.3862, "step": 880 }, { "epoch": 0.23866865106210255, "grad_norm": 0.37414151430130005, "learning_rate": 7.94409377817854e-05, "loss": 1.3751, "step": 882 }, { "epoch": 0.23920984981734542, "grad_norm": 0.3838132619857788, "learning_rate": 7.962128043282237e-05, "loss": 1.3805, "step": 884 }, { "epoch": 0.23975104857258828, "grad_norm": 0.3818622827529907, "learning_rate": 7.980162308385933e-05, "loss": 1.3735, "step": 886 }, { "epoch": 0.24029224732783114, "grad_norm": 0.38791927695274353, "learning_rate": 7.99819657348963e-05, "loss": 1.3958, "step": 888 }, { "epoch": 0.240833446083074, "grad_norm": 0.4164978861808777, "learning_rate": 8.016230838593328e-05, "loss": 1.421, "step": 890 }, { "epoch": 0.24137464483831686, "grad_norm": 0.3721414804458618, "learning_rate": 8.034265103697025e-05, "loss": 1.3977, "step": 892 }, { "epoch": 0.24191584359355975, "grad_norm": 0.37698984146118164, "learning_rate": 8.052299368800722e-05, "loss": 1.3854, "step": 894 }, { "epoch": 0.2424570423488026, "grad_norm": 0.3553116023540497, "learning_rate": 8.070333633904418e-05, "loss": 1.3925, "step": 896 }, { "epoch": 0.24299824110404547, "grad_norm": 0.37809059023857117, "learning_rate": 8.088367899008115e-05, "loss": 1.368, "step": 898 }, { "epoch": 0.24353943985928833, "grad_norm": 0.3835943043231964, "learning_rate": 8.106402164111813e-05, "loss": 1.3992, "step": 900 }, { "epoch": 0.2440806386145312, "grad_norm": 0.4013379216194153, "learning_rate": 8.12443642921551e-05, "loss": 1.3912, "step": 902 }, { "epoch": 0.24462183736977405, "grad_norm": 0.37845560908317566, "learning_rate": 8.142470694319207e-05, "loss": 1.3934, "step": 904 }, { "epoch": 0.24516303612501691, "grad_norm": 0.39762255549430847, "learning_rate": 8.160504959422905e-05, "loss": 1.3782, "step": 906 }, { "epoch": 0.24570423488025978, "grad_norm": 0.36652496457099915, "learning_rate": 8.178539224526602e-05, "loss": 1.3787, "step": 908 }, { "epoch": 0.24624543363550264, "grad_norm": 0.39953047037124634, "learning_rate": 8.196573489630298e-05, "loss": 1.3752, "step": 910 }, { "epoch": 0.2467866323907455, "grad_norm": 0.35875022411346436, "learning_rate": 8.214607754733995e-05, "loss": 1.3768, "step": 912 }, { "epoch": 0.24732783114598836, "grad_norm": 0.3617067337036133, "learning_rate": 8.232642019837692e-05, "loss": 1.3859, "step": 914 }, { "epoch": 0.24786902990123122, "grad_norm": 0.38250839710235596, "learning_rate": 8.250676284941389e-05, "loss": 1.3897, "step": 916 }, { "epoch": 0.24841022865647408, "grad_norm": 0.3404116928577423, "learning_rate": 8.268710550045085e-05, "loss": 1.3933, "step": 918 }, { "epoch": 0.24895142741171694, "grad_norm": 0.3547706604003906, "learning_rate": 8.286744815148782e-05, "loss": 1.3787, "step": 920 }, { "epoch": 0.2494926261669598, "grad_norm": 0.32752275466918945, "learning_rate": 8.30477908025248e-05, "loss": 1.3905, "step": 922 }, { "epoch": 0.25003382492220266, "grad_norm": 0.3413980007171631, "learning_rate": 8.322813345356177e-05, "loss": 1.385, "step": 924 }, { "epoch": 0.25057502367744555, "grad_norm": 0.5574982762336731, "learning_rate": 8.340847610459874e-05, "loss": 1.3869, "step": 926 }, { "epoch": 0.2511162224326884, "grad_norm": 0.41128844022750854, "learning_rate": 8.358881875563572e-05, "loss": 1.3583, "step": 928 }, { "epoch": 0.2516574211879313, "grad_norm": 0.3476073145866394, "learning_rate": 8.376916140667269e-05, "loss": 1.3832, "step": 930 }, { "epoch": 0.2521986199431741, "grad_norm": 0.34838998317718506, "learning_rate": 8.394950405770965e-05, "loss": 1.3748, "step": 932 }, { "epoch": 0.252739818698417, "grad_norm": 0.3552824556827545, "learning_rate": 8.412984670874662e-05, "loss": 1.3936, "step": 934 }, { "epoch": 0.25328101745365983, "grad_norm": 0.34918278455734253, "learning_rate": 8.43101893597836e-05, "loss": 1.3733, "step": 936 }, { "epoch": 0.2538222162089027, "grad_norm": 0.431455135345459, "learning_rate": 8.449053201082057e-05, "loss": 1.3924, "step": 938 }, { "epoch": 0.2543634149641456, "grad_norm": 0.37811046838760376, "learning_rate": 8.467087466185754e-05, "loss": 1.3861, "step": 940 }, { "epoch": 0.25490461371938844, "grad_norm": 0.35659778118133545, "learning_rate": 8.48512173128945e-05, "loss": 1.3736, "step": 942 }, { "epoch": 0.25544581247463133, "grad_norm": 0.4327319264411926, "learning_rate": 8.503155996393147e-05, "loss": 1.3883, "step": 944 }, { "epoch": 0.25598701122987416, "grad_norm": 0.39134231209754944, "learning_rate": 8.521190261496844e-05, "loss": 1.3704, "step": 946 }, { "epoch": 0.25652820998511705, "grad_norm": 0.39573270082473755, "learning_rate": 8.53922452660054e-05, "loss": 1.4047, "step": 948 }, { "epoch": 0.2570694087403599, "grad_norm": 0.3299993872642517, "learning_rate": 8.557258791704237e-05, "loss": 1.3778, "step": 950 }, { "epoch": 0.2576106074956028, "grad_norm": 0.3559456765651703, "learning_rate": 8.575293056807936e-05, "loss": 1.3794, "step": 952 }, { "epoch": 0.2581518062508456, "grad_norm": 0.36347028613090515, "learning_rate": 8.593327321911632e-05, "loss": 1.3817, "step": 954 }, { "epoch": 0.2586930050060885, "grad_norm": 0.39882585406303406, "learning_rate": 8.611361587015329e-05, "loss": 1.3565, "step": 956 }, { "epoch": 0.2592342037613313, "grad_norm": 0.3932117223739624, "learning_rate": 8.629395852119027e-05, "loss": 1.396, "step": 958 }, { "epoch": 0.2597754025165742, "grad_norm": 0.3526294231414795, "learning_rate": 8.647430117222724e-05, "loss": 1.3624, "step": 960 }, { "epoch": 0.26031660127181705, "grad_norm": 0.3804738223552704, "learning_rate": 8.66546438232642e-05, "loss": 1.3616, "step": 962 }, { "epoch": 0.26085780002705994, "grad_norm": 0.36557725071907043, "learning_rate": 8.683498647430117e-05, "loss": 1.3997, "step": 964 }, { "epoch": 0.2613989987823028, "grad_norm": 0.3574380874633789, "learning_rate": 8.701532912533815e-05, "loss": 1.3901, "step": 966 }, { "epoch": 0.26194019753754566, "grad_norm": 0.4025056064128876, "learning_rate": 8.719567177637512e-05, "loss": 1.3707, "step": 968 }, { "epoch": 0.26248139629278855, "grad_norm": 0.3687063157558441, "learning_rate": 8.737601442741209e-05, "loss": 1.3679, "step": 970 }, { "epoch": 0.2630225950480314, "grad_norm": 0.3697878420352936, "learning_rate": 8.755635707844906e-05, "loss": 1.3981, "step": 972 }, { "epoch": 0.26356379380327427, "grad_norm": 0.34241798520088196, "learning_rate": 8.773669972948602e-05, "loss": 1.3728, "step": 974 }, { "epoch": 0.2641049925585171, "grad_norm": 0.40002745389938354, "learning_rate": 8.791704238052299e-05, "loss": 1.3732, "step": 976 }, { "epoch": 0.26464619131376, "grad_norm": 0.42943906784057617, "learning_rate": 8.809738503155996e-05, "loss": 1.3731, "step": 978 }, { "epoch": 0.2651873900690028, "grad_norm": 0.37437063455581665, "learning_rate": 8.827772768259693e-05, "loss": 1.372, "step": 980 }, { "epoch": 0.2657285888242457, "grad_norm": 0.3378891944885254, "learning_rate": 8.845807033363391e-05, "loss": 1.3777, "step": 982 }, { "epoch": 0.26626978757948855, "grad_norm": 0.32884734869003296, "learning_rate": 8.863841298467088e-05, "loss": 1.3639, "step": 984 }, { "epoch": 0.26681098633473144, "grad_norm": 0.3945903480052948, "learning_rate": 8.881875563570786e-05, "loss": 1.3722, "step": 986 }, { "epoch": 0.26735218508997427, "grad_norm": 0.39569205045700073, "learning_rate": 8.899909828674482e-05, "loss": 1.376, "step": 988 }, { "epoch": 0.26789338384521716, "grad_norm": 0.31659135222435, "learning_rate": 8.917944093778179e-05, "loss": 1.3807, "step": 990 }, { "epoch": 0.26843458260046, "grad_norm": 0.44032666087150574, "learning_rate": 8.935978358881876e-05, "loss": 1.3986, "step": 992 }, { "epoch": 0.2689757813557029, "grad_norm": 0.3445993661880493, "learning_rate": 8.954012623985573e-05, "loss": 1.3589, "step": 994 }, { "epoch": 0.26951698011094577, "grad_norm": 0.3693557679653168, "learning_rate": 8.97204688908927e-05, "loss": 1.3593, "step": 996 }, { "epoch": 0.2700581788661886, "grad_norm": 0.3965442478656769, "learning_rate": 8.990081154192968e-05, "loss": 1.3909, "step": 998 }, { "epoch": 0.2705993776214315, "grad_norm": 0.4038390815258026, "learning_rate": 9.008115419296664e-05, "loss": 1.3629, "step": 1000 }, { "epoch": 0.2711405763766743, "grad_norm": 0.36394256353378296, "learning_rate": 9.026149684400361e-05, "loss": 1.3812, "step": 1002 }, { "epoch": 0.2716817751319172, "grad_norm": 0.4527181386947632, "learning_rate": 9.044183949504058e-05, "loss": 1.3692, "step": 1004 }, { "epoch": 0.27222297388716005, "grad_norm": 0.37700143456459045, "learning_rate": 9.062218214607755e-05, "loss": 1.3652, "step": 1006 }, { "epoch": 0.27276417264240294, "grad_norm": 0.45016244053840637, "learning_rate": 9.080252479711451e-05, "loss": 1.3657, "step": 1008 }, { "epoch": 0.27330537139764577, "grad_norm": 0.42159709334373474, "learning_rate": 9.09828674481515e-05, "loss": 1.3702, "step": 1010 }, { "epoch": 0.27384657015288866, "grad_norm": 0.3884572982788086, "learning_rate": 9.116321009918846e-05, "loss": 1.3535, "step": 1012 }, { "epoch": 0.2743877689081315, "grad_norm": 0.37507420778274536, "learning_rate": 9.134355275022544e-05, "loss": 1.3659, "step": 1014 }, { "epoch": 0.2749289676633744, "grad_norm": 0.35269656777381897, "learning_rate": 9.152389540126241e-05, "loss": 1.3623, "step": 1016 }, { "epoch": 0.2754701664186172, "grad_norm": 0.3543412387371063, "learning_rate": 9.170423805229938e-05, "loss": 1.3695, "step": 1018 }, { "epoch": 0.2760113651738601, "grad_norm": 0.3173674941062927, "learning_rate": 9.188458070333635e-05, "loss": 1.3572, "step": 1020 }, { "epoch": 0.276552563929103, "grad_norm": 0.3729746341705322, "learning_rate": 9.206492335437331e-05, "loss": 1.3888, "step": 1022 }, { "epoch": 0.2770937626843458, "grad_norm": 0.33210429549217224, "learning_rate": 9.224526600541028e-05, "loss": 1.3395, "step": 1024 }, { "epoch": 0.2776349614395887, "grad_norm": 0.338366836309433, "learning_rate": 9.242560865644725e-05, "loss": 1.3498, "step": 1026 }, { "epoch": 0.27817616019483155, "grad_norm": 0.3367864191532135, "learning_rate": 9.260595130748423e-05, "loss": 1.3548, "step": 1028 }, { "epoch": 0.27871735895007443, "grad_norm": 0.40313002467155457, "learning_rate": 9.27862939585212e-05, "loss": 1.4059, "step": 1030 }, { "epoch": 0.27925855770531727, "grad_norm": 0.3434394299983978, "learning_rate": 9.296663660955816e-05, "loss": 1.3522, "step": 1032 }, { "epoch": 0.27979975646056016, "grad_norm": 0.35454580187797546, "learning_rate": 9.314697926059513e-05, "loss": 1.3838, "step": 1034 }, { "epoch": 0.280340955215803, "grad_norm": 0.3280038833618164, "learning_rate": 9.33273219116321e-05, "loss": 1.3753, "step": 1036 }, { "epoch": 0.2808821539710459, "grad_norm": 0.4306875169277191, "learning_rate": 9.350766456266907e-05, "loss": 1.3807, "step": 1038 }, { "epoch": 0.2814233527262887, "grad_norm": 0.3500923812389374, "learning_rate": 9.368800721370605e-05, "loss": 1.36, "step": 1040 }, { "epoch": 0.2819645514815316, "grad_norm": 0.3702130913734436, "learning_rate": 9.386834986474301e-05, "loss": 1.3919, "step": 1042 }, { "epoch": 0.28250575023677443, "grad_norm": 0.3651416599750519, "learning_rate": 9.404869251578e-05, "loss": 1.3805, "step": 1044 }, { "epoch": 0.2830469489920173, "grad_norm": 0.35927796363830566, "learning_rate": 9.422903516681696e-05, "loss": 1.3507, "step": 1046 }, { "epoch": 0.28358814774726016, "grad_norm": 0.36750975251197815, "learning_rate": 9.440937781785393e-05, "loss": 1.3475, "step": 1048 }, { "epoch": 0.28412934650250304, "grad_norm": 0.31946998834609985, "learning_rate": 9.45897204688909e-05, "loss": 1.3708, "step": 1050 }, { "epoch": 0.28467054525774593, "grad_norm": 0.3447932302951813, "learning_rate": 9.477006311992787e-05, "loss": 1.3519, "step": 1052 }, { "epoch": 0.28521174401298877, "grad_norm": 0.31405511498451233, "learning_rate": 9.495040577096483e-05, "loss": 1.3806, "step": 1054 }, { "epoch": 0.28575294276823165, "grad_norm": 0.3198442757129669, "learning_rate": 9.51307484220018e-05, "loss": 1.368, "step": 1056 }, { "epoch": 0.2862941415234745, "grad_norm": 0.33328956365585327, "learning_rate": 9.531109107303878e-05, "loss": 1.3429, "step": 1058 }, { "epoch": 0.2868353402787174, "grad_norm": 0.29432907700538635, "learning_rate": 9.549143372407575e-05, "loss": 1.3698, "step": 1060 }, { "epoch": 0.2873765390339602, "grad_norm": 0.3468937575817108, "learning_rate": 9.567177637511272e-05, "loss": 1.356, "step": 1062 }, { "epoch": 0.2879177377892031, "grad_norm": 0.3619658350944519, "learning_rate": 9.585211902614968e-05, "loss": 1.3596, "step": 1064 }, { "epoch": 0.28845893654444593, "grad_norm": 0.3384917378425598, "learning_rate": 9.603246167718665e-05, "loss": 1.3693, "step": 1066 }, { "epoch": 0.2890001352996888, "grad_norm": 0.3724029064178467, "learning_rate": 9.621280432822363e-05, "loss": 1.3639, "step": 1068 }, { "epoch": 0.28954133405493165, "grad_norm": 0.7029115557670593, "learning_rate": 9.63931469792606e-05, "loss": 1.3557, "step": 1070 }, { "epoch": 0.29008253281017454, "grad_norm": 0.5529230833053589, "learning_rate": 9.657348963029757e-05, "loss": 1.3657, "step": 1072 }, { "epoch": 0.2906237315654174, "grad_norm": 0.4254820644855499, "learning_rate": 9.675383228133455e-05, "loss": 1.3633, "step": 1074 }, { "epoch": 0.29116493032066026, "grad_norm": 0.4930615723133087, "learning_rate": 9.693417493237152e-05, "loss": 1.3714, "step": 1076 }, { "epoch": 0.2917061290759031, "grad_norm": 0.4455857574939728, "learning_rate": 9.711451758340848e-05, "loss": 1.3615, "step": 1078 }, { "epoch": 0.292247327831146, "grad_norm": 0.4171796441078186, "learning_rate": 9.729486023444545e-05, "loss": 1.3673, "step": 1080 }, { "epoch": 0.2927885265863889, "grad_norm": 0.37810683250427246, "learning_rate": 9.747520288548242e-05, "loss": 1.3683, "step": 1082 }, { "epoch": 0.2933297253416317, "grad_norm": 0.4057900905609131, "learning_rate": 9.765554553651939e-05, "loss": 1.3674, "step": 1084 }, { "epoch": 0.2938709240968746, "grad_norm": 0.40583640336990356, "learning_rate": 9.783588818755635e-05, "loss": 1.3566, "step": 1086 }, { "epoch": 0.29441212285211743, "grad_norm": 0.39454150199890137, "learning_rate": 9.801623083859334e-05, "loss": 1.3611, "step": 1088 }, { "epoch": 0.2949533216073603, "grad_norm": 0.42229679226875305, "learning_rate": 9.81965734896303e-05, "loss": 1.3726, "step": 1090 }, { "epoch": 0.29549452036260315, "grad_norm": 0.3274170160293579, "learning_rate": 9.837691614066727e-05, "loss": 1.3375, "step": 1092 }, { "epoch": 0.29603571911784604, "grad_norm": 0.40999388694763184, "learning_rate": 9.855725879170424e-05, "loss": 1.3548, "step": 1094 }, { "epoch": 0.2965769178730889, "grad_norm": 0.33515796065330505, "learning_rate": 9.873760144274122e-05, "loss": 1.3903, "step": 1096 }, { "epoch": 0.29711811662833176, "grad_norm": 0.3834095597267151, "learning_rate": 9.891794409377819e-05, "loss": 1.3653, "step": 1098 }, { "epoch": 0.2976593153835746, "grad_norm": 0.34850651025772095, "learning_rate": 9.909828674481515e-05, "loss": 1.3573, "step": 1100 }, { "epoch": 0.2982005141388175, "grad_norm": 0.3811749815940857, "learning_rate": 9.927862939585212e-05, "loss": 1.3843, "step": 1102 }, { "epoch": 0.2987417128940603, "grad_norm": 0.3308597803115845, "learning_rate": 9.94589720468891e-05, "loss": 1.3492, "step": 1104 }, { "epoch": 0.2992829116493032, "grad_norm": 0.31952470541000366, "learning_rate": 9.963931469792607e-05, "loss": 1.3586, "step": 1106 }, { "epoch": 0.2998241104045461, "grad_norm": 0.3433592915534973, "learning_rate": 9.981965734896304e-05, "loss": 1.3524, "step": 1108 }, { "epoch": 0.30036530915978893, "grad_norm": 0.4547680914402008, "learning_rate": 0.0001, "loss": 1.3562, "step": 1110 }, { "epoch": 0.3009065079150318, "grad_norm": 0.4963592290878296, "learning_rate": 9.999999008881264e-05, "loss": 1.3452, "step": 1112 }, { "epoch": 0.30144770667027465, "grad_norm": 1.1111193895339966, "learning_rate": 9.999996035525452e-05, "loss": 1.3732, "step": 1114 }, { "epoch": 0.30198890542551754, "grad_norm": 0.6860964298248291, "learning_rate": 9.999991079933739e-05, "loss": 1.3689, "step": 1116 }, { "epoch": 0.3025301041807604, "grad_norm": 0.7344204783439636, "learning_rate": 9.999984142108093e-05, "loss": 1.3575, "step": 1118 }, { "epoch": 0.30307130293600326, "grad_norm": 0.6534725427627563, "learning_rate": 9.999975222051263e-05, "loss": 1.376, "step": 1120 }, { "epoch": 0.3036125016912461, "grad_norm": 0.5108229517936707, "learning_rate": 9.999964319766785e-05, "loss": 1.3741, "step": 1122 }, { "epoch": 0.304153700446489, "grad_norm": 0.4888688325881958, "learning_rate": 9.99995143525898e-05, "loss": 1.3555, "step": 1124 }, { "epoch": 0.3046948992017318, "grad_norm": 0.42808806896209717, "learning_rate": 9.999936568532962e-05, "loss": 1.3548, "step": 1126 }, { "epoch": 0.3052360979569747, "grad_norm": 0.3921727240085602, "learning_rate": 9.999919719594617e-05, "loss": 1.3559, "step": 1128 }, { "epoch": 0.30577729671221754, "grad_norm": 0.3473529517650604, "learning_rate": 9.999900888450628e-05, "loss": 1.3603, "step": 1130 }, { "epoch": 0.3063184954674604, "grad_norm": 0.3337381184101105, "learning_rate": 9.999880075108464e-05, "loss": 1.3642, "step": 1132 }, { "epoch": 0.30685969422270326, "grad_norm": 0.3363231122493744, "learning_rate": 9.99985727957637e-05, "loss": 1.3606, "step": 1134 }, { "epoch": 0.30740089297794615, "grad_norm": 0.32726484537124634, "learning_rate": 9.999832501863386e-05, "loss": 1.3493, "step": 1136 }, { "epoch": 0.30794209173318904, "grad_norm": 0.3190646767616272, "learning_rate": 9.999805741979338e-05, "loss": 1.3518, "step": 1138 }, { "epoch": 0.30848329048843187, "grad_norm": 0.31244540214538574, "learning_rate": 9.999776999934831e-05, "loss": 1.3495, "step": 1140 }, { "epoch": 0.30902448924367476, "grad_norm": 0.3286384344100952, "learning_rate": 9.999746275741261e-05, "loss": 1.3517, "step": 1142 }, { "epoch": 0.3095656879989176, "grad_norm": 0.3630046546459198, "learning_rate": 9.99971356941081e-05, "loss": 1.3641, "step": 1144 }, { "epoch": 0.3101068867541605, "grad_norm": 0.30771151185035706, "learning_rate": 9.999678880956443e-05, "loss": 1.3571, "step": 1146 }, { "epoch": 0.3106480855094033, "grad_norm": 0.30026301741600037, "learning_rate": 9.99964221039191e-05, "loss": 1.3541, "step": 1148 }, { "epoch": 0.3111892842646462, "grad_norm": 0.3128298223018646, "learning_rate": 9.999603557731754e-05, "loss": 1.3556, "step": 1150 }, { "epoch": 0.31173048301988904, "grad_norm": 0.30185452103614807, "learning_rate": 9.999562922991293e-05, "loss": 1.3484, "step": 1152 }, { "epoch": 0.3122716817751319, "grad_norm": 0.3274635076522827, "learning_rate": 9.99952030618664e-05, "loss": 1.3729, "step": 1154 }, { "epoch": 0.31281288053037476, "grad_norm": 0.30549076199531555, "learning_rate": 9.999475707334692e-05, "loss": 1.3642, "step": 1156 }, { "epoch": 0.31335407928561765, "grad_norm": 0.3147718906402588, "learning_rate": 9.999429126453126e-05, "loss": 1.3493, "step": 1158 }, { "epoch": 0.3138952780408605, "grad_norm": 0.6205586791038513, "learning_rate": 9.99938056356041e-05, "loss": 1.3623, "step": 1160 }, { "epoch": 0.31443647679610337, "grad_norm": 0.3471706211566925, "learning_rate": 9.999330018675798e-05, "loss": 1.3533, "step": 1162 }, { "epoch": 0.31497767555134626, "grad_norm": 1.3515815734863281, "learning_rate": 9.999277491819328e-05, "loss": 1.3565, "step": 1164 }, { "epoch": 0.3155188743065891, "grad_norm": 733.9155883789062, "learning_rate": 9.999222983011824e-05, "loss": 5.2143, "step": 1166 }, { "epoch": 0.316060073061832, "grad_norm": 2.9439170360565186, "learning_rate": 9.999166492274894e-05, "loss": 1.4438, "step": 1168 }, { "epoch": 0.3166012718170748, "grad_norm": 1.5871142148971558, "learning_rate": 9.999108019630938e-05, "loss": 1.4426, "step": 1170 }, { "epoch": 0.3171424705723177, "grad_norm": 711.9217529296875, "learning_rate": 9.999047565103132e-05, "loss": 3.6935, "step": 1172 }, { "epoch": 0.31768366932756054, "grad_norm": 100.76264953613281, "learning_rate": 9.998985128715448e-05, "loss": 4.2396, "step": 1174 }, { "epoch": 0.3182248680828034, "grad_norm": 108.88189697265625, "learning_rate": 9.998920710492634e-05, "loss": 4.9929, "step": 1176 }, { "epoch": 0.31876606683804626, "grad_norm": 72.18595123291016, "learning_rate": 9.998854310460233e-05, "loss": 6.0375, "step": 1178 }, { "epoch": 0.31930726559328915, "grad_norm": 59.48538589477539, "learning_rate": 9.998785928644567e-05, "loss": 5.8932, "step": 1180 }, { "epoch": 0.319848464348532, "grad_norm": 36.32703399658203, "learning_rate": 9.998715565072744e-05, "loss": 6.5369, "step": 1182 }, { "epoch": 0.32038966310377487, "grad_norm": 18.565351486206055, "learning_rate": 9.998643219772664e-05, "loss": 6.1671, "step": 1184 }, { "epoch": 0.3209308618590177, "grad_norm": 45.84898376464844, "learning_rate": 9.998568892773003e-05, "loss": 5.9379, "step": 1186 }, { "epoch": 0.3214720606142606, "grad_norm": 66.2480239868164, "learning_rate": 9.998492584103232e-05, "loss": 5.7071, "step": 1188 }, { "epoch": 0.3220132593695034, "grad_norm": 41.693092346191406, "learning_rate": 9.998414293793599e-05, "loss": 6.3198, "step": 1190 }, { "epoch": 0.3225544581247463, "grad_norm": 19.323413848876953, "learning_rate": 9.998334021875147e-05, "loss": 5.377, "step": 1192 }, { "epoch": 0.3230956568799892, "grad_norm": 15.907301902770996, "learning_rate": 9.998251768379696e-05, "loss": 4.5293, "step": 1194 }, { "epoch": 0.32363685563523203, "grad_norm": 80.1374740600586, "learning_rate": 9.998167533339857e-05, "loss": 4.3471, "step": 1196 }, { "epoch": 0.3241780543904749, "grad_norm": 23.298336029052734, "learning_rate": 9.998081316789024e-05, "loss": 3.7461, "step": 1198 }, { "epoch": 0.32471925314571776, "grad_norm": 82.48027801513672, "learning_rate": 9.997993118761378e-05, "loss": 4.1647, "step": 1200 }, { "epoch": 0.32526045190096065, "grad_norm": 27.916913986206055, "learning_rate": 9.997902939291883e-05, "loss": 3.9092, "step": 1202 }, { "epoch": 0.3258016506562035, "grad_norm": 15.70148754119873, "learning_rate": 9.997810778416293e-05, "loss": 3.1628, "step": 1204 }, { "epoch": 0.32634284941144637, "grad_norm": 18.33330535888672, "learning_rate": 9.997716636171142e-05, "loss": 2.8777, "step": 1206 }, { "epoch": 0.3268840481666892, "grad_norm": 10.6620512008667, "learning_rate": 9.997620512593755e-05, "loss": 2.3009, "step": 1208 }, { "epoch": 0.3274252469219321, "grad_norm": 32.01799011230469, "learning_rate": 9.99752240772224e-05, "loss": 1.9617, "step": 1210 }, { "epoch": 0.3279664456771749, "grad_norm": 5.677090644836426, "learning_rate": 9.997422321595488e-05, "loss": 1.8401, "step": 1212 }, { "epoch": 0.3285076444324178, "grad_norm": 8.914667129516602, "learning_rate": 9.997320254253179e-05, "loss": 1.6707, "step": 1214 }, { "epoch": 0.32904884318766064, "grad_norm": 2.3725008964538574, "learning_rate": 9.997216205735779e-05, "loss": 1.5757, "step": 1216 }, { "epoch": 0.32959004194290353, "grad_norm": 2.418389320373535, "learning_rate": 9.997110176084538e-05, "loss": 1.5154, "step": 1218 }, { "epoch": 0.33013124069814637, "grad_norm": 2.802185297012329, "learning_rate": 9.997002165341487e-05, "loss": 1.4883, "step": 1220 }, { "epoch": 0.33067243945338926, "grad_norm": 2.1769211292266846, "learning_rate": 9.996892173549452e-05, "loss": 1.445, "step": 1222 }, { "epoch": 0.33121363820863214, "grad_norm": 1.799670934677124, "learning_rate": 9.996780200752035e-05, "loss": 1.4276, "step": 1224 }, { "epoch": 0.331754836963875, "grad_norm": 3.2545313835144043, "learning_rate": 9.996666246993627e-05, "loss": 1.4394, "step": 1226 }, { "epoch": 0.33229603571911787, "grad_norm": 1.1922351121902466, "learning_rate": 9.996550312319408e-05, "loss": 1.4359, "step": 1228 }, { "epoch": 0.3328372344743607, "grad_norm": 2.6813228130340576, "learning_rate": 9.996432396775339e-05, "loss": 1.4229, "step": 1230 }, { "epoch": 0.3333784332296036, "grad_norm": 1.6968843936920166, "learning_rate": 9.996312500408165e-05, "loss": 1.4281, "step": 1232 }, { "epoch": 0.3339196319848464, "grad_norm": 1.3502254486083984, "learning_rate": 9.996190623265421e-05, "loss": 1.408, "step": 1234 }, { "epoch": 0.3344608307400893, "grad_norm": 1.2809518575668335, "learning_rate": 9.996066765395424e-05, "loss": 1.4176, "step": 1236 }, { "epoch": 0.33500202949533214, "grad_norm": 1.0455057621002197, "learning_rate": 9.995940926847279e-05, "loss": 1.4056, "step": 1238 }, { "epoch": 0.33554322825057503, "grad_norm": 1.3292824029922485, "learning_rate": 9.99581310767087e-05, "loss": 1.4033, "step": 1240 }, { "epoch": 0.33608442700581787, "grad_norm": 1.5960067510604858, "learning_rate": 9.995683307916875e-05, "loss": 1.379, "step": 1242 }, { "epoch": 0.33662562576106075, "grad_norm": 1.0471105575561523, "learning_rate": 9.99555152763675e-05, "loss": 1.3823, "step": 1244 }, { "epoch": 0.3371668245163036, "grad_norm": 2.339273452758789, "learning_rate": 9.99541776688274e-05, "loss": 1.3698, "step": 1246 }, { "epoch": 0.3377080232715465, "grad_norm": 0.81674724817276, "learning_rate": 9.995282025707875e-05, "loss": 1.4154, "step": 1248 }, { "epoch": 0.33824922202678936, "grad_norm": 0.6240290999412537, "learning_rate": 9.995144304165968e-05, "loss": 1.4035, "step": 1250 }, { "epoch": 0.3387904207820322, "grad_norm": 2.281787872314453, "learning_rate": 9.995004602311619e-05, "loss": 1.3906, "step": 1252 }, { "epoch": 0.3393316195372751, "grad_norm": 0.6818395853042603, "learning_rate": 9.99486292020021e-05, "loss": 1.3853, "step": 1254 }, { "epoch": 0.3398728182925179, "grad_norm": 6.299881935119629, "learning_rate": 9.994719257887915e-05, "loss": 1.3856, "step": 1256 }, { "epoch": 0.3404140170477608, "grad_norm": 0.8173750638961792, "learning_rate": 9.994573615431686e-05, "loss": 1.3871, "step": 1258 }, { "epoch": 0.34095521580300364, "grad_norm": 2.155395746231079, "learning_rate": 9.994425992889262e-05, "loss": 1.3382, "step": 1260 }, { "epoch": 0.34149641455824653, "grad_norm": 0.5846114754676819, "learning_rate": 9.99427639031917e-05, "loss": 1.3978, "step": 1262 }, { "epoch": 0.34203761331348936, "grad_norm": 0.6624069213867188, "learning_rate": 9.994124807780717e-05, "loss": 1.3792, "step": 1264 }, { "epoch": 0.34257881206873225, "grad_norm": 0.5708588361740112, "learning_rate": 9.993971245333998e-05, "loss": 1.3677, "step": 1266 }, { "epoch": 0.3431200108239751, "grad_norm": 0.5245474576950073, "learning_rate": 9.993815703039894e-05, "loss": 1.3672, "step": 1268 }, { "epoch": 0.343661209579218, "grad_norm": 0.501871645450592, "learning_rate": 9.993658180960069e-05, "loss": 1.3674, "step": 1270 }, { "epoch": 0.3442024083344608, "grad_norm": 0.5990382432937622, "learning_rate": 9.993498679156969e-05, "loss": 1.3804, "step": 1272 }, { "epoch": 0.3447436070897037, "grad_norm": 0.42392146587371826, "learning_rate": 9.993337197693833e-05, "loss": 1.3628, "step": 1274 }, { "epoch": 0.34528480584494653, "grad_norm": 0.46936917304992676, "learning_rate": 9.993173736634676e-05, "loss": 1.3696, "step": 1276 }, { "epoch": 0.3458260046001894, "grad_norm": 0.52222740650177, "learning_rate": 9.993008296044304e-05, "loss": 1.3697, "step": 1278 }, { "epoch": 0.3463672033554323, "grad_norm": 0.3582518398761749, "learning_rate": 9.992840875988305e-05, "loss": 1.3825, "step": 1280 }, { "epoch": 0.34690840211067514, "grad_norm": 0.3533988296985626, "learning_rate": 9.99267147653305e-05, "loss": 1.361, "step": 1282 }, { "epoch": 0.34744960086591803, "grad_norm": 0.35905274748802185, "learning_rate": 9.992500097745702e-05, "loss": 1.3721, "step": 1284 }, { "epoch": 0.34799079962116086, "grad_norm": 0.3057416081428528, "learning_rate": 9.9923267396942e-05, "loss": 1.369, "step": 1286 }, { "epoch": 0.34853199837640375, "grad_norm": 0.3299311101436615, "learning_rate": 9.992151402447272e-05, "loss": 1.358, "step": 1288 }, { "epoch": 0.3490731971316466, "grad_norm": 0.3086453080177307, "learning_rate": 9.99197408607443e-05, "loss": 1.3534, "step": 1290 }, { "epoch": 0.3496143958868895, "grad_norm": 0.3111782968044281, "learning_rate": 9.991794790645969e-05, "loss": 1.3605, "step": 1292 }, { "epoch": 0.3501555946421323, "grad_norm": 0.3231568932533264, "learning_rate": 9.991613516232974e-05, "loss": 1.3543, "step": 1294 }, { "epoch": 0.3506967933973752, "grad_norm": 0.3288814425468445, "learning_rate": 9.991430262907309e-05, "loss": 1.3521, "step": 1296 }, { "epoch": 0.35123799215261803, "grad_norm": 0.3239436745643616, "learning_rate": 9.991245030741622e-05, "loss": 1.3335, "step": 1298 }, { "epoch": 0.3517791909078609, "grad_norm": 0.3560773730278015, "learning_rate": 9.991057819809353e-05, "loss": 1.3487, "step": 1300 }, { "epoch": 0.35232038966310375, "grad_norm": 0.4387347400188446, "learning_rate": 9.990868630184716e-05, "loss": 1.3548, "step": 1302 }, { "epoch": 0.35286158841834664, "grad_norm": 0.32067278027534485, "learning_rate": 9.990677461942717e-05, "loss": 1.3471, "step": 1304 }, { "epoch": 0.3534027871735895, "grad_norm": 0.4399580955505371, "learning_rate": 9.990484315159146e-05, "loss": 1.3588, "step": 1306 }, { "epoch": 0.35394398592883236, "grad_norm": 0.9175602793693542, "learning_rate": 9.990289189910571e-05, "loss": 1.3432, "step": 1308 }, { "epoch": 0.35448518468407525, "grad_norm": 0.45273318886756897, "learning_rate": 9.990092086274352e-05, "loss": 1.3434, "step": 1310 }, { "epoch": 0.3550263834393181, "grad_norm": 0.3346487879753113, "learning_rate": 9.989893004328632e-05, "loss": 1.3339, "step": 1312 }, { "epoch": 0.35556758219456097, "grad_norm": 0.4779951870441437, "learning_rate": 9.989691944152333e-05, "loss": 1.3561, "step": 1314 }, { "epoch": 0.3561087809498038, "grad_norm": 0.6359366774559021, "learning_rate": 9.989488905825166e-05, "loss": 1.3499, "step": 1316 }, { "epoch": 0.3566499797050467, "grad_norm": 0.5867050290107727, "learning_rate": 9.989283889427625e-05, "loss": 1.3791, "step": 1318 }, { "epoch": 0.3571911784602895, "grad_norm": 1.869691014289856, "learning_rate": 9.989076895040989e-05, "loss": 1.3663, "step": 1320 }, { "epoch": 0.3577323772155324, "grad_norm": 2.7147843837738037, "learning_rate": 9.98886792274732e-05, "loss": 1.358, "step": 1322 }, { "epoch": 0.35827357597077525, "grad_norm": 0.8717885613441467, "learning_rate": 9.988656972629465e-05, "loss": 1.34, "step": 1324 }, { "epoch": 0.35881477472601814, "grad_norm": 0.7126337885856628, "learning_rate": 9.988444044771054e-05, "loss": 1.3281, "step": 1326 }, { "epoch": 0.35935597348126097, "grad_norm": 0.7409217357635498, "learning_rate": 9.988229139256502e-05, "loss": 1.3571, "step": 1328 }, { "epoch": 0.35989717223650386, "grad_norm": 0.5892549157142639, "learning_rate": 9.988012256171006e-05, "loss": 1.3269, "step": 1330 }, { "epoch": 0.3604383709917467, "grad_norm": 0.4858717620372772, "learning_rate": 9.98779339560055e-05, "loss": 1.3506, "step": 1332 }, { "epoch": 0.3609795697469896, "grad_norm": 0.37409740686416626, "learning_rate": 9.987572557631903e-05, "loss": 1.3339, "step": 1334 }, { "epoch": 0.36152076850223247, "grad_norm": 0.38315168023109436, "learning_rate": 9.987349742352611e-05, "loss": 1.3404, "step": 1336 }, { "epoch": 0.3620619672574753, "grad_norm": 0.32702726125717163, "learning_rate": 9.987124949851014e-05, "loss": 1.3595, "step": 1338 }, { "epoch": 0.3626031660127182, "grad_norm": 0.3133656680583954, "learning_rate": 9.986898180216226e-05, "loss": 1.3428, "step": 1340 }, { "epoch": 0.363144364767961, "grad_norm": 0.2916230857372284, "learning_rate": 9.986669433538152e-05, "loss": 1.3381, "step": 1342 }, { "epoch": 0.3636855635232039, "grad_norm": 0.28036215901374817, "learning_rate": 9.986438709907476e-05, "loss": 1.3447, "step": 1344 }, { "epoch": 0.36422676227844675, "grad_norm": 0.30352699756622314, "learning_rate": 9.98620600941567e-05, "loss": 1.3427, "step": 1346 }, { "epoch": 0.36476796103368964, "grad_norm": 0.3100769519805908, "learning_rate": 9.985971332154984e-05, "loss": 1.3603, "step": 1348 }, { "epoch": 0.36530915978893247, "grad_norm": 0.2933647930622101, "learning_rate": 9.98573467821846e-05, "loss": 1.3646, "step": 1350 }, { "epoch": 0.36585035854417536, "grad_norm": 0.2938663959503174, "learning_rate": 9.985496047699916e-05, "loss": 1.3763, "step": 1352 }, { "epoch": 0.3663915572994182, "grad_norm": 0.2916519343852997, "learning_rate": 9.985255440693955e-05, "loss": 1.3431, "step": 1354 }, { "epoch": 0.3669327560546611, "grad_norm": 0.2954147756099701, "learning_rate": 9.985012857295968e-05, "loss": 1.338, "step": 1356 }, { "epoch": 0.3674739548099039, "grad_norm": 0.2839341163635254, "learning_rate": 9.984768297602125e-05, "loss": 1.3653, "step": 1358 }, { "epoch": 0.3680151535651468, "grad_norm": 0.2878473699092865, "learning_rate": 9.984521761709382e-05, "loss": 1.3302, "step": 1360 }, { "epoch": 0.3685563523203897, "grad_norm": 0.2859325408935547, "learning_rate": 9.984273249715478e-05, "loss": 1.3273, "step": 1362 }, { "epoch": 0.3690975510756325, "grad_norm": 0.28399959206581116, "learning_rate": 9.984022761718933e-05, "loss": 1.3516, "step": 1364 }, { "epoch": 0.3696387498308754, "grad_norm": 0.29740169644355774, "learning_rate": 9.983770297819052e-05, "loss": 1.3389, "step": 1366 }, { "epoch": 0.37017994858611825, "grad_norm": 0.3143361806869507, "learning_rate": 9.983515858115928e-05, "loss": 1.3557, "step": 1368 }, { "epoch": 0.37072114734136113, "grad_norm": 0.30783936381340027, "learning_rate": 9.983259442710429e-05, "loss": 1.3498, "step": 1370 }, { "epoch": 0.37126234609660397, "grad_norm": 0.297091543674469, "learning_rate": 9.983001051704211e-05, "loss": 1.3308, "step": 1372 }, { "epoch": 0.37180354485184686, "grad_norm": 0.3118893504142761, "learning_rate": 9.982740685199712e-05, "loss": 1.3372, "step": 1374 }, { "epoch": 0.3723447436070897, "grad_norm": 0.2826865017414093, "learning_rate": 9.982478343300155e-05, "loss": 1.3488, "step": 1376 }, { "epoch": 0.3728859423623326, "grad_norm": 0.2829175889492035, "learning_rate": 9.982214026109544e-05, "loss": 1.3693, "step": 1378 }, { "epoch": 0.3734271411175754, "grad_norm": 0.3026389479637146, "learning_rate": 9.981947733732668e-05, "loss": 1.3276, "step": 1380 }, { "epoch": 0.3739683398728183, "grad_norm": 0.30112889409065247, "learning_rate": 9.981679466275096e-05, "loss": 1.3441, "step": 1382 }, { "epoch": 0.37450953862806113, "grad_norm": 0.27241262793540955, "learning_rate": 9.981409223843183e-05, "loss": 1.3373, "step": 1384 }, { "epoch": 0.375050737383304, "grad_norm": 0.2804114520549774, "learning_rate": 9.981137006544066e-05, "loss": 1.344, "step": 1386 }, { "epoch": 0.37559193613854686, "grad_norm": 0.27698764204978943, "learning_rate": 9.980862814485665e-05, "loss": 1.3543, "step": 1388 }, { "epoch": 0.37613313489378974, "grad_norm": 0.29283177852630615, "learning_rate": 9.980586647776681e-05, "loss": 1.3332, "step": 1390 }, { "epoch": 0.37667433364903263, "grad_norm": 0.2896028459072113, "learning_rate": 9.980308506526604e-05, "loss": 1.3392, "step": 1392 }, { "epoch": 0.37721553240427547, "grad_norm": 0.27882838249206543, "learning_rate": 9.980028390845697e-05, "loss": 1.336, "step": 1394 }, { "epoch": 0.37775673115951836, "grad_norm": 0.2886262834072113, "learning_rate": 9.979746300845015e-05, "loss": 1.3331, "step": 1396 }, { "epoch": 0.3782979299147612, "grad_norm": 0.3085189163684845, "learning_rate": 9.97946223663639e-05, "loss": 1.3296, "step": 1398 }, { "epoch": 0.3788391286700041, "grad_norm": 0.3342386484146118, "learning_rate": 9.97917619833244e-05, "loss": 1.351, "step": 1400 }, { "epoch": 0.3793803274252469, "grad_norm": 0.3263756036758423, "learning_rate": 9.978888186046562e-05, "loss": 1.3526, "step": 1402 }, { "epoch": 0.3799215261804898, "grad_norm": 0.292346715927124, "learning_rate": 9.97859819989294e-05, "loss": 1.3498, "step": 1404 }, { "epoch": 0.38046272493573263, "grad_norm": 0.29072263836860657, "learning_rate": 9.978306239986536e-05, "loss": 1.3423, "step": 1406 }, { "epoch": 0.3810039236909755, "grad_norm": 0.3350834548473358, "learning_rate": 9.978012306443101e-05, "loss": 1.3559, "step": 1408 }, { "epoch": 0.38154512244621835, "grad_norm": 0.28721559047698975, "learning_rate": 9.977716399379157e-05, "loss": 1.3294, "step": 1410 }, { "epoch": 0.38208632120146124, "grad_norm": 0.3062276244163513, "learning_rate": 9.977418518912023e-05, "loss": 1.3457, "step": 1412 }, { "epoch": 0.3826275199567041, "grad_norm": 0.30255332589149475, "learning_rate": 9.977118665159791e-05, "loss": 1.3371, "step": 1414 }, { "epoch": 0.38316871871194697, "grad_norm": 0.2800199091434479, "learning_rate": 9.976816838241334e-05, "loss": 1.3439, "step": 1416 }, { "epoch": 0.3837099174671898, "grad_norm": 0.2754746675491333, "learning_rate": 9.976513038276312e-05, "loss": 1.3303, "step": 1418 }, { "epoch": 0.3842511162224327, "grad_norm": 0.29933616518974304, "learning_rate": 9.976207265385168e-05, "loss": 1.3365, "step": 1420 }, { "epoch": 0.3847923149776756, "grad_norm": 0.3023386001586914, "learning_rate": 9.975899519689122e-05, "loss": 1.3164, "step": 1422 }, { "epoch": 0.3853335137329184, "grad_norm": 0.2901383936405182, "learning_rate": 9.975589801310181e-05, "loss": 1.3209, "step": 1424 }, { "epoch": 0.3858747124881613, "grad_norm": 0.28566035628318787, "learning_rate": 9.975278110371131e-05, "loss": 1.3301, "step": 1426 }, { "epoch": 0.38641591124340413, "grad_norm": 0.3010505735874176, "learning_rate": 9.974964446995543e-05, "loss": 1.319, "step": 1428 }, { "epoch": 0.386957109998647, "grad_norm": 0.2977135479450226, "learning_rate": 9.974648811307766e-05, "loss": 1.3311, "step": 1430 }, { "epoch": 0.38749830875388985, "grad_norm": 0.28914034366607666, "learning_rate": 9.974331203432932e-05, "loss": 1.343, "step": 1432 }, { "epoch": 0.38803950750913274, "grad_norm": 0.2842980623245239, "learning_rate": 9.974011623496958e-05, "loss": 1.3162, "step": 1434 }, { "epoch": 0.3885807062643756, "grad_norm": 0.3048929274082184, "learning_rate": 9.97369007162654e-05, "loss": 1.3166, "step": 1436 }, { "epoch": 0.38912190501961846, "grad_norm": 0.3024531304836273, "learning_rate": 9.973366547949157e-05, "loss": 1.3156, "step": 1438 }, { "epoch": 0.3896631037748613, "grad_norm": 0.2911103367805481, "learning_rate": 9.973041052593068e-05, "loss": 1.3314, "step": 1440 }, { "epoch": 0.3902043025301042, "grad_norm": 0.30932334065437317, "learning_rate": 9.972713585687317e-05, "loss": 1.3144, "step": 1442 }, { "epoch": 0.390745501285347, "grad_norm": 0.302971750497818, "learning_rate": 9.972384147361725e-05, "loss": 1.3431, "step": 1444 }, { "epoch": 0.3912867000405899, "grad_norm": 0.32412296533584595, "learning_rate": 9.972052737746898e-05, "loss": 1.3167, "step": 1446 }, { "epoch": 0.3918278987958328, "grad_norm": 0.4637945890426636, "learning_rate": 9.97171935697422e-05, "loss": 1.3433, "step": 1448 }, { "epoch": 0.39236909755107563, "grad_norm": 0.32690081000328064, "learning_rate": 9.971384005175864e-05, "loss": 1.3327, "step": 1450 }, { "epoch": 0.3929102963063185, "grad_norm": 0.3049994111061096, "learning_rate": 9.971046682484776e-05, "loss": 1.3401, "step": 1452 }, { "epoch": 0.39345149506156135, "grad_norm": 0.306095689535141, "learning_rate": 9.970707389034688e-05, "loss": 1.3205, "step": 1454 }, { "epoch": 0.39399269381680424, "grad_norm": 0.3375592529773712, "learning_rate": 9.970366124960111e-05, "loss": 1.3243, "step": 1456 }, { "epoch": 0.3945338925720471, "grad_norm": 0.30508387088775635, "learning_rate": 9.970022890396338e-05, "loss": 1.3342, "step": 1458 }, { "epoch": 0.39507509132728996, "grad_norm": 0.2996918261051178, "learning_rate": 9.969677685479444e-05, "loss": 1.3457, "step": 1460 }, { "epoch": 0.3956162900825328, "grad_norm": 0.29500269889831543, "learning_rate": 9.969330510346286e-05, "loss": 1.3306, "step": 1462 }, { "epoch": 0.3961574888377757, "grad_norm": 0.28392598032951355, "learning_rate": 9.9689813651345e-05, "loss": 1.3347, "step": 1464 }, { "epoch": 0.3966986875930185, "grad_norm": 0.2859434485435486, "learning_rate": 9.968630249982503e-05, "loss": 1.3342, "step": 1466 }, { "epoch": 0.3972398863482614, "grad_norm": 0.3038876950740814, "learning_rate": 9.968277165029494e-05, "loss": 1.3248, "step": 1468 }, { "epoch": 0.39778108510350424, "grad_norm": 0.3060581088066101, "learning_rate": 9.967922110415454e-05, "loss": 1.3403, "step": 1470 }, { "epoch": 0.39832228385874713, "grad_norm": 0.30475133657455444, "learning_rate": 9.96756508628114e-05, "loss": 1.3338, "step": 1472 }, { "epoch": 0.39886348261398996, "grad_norm": 0.33263343572616577, "learning_rate": 9.967206092768095e-05, "loss": 1.3209, "step": 1474 }, { "epoch": 0.39940468136923285, "grad_norm": 0.2895435094833374, "learning_rate": 9.966845130018645e-05, "loss": 1.3352, "step": 1476 }, { "epoch": 0.39994588012447574, "grad_norm": 0.27237775921821594, "learning_rate": 9.966482198175886e-05, "loss": 1.3239, "step": 1478 }, { "epoch": 0.40048707887971857, "grad_norm": 0.2740168571472168, "learning_rate": 9.966117297383707e-05, "loss": 1.3371, "step": 1480 }, { "epoch": 0.40102827763496146, "grad_norm": 0.30601269006729126, "learning_rate": 9.965750427786768e-05, "loss": 1.343, "step": 1482 }, { "epoch": 0.4015694763902043, "grad_norm": 0.28768840432167053, "learning_rate": 9.965381589530518e-05, "loss": 1.3442, "step": 1484 }, { "epoch": 0.4021106751454472, "grad_norm": 0.28244882822036743, "learning_rate": 9.965010782761177e-05, "loss": 1.3336, "step": 1486 }, { "epoch": 0.40265187390069, "grad_norm": 0.2694818079471588, "learning_rate": 9.964638007625754e-05, "loss": 1.3448, "step": 1488 }, { "epoch": 0.4031930726559329, "grad_norm": 0.29507288336753845, "learning_rate": 9.964263264272033e-05, "loss": 1.327, "step": 1490 }, { "epoch": 0.40373427141117574, "grad_norm": 0.3036315143108368, "learning_rate": 9.963886552848581e-05, "loss": 1.3289, "step": 1492 }, { "epoch": 0.4042754701664186, "grad_norm": 0.2737107574939728, "learning_rate": 9.963507873504744e-05, "loss": 1.3281, "step": 1494 }, { "epoch": 0.40481666892166146, "grad_norm": 0.29833105206489563, "learning_rate": 9.963127226390647e-05, "loss": 1.3378, "step": 1496 }, { "epoch": 0.40535786767690435, "grad_norm": 0.32203689217567444, "learning_rate": 9.9627446116572e-05, "loss": 1.3158, "step": 1498 }, { "epoch": 0.4058990664321472, "grad_norm": 0.27837038040161133, "learning_rate": 9.962360029456086e-05, "loss": 1.3051, "step": 1500 }, { "epoch": 0.40644026518739007, "grad_norm": 0.2688932418823242, "learning_rate": 9.961973479939774e-05, "loss": 1.339, "step": 1502 }, { "epoch": 0.40698146394263296, "grad_norm": 0.2779388725757599, "learning_rate": 9.96158496326151e-05, "loss": 1.3264, "step": 1504 }, { "epoch": 0.4075226626978758, "grad_norm": 0.27401190996170044, "learning_rate": 9.961194479575321e-05, "loss": 1.3139, "step": 1506 }, { "epoch": 0.4080638614531187, "grad_norm": 0.270448237657547, "learning_rate": 9.960802029036012e-05, "loss": 1.3253, "step": 1508 }, { "epoch": 0.4086050602083615, "grad_norm": 0.29150158166885376, "learning_rate": 9.96040761179917e-05, "loss": 1.3324, "step": 1510 }, { "epoch": 0.4091462589636044, "grad_norm": 0.2666511833667755, "learning_rate": 9.960011228021159e-05, "loss": 1.325, "step": 1512 }, { "epoch": 0.40968745771884724, "grad_norm": 0.2782241106033325, "learning_rate": 9.959612877859125e-05, "loss": 1.3162, "step": 1514 }, { "epoch": 0.4102286564740901, "grad_norm": 0.2845720946788788, "learning_rate": 9.959212561470996e-05, "loss": 1.3316, "step": 1516 }, { "epoch": 0.41076985522933296, "grad_norm": 0.27991780638694763, "learning_rate": 9.958810279015473e-05, "loss": 1.3121, "step": 1518 }, { "epoch": 0.41131105398457585, "grad_norm": 0.2804965674877167, "learning_rate": 9.958406030652043e-05, "loss": 1.3246, "step": 1520 }, { "epoch": 0.4118522527398187, "grad_norm": 0.2732795178890228, "learning_rate": 9.957999816540965e-05, "loss": 1.3217, "step": 1522 }, { "epoch": 0.41239345149506157, "grad_norm": 0.28181079030036926, "learning_rate": 9.957591636843284e-05, "loss": 1.3374, "step": 1524 }, { "epoch": 0.4129346502503044, "grad_norm": 0.3096240162849426, "learning_rate": 9.957181491720822e-05, "loss": 1.3324, "step": 1526 }, { "epoch": 0.4134758490055473, "grad_norm": 0.2709742486476898, "learning_rate": 9.95676938133618e-05, "loss": 1.3055, "step": 1528 }, { "epoch": 0.4140170477607901, "grad_norm": 0.27309080958366394, "learning_rate": 9.956355305852736e-05, "loss": 1.313, "step": 1530 }, { "epoch": 0.414558246516033, "grad_norm": 0.29801151156425476, "learning_rate": 9.955939265434652e-05, "loss": 1.3185, "step": 1532 }, { "epoch": 0.4150994452712759, "grad_norm": 0.28698021173477173, "learning_rate": 9.955521260246865e-05, "loss": 1.3214, "step": 1534 }, { "epoch": 0.41564064402651874, "grad_norm": 0.2641914188861847, "learning_rate": 9.955101290455093e-05, "loss": 1.317, "step": 1536 }, { "epoch": 0.4161818427817616, "grad_norm": 0.26065558195114136, "learning_rate": 9.954679356225832e-05, "loss": 1.3253, "step": 1538 }, { "epoch": 0.41672304153700446, "grad_norm": 0.27157294750213623, "learning_rate": 9.954255457726354e-05, "loss": 1.3218, "step": 1540 }, { "epoch": 0.41726424029224735, "grad_norm": 0.2833496630191803, "learning_rate": 9.953829595124715e-05, "loss": 1.32, "step": 1542 }, { "epoch": 0.4178054390474902, "grad_norm": 0.2757824659347534, "learning_rate": 9.953401768589745e-05, "loss": 1.3165, "step": 1544 }, { "epoch": 0.41834663780273307, "grad_norm": 0.2609362304210663, "learning_rate": 9.952971978291059e-05, "loss": 1.3229, "step": 1546 }, { "epoch": 0.4188878365579759, "grad_norm": 0.2863214313983917, "learning_rate": 9.952540224399043e-05, "loss": 1.3217, "step": 1548 }, { "epoch": 0.4194290353132188, "grad_norm": 0.27573657035827637, "learning_rate": 9.952106507084864e-05, "loss": 1.3151, "step": 1550 }, { "epoch": 0.4199702340684616, "grad_norm": 0.26843398809432983, "learning_rate": 9.95167082652047e-05, "loss": 1.3185, "step": 1552 }, { "epoch": 0.4205114328237045, "grad_norm": 0.25903749465942383, "learning_rate": 9.951233182878585e-05, "loss": 1.3142, "step": 1554 }, { "epoch": 0.42105263157894735, "grad_norm": 0.27221450209617615, "learning_rate": 9.950793576332713e-05, "loss": 1.3119, "step": 1556 }, { "epoch": 0.42159383033419023, "grad_norm": 0.2897038161754608, "learning_rate": 9.950352007057134e-05, "loss": 1.3217, "step": 1558 }, { "epoch": 0.42213502908943307, "grad_norm": 0.2515231668949127, "learning_rate": 9.949908475226905e-05, "loss": 1.3263, "step": 1560 }, { "epoch": 0.42267622784467596, "grad_norm": 0.26686710119247437, "learning_rate": 9.949462981017865e-05, "loss": 1.3269, "step": 1562 }, { "epoch": 0.42321742659991884, "grad_norm": 0.2747204899787903, "learning_rate": 9.949015524606629e-05, "loss": 1.3208, "step": 1564 }, { "epoch": 0.4237586253551617, "grad_norm": 0.25866395235061646, "learning_rate": 9.948566106170589e-05, "loss": 1.3273, "step": 1566 }, { "epoch": 0.42429982411040457, "grad_norm": 0.2659189999103546, "learning_rate": 9.948114725887918e-05, "loss": 1.2955, "step": 1568 }, { "epoch": 0.4248410228656474, "grad_norm": 0.25262853503227234, "learning_rate": 9.947661383937563e-05, "loss": 1.284, "step": 1570 }, { "epoch": 0.4253822216208903, "grad_norm": 0.24780422449111938, "learning_rate": 9.94720608049925e-05, "loss": 1.3168, "step": 1572 }, { "epoch": 0.4259234203761331, "grad_norm": 0.2663845121860504, "learning_rate": 9.946748815753484e-05, "loss": 1.313, "step": 1574 }, { "epoch": 0.426464619131376, "grad_norm": 0.2906511425971985, "learning_rate": 9.946289589881545e-05, "loss": 1.3197, "step": 1576 }, { "epoch": 0.42700581788661884, "grad_norm": 0.28401264548301697, "learning_rate": 9.945828403065493e-05, "loss": 1.3254, "step": 1578 }, { "epoch": 0.42754701664186173, "grad_norm": 0.27820122241973877, "learning_rate": 9.945365255488164e-05, "loss": 1.3153, "step": 1580 }, { "epoch": 0.42808821539710457, "grad_norm": 0.2573559880256653, "learning_rate": 9.944900147333173e-05, "loss": 1.3144, "step": 1582 }, { "epoch": 0.42862941415234745, "grad_norm": 0.2536357343196869, "learning_rate": 9.944433078784909e-05, "loss": 1.3172, "step": 1584 }, { "epoch": 0.4291706129075903, "grad_norm": 0.2745160758495331, "learning_rate": 9.94396405002854e-05, "loss": 1.3023, "step": 1586 }, { "epoch": 0.4297118116628332, "grad_norm": 0.290393203496933, "learning_rate": 9.943493061250013e-05, "loss": 1.3095, "step": 1588 }, { "epoch": 0.43025301041807607, "grad_norm": 0.29357218742370605, "learning_rate": 9.94302011263605e-05, "loss": 1.3232, "step": 1590 }, { "epoch": 0.4307942091733189, "grad_norm": 0.2756180167198181, "learning_rate": 9.94254520437415e-05, "loss": 1.3179, "step": 1592 }, { "epoch": 0.4313354079285618, "grad_norm": 0.30225417017936707, "learning_rate": 9.942068336652589e-05, "loss": 1.3353, "step": 1594 }, { "epoch": 0.4318766066838046, "grad_norm": 0.26694637537002563, "learning_rate": 9.94158950966042e-05, "loss": 1.318, "step": 1596 }, { "epoch": 0.4324178054390475, "grad_norm": 0.2528863549232483, "learning_rate": 9.941108723587471e-05, "loss": 1.3282, "step": 1598 }, { "epoch": 0.43295900419429034, "grad_norm": 0.25261232256889343, "learning_rate": 9.940625978624353e-05, "loss": 1.3178, "step": 1600 }, { "epoch": 0.43350020294953323, "grad_norm": 0.2624775767326355, "learning_rate": 9.940141274962444e-05, "loss": 1.31, "step": 1602 }, { "epoch": 0.43404140170477606, "grad_norm": 0.260810524225235, "learning_rate": 9.939654612793908e-05, "loss": 1.3162, "step": 1604 }, { "epoch": 0.43458260046001895, "grad_norm": 0.2815745174884796, "learning_rate": 9.939165992311676e-05, "loss": 1.3112, "step": 1606 }, { "epoch": 0.4351237992152618, "grad_norm": 0.2773973345756531, "learning_rate": 9.938675413709466e-05, "loss": 1.3, "step": 1608 }, { "epoch": 0.4356649979705047, "grad_norm": 0.26486915349960327, "learning_rate": 9.938182877181763e-05, "loss": 1.3193, "step": 1610 }, { "epoch": 0.4362061967257475, "grad_norm": 0.26103830337524414, "learning_rate": 9.937688382923832e-05, "loss": 1.3244, "step": 1612 }, { "epoch": 0.4367473954809904, "grad_norm": 0.2556493878364563, "learning_rate": 9.937191931131716e-05, "loss": 1.3087, "step": 1614 }, { "epoch": 0.43728859423623323, "grad_norm": 0.2739090919494629, "learning_rate": 9.93669352200223e-05, "loss": 1.3009, "step": 1616 }, { "epoch": 0.4378297929914761, "grad_norm": 0.26297444105148315, "learning_rate": 9.936193155732967e-05, "loss": 1.2971, "step": 1618 }, { "epoch": 0.438370991746719, "grad_norm": 0.2587411403656006, "learning_rate": 9.935690832522297e-05, "loss": 1.3259, "step": 1620 }, { "epoch": 0.43891219050196184, "grad_norm": 0.2419731616973877, "learning_rate": 9.935186552569366e-05, "loss": 1.3123, "step": 1622 }, { "epoch": 0.43945338925720473, "grad_norm": 0.27424389123916626, "learning_rate": 9.934680316074092e-05, "loss": 1.3196, "step": 1624 }, { "epoch": 0.43999458801244756, "grad_norm": 0.258242666721344, "learning_rate": 9.934172123237173e-05, "loss": 1.3044, "step": 1626 }, { "epoch": 0.44053578676769045, "grad_norm": 0.2621035575866699, "learning_rate": 9.933661974260078e-05, "loss": 1.3111, "step": 1628 }, { "epoch": 0.4410769855229333, "grad_norm": 0.25349390506744385, "learning_rate": 9.93314986934506e-05, "loss": 1.3025, "step": 1630 }, { "epoch": 0.4416181842781762, "grad_norm": 0.2615620195865631, "learning_rate": 9.932635808695136e-05, "loss": 1.3291, "step": 1632 }, { "epoch": 0.442159383033419, "grad_norm": 0.2933880686759949, "learning_rate": 9.932119792514105e-05, "loss": 1.3327, "step": 1634 }, { "epoch": 0.4427005817886619, "grad_norm": 0.2584700286388397, "learning_rate": 9.931601821006544e-05, "loss": 1.3031, "step": 1636 }, { "epoch": 0.44324178054390473, "grad_norm": 0.2718084156513214, "learning_rate": 9.931081894377797e-05, "loss": 1.3053, "step": 1638 }, { "epoch": 0.4437829792991476, "grad_norm": 0.27105703949928284, "learning_rate": 9.93056001283399e-05, "loss": 1.3012, "step": 1640 }, { "epoch": 0.44432417805439045, "grad_norm": 0.27265292406082153, "learning_rate": 9.930036176582021e-05, "loss": 1.2957, "step": 1642 }, { "epoch": 0.44486537680963334, "grad_norm": 0.26121169328689575, "learning_rate": 9.929510385829564e-05, "loss": 1.3062, "step": 1644 }, { "epoch": 0.44540657556487623, "grad_norm": 0.26841971278190613, "learning_rate": 9.928982640785067e-05, "loss": 1.3192, "step": 1646 }, { "epoch": 0.44594777432011906, "grad_norm": 0.27634862065315247, "learning_rate": 9.928452941657755e-05, "loss": 1.3005, "step": 1648 }, { "epoch": 0.44648897307536195, "grad_norm": 0.25527122616767883, "learning_rate": 9.927921288657623e-05, "loss": 1.3121, "step": 1650 }, { "epoch": 0.4470301718306048, "grad_norm": 0.2733294665813446, "learning_rate": 9.927387681995443e-05, "loss": 1.3051, "step": 1652 }, { "epoch": 0.44757137058584767, "grad_norm": 0.2783257067203522, "learning_rate": 9.926852121882766e-05, "loss": 1.2947, "step": 1654 }, { "epoch": 0.4481125693410905, "grad_norm": 0.2672583758831024, "learning_rate": 9.926314608531911e-05, "loss": 1.3272, "step": 1656 }, { "epoch": 0.4486537680963334, "grad_norm": 0.2568219304084778, "learning_rate": 9.925775142155974e-05, "loss": 1.3025, "step": 1658 }, { "epoch": 0.4491949668515762, "grad_norm": 0.2576539218425751, "learning_rate": 9.925233722968826e-05, "loss": 1.2715, "step": 1660 }, { "epoch": 0.4497361656068191, "grad_norm": 0.25898897647857666, "learning_rate": 9.924690351185109e-05, "loss": 1.3039, "step": 1662 }, { "epoch": 0.45027736436206195, "grad_norm": 0.25795668363571167, "learning_rate": 9.924145027020242e-05, "loss": 1.3115, "step": 1664 }, { "epoch": 0.45081856311730484, "grad_norm": 0.2781166136264801, "learning_rate": 9.92359775069042e-05, "loss": 1.3017, "step": 1666 }, { "epoch": 0.45135976187254767, "grad_norm": 0.2871512770652771, "learning_rate": 9.923048522412608e-05, "loss": 1.3206, "step": 1668 }, { "epoch": 0.45190096062779056, "grad_norm": 0.27760595083236694, "learning_rate": 9.922497342404544e-05, "loss": 1.3214, "step": 1670 }, { "epoch": 0.4524421593830334, "grad_norm": 0.26959067583084106, "learning_rate": 9.921944210884746e-05, "loss": 1.3144, "step": 1672 }, { "epoch": 0.4529833581382763, "grad_norm": 0.2662011384963989, "learning_rate": 9.921389128072498e-05, "loss": 1.3022, "step": 1674 }, { "epoch": 0.45352455689351917, "grad_norm": 0.28014811873435974, "learning_rate": 9.920832094187861e-05, "loss": 1.3104, "step": 1676 }, { "epoch": 0.454065755648762, "grad_norm": 0.2560974955558777, "learning_rate": 9.920273109451673e-05, "loss": 1.3113, "step": 1678 }, { "epoch": 0.4546069544040049, "grad_norm": 0.285339891910553, "learning_rate": 9.91971217408554e-05, "loss": 1.3126, "step": 1680 }, { "epoch": 0.4551481531592477, "grad_norm": 0.29105204343795776, "learning_rate": 9.919149288311843e-05, "loss": 1.3248, "step": 1682 }, { "epoch": 0.4556893519144906, "grad_norm": 0.2868146002292633, "learning_rate": 9.918584452353739e-05, "loss": 1.3217, "step": 1684 }, { "epoch": 0.45623055066973345, "grad_norm": 0.26717278361320496, "learning_rate": 9.918017666435152e-05, "loss": 1.2991, "step": 1686 }, { "epoch": 0.45677174942497634, "grad_norm": 0.2560403048992157, "learning_rate": 9.917448930780786e-05, "loss": 1.3091, "step": 1688 }, { "epoch": 0.45731294818021917, "grad_norm": 0.2610042989253998, "learning_rate": 9.916878245616114e-05, "loss": 1.2948, "step": 1690 }, { "epoch": 0.45785414693546206, "grad_norm": 0.27322304248809814, "learning_rate": 9.916305611167382e-05, "loss": 1.3121, "step": 1692 }, { "epoch": 0.4583953456907049, "grad_norm": 0.26559844613075256, "learning_rate": 9.91573102766161e-05, "loss": 1.307, "step": 1694 }, { "epoch": 0.4589365444459478, "grad_norm": 0.2677384316921234, "learning_rate": 9.91515449532659e-05, "loss": 1.2925, "step": 1696 }, { "epoch": 0.4594777432011906, "grad_norm": 0.2670448422431946, "learning_rate": 9.914576014390888e-05, "loss": 1.3051, "step": 1698 }, { "epoch": 0.4600189419564335, "grad_norm": 0.2537919878959656, "learning_rate": 9.91399558508384e-05, "loss": 1.3047, "step": 1700 }, { "epoch": 0.46056014071167634, "grad_norm": 0.2712916433811188, "learning_rate": 9.913413207635555e-05, "loss": 1.2949, "step": 1702 }, { "epoch": 0.4611013394669192, "grad_norm": 0.27910125255584717, "learning_rate": 9.912828882276917e-05, "loss": 1.336, "step": 1704 }, { "epoch": 0.4616425382221621, "grad_norm": 0.25917065143585205, "learning_rate": 9.91224260923958e-05, "loss": 1.2938, "step": 1706 }, { "epoch": 0.46218373697740495, "grad_norm": 0.265024334192276, "learning_rate": 9.91165438875597e-05, "loss": 1.2876, "step": 1708 }, { "epoch": 0.46272493573264784, "grad_norm": 0.2637651860713959, "learning_rate": 9.911064221059286e-05, "loss": 1.3128, "step": 1710 }, { "epoch": 0.46326613448789067, "grad_norm": 0.25448864698410034, "learning_rate": 9.910472106383495e-05, "loss": 1.3289, "step": 1712 }, { "epoch": 0.46380733324313356, "grad_norm": 0.24903124570846558, "learning_rate": 9.909878044963346e-05, "loss": 1.3013, "step": 1714 }, { "epoch": 0.4643485319983764, "grad_norm": 0.258848637342453, "learning_rate": 9.909282037034347e-05, "loss": 1.3052, "step": 1716 }, { "epoch": 0.4648897307536193, "grad_norm": 0.25806304812431335, "learning_rate": 9.908684082832787e-05, "loss": 1.286, "step": 1718 }, { "epoch": 0.4654309295088621, "grad_norm": 0.26794132590293884, "learning_rate": 9.908084182595723e-05, "loss": 1.3069, "step": 1720 }, { "epoch": 0.465972128264105, "grad_norm": 0.26079118251800537, "learning_rate": 9.907482336560983e-05, "loss": 1.3145, "step": 1722 }, { "epoch": 0.46651332701934783, "grad_norm": 0.25958481431007385, "learning_rate": 9.906878544967169e-05, "loss": 1.3098, "step": 1724 }, { "epoch": 0.4670545257745907, "grad_norm": 0.2390812784433365, "learning_rate": 9.906272808053652e-05, "loss": 1.3085, "step": 1726 }, { "epoch": 0.46759572452983356, "grad_norm": 0.263637900352478, "learning_rate": 9.905665126060574e-05, "loss": 1.2933, "step": 1728 }, { "epoch": 0.46813692328507645, "grad_norm": 0.2462746798992157, "learning_rate": 9.90505549922885e-05, "loss": 1.2877, "step": 1730 }, { "epoch": 0.46867812204031933, "grad_norm": 0.244845911860466, "learning_rate": 9.904443927800164e-05, "loss": 1.325, "step": 1732 }, { "epoch": 0.46921932079556217, "grad_norm": 0.28249332308769226, "learning_rate": 9.903830412016974e-05, "loss": 1.313, "step": 1734 }, { "epoch": 0.46976051955080506, "grad_norm": 0.29556336998939514, "learning_rate": 9.903214952122504e-05, "loss": 1.3142, "step": 1736 }, { "epoch": 0.4703017183060479, "grad_norm": 0.2746431827545166, "learning_rate": 9.902597548360754e-05, "loss": 1.3096, "step": 1738 }, { "epoch": 0.4708429170612908, "grad_norm": 0.2979538142681122, "learning_rate": 9.901978200976492e-05, "loss": 1.2849, "step": 1740 }, { "epoch": 0.4713841158165336, "grad_norm": 0.2766527235507965, "learning_rate": 9.901356910215255e-05, "loss": 1.3089, "step": 1742 }, { "epoch": 0.4719253145717765, "grad_norm": 0.25000783801078796, "learning_rate": 9.900733676323353e-05, "loss": 1.308, "step": 1744 }, { "epoch": 0.47246651332701933, "grad_norm": 0.26226234436035156, "learning_rate": 9.900108499547864e-05, "loss": 1.3041, "step": 1746 }, { "epoch": 0.4730077120822622, "grad_norm": 0.2794544994831085, "learning_rate": 9.899481380136642e-05, "loss": 1.3312, "step": 1748 }, { "epoch": 0.47354891083750505, "grad_norm": 0.24771127104759216, "learning_rate": 9.898852318338303e-05, "loss": 1.2853, "step": 1750 }, { "epoch": 0.47409010959274794, "grad_norm": 0.2811632752418518, "learning_rate": 9.898221314402238e-05, "loss": 1.3019, "step": 1752 }, { "epoch": 0.4746313083479908, "grad_norm": 0.2812533378601074, "learning_rate": 9.897588368578608e-05, "loss": 1.3298, "step": 1754 }, { "epoch": 0.47517250710323367, "grad_norm": 0.25955653190612793, "learning_rate": 9.896953481118341e-05, "loss": 1.3093, "step": 1756 }, { "epoch": 0.4757137058584765, "grad_norm": 0.2653108537197113, "learning_rate": 9.896316652273136e-05, "loss": 1.2898, "step": 1758 }, { "epoch": 0.4762549046137194, "grad_norm": 0.27985796332359314, "learning_rate": 9.895677882295466e-05, "loss": 1.2928, "step": 1760 }, { "epoch": 0.4767961033689623, "grad_norm": 0.2889133393764496, "learning_rate": 9.895037171438568e-05, "loss": 1.3088, "step": 1762 }, { "epoch": 0.4773373021242051, "grad_norm": 0.2615009546279907, "learning_rate": 9.894394519956448e-05, "loss": 1.3212, "step": 1764 }, { "epoch": 0.477878500879448, "grad_norm": 0.24938960373401642, "learning_rate": 9.893749928103885e-05, "loss": 1.2982, "step": 1766 }, { "epoch": 0.47841969963469083, "grad_norm": 0.27132853865623474, "learning_rate": 9.893103396136427e-05, "loss": 1.294, "step": 1768 }, { "epoch": 0.4789608983899337, "grad_norm": 0.2632822096347809, "learning_rate": 9.89245492431039e-05, "loss": 1.2852, "step": 1770 }, { "epoch": 0.47950209714517655, "grad_norm": 0.27269670367240906, "learning_rate": 9.891804512882856e-05, "loss": 1.2934, "step": 1772 }, { "epoch": 0.48004329590041944, "grad_norm": 0.2572595179080963, "learning_rate": 9.891152162111683e-05, "loss": 1.2719, "step": 1774 }, { "epoch": 0.4805844946556623, "grad_norm": 0.2708267867565155, "learning_rate": 9.890497872255489e-05, "loss": 1.2907, "step": 1776 }, { "epoch": 0.48112569341090516, "grad_norm": 0.28407028317451477, "learning_rate": 9.889841643573671e-05, "loss": 1.2977, "step": 1778 }, { "epoch": 0.481666892166148, "grad_norm": 0.26248103380203247, "learning_rate": 9.889183476326386e-05, "loss": 1.2993, "step": 1780 }, { "epoch": 0.4822080909213909, "grad_norm": 0.26148512959480286, "learning_rate": 9.888523370774563e-05, "loss": 1.2893, "step": 1782 }, { "epoch": 0.4827492896766337, "grad_norm": 0.2815425395965576, "learning_rate": 9.8878613271799e-05, "loss": 1.3015, "step": 1784 }, { "epoch": 0.4832904884318766, "grad_norm": 0.26061713695526123, "learning_rate": 9.887197345804862e-05, "loss": 1.2781, "step": 1786 }, { "epoch": 0.4838316871871195, "grad_norm": 0.2641533613204956, "learning_rate": 9.886531426912683e-05, "loss": 1.2993, "step": 1788 }, { "epoch": 0.48437288594236233, "grad_norm": 0.25920137763023376, "learning_rate": 9.885863570767364e-05, "loss": 1.2955, "step": 1790 }, { "epoch": 0.4849140846976052, "grad_norm": 0.24002158641815186, "learning_rate": 9.885193777633676e-05, "loss": 1.2932, "step": 1792 }, { "epoch": 0.48545528345284805, "grad_norm": 0.2643393576145172, "learning_rate": 9.884522047777157e-05, "loss": 1.2963, "step": 1794 }, { "epoch": 0.48599648220809094, "grad_norm": 0.2522197663784027, "learning_rate": 9.883848381464112e-05, "loss": 1.2947, "step": 1796 }, { "epoch": 0.4865376809633338, "grad_norm": 0.2431286871433258, "learning_rate": 9.883172778961613e-05, "loss": 1.3112, "step": 1798 }, { "epoch": 0.48707887971857666, "grad_norm": 0.26892608404159546, "learning_rate": 9.882495240537505e-05, "loss": 1.2904, "step": 1800 }, { "epoch": 0.4876200784738195, "grad_norm": 0.2528528571128845, "learning_rate": 9.881815766460392e-05, "loss": 1.2949, "step": 1802 }, { "epoch": 0.4881612772290624, "grad_norm": 0.2614927291870117, "learning_rate": 9.881134356999652e-05, "loss": 1.288, "step": 1804 }, { "epoch": 0.4887024759843052, "grad_norm": 0.2523605227470398, "learning_rate": 9.880451012425426e-05, "loss": 1.3029, "step": 1806 }, { "epoch": 0.4892436747395481, "grad_norm": 0.24303248524665833, "learning_rate": 9.879765733008627e-05, "loss": 1.3107, "step": 1808 }, { "epoch": 0.48978487349479094, "grad_norm": 0.2470557987689972, "learning_rate": 9.879078519020933e-05, "loss": 1.2856, "step": 1810 }, { "epoch": 0.49032607225003383, "grad_norm": 0.2526317536830902, "learning_rate": 9.878389370734784e-05, "loss": 1.2965, "step": 1812 }, { "epoch": 0.49086727100527666, "grad_norm": 0.2483314871788025, "learning_rate": 9.877698288423394e-05, "loss": 1.3016, "step": 1814 }, { "epoch": 0.49140846976051955, "grad_norm": 0.24746839702129364, "learning_rate": 9.877005272360741e-05, "loss": 1.2944, "step": 1816 }, { "epoch": 0.49194966851576244, "grad_norm": 0.24739988148212433, "learning_rate": 9.876310322821568e-05, "loss": 1.3037, "step": 1818 }, { "epoch": 0.4924908672710053, "grad_norm": 0.2740204632282257, "learning_rate": 9.875613440081387e-05, "loss": 1.3116, "step": 1820 }, { "epoch": 0.49303206602624816, "grad_norm": 0.27116379141807556, "learning_rate": 9.874914624416475e-05, "loss": 1.288, "step": 1822 }, { "epoch": 0.493573264781491, "grad_norm": 0.24231554567813873, "learning_rate": 9.874213876103878e-05, "loss": 1.2975, "step": 1824 }, { "epoch": 0.4941144635367339, "grad_norm": 0.2590995728969574, "learning_rate": 9.873511195421402e-05, "loss": 1.2678, "step": 1826 }, { "epoch": 0.4946556622919767, "grad_norm": 0.25694531202316284, "learning_rate": 9.872806582647625e-05, "loss": 1.28, "step": 1828 }, { "epoch": 0.4951968610472196, "grad_norm": 0.25455620884895325, "learning_rate": 9.87210003806189e-05, "loss": 1.2942, "step": 1830 }, { "epoch": 0.49573805980246244, "grad_norm": 0.2639889121055603, "learning_rate": 9.871391561944302e-05, "loss": 1.3161, "step": 1832 }, { "epoch": 0.4962792585577053, "grad_norm": 0.271282821893692, "learning_rate": 9.870681154575737e-05, "loss": 1.3071, "step": 1834 }, { "epoch": 0.49682045731294816, "grad_norm": 0.26479372382164, "learning_rate": 9.869968816237833e-05, "loss": 1.2841, "step": 1836 }, { "epoch": 0.49736165606819105, "grad_norm": 0.26040130853652954, "learning_rate": 9.869254547212997e-05, "loss": 1.2989, "step": 1838 }, { "epoch": 0.4979028548234339, "grad_norm": 0.26563623547554016, "learning_rate": 9.868538347784396e-05, "loss": 1.2965, "step": 1840 }, { "epoch": 0.49844405357867677, "grad_norm": 0.26089224219322205, "learning_rate": 9.867820218235969e-05, "loss": 1.3071, "step": 1842 }, { "epoch": 0.4989852523339196, "grad_norm": 0.27151811122894287, "learning_rate": 9.867100158852412e-05, "loss": 1.287, "step": 1844 }, { "epoch": 0.4995264510891625, "grad_norm": 0.2477792203426361, "learning_rate": 9.866378169919192e-05, "loss": 1.2894, "step": 1846 }, { "epoch": 0.5000676498444053, "grad_norm": 0.24871942400932312, "learning_rate": 9.865654251722545e-05, "loss": 1.3024, "step": 1848 }, { "epoch": 0.5006088485996483, "grad_norm": 0.26377877593040466, "learning_rate": 9.86492840454946e-05, "loss": 1.2939, "step": 1850 }, { "epoch": 0.5011500473548911, "grad_norm": 0.258228063583374, "learning_rate": 9.8642006286877e-05, "loss": 1.291, "step": 1852 }, { "epoch": 0.5016912461101339, "grad_norm": 0.26982301473617554, "learning_rate": 9.86347092442579e-05, "loss": 1.2845, "step": 1854 }, { "epoch": 0.5022324448653768, "grad_norm": 0.24094600975513458, "learning_rate": 9.862739292053021e-05, "loss": 1.2744, "step": 1856 }, { "epoch": 0.5027736436206197, "grad_norm": 0.25840380787849426, "learning_rate": 9.862005731859442e-05, "loss": 1.2966, "step": 1858 }, { "epoch": 0.5033148423758625, "grad_norm": 0.26734429597854614, "learning_rate": 9.861270244135877e-05, "loss": 1.2856, "step": 1860 }, { "epoch": 0.5038560411311054, "grad_norm": 0.24431397020816803, "learning_rate": 9.860532829173903e-05, "loss": 1.2871, "step": 1862 }, { "epoch": 0.5043972398863482, "grad_norm": 0.25425857305526733, "learning_rate": 9.859793487265869e-05, "loss": 1.2822, "step": 1864 }, { "epoch": 0.5049384386415912, "grad_norm": 0.25332111120224, "learning_rate": 9.859052218704885e-05, "loss": 1.2723, "step": 1866 }, { "epoch": 0.505479637396834, "grad_norm": 0.24775418639183044, "learning_rate": 9.858309023784826e-05, "loss": 1.2934, "step": 1868 }, { "epoch": 0.5060208361520768, "grad_norm": 0.24880458414554596, "learning_rate": 9.857563902800328e-05, "loss": 1.3041, "step": 1870 }, { "epoch": 0.5065620349073197, "grad_norm": 0.2574135959148407, "learning_rate": 9.856816856046793e-05, "loss": 1.2855, "step": 1872 }, { "epoch": 0.5071032336625626, "grad_norm": 0.26873350143432617, "learning_rate": 9.856067883820386e-05, "loss": 1.3055, "step": 1874 }, { "epoch": 0.5076444324178054, "grad_norm": 0.23742420971393585, "learning_rate": 9.855316986418036e-05, "loss": 1.3029, "step": 1876 }, { "epoch": 0.5081856311730483, "grad_norm": 0.2398921549320221, "learning_rate": 9.854564164137432e-05, "loss": 1.2849, "step": 1878 }, { "epoch": 0.5087268299282912, "grad_norm": 0.25182288885116577, "learning_rate": 9.85380941727703e-05, "loss": 1.2981, "step": 1880 }, { "epoch": 0.509268028683534, "grad_norm": 0.23373378813266754, "learning_rate": 9.853052746136048e-05, "loss": 1.2772, "step": 1882 }, { "epoch": 0.5098092274387769, "grad_norm": 0.2581213712692261, "learning_rate": 9.852294151014466e-05, "loss": 1.3147, "step": 1884 }, { "epoch": 0.5103504261940197, "grad_norm": 0.26642751693725586, "learning_rate": 9.851533632213028e-05, "loss": 1.2885, "step": 1886 }, { "epoch": 0.5108916249492627, "grad_norm": 0.24029181897640228, "learning_rate": 9.850771190033237e-05, "loss": 1.297, "step": 1888 }, { "epoch": 0.5114328237045055, "grad_norm": 0.2555221915245056, "learning_rate": 9.850006824777364e-05, "loss": 1.284, "step": 1890 }, { "epoch": 0.5119740224597483, "grad_norm": 0.2723660171031952, "learning_rate": 9.849240536748439e-05, "loss": 1.2821, "step": 1892 }, { "epoch": 0.5125152212149912, "grad_norm": 0.24772705137729645, "learning_rate": 9.848472326250253e-05, "loss": 1.2743, "step": 1894 }, { "epoch": 0.5130564199702341, "grad_norm": 0.2344834804534912, "learning_rate": 9.847702193587365e-05, "loss": 1.286, "step": 1896 }, { "epoch": 0.5135976187254769, "grad_norm": 0.23948362469673157, "learning_rate": 9.846930139065088e-05, "loss": 1.2673, "step": 1898 }, { "epoch": 0.5141388174807198, "grad_norm": 0.27207908034324646, "learning_rate": 9.846156162989503e-05, "loss": 1.3041, "step": 1900 }, { "epoch": 0.5146800162359627, "grad_norm": 0.2407965511083603, "learning_rate": 9.845380265667454e-05, "loss": 1.2875, "step": 1902 }, { "epoch": 0.5152212149912055, "grad_norm": 0.2517203688621521, "learning_rate": 9.844602447406538e-05, "loss": 1.2855, "step": 1904 }, { "epoch": 0.5157624137464484, "grad_norm": 0.24267178773880005, "learning_rate": 9.843822708515123e-05, "loss": 1.2711, "step": 1906 }, { "epoch": 0.5163036125016912, "grad_norm": 0.23933006823062897, "learning_rate": 9.843041049302331e-05, "loss": 1.3094, "step": 1908 }, { "epoch": 0.5168448112569342, "grad_norm": 0.21948301792144775, "learning_rate": 9.842257470078054e-05, "loss": 1.2686, "step": 1910 }, { "epoch": 0.517386010012177, "grad_norm": 0.239594966173172, "learning_rate": 9.841471971152933e-05, "loss": 1.2959, "step": 1912 }, { "epoch": 0.5179272087674198, "grad_norm": 0.26850634813308716, "learning_rate": 9.840684552838385e-05, "loss": 1.2969, "step": 1914 }, { "epoch": 0.5184684075226627, "grad_norm": 0.26066869497299194, "learning_rate": 9.839895215446573e-05, "loss": 1.2935, "step": 1916 }, { "epoch": 0.5190096062779056, "grad_norm": 0.25288596749305725, "learning_rate": 9.839103959290433e-05, "loss": 1.2922, "step": 1918 }, { "epoch": 0.5195508050331484, "grad_norm": 0.24453966319561005, "learning_rate": 9.838310784683655e-05, "loss": 1.3058, "step": 1920 }, { "epoch": 0.5200920037883913, "grad_norm": 0.25353509187698364, "learning_rate": 9.837515691940689e-05, "loss": 1.3161, "step": 1922 }, { "epoch": 0.5206332025436341, "grad_norm": 0.24898375570774078, "learning_rate": 9.836718681376749e-05, "loss": 1.2925, "step": 1924 }, { "epoch": 0.521174401298877, "grad_norm": 0.2576977014541626, "learning_rate": 9.835919753307807e-05, "loss": 1.2916, "step": 1926 }, { "epoch": 0.5217156000541199, "grad_norm": 0.25432518124580383, "learning_rate": 9.8351189080506e-05, "loss": 1.2866, "step": 1928 }, { "epoch": 0.5222567988093627, "grad_norm": 0.2504200339317322, "learning_rate": 9.834316145922615e-05, "loss": 1.2728, "step": 1930 }, { "epoch": 0.5227979975646057, "grad_norm": 0.2627692222595215, "learning_rate": 9.83351146724211e-05, "loss": 1.2853, "step": 1932 }, { "epoch": 0.5233391963198485, "grad_norm": 0.2776716351509094, "learning_rate": 9.832704872328094e-05, "loss": 1.2881, "step": 1934 }, { "epoch": 0.5238803950750913, "grad_norm": 0.24669450521469116, "learning_rate": 9.831896361500344e-05, "loss": 1.2681, "step": 1936 }, { "epoch": 0.5244215938303342, "grad_norm": 0.24949464201927185, "learning_rate": 9.831085935079387e-05, "loss": 1.2851, "step": 1938 }, { "epoch": 0.5249627925855771, "grad_norm": 0.2585392892360687, "learning_rate": 9.830273593386518e-05, "loss": 1.2796, "step": 1940 }, { "epoch": 0.5255039913408199, "grad_norm": 0.26086801290512085, "learning_rate": 9.829459336743787e-05, "loss": 1.293, "step": 1942 }, { "epoch": 0.5260451900960628, "grad_norm": 0.25490057468414307, "learning_rate": 9.828643165474006e-05, "loss": 1.2824, "step": 1944 }, { "epoch": 0.5265863888513056, "grad_norm": 0.24865177273750305, "learning_rate": 9.827825079900739e-05, "loss": 1.2835, "step": 1946 }, { "epoch": 0.5271275876065485, "grad_norm": 0.25498902797698975, "learning_rate": 9.827005080348317e-05, "loss": 1.2931, "step": 1948 }, { "epoch": 0.5276687863617914, "grad_norm": 0.2585375905036926, "learning_rate": 9.826183167141828e-05, "loss": 1.2659, "step": 1950 }, { "epoch": 0.5282099851170342, "grad_norm": 0.2300305813550949, "learning_rate": 9.825359340607116e-05, "loss": 1.3019, "step": 1952 }, { "epoch": 0.528751183872277, "grad_norm": 0.24674038589000702, "learning_rate": 9.824533601070784e-05, "loss": 1.2784, "step": 1954 }, { "epoch": 0.52929238262752, "grad_norm": 0.23458759486675262, "learning_rate": 9.823705948860195e-05, "loss": 1.2779, "step": 1956 }, { "epoch": 0.5298335813827628, "grad_norm": 0.24736309051513672, "learning_rate": 9.822876384303472e-05, "loss": 1.3083, "step": 1958 }, { "epoch": 0.5303747801380057, "grad_norm": 0.25108450651168823, "learning_rate": 9.82204490772949e-05, "loss": 1.3044, "step": 1960 }, { "epoch": 0.5309159788932486, "grad_norm": 0.23308375477790833, "learning_rate": 9.82121151946789e-05, "loss": 1.2694, "step": 1962 }, { "epoch": 0.5314571776484914, "grad_norm": 0.2283206284046173, "learning_rate": 9.820376219849064e-05, "loss": 1.2735, "step": 1964 }, { "epoch": 0.5319983764037343, "grad_norm": 0.24121573567390442, "learning_rate": 9.819539009204164e-05, "loss": 1.2799, "step": 1966 }, { "epoch": 0.5325395751589771, "grad_norm": 0.24135661125183105, "learning_rate": 9.8186998878651e-05, "loss": 1.295, "step": 1968 }, { "epoch": 0.53308077391422, "grad_norm": 0.24390241503715515, "learning_rate": 9.817858856164542e-05, "loss": 1.2812, "step": 1970 }, { "epoch": 0.5336219726694629, "grad_norm": 0.24739502370357513, "learning_rate": 9.817015914435913e-05, "loss": 1.2872, "step": 1972 }, { "epoch": 0.5341631714247057, "grad_norm": 0.25517916679382324, "learning_rate": 9.816171063013395e-05, "loss": 1.2718, "step": 1974 }, { "epoch": 0.5347043701799485, "grad_norm": 0.25479528307914734, "learning_rate": 9.815324302231928e-05, "loss": 1.2952, "step": 1976 }, { "epoch": 0.5352455689351915, "grad_norm": 0.24998174607753754, "learning_rate": 9.814475632427206e-05, "loss": 1.2914, "step": 1978 }, { "epoch": 0.5357867676904343, "grad_norm": 0.2341603934764862, "learning_rate": 9.813625053935686e-05, "loss": 1.2793, "step": 1980 }, { "epoch": 0.5363279664456772, "grad_norm": 0.23716285824775696, "learning_rate": 9.812772567094574e-05, "loss": 1.2872, "step": 1982 }, { "epoch": 0.53686916520092, "grad_norm": 0.2324230819940567, "learning_rate": 9.81191817224184e-05, "loss": 1.2604, "step": 1984 }, { "epoch": 0.5374103639561629, "grad_norm": 0.24399405717849731, "learning_rate": 9.811061869716205e-05, "loss": 1.2972, "step": 1986 }, { "epoch": 0.5379515627114058, "grad_norm": 0.24572497606277466, "learning_rate": 9.810203659857145e-05, "loss": 1.2784, "step": 1988 }, { "epoch": 0.5384927614666486, "grad_norm": 0.22993844747543335, "learning_rate": 9.8093435430049e-05, "loss": 1.2886, "step": 1990 }, { "epoch": 0.5390339602218915, "grad_norm": 0.24518661201000214, "learning_rate": 9.808481519500458e-05, "loss": 1.2622, "step": 1992 }, { "epoch": 0.5395751589771344, "grad_norm": 0.2601888179779053, "learning_rate": 9.807617589685568e-05, "loss": 1.2739, "step": 1994 }, { "epoch": 0.5401163577323772, "grad_norm": 0.24736261367797852, "learning_rate": 9.80675175390273e-05, "loss": 1.2748, "step": 1996 }, { "epoch": 0.54065755648762, "grad_norm": 0.2332574725151062, "learning_rate": 9.805884012495203e-05, "loss": 1.2639, "step": 1998 }, { "epoch": 0.541198755242863, "grad_norm": 0.2662294805049896, "learning_rate": 9.805014365807004e-05, "loss": 1.2914, "step": 2000 }, { "epoch": 0.5417399539981058, "grad_norm": 0.28600943088531494, "learning_rate": 9.804142814182902e-05, "loss": 1.2657, "step": 2002 }, { "epoch": 0.5422811527533486, "grad_norm": 0.2814892530441284, "learning_rate": 9.803269357968416e-05, "loss": 1.2839, "step": 2004 }, { "epoch": 0.5428223515085915, "grad_norm": 0.24939605593681335, "learning_rate": 9.802393997509833e-05, "loss": 1.2692, "step": 2006 }, { "epoch": 0.5433635502638344, "grad_norm": 0.2562806308269501, "learning_rate": 9.801516733154181e-05, "loss": 1.291, "step": 2008 }, { "epoch": 0.5439047490190773, "grad_norm": 0.2617442011833191, "learning_rate": 9.800637565249255e-05, "loss": 1.2808, "step": 2010 }, { "epoch": 0.5444459477743201, "grad_norm": 0.2421412616968155, "learning_rate": 9.799756494143593e-05, "loss": 1.2733, "step": 2012 }, { "epoch": 0.5449871465295629, "grad_norm": 0.25231024622917175, "learning_rate": 9.798873520186497e-05, "loss": 1.2695, "step": 2014 }, { "epoch": 0.5455283452848059, "grad_norm": 0.25108659267425537, "learning_rate": 9.79798864372802e-05, "loss": 1.298, "step": 2016 }, { "epoch": 0.5460695440400487, "grad_norm": 0.24615678191184998, "learning_rate": 9.79710186511897e-05, "loss": 1.3127, "step": 2018 }, { "epoch": 0.5466107427952915, "grad_norm": 0.23436503112316132, "learning_rate": 9.796213184710904e-05, "loss": 1.2896, "step": 2020 }, { "epoch": 0.5471519415505345, "grad_norm": 0.23453901708126068, "learning_rate": 9.79532260285614e-05, "loss": 1.2761, "step": 2022 }, { "epoch": 0.5476931403057773, "grad_norm": 0.2413233071565628, "learning_rate": 9.794430119907748e-05, "loss": 1.2744, "step": 2024 }, { "epoch": 0.5482343390610201, "grad_norm": 0.2426893562078476, "learning_rate": 9.793535736219546e-05, "loss": 1.2615, "step": 2026 }, { "epoch": 0.548775537816263, "grad_norm": 0.23853014409542084, "learning_rate": 9.792639452146115e-05, "loss": 1.2897, "step": 2028 }, { "epoch": 0.5493167365715059, "grad_norm": 0.24866445362567902, "learning_rate": 9.791741268042784e-05, "loss": 1.2957, "step": 2030 }, { "epoch": 0.5498579353267488, "grad_norm": 0.24467822909355164, "learning_rate": 9.790841184265633e-05, "loss": 1.2867, "step": 2032 }, { "epoch": 0.5503991340819916, "grad_norm": 0.2393324077129364, "learning_rate": 9.7899392011715e-05, "loss": 1.3061, "step": 2034 }, { "epoch": 0.5509403328372344, "grad_norm": 0.23834531009197235, "learning_rate": 9.789035319117974e-05, "loss": 1.2957, "step": 2036 }, { "epoch": 0.5514815315924774, "grad_norm": 0.2603852450847626, "learning_rate": 9.788129538463397e-05, "loss": 1.2897, "step": 2038 }, { "epoch": 0.5520227303477202, "grad_norm": 0.26540425419807434, "learning_rate": 9.787221859566861e-05, "loss": 1.2829, "step": 2040 }, { "epoch": 0.552563929102963, "grad_norm": 0.25125250220298767, "learning_rate": 9.786312282788216e-05, "loss": 1.2708, "step": 2042 }, { "epoch": 0.553105127858206, "grad_norm": 0.23911471664905548, "learning_rate": 9.785400808488061e-05, "loss": 1.2949, "step": 2044 }, { "epoch": 0.5536463266134488, "grad_norm": 0.23871150612831116, "learning_rate": 9.784487437027746e-05, "loss": 1.2863, "step": 2046 }, { "epoch": 0.5541875253686916, "grad_norm": 0.25253376364707947, "learning_rate": 9.783572168769376e-05, "loss": 1.2797, "step": 2048 }, { "epoch": 0.5547287241239345, "grad_norm": 0.25140559673309326, "learning_rate": 9.782655004075807e-05, "loss": 1.2666, "step": 2050 }, { "epoch": 0.5552699228791774, "grad_norm": 0.25297242403030396, "learning_rate": 9.781735943310646e-05, "loss": 1.2935, "step": 2052 }, { "epoch": 0.5558111216344203, "grad_norm": 0.28536322712898254, "learning_rate": 9.780814986838252e-05, "loss": 1.2891, "step": 2054 }, { "epoch": 0.5563523203896631, "grad_norm": 0.28267911076545715, "learning_rate": 9.779892135023738e-05, "loss": 1.2846, "step": 2056 }, { "epoch": 0.5568935191449059, "grad_norm": 0.24850498139858246, "learning_rate": 9.778967388232964e-05, "loss": 1.2823, "step": 2058 }, { "epoch": 0.5574347179001489, "grad_norm": 0.4929364025592804, "learning_rate": 9.778040746832544e-05, "loss": 1.2681, "step": 2060 }, { "epoch": 0.5579759166553917, "grad_norm": 0.25423306226730347, "learning_rate": 9.777112211189843e-05, "loss": 1.2765, "step": 2062 }, { "epoch": 0.5585171154106345, "grad_norm": 0.23608753085136414, "learning_rate": 9.776181781672977e-05, "loss": 1.2756, "step": 2064 }, { "epoch": 0.5590583141658774, "grad_norm": 0.3117451071739197, "learning_rate": 9.775249458650812e-05, "loss": 1.2731, "step": 2066 }, { "epoch": 0.5595995129211203, "grad_norm": 0.2454603612422943, "learning_rate": 9.774315242492965e-05, "loss": 1.2821, "step": 2068 }, { "epoch": 0.5601407116763631, "grad_norm": 0.3214171528816223, "learning_rate": 9.773379133569804e-05, "loss": 1.2964, "step": 2070 }, { "epoch": 0.560681910431606, "grad_norm": 0.23589906096458435, "learning_rate": 9.772441132252448e-05, "loss": 1.2794, "step": 2072 }, { "epoch": 0.5612231091868489, "grad_norm": 0.23020370304584503, "learning_rate": 9.771501238912763e-05, "loss": 1.2753, "step": 2074 }, { "epoch": 0.5617643079420918, "grad_norm": 0.2368050515651703, "learning_rate": 9.77055945392337e-05, "loss": 1.3048, "step": 2076 }, { "epoch": 0.5623055066973346, "grad_norm": 0.2581866383552551, "learning_rate": 9.769615777657633e-05, "loss": 1.2765, "step": 2078 }, { "epoch": 0.5628467054525774, "grad_norm": 0.2481439858675003, "learning_rate": 9.768670210489675e-05, "loss": 1.2957, "step": 2080 }, { "epoch": 0.5633879042078204, "grad_norm": 0.2861919701099396, "learning_rate": 9.767722752794361e-05, "loss": 1.2647, "step": 2082 }, { "epoch": 0.5639291029630632, "grad_norm": 0.2552880346775055, "learning_rate": 9.766773404947309e-05, "loss": 1.2675, "step": 2084 }, { "epoch": 0.564470301718306, "grad_norm": 0.251891165971756, "learning_rate": 9.765822167324885e-05, "loss": 1.2799, "step": 2086 }, { "epoch": 0.5650115004735489, "grad_norm": 0.25395113229751587, "learning_rate": 9.764869040304205e-05, "loss": 1.2916, "step": 2088 }, { "epoch": 0.5655526992287918, "grad_norm": 0.2496347427368164, "learning_rate": 9.763914024263136e-05, "loss": 1.2722, "step": 2090 }, { "epoch": 0.5660938979840346, "grad_norm": 0.24722573161125183, "learning_rate": 9.762957119580287e-05, "loss": 1.2722, "step": 2092 }, { "epoch": 0.5666350967392775, "grad_norm": 0.23567502200603485, "learning_rate": 9.761998326635026e-05, "loss": 1.2681, "step": 2094 }, { "epoch": 0.5671762954945203, "grad_norm": 0.2396802455186844, "learning_rate": 9.76103764580746e-05, "loss": 1.2509, "step": 2096 }, { "epoch": 0.5677174942497633, "grad_norm": 0.24394263327121735, "learning_rate": 9.76007507747845e-05, "loss": 1.2863, "step": 2098 }, { "epoch": 0.5682586930050061, "grad_norm": 0.23184406757354736, "learning_rate": 9.759110622029604e-05, "loss": 1.2827, "step": 2100 }, { "epoch": 0.5687998917602489, "grad_norm": 0.24522008001804352, "learning_rate": 9.758144279843275e-05, "loss": 1.2716, "step": 2102 }, { "epoch": 0.5693410905154919, "grad_norm": 0.2323451191186905, "learning_rate": 9.757176051302573e-05, "loss": 1.2747, "step": 2104 }, { "epoch": 0.5698822892707347, "grad_norm": 0.26049986481666565, "learning_rate": 9.756205936791344e-05, "loss": 1.2976, "step": 2106 }, { "epoch": 0.5704234880259775, "grad_norm": 0.24207298457622528, "learning_rate": 9.75523393669419e-05, "loss": 1.2637, "step": 2108 }, { "epoch": 0.5709646867812204, "grad_norm": 0.23590189218521118, "learning_rate": 9.754260051396459e-05, "loss": 1.2725, "step": 2110 }, { "epoch": 0.5715058855364633, "grad_norm": 0.25714561343193054, "learning_rate": 9.753284281284243e-05, "loss": 1.2751, "step": 2112 }, { "epoch": 0.5720470842917061, "grad_norm": 0.2563743591308594, "learning_rate": 9.752306626744385e-05, "loss": 1.2552, "step": 2114 }, { "epoch": 0.572588283046949, "grad_norm": 0.23137059807777405, "learning_rate": 9.751327088164474e-05, "loss": 1.2826, "step": 2116 }, { "epoch": 0.5731294818021918, "grad_norm": 0.23491452634334564, "learning_rate": 9.750345665932844e-05, "loss": 1.2909, "step": 2118 }, { "epoch": 0.5736706805574348, "grad_norm": 0.23278982937335968, "learning_rate": 9.749362360438579e-05, "loss": 1.2904, "step": 2120 }, { "epoch": 0.5742118793126776, "grad_norm": 0.22500935196876526, "learning_rate": 9.748377172071508e-05, "loss": 1.2822, "step": 2122 }, { "epoch": 0.5747530780679204, "grad_norm": 0.23745082318782806, "learning_rate": 9.747390101222205e-05, "loss": 1.279, "step": 2124 }, { "epoch": 0.5752942768231633, "grad_norm": 0.24000118672847748, "learning_rate": 9.746401148281993e-05, "loss": 1.2806, "step": 2126 }, { "epoch": 0.5758354755784062, "grad_norm": 0.24468575417995453, "learning_rate": 9.74541031364294e-05, "loss": 1.2867, "step": 2128 }, { "epoch": 0.576376674333649, "grad_norm": 0.23120936751365662, "learning_rate": 9.744417597697859e-05, "loss": 1.2666, "step": 2130 }, { "epoch": 0.5769178730888919, "grad_norm": 0.25929006934165955, "learning_rate": 9.743423000840309e-05, "loss": 1.2672, "step": 2132 }, { "epoch": 0.5774590718441348, "grad_norm": 0.2530214786529541, "learning_rate": 9.742426523464598e-05, "loss": 1.2769, "step": 2134 }, { "epoch": 0.5780002705993776, "grad_norm": 0.2752387225627899, "learning_rate": 9.741428165965775e-05, "loss": 1.2562, "step": 2136 }, { "epoch": 0.5785414693546205, "grad_norm": 0.25394052267074585, "learning_rate": 9.740427928739638e-05, "loss": 1.2758, "step": 2138 }, { "epoch": 0.5790826681098633, "grad_norm": 0.25311702489852905, "learning_rate": 9.739425812182728e-05, "loss": 1.2603, "step": 2140 }, { "epoch": 0.5796238668651063, "grad_norm": 0.26108497381210327, "learning_rate": 9.738421816692329e-05, "loss": 1.2627, "step": 2142 }, { "epoch": 0.5801650656203491, "grad_norm": 0.2541772425174713, "learning_rate": 9.737415942666476e-05, "loss": 1.2752, "step": 2144 }, { "epoch": 0.5807062643755919, "grad_norm": 0.24984823167324066, "learning_rate": 9.736408190503943e-05, "loss": 1.2673, "step": 2146 }, { "epoch": 0.5812474631308348, "grad_norm": 0.2763904333114624, "learning_rate": 9.735398560604251e-05, "loss": 1.2936, "step": 2148 }, { "epoch": 0.5817886618860777, "grad_norm": 0.26247066259384155, "learning_rate": 9.734387053367669e-05, "loss": 1.2636, "step": 2150 }, { "epoch": 0.5823298606413205, "grad_norm": 0.27749454975128174, "learning_rate": 9.7333736691952e-05, "loss": 1.2857, "step": 2152 }, { "epoch": 0.5828710593965634, "grad_norm": 0.39380860328674316, "learning_rate": 9.732358408488602e-05, "loss": 1.2916, "step": 2154 }, { "epoch": 0.5834122581518062, "grad_norm": 0.2505074441432953, "learning_rate": 9.731341271650372e-05, "loss": 1.2548, "step": 2156 }, { "epoch": 0.5839534569070491, "grad_norm": 0.2549828588962555, "learning_rate": 9.730322259083751e-05, "loss": 1.2884, "step": 2158 }, { "epoch": 0.584494655662292, "grad_norm": 0.24714533984661102, "learning_rate": 9.729301371192724e-05, "loss": 1.2823, "step": 2160 }, { "epoch": 0.5850358544175348, "grad_norm": 0.24945247173309326, "learning_rate": 9.728278608382018e-05, "loss": 1.2976, "step": 2162 }, { "epoch": 0.5855770531727778, "grad_norm": 0.2512315511703491, "learning_rate": 9.727253971057109e-05, "loss": 1.2883, "step": 2164 }, { "epoch": 0.5861182519280206, "grad_norm": 0.2401745468378067, "learning_rate": 9.726227459624207e-05, "loss": 1.2637, "step": 2166 }, { "epoch": 0.5866594506832634, "grad_norm": 0.260251522064209, "learning_rate": 9.725199074490271e-05, "loss": 1.2618, "step": 2168 }, { "epoch": 0.5872006494385063, "grad_norm": 0.2533782124519348, "learning_rate": 9.724168816063004e-05, "loss": 1.2825, "step": 2170 }, { "epoch": 0.5877418481937492, "grad_norm": 0.2545458972454071, "learning_rate": 9.723136684750847e-05, "loss": 1.2784, "step": 2172 }, { "epoch": 0.588283046948992, "grad_norm": 0.24370916187763214, "learning_rate": 9.722102680962988e-05, "loss": 1.2601, "step": 2174 }, { "epoch": 0.5888242457042349, "grad_norm": 0.23707440495491028, "learning_rate": 9.721066805109353e-05, "loss": 1.2818, "step": 2176 }, { "epoch": 0.5893654444594777, "grad_norm": 0.22903890907764435, "learning_rate": 9.720029057600615e-05, "loss": 1.2686, "step": 2178 }, { "epoch": 0.5899066432147206, "grad_norm": 0.22820548713207245, "learning_rate": 9.718989438848182e-05, "loss": 1.2749, "step": 2180 }, { "epoch": 0.5904478419699635, "grad_norm": 0.2249859720468521, "learning_rate": 9.717947949264214e-05, "loss": 1.2649, "step": 2182 }, { "epoch": 0.5909890407252063, "grad_norm": 0.23568090796470642, "learning_rate": 9.716904589261602e-05, "loss": 1.2764, "step": 2184 }, { "epoch": 0.5915302394804492, "grad_norm": 0.24089080095291138, "learning_rate": 9.715859359253987e-05, "loss": 1.2801, "step": 2186 }, { "epoch": 0.5920714382356921, "grad_norm": 0.2259254902601242, "learning_rate": 9.714812259655746e-05, "loss": 1.2805, "step": 2188 }, { "epoch": 0.5926126369909349, "grad_norm": 0.23276519775390625, "learning_rate": 9.713763290881999e-05, "loss": 1.2635, "step": 2190 }, { "epoch": 0.5931538357461777, "grad_norm": 0.24884091317653656, "learning_rate": 9.712712453348607e-05, "loss": 1.2984, "step": 2192 }, { "epoch": 0.5936950345014207, "grad_norm": 0.23471422493457794, "learning_rate": 9.711659747472171e-05, "loss": 1.2742, "step": 2194 }, { "epoch": 0.5942362332566635, "grad_norm": 0.25790145993232727, "learning_rate": 9.710605173670037e-05, "loss": 1.2865, "step": 2196 }, { "epoch": 0.5947774320119064, "grad_norm": 0.24584504961967468, "learning_rate": 9.709548732360285e-05, "loss": 1.2826, "step": 2198 }, { "epoch": 0.5953186307671492, "grad_norm": 0.23682548105716705, "learning_rate": 9.708490423961741e-05, "loss": 1.2499, "step": 2200 }, { "epoch": 0.5958598295223921, "grad_norm": 0.24267072975635529, "learning_rate": 9.707430248893964e-05, "loss": 1.2514, "step": 2202 }, { "epoch": 0.596401028277635, "grad_norm": 0.2546815276145935, "learning_rate": 9.706368207577264e-05, "loss": 1.2755, "step": 2204 }, { "epoch": 0.5969422270328778, "grad_norm": 0.24322691559791565, "learning_rate": 9.70530430043268e-05, "loss": 1.2817, "step": 2206 }, { "epoch": 0.5974834257881206, "grad_norm": 0.22995691001415253, "learning_rate": 9.704238527882e-05, "loss": 1.2487, "step": 2208 }, { "epoch": 0.5980246245433636, "grad_norm": 0.25768396258354187, "learning_rate": 9.70317089034774e-05, "loss": 1.2956, "step": 2210 }, { "epoch": 0.5985658232986064, "grad_norm": 0.2691928744316101, "learning_rate": 9.702101388253167e-05, "loss": 1.2704, "step": 2212 }, { "epoch": 0.5991070220538492, "grad_norm": 0.2356652021408081, "learning_rate": 9.701030022022282e-05, "loss": 1.2548, "step": 2214 }, { "epoch": 0.5996482208090922, "grad_norm": 0.24094751477241516, "learning_rate": 9.699956792079825e-05, "loss": 1.2616, "step": 2216 }, { "epoch": 0.600189419564335, "grad_norm": 0.2429209202528, "learning_rate": 9.698881698851274e-05, "loss": 1.2603, "step": 2218 }, { "epoch": 0.6007306183195779, "grad_norm": 0.24263691902160645, "learning_rate": 9.69780474276285e-05, "loss": 1.2788, "step": 2220 }, { "epoch": 0.6012718170748207, "grad_norm": 0.25438740849494934, "learning_rate": 9.696725924241506e-05, "loss": 1.2823, "step": 2222 }, { "epoch": 0.6018130158300636, "grad_norm": 0.258472204208374, "learning_rate": 9.695645243714939e-05, "loss": 1.2673, "step": 2224 }, { "epoch": 0.6023542145853065, "grad_norm": 0.2571878433227539, "learning_rate": 9.694562701611583e-05, "loss": 1.295, "step": 2226 }, { "epoch": 0.6028954133405493, "grad_norm": 0.2430989295244217, "learning_rate": 9.693478298360607e-05, "loss": 1.2595, "step": 2228 }, { "epoch": 0.6034366120957921, "grad_norm": 0.23489908874034882, "learning_rate": 9.692392034391922e-05, "loss": 1.2773, "step": 2230 }, { "epoch": 0.6039778108510351, "grad_norm": 0.2507382035255432, "learning_rate": 9.691303910136171e-05, "loss": 1.2782, "step": 2232 }, { "epoch": 0.6045190096062779, "grad_norm": 0.23355726897716522, "learning_rate": 9.690213926024743e-05, "loss": 1.263, "step": 2234 }, { "epoch": 0.6050602083615207, "grad_norm": 0.2275291532278061, "learning_rate": 9.689122082489754e-05, "loss": 1.2677, "step": 2236 }, { "epoch": 0.6056014071167636, "grad_norm": 0.2314850389957428, "learning_rate": 9.688028379964068e-05, "loss": 1.2646, "step": 2238 }, { "epoch": 0.6061426058720065, "grad_norm": 0.24879969656467438, "learning_rate": 9.686932818881278e-05, "loss": 1.2704, "step": 2240 }, { "epoch": 0.6066838046272494, "grad_norm": 0.23156161606311798, "learning_rate": 9.685835399675717e-05, "loss": 1.2795, "step": 2242 }, { "epoch": 0.6072250033824922, "grad_norm": 0.22952421009540558, "learning_rate": 9.684736122782454e-05, "loss": 1.2597, "step": 2244 }, { "epoch": 0.6077662021377351, "grad_norm": 1.0009911060333252, "learning_rate": 9.683634988637293e-05, "loss": 1.2504, "step": 2246 }, { "epoch": 0.608307400892978, "grad_norm": 0.2649003267288208, "learning_rate": 9.682531997676777e-05, "loss": 1.2376, "step": 2248 }, { "epoch": 0.6088485996482208, "grad_norm": 0.31321823596954346, "learning_rate": 9.681427150338187e-05, "loss": 1.2607, "step": 2250 }, { "epoch": 0.6093897984034636, "grad_norm": 0.3142634332180023, "learning_rate": 9.680320447059532e-05, "loss": 1.261, "step": 2252 }, { "epoch": 0.6099309971587066, "grad_norm": 0.31247085332870483, "learning_rate": 9.679211888279565e-05, "loss": 1.2685, "step": 2254 }, { "epoch": 0.6104721959139494, "grad_norm": 0.25763556361198425, "learning_rate": 9.67810147443777e-05, "loss": 1.2542, "step": 2256 }, { "epoch": 0.6110133946691922, "grad_norm": 0.2788141667842865, "learning_rate": 9.676989205974367e-05, "loss": 1.2747, "step": 2258 }, { "epoch": 0.6115545934244351, "grad_norm": 0.26279813051223755, "learning_rate": 9.675875083330315e-05, "loss": 1.261, "step": 2260 }, { "epoch": 0.612095792179678, "grad_norm": 0.24764376878738403, "learning_rate": 9.674759106947302e-05, "loss": 1.2632, "step": 2262 }, { "epoch": 0.6126369909349209, "grad_norm": 0.2378121018409729, "learning_rate": 9.673641277267756e-05, "loss": 1.2569, "step": 2264 }, { "epoch": 0.6131781896901637, "grad_norm": 0.25457054376602173, "learning_rate": 9.672521594734838e-05, "loss": 1.2667, "step": 2266 }, { "epoch": 0.6137193884454065, "grad_norm": 0.2589806616306305, "learning_rate": 9.67140005979244e-05, "loss": 1.2515, "step": 2268 }, { "epoch": 0.6142605872006495, "grad_norm": 0.23375307023525238, "learning_rate": 9.670276672885195e-05, "loss": 1.2608, "step": 2270 }, { "epoch": 0.6148017859558923, "grad_norm": 0.22978229820728302, "learning_rate": 9.669151434458468e-05, "loss": 1.2516, "step": 2272 }, { "epoch": 0.6153429847111351, "grad_norm": 0.22958585619926453, "learning_rate": 9.668024344958353e-05, "loss": 1.2617, "step": 2274 }, { "epoch": 0.6158841834663781, "grad_norm": 0.22783328592777252, "learning_rate": 9.666895404831685e-05, "loss": 1.2732, "step": 2276 }, { "epoch": 0.6164253822216209, "grad_norm": 0.2413301318883896, "learning_rate": 9.665764614526027e-05, "loss": 1.2501, "step": 2278 }, { "epoch": 0.6169665809768637, "grad_norm": 0.2591399550437927, "learning_rate": 9.66463197448968e-05, "loss": 1.2657, "step": 2280 }, { "epoch": 0.6175077797321066, "grad_norm": 0.23001301288604736, "learning_rate": 9.663497485171675e-05, "loss": 1.2698, "step": 2282 }, { "epoch": 0.6180489784873495, "grad_norm": 0.21373826265335083, "learning_rate": 9.662361147021779e-05, "loss": 1.2651, "step": 2284 }, { "epoch": 0.6185901772425924, "grad_norm": 0.2302403599023819, "learning_rate": 9.66122296049049e-05, "loss": 1.2786, "step": 2286 }, { "epoch": 0.6191313759978352, "grad_norm": 0.24121953547000885, "learning_rate": 9.660082926029038e-05, "loss": 1.2639, "step": 2288 }, { "epoch": 0.619672574753078, "grad_norm": 0.22190925478935242, "learning_rate": 9.658941044089387e-05, "loss": 1.2472, "step": 2290 }, { "epoch": 0.620213773508321, "grad_norm": 0.22907890379428864, "learning_rate": 9.657797315124234e-05, "loss": 1.2475, "step": 2292 }, { "epoch": 0.6207549722635638, "grad_norm": 0.23821642994880676, "learning_rate": 9.656651739587008e-05, "loss": 1.2689, "step": 2294 }, { "epoch": 0.6212961710188066, "grad_norm": 0.25953301787376404, "learning_rate": 9.655504317931867e-05, "loss": 1.2587, "step": 2296 }, { "epoch": 0.6218373697740495, "grad_norm": 0.24054677784442902, "learning_rate": 9.654355050613705e-05, "loss": 1.2541, "step": 2298 }, { "epoch": 0.6223785685292924, "grad_norm": 0.23474815487861633, "learning_rate": 9.65320393808815e-05, "loss": 1.2605, "step": 2300 }, { "epoch": 0.6229197672845352, "grad_norm": 0.23981337249279022, "learning_rate": 9.652050980811551e-05, "loss": 1.267, "step": 2302 }, { "epoch": 0.6234609660397781, "grad_norm": 0.21820946037769318, "learning_rate": 9.650896179240997e-05, "loss": 1.2555, "step": 2304 }, { "epoch": 0.624002164795021, "grad_norm": 0.2165161520242691, "learning_rate": 9.64973953383431e-05, "loss": 1.271, "step": 2306 }, { "epoch": 0.6245433635502639, "grad_norm": 0.22105515003204346, "learning_rate": 9.648581045050035e-05, "loss": 1.2663, "step": 2308 }, { "epoch": 0.6250845623055067, "grad_norm": 0.22626088559627533, "learning_rate": 9.647420713347454e-05, "loss": 1.2645, "step": 2310 }, { "epoch": 0.6256257610607495, "grad_norm": 0.2347354292869568, "learning_rate": 9.646258539186577e-05, "loss": 1.2372, "step": 2312 }, { "epoch": 0.6261669598159925, "grad_norm": 0.2388308346271515, "learning_rate": 9.645094523028144e-05, "loss": 1.2652, "step": 2314 }, { "epoch": 0.6267081585712353, "grad_norm": 0.2252940982580185, "learning_rate": 9.643928665333628e-05, "loss": 1.2595, "step": 2316 }, { "epoch": 0.6272493573264781, "grad_norm": 0.24020199477672577, "learning_rate": 9.64276096656523e-05, "loss": 1.3079, "step": 2318 }, { "epoch": 0.627790556081721, "grad_norm": 0.23432402312755585, "learning_rate": 9.64159142718588e-05, "loss": 1.2718, "step": 2320 }, { "epoch": 0.6283317548369639, "grad_norm": 0.22962002456188202, "learning_rate": 9.640420047659239e-05, "loss": 1.2606, "step": 2322 }, { "epoch": 0.6288729535922067, "grad_norm": 0.21251855790615082, "learning_rate": 9.6392468284497e-05, "loss": 1.2568, "step": 2324 }, { "epoch": 0.6294141523474496, "grad_norm": 0.2140374481678009, "learning_rate": 9.63807177002238e-05, "loss": 1.276, "step": 2326 }, { "epoch": 0.6299553511026925, "grad_norm": 0.21366523206233978, "learning_rate": 9.636894872843132e-05, "loss": 1.2521, "step": 2328 }, { "epoch": 0.6304965498579354, "grad_norm": 0.22407646477222443, "learning_rate": 9.635716137378528e-05, "loss": 1.2692, "step": 2330 }, { "epoch": 0.6310377486131782, "grad_norm": 0.24414391815662384, "learning_rate": 9.63453556409588e-05, "loss": 1.2554, "step": 2332 }, { "epoch": 0.631578947368421, "grad_norm": 0.23787756264209747, "learning_rate": 9.633353153463219e-05, "loss": 1.2471, "step": 2334 }, { "epoch": 0.632120146123664, "grad_norm": 0.2248927801847458, "learning_rate": 9.63216890594931e-05, "loss": 1.2586, "step": 2336 }, { "epoch": 0.6326613448789068, "grad_norm": 0.2224208265542984, "learning_rate": 9.630982822023648e-05, "loss": 1.293, "step": 2338 }, { "epoch": 0.6332025436341496, "grad_norm": 0.2312854379415512, "learning_rate": 9.62979490215645e-05, "loss": 1.2604, "step": 2340 }, { "epoch": 0.6337437423893925, "grad_norm": 0.22154025733470917, "learning_rate": 9.628605146818665e-05, "loss": 1.2645, "step": 2342 }, { "epoch": 0.6342849411446354, "grad_norm": 0.2136823982000351, "learning_rate": 9.627413556481968e-05, "loss": 1.2375, "step": 2344 }, { "epoch": 0.6348261398998782, "grad_norm": 0.21541404724121094, "learning_rate": 9.626220131618763e-05, "loss": 1.2771, "step": 2346 }, { "epoch": 0.6353673386551211, "grad_norm": 0.22025029361248016, "learning_rate": 9.625024872702178e-05, "loss": 1.261, "step": 2348 }, { "epoch": 0.6359085374103639, "grad_norm": 0.2375534474849701, "learning_rate": 9.623827780206073e-05, "loss": 1.2808, "step": 2350 }, { "epoch": 0.6364497361656068, "grad_norm": 0.23297767341136932, "learning_rate": 9.62262885460503e-05, "loss": 1.2697, "step": 2352 }, { "epoch": 0.6369909349208497, "grad_norm": 0.24082797765731812, "learning_rate": 9.621428096374363e-05, "loss": 1.2347, "step": 2354 }, { "epoch": 0.6375321336760925, "grad_norm": 0.22009813785552979, "learning_rate": 9.620225505990105e-05, "loss": 1.2631, "step": 2356 }, { "epoch": 0.6380733324313355, "grad_norm": 0.22501374781131744, "learning_rate": 9.619021083929025e-05, "loss": 1.2563, "step": 2358 }, { "epoch": 0.6386145311865783, "grad_norm": 0.22494594752788544, "learning_rate": 9.61781483066861e-05, "loss": 1.2532, "step": 2360 }, { "epoch": 0.6391557299418211, "grad_norm": 0.3569008409976959, "learning_rate": 9.616606746687078e-05, "loss": 1.2684, "step": 2362 }, { "epoch": 0.639696928697064, "grad_norm": 207.0965576171875, "learning_rate": 9.61539683246337e-05, "loss": 1.3637, "step": 2364 }, { "epoch": 0.6402381274523069, "grad_norm": 0.4599202573299408, "learning_rate": 9.614185088477152e-05, "loss": 1.292, "step": 2366 }, { "epoch": 0.6407793262075497, "grad_norm": 0.3244802951812744, "learning_rate": 9.61297151520882e-05, "loss": 1.2585, "step": 2368 }, { "epoch": 0.6413205249627926, "grad_norm": 0.30332016944885254, "learning_rate": 9.611756113139488e-05, "loss": 1.2619, "step": 2370 }, { "epoch": 0.6418617237180354, "grad_norm": 0.2982909083366394, "learning_rate": 9.610538882751001e-05, "loss": 1.2637, "step": 2372 }, { "epoch": 0.6424029224732783, "grad_norm": 5.417288303375244, "learning_rate": 9.609319824525928e-05, "loss": 1.2713, "step": 2374 }, { "epoch": 0.6429441212285212, "grad_norm": 0.4198252260684967, "learning_rate": 9.608098938947562e-05, "loss": 1.2541, "step": 2376 }, { "epoch": 0.643485319983764, "grad_norm": 0.8178582191467285, "learning_rate": 9.606876226499918e-05, "loss": 1.2884, "step": 2378 }, { "epoch": 0.6440265187390068, "grad_norm": 0.33514025807380676, "learning_rate": 9.60565168766774e-05, "loss": 1.2719, "step": 2380 }, { "epoch": 0.6445677174942498, "grad_norm": 0.2973354756832123, "learning_rate": 9.60442532293649e-05, "loss": 1.2515, "step": 2382 }, { "epoch": 0.6451089162494926, "grad_norm": 0.4670213758945465, "learning_rate": 9.603197132792359e-05, "loss": 1.2665, "step": 2384 }, { "epoch": 0.6456501150047355, "grad_norm": 0.3197322189807892, "learning_rate": 9.60196711772226e-05, "loss": 1.2574, "step": 2386 }, { "epoch": 0.6461913137599784, "grad_norm": 1.1344069242477417, "learning_rate": 9.600735278213828e-05, "loss": 1.2689, "step": 2388 }, { "epoch": 0.6467325125152212, "grad_norm": 0.5379347801208496, "learning_rate": 9.599501614755425e-05, "loss": 1.249, "step": 2390 }, { "epoch": 0.6472737112704641, "grad_norm": 0.33201339840888977, "learning_rate": 9.598266127836131e-05, "loss": 1.2729, "step": 2392 }, { "epoch": 0.6478149100257069, "grad_norm": 8.969808578491211, "learning_rate": 9.597028817945753e-05, "loss": 1.2768, "step": 2394 }, { "epoch": 0.6483561087809498, "grad_norm": 0.3650411069393158, "learning_rate": 9.595789685574821e-05, "loss": 1.2511, "step": 2396 }, { "epoch": 0.6488973075361927, "grad_norm": 0.8414996862411499, "learning_rate": 9.594548731214583e-05, "loss": 1.2707, "step": 2398 }, { "epoch": 0.6494385062914355, "grad_norm": 0.5362874269485474, "learning_rate": 9.593305955357016e-05, "loss": 1.2453, "step": 2400 }, { "epoch": 0.6499797050466783, "grad_norm": 0.40546804666519165, "learning_rate": 9.592061358494813e-05, "loss": 1.2665, "step": 2402 }, { "epoch": 0.6505209038019213, "grad_norm": 0.29758453369140625, "learning_rate": 9.590814941121389e-05, "loss": 1.2538, "step": 2404 }, { "epoch": 0.6510621025571641, "grad_norm": 0.2636415660381317, "learning_rate": 9.589566703730888e-05, "loss": 1.2457, "step": 2406 }, { "epoch": 0.651603301312407, "grad_norm": 0.2844487130641937, "learning_rate": 9.588316646818168e-05, "loss": 1.257, "step": 2408 }, { "epoch": 0.6521445000676498, "grad_norm": 0.2777060568332672, "learning_rate": 9.587064770878808e-05, "loss": 1.2506, "step": 2410 }, { "epoch": 0.6526856988228927, "grad_norm": 0.2585492730140686, "learning_rate": 9.585811076409117e-05, "loss": 1.2472, "step": 2412 }, { "epoch": 0.6532268975781356, "grad_norm": 0.24312525987625122, "learning_rate": 9.584555563906116e-05, "loss": 1.2703, "step": 2414 }, { "epoch": 0.6537680963333784, "grad_norm": 0.2286798357963562, "learning_rate": 9.583298233867549e-05, "loss": 1.2582, "step": 2416 }, { "epoch": 0.6543092950886213, "grad_norm": 0.22804994881153107, "learning_rate": 9.582039086791883e-05, "loss": 1.2538, "step": 2418 }, { "epoch": 0.6548504938438642, "grad_norm": 0.2244635969400406, "learning_rate": 9.580778123178303e-05, "loss": 1.2481, "step": 2420 }, { "epoch": 0.655391692599107, "grad_norm": 0.22303158044815063, "learning_rate": 9.579515343526714e-05, "loss": 1.2574, "step": 2422 }, { "epoch": 0.6559328913543498, "grad_norm": 0.2208811491727829, "learning_rate": 9.578250748337742e-05, "loss": 1.2579, "step": 2424 }, { "epoch": 0.6564740901095928, "grad_norm": 0.20853403210639954, "learning_rate": 9.576984338112736e-05, "loss": 1.2619, "step": 2426 }, { "epoch": 0.6570152888648356, "grad_norm": 0.20974035561084747, "learning_rate": 9.575716113353757e-05, "loss": 1.2605, "step": 2428 }, { "epoch": 0.6575564876200785, "grad_norm": 0.22891463339328766, "learning_rate": 9.57444607456359e-05, "loss": 1.2586, "step": 2430 }, { "epoch": 0.6580976863753213, "grad_norm": 0.21693287789821625, "learning_rate": 9.57317422224574e-05, "loss": 1.2505, "step": 2432 }, { "epoch": 0.6586388851305642, "grad_norm": 0.21806494891643524, "learning_rate": 9.57190055690443e-05, "loss": 1.261, "step": 2434 }, { "epoch": 0.6591800838858071, "grad_norm": 0.24015147984027863, "learning_rate": 9.570625079044601e-05, "loss": 1.2564, "step": 2436 }, { "epoch": 0.6597212826410499, "grad_norm": 0.26577669382095337, "learning_rate": 9.569347789171912e-05, "loss": 1.2716, "step": 2438 }, { "epoch": 0.6602624813962927, "grad_norm": 0.2382255643606186, "learning_rate": 9.568068687792741e-05, "loss": 1.2465, "step": 2440 }, { "epoch": 0.6608036801515357, "grad_norm": 0.22770415246486664, "learning_rate": 9.566787775414188e-05, "loss": 1.2229, "step": 2442 }, { "epoch": 0.6613448789067785, "grad_norm": 0.23449081182479858, "learning_rate": 9.565505052544065e-05, "loss": 1.2582, "step": 2444 }, { "epoch": 0.6618860776620213, "grad_norm": 0.22105945646762848, "learning_rate": 9.564220519690903e-05, "loss": 1.2505, "step": 2446 }, { "epoch": 0.6624272764172643, "grad_norm": 0.22349369525909424, "learning_rate": 9.562934177363953e-05, "loss": 1.2578, "step": 2448 }, { "epoch": 0.6629684751725071, "grad_norm": 0.23770608007907867, "learning_rate": 9.561646026073184e-05, "loss": 1.2399, "step": 2450 }, { "epoch": 0.66350967392775, "grad_norm": 0.2204604148864746, "learning_rate": 9.56035606632928e-05, "loss": 1.2512, "step": 2452 }, { "epoch": 0.6640508726829928, "grad_norm": 0.2204030454158783, "learning_rate": 9.559064298643638e-05, "loss": 1.2821, "step": 2454 }, { "epoch": 0.6645920714382357, "grad_norm": 0.2169465720653534, "learning_rate": 9.55777072352838e-05, "loss": 1.2529, "step": 2456 }, { "epoch": 0.6651332701934786, "grad_norm": 0.2273695021867752, "learning_rate": 9.55647534149634e-05, "loss": 1.2497, "step": 2458 }, { "epoch": 0.6656744689487214, "grad_norm": 0.22077496349811554, "learning_rate": 9.555178153061069e-05, "loss": 1.2433, "step": 2460 }, { "epoch": 0.6662156677039642, "grad_norm": 0.2203417718410492, "learning_rate": 9.553879158736833e-05, "loss": 1.2464, "step": 2462 }, { "epoch": 0.6667568664592072, "grad_norm": 0.22205059230327606, "learning_rate": 9.552578359038617e-05, "loss": 1.2611, "step": 2464 }, { "epoch": 0.66729806521445, "grad_norm": 0.2206515222787857, "learning_rate": 9.551275754482119e-05, "loss": 1.2624, "step": 2466 }, { "epoch": 0.6678392639696928, "grad_norm": 0.21758343279361725, "learning_rate": 9.549971345583753e-05, "loss": 1.2406, "step": 2468 }, { "epoch": 0.6683804627249358, "grad_norm": 0.21517138183116913, "learning_rate": 9.548665132860647e-05, "loss": 1.2538, "step": 2470 }, { "epoch": 0.6689216614801786, "grad_norm": 0.21490350365638733, "learning_rate": 9.547357116830648e-05, "loss": 1.2534, "step": 2472 }, { "epoch": 0.6694628602354215, "grad_norm": 0.2156359702348709, "learning_rate": 9.546047298012315e-05, "loss": 1.2459, "step": 2474 }, { "epoch": 0.6700040589906643, "grad_norm": 0.2196791172027588, "learning_rate": 9.544735676924923e-05, "loss": 1.2534, "step": 2476 }, { "epoch": 0.6705452577459072, "grad_norm": 0.22666549682617188, "learning_rate": 9.54342225408846e-05, "loss": 1.252, "step": 2478 }, { "epoch": 0.6710864565011501, "grad_norm": 0.2314993143081665, "learning_rate": 9.54210703002363e-05, "loss": 1.2478, "step": 2480 }, { "epoch": 0.6716276552563929, "grad_norm": 0.2225077599287033, "learning_rate": 9.54079000525185e-05, "loss": 1.2465, "step": 2482 }, { "epoch": 0.6721688540116357, "grad_norm": 0.22268906235694885, "learning_rate": 9.539471180295249e-05, "loss": 1.2453, "step": 2484 }, { "epoch": 0.6727100527668787, "grad_norm": 0.30744513869285583, "learning_rate": 9.538150555676677e-05, "loss": 1.2874, "step": 2486 }, { "epoch": 0.6732512515221215, "grad_norm": 0.27435171604156494, "learning_rate": 9.536828131919686e-05, "loss": 1.2533, "step": 2488 }, { "epoch": 0.6737924502773643, "grad_norm": 0.5657795667648315, "learning_rate": 9.535503909548553e-05, "loss": 1.2567, "step": 2490 }, { "epoch": 0.6743336490326072, "grad_norm": 0.4795803129673004, "learning_rate": 9.53417788908826e-05, "loss": 1.2563, "step": 2492 }, { "epoch": 0.6748748477878501, "grad_norm": 0.3125123977661133, "learning_rate": 9.532850071064503e-05, "loss": 1.251, "step": 2494 }, { "epoch": 0.675416046543093, "grad_norm": 0.2949443459510803, "learning_rate": 9.531520456003696e-05, "loss": 1.2491, "step": 2496 }, { "epoch": 0.6759572452983358, "grad_norm": 0.289389967918396, "learning_rate": 9.530189044432959e-05, "loss": 1.2571, "step": 2498 }, { "epoch": 0.6764984440535787, "grad_norm": 0.24411126971244812, "learning_rate": 9.528855836880127e-05, "loss": 1.2528, "step": 2500 }, { "epoch": 0.6770396428088216, "grad_norm": 0.38176965713500977, "learning_rate": 9.527520833873748e-05, "loss": 1.2462, "step": 2502 }, { "epoch": 0.6775808415640644, "grad_norm": 0.25295090675354004, "learning_rate": 9.52618403594308e-05, "loss": 1.2601, "step": 2504 }, { "epoch": 0.6781220403193072, "grad_norm": 0.24630951881408691, "learning_rate": 9.524845443618091e-05, "loss": 1.2398, "step": 2506 }, { "epoch": 0.6786632390745502, "grad_norm": 0.25156068801879883, "learning_rate": 9.523505057429466e-05, "loss": 1.2429, "step": 2508 }, { "epoch": 0.679204437829793, "grad_norm": 0.23003700375556946, "learning_rate": 9.522162877908596e-05, "loss": 1.2569, "step": 2510 }, { "epoch": 0.6797456365850358, "grad_norm": 0.2248392552137375, "learning_rate": 9.520818905587585e-05, "loss": 1.2506, "step": 2512 }, { "epoch": 0.6802868353402787, "grad_norm": 0.22383219003677368, "learning_rate": 9.519473140999246e-05, "loss": 1.2294, "step": 2514 }, { "epoch": 0.6808280340955216, "grad_norm": 0.22723117470741272, "learning_rate": 9.518125584677106e-05, "loss": 1.2658, "step": 2516 }, { "epoch": 0.6813692328507645, "grad_norm": 0.24425800144672394, "learning_rate": 9.516776237155402e-05, "loss": 1.233, "step": 2518 }, { "epoch": 0.6819104316060073, "grad_norm": 0.22345170378684998, "learning_rate": 9.515425098969075e-05, "loss": 1.248, "step": 2520 }, { "epoch": 0.6824516303612501, "grad_norm": 0.21297229826450348, "learning_rate": 9.514072170653782e-05, "loss": 1.2453, "step": 2522 }, { "epoch": 0.6829928291164931, "grad_norm": 0.21216444671154022, "learning_rate": 9.51271745274589e-05, "loss": 1.2473, "step": 2524 }, { "epoch": 0.6835340278717359, "grad_norm": 0.2091735154390335, "learning_rate": 9.511360945782472e-05, "loss": 1.2451, "step": 2526 }, { "epoch": 0.6840752266269787, "grad_norm": 0.21291106939315796, "learning_rate": 9.510002650301313e-05, "loss": 1.2772, "step": 2528 }, { "epoch": 0.6846164253822217, "grad_norm": 0.21953986585140228, "learning_rate": 9.508642566840901e-05, "loss": 1.2533, "step": 2530 }, { "epoch": 0.6851576241374645, "grad_norm": 0.21948380768299103, "learning_rate": 9.507280695940446e-05, "loss": 1.2797, "step": 2532 }, { "epoch": 0.6856988228927073, "grad_norm": 0.21971148252487183, "learning_rate": 9.505917038139851e-05, "loss": 1.2609, "step": 2534 }, { "epoch": 0.6862400216479502, "grad_norm": 0.21478046476840973, "learning_rate": 9.504551593979738e-05, "loss": 1.2625, "step": 2536 }, { "epoch": 0.6867812204031931, "grad_norm": 0.21927322447299957, "learning_rate": 9.503184364001431e-05, "loss": 1.2415, "step": 2538 }, { "epoch": 0.687322419158436, "grad_norm": 0.2084941267967224, "learning_rate": 9.501815348746971e-05, "loss": 1.2455, "step": 2540 }, { "epoch": 0.6878636179136788, "grad_norm": 0.20336540043354034, "learning_rate": 9.500444548759095e-05, "loss": 1.2505, "step": 2542 }, { "epoch": 0.6884048166689216, "grad_norm": 0.21661430597305298, "learning_rate": 9.499071964581256e-05, "loss": 1.235, "step": 2544 }, { "epoch": 0.6889460154241646, "grad_norm": 0.2240605354309082, "learning_rate": 9.497697596757609e-05, "loss": 1.2546, "step": 2546 }, { "epoch": 0.6894872141794074, "grad_norm": 0.2289547622203827, "learning_rate": 9.496321445833022e-05, "loss": 1.2387, "step": 2548 }, { "epoch": 0.6900284129346502, "grad_norm": 0.22886811196804047, "learning_rate": 9.494943512353063e-05, "loss": 1.2531, "step": 2550 }, { "epoch": 0.6905696116898931, "grad_norm": 0.2151922732591629, "learning_rate": 9.493563796864014e-05, "loss": 1.2447, "step": 2552 }, { "epoch": 0.691110810445136, "grad_norm": 0.2263440489768982, "learning_rate": 9.492182299912857e-05, "loss": 1.245, "step": 2554 }, { "epoch": 0.6916520092003788, "grad_norm": 0.23101641237735748, "learning_rate": 9.490799022047286e-05, "loss": 1.2253, "step": 2556 }, { "epoch": 0.6921932079556217, "grad_norm": 0.2258201241493225, "learning_rate": 9.489413963815694e-05, "loss": 1.2477, "step": 2558 }, { "epoch": 0.6927344067108646, "grad_norm": 0.2227460741996765, "learning_rate": 9.488027125767187e-05, "loss": 1.2215, "step": 2560 }, { "epoch": 0.6932756054661074, "grad_norm": 0.2213139533996582, "learning_rate": 9.48663850845157e-05, "loss": 1.2308, "step": 2562 }, { "epoch": 0.6938168042213503, "grad_norm": 0.22192241251468658, "learning_rate": 9.485248112419363e-05, "loss": 1.2487, "step": 2564 }, { "epoch": 0.6943580029765931, "grad_norm": 0.21532469987869263, "learning_rate": 9.483855938221777e-05, "loss": 1.2498, "step": 2566 }, { "epoch": 0.6948992017318361, "grad_norm": 0.21143551170825958, "learning_rate": 9.482461986410743e-05, "loss": 1.2453, "step": 2568 }, { "epoch": 0.6954404004870789, "grad_norm": 0.21282954514026642, "learning_rate": 9.481066257538886e-05, "loss": 1.2499, "step": 2570 }, { "epoch": 0.6959815992423217, "grad_norm": 0.219988152384758, "learning_rate": 9.47966875215954e-05, "loss": 1.2478, "step": 2572 }, { "epoch": 0.6965227979975646, "grad_norm": 0.21327020227909088, "learning_rate": 9.478269470826744e-05, "loss": 1.2364, "step": 2574 }, { "epoch": 0.6970639967528075, "grad_norm": 0.2091750204563141, "learning_rate": 9.476868414095237e-05, "loss": 1.2494, "step": 2576 }, { "epoch": 0.6976051955080503, "grad_norm": 0.2145649939775467, "learning_rate": 9.475465582520466e-05, "loss": 1.254, "step": 2578 }, { "epoch": 0.6981463942632932, "grad_norm": 0.21477670967578888, "learning_rate": 9.474060976658578e-05, "loss": 1.2678, "step": 2580 }, { "epoch": 0.6986875930185361, "grad_norm": 0.21862445771694183, "learning_rate": 9.472654597066431e-05, "loss": 1.2512, "step": 2582 }, { "epoch": 0.699228791773779, "grad_norm": 0.21111270785331726, "learning_rate": 9.471246444301574e-05, "loss": 1.2587, "step": 2584 }, { "epoch": 0.6997699905290218, "grad_norm": 0.21332062780857086, "learning_rate": 9.469836518922269e-05, "loss": 1.2569, "step": 2586 }, { "epoch": 0.7003111892842646, "grad_norm": 0.21386279165744781, "learning_rate": 9.468424821487476e-05, "loss": 1.2308, "step": 2588 }, { "epoch": 0.7008523880395076, "grad_norm": 0.20638014376163483, "learning_rate": 9.46701135255686e-05, "loss": 1.2453, "step": 2590 }, { "epoch": 0.7013935867947504, "grad_norm": 0.2437312752008438, "learning_rate": 9.465596112690787e-05, "loss": 1.2523, "step": 2592 }, { "epoch": 0.7019347855499932, "grad_norm": 0.22395059466362, "learning_rate": 9.464179102450325e-05, "loss": 1.2535, "step": 2594 }, { "epoch": 0.7024759843052361, "grad_norm": 0.22118812799453735, "learning_rate": 9.462760322397246e-05, "loss": 1.2488, "step": 2596 }, { "epoch": 0.703017183060479, "grad_norm": 0.22880488634109497, "learning_rate": 9.461339773094021e-05, "loss": 1.2407, "step": 2598 }, { "epoch": 0.7035583818157218, "grad_norm": 0.21199798583984375, "learning_rate": 9.45991745510382e-05, "loss": 1.2476, "step": 2600 }, { "epoch": 0.7040995805709647, "grad_norm": 0.20646455883979797, "learning_rate": 9.458493368990519e-05, "loss": 1.2556, "step": 2602 }, { "epoch": 0.7046407793262075, "grad_norm": 0.2136593908071518, "learning_rate": 9.457067515318698e-05, "loss": 1.2567, "step": 2604 }, { "epoch": 0.7051819780814504, "grad_norm": 0.214664489030838, "learning_rate": 9.455639894653627e-05, "loss": 1.266, "step": 2606 }, { "epoch": 0.7057231768366933, "grad_norm": 0.2101629078388214, "learning_rate": 9.454210507561285e-05, "loss": 1.2499, "step": 2608 }, { "epoch": 0.7062643755919361, "grad_norm": 0.2157791256904602, "learning_rate": 9.452779354608348e-05, "loss": 1.2421, "step": 2610 }, { "epoch": 0.706805574347179, "grad_norm": 0.20827960968017578, "learning_rate": 9.451346436362196e-05, "loss": 1.2566, "step": 2612 }, { "epoch": 0.7073467731024219, "grad_norm": 0.21283753216266632, "learning_rate": 9.449911753390901e-05, "loss": 1.2561, "step": 2614 }, { "epoch": 0.7078879718576647, "grad_norm": 0.22358572483062744, "learning_rate": 9.448475306263245e-05, "loss": 1.2418, "step": 2616 }, { "epoch": 0.7084291706129076, "grad_norm": 0.21198727190494537, "learning_rate": 9.4470370955487e-05, "loss": 1.2511, "step": 2618 }, { "epoch": 0.7089703693681505, "grad_norm": 0.21495653688907623, "learning_rate": 9.445597121817442e-05, "loss": 1.2294, "step": 2620 }, { "epoch": 0.7095115681233933, "grad_norm": 0.21378777921199799, "learning_rate": 9.444155385640345e-05, "loss": 1.2375, "step": 2622 }, { "epoch": 0.7100527668786362, "grad_norm": 0.21197205781936646, "learning_rate": 9.442711887588981e-05, "loss": 1.251, "step": 2624 }, { "epoch": 0.710593965633879, "grad_norm": 0.21979504823684692, "learning_rate": 9.441266628235624e-05, "loss": 1.2467, "step": 2626 }, { "epoch": 0.7111351643891219, "grad_norm": 0.21565599739551544, "learning_rate": 9.43981960815324e-05, "loss": 1.22, "step": 2628 }, { "epoch": 0.7116763631443648, "grad_norm": 0.19891119003295898, "learning_rate": 9.438370827915499e-05, "loss": 1.215, "step": 2630 }, { "epoch": 0.7122175618996076, "grad_norm": 0.21079830825328827, "learning_rate": 9.436920288096764e-05, "loss": 1.2407, "step": 2632 }, { "epoch": 0.7127587606548504, "grad_norm": 0.21531549096107483, "learning_rate": 9.435467989272099e-05, "loss": 1.2348, "step": 2634 }, { "epoch": 0.7132999594100934, "grad_norm": 0.22583681344985962, "learning_rate": 9.434013932017265e-05, "loss": 1.2567, "step": 2636 }, { "epoch": 0.7138411581653362, "grad_norm": 0.24707137048244476, "learning_rate": 9.432558116908718e-05, "loss": 1.244, "step": 2638 }, { "epoch": 0.714382356920579, "grad_norm": 0.23890820145606995, "learning_rate": 9.431100544523614e-05, "loss": 1.2361, "step": 2640 }, { "epoch": 0.714923555675822, "grad_norm": 0.2275097668170929, "learning_rate": 9.429641215439802e-05, "loss": 1.2337, "step": 2642 }, { "epoch": 0.7154647544310648, "grad_norm": 0.22068314254283905, "learning_rate": 9.42818013023583e-05, "loss": 1.246, "step": 2644 }, { "epoch": 0.7160059531863077, "grad_norm": 0.22214053571224213, "learning_rate": 9.426717289490943e-05, "loss": 1.2507, "step": 2646 }, { "epoch": 0.7165471519415505, "grad_norm": 0.21483547985553741, "learning_rate": 9.425252693785078e-05, "loss": 1.2223, "step": 2648 }, { "epoch": 0.7170883506967934, "grad_norm": 0.21457841992378235, "learning_rate": 9.423786343698872e-05, "loss": 1.2494, "step": 2650 }, { "epoch": 0.7176295494520363, "grad_norm": 0.20471327006816864, "learning_rate": 9.422318239813656e-05, "loss": 1.2426, "step": 2652 }, { "epoch": 0.7181707482072791, "grad_norm": 0.20799721777439117, "learning_rate": 9.420848382711455e-05, "loss": 1.2409, "step": 2654 }, { "epoch": 0.7187119469625219, "grad_norm": 0.2095753401517868, "learning_rate": 9.41937677297499e-05, "loss": 1.2349, "step": 2656 }, { "epoch": 0.7192531457177649, "grad_norm": 0.2103864848613739, "learning_rate": 9.417903411187678e-05, "loss": 1.2432, "step": 2658 }, { "epoch": 0.7197943444730077, "grad_norm": 0.20874999463558197, "learning_rate": 9.416428297933631e-05, "loss": 1.24, "step": 2660 }, { "epoch": 0.7203355432282506, "grad_norm": 0.21667924523353577, "learning_rate": 9.41495143379765e-05, "loss": 1.254, "step": 2662 }, { "epoch": 0.7208767419834934, "grad_norm": 0.20849965512752533, "learning_rate": 9.413472819365237e-05, "loss": 1.2494, "step": 2664 }, { "epoch": 0.7214179407387363, "grad_norm": 0.2131972759962082, "learning_rate": 9.411992455222585e-05, "loss": 1.2233, "step": 2666 }, { "epoch": 0.7219591394939792, "grad_norm": 0.21590593457221985, "learning_rate": 9.410510341956579e-05, "loss": 1.2428, "step": 2668 }, { "epoch": 0.722500338249222, "grad_norm": 0.21747298538684845, "learning_rate": 9.409026480154801e-05, "loss": 1.2495, "step": 2670 }, { "epoch": 0.7230415370044649, "grad_norm": 0.21579551696777344, "learning_rate": 9.407540870405523e-05, "loss": 1.2513, "step": 2672 }, { "epoch": 0.7235827357597078, "grad_norm": 0.20697540044784546, "learning_rate": 9.40605351329771e-05, "loss": 1.2364, "step": 2674 }, { "epoch": 0.7241239345149506, "grad_norm": 0.215818852186203, "learning_rate": 9.404564409421024e-05, "loss": 1.2242, "step": 2676 }, { "epoch": 0.7246651332701934, "grad_norm": 0.21552613377571106, "learning_rate": 9.403073559365816e-05, "loss": 1.2378, "step": 2678 }, { "epoch": 0.7252063320254364, "grad_norm": 0.20463980734348297, "learning_rate": 9.401580963723127e-05, "loss": 1.2144, "step": 2680 }, { "epoch": 0.7257475307806792, "grad_norm": 0.20748072862625122, "learning_rate": 9.400086623084696e-05, "loss": 1.2422, "step": 2682 }, { "epoch": 0.726288729535922, "grad_norm": 0.21622253954410553, "learning_rate": 9.398590538042948e-05, "loss": 1.2466, "step": 2684 }, { "epoch": 0.7268299282911649, "grad_norm": 0.21229557693004608, "learning_rate": 9.397092709191005e-05, "loss": 1.2533, "step": 2686 }, { "epoch": 0.7273711270464078, "grad_norm": 0.2206655591726303, "learning_rate": 9.395593137122676e-05, "loss": 1.2368, "step": 2688 }, { "epoch": 0.7279123258016507, "grad_norm": 0.22106198966503143, "learning_rate": 9.39409182243246e-05, "loss": 1.2523, "step": 2690 }, { "epoch": 0.7284535245568935, "grad_norm": 0.21155452728271484, "learning_rate": 9.392588765715554e-05, "loss": 1.2558, "step": 2692 }, { "epoch": 0.7289947233121363, "grad_norm": 0.2205546647310257, "learning_rate": 9.39108396756784e-05, "loss": 1.2409, "step": 2694 }, { "epoch": 0.7295359220673793, "grad_norm": 0.2159835547208786, "learning_rate": 9.389577428585888e-05, "loss": 1.248, "step": 2696 }, { "epoch": 0.7300771208226221, "grad_norm": 0.20885945856571198, "learning_rate": 9.388069149366966e-05, "loss": 1.2388, "step": 2698 }, { "epoch": 0.7306183195778649, "grad_norm": 0.2038174420595169, "learning_rate": 9.386559130509026e-05, "loss": 1.2213, "step": 2700 }, { "epoch": 0.7311595183331079, "grad_norm": 0.21526674926280975, "learning_rate": 9.385047372610709e-05, "loss": 1.2369, "step": 2702 }, { "epoch": 0.7317007170883507, "grad_norm": 0.21164047718048096, "learning_rate": 9.383533876271349e-05, "loss": 1.2414, "step": 2704 }, { "epoch": 0.7322419158435935, "grad_norm": 0.20897522568702698, "learning_rate": 9.38201864209097e-05, "loss": 1.2396, "step": 2706 }, { "epoch": 0.7327831145988364, "grad_norm": 0.20399637520313263, "learning_rate": 9.38050167067028e-05, "loss": 1.2254, "step": 2708 }, { "epoch": 0.7333243133540793, "grad_norm": 0.21298326551914215, "learning_rate": 9.37898296261068e-05, "loss": 1.2465, "step": 2710 }, { "epoch": 0.7338655121093222, "grad_norm": 0.2132457196712494, "learning_rate": 9.377462518514257e-05, "loss": 1.2309, "step": 2712 }, { "epoch": 0.734406710864565, "grad_norm": 0.22349213063716888, "learning_rate": 9.375940338983789e-05, "loss": 1.2446, "step": 2714 }, { "epoch": 0.7349479096198078, "grad_norm": 0.2120126485824585, "learning_rate": 9.374416424622738e-05, "loss": 1.2468, "step": 2716 }, { "epoch": 0.7354891083750508, "grad_norm": 0.21380288898944855, "learning_rate": 9.372890776035259e-05, "loss": 1.2379, "step": 2718 }, { "epoch": 0.7360303071302936, "grad_norm": 0.20808126032352448, "learning_rate": 9.371363393826187e-05, "loss": 1.2507, "step": 2720 }, { "epoch": 0.7365715058855364, "grad_norm": 0.21934252977371216, "learning_rate": 9.369834278601052e-05, "loss": 1.2572, "step": 2722 }, { "epoch": 0.7371127046407794, "grad_norm": 0.20850279927253723, "learning_rate": 9.36830343096607e-05, "loss": 1.2538, "step": 2724 }, { "epoch": 0.7376539033960222, "grad_norm": 0.216210275888443, "learning_rate": 9.366770851528137e-05, "loss": 1.2307, "step": 2726 }, { "epoch": 0.738195102151265, "grad_norm": 0.20865590870380402, "learning_rate": 9.365236540894842e-05, "loss": 1.2293, "step": 2728 }, { "epoch": 0.7387363009065079, "grad_norm": 0.20254139602184296, "learning_rate": 9.363700499674462e-05, "loss": 1.2543, "step": 2730 }, { "epoch": 0.7392774996617508, "grad_norm": 0.21307919919490814, "learning_rate": 9.36216272847595e-05, "loss": 1.2353, "step": 2732 }, { "epoch": 0.7398186984169937, "grad_norm": 0.21729370951652527, "learning_rate": 9.360623227908957e-05, "loss": 1.2384, "step": 2734 }, { "epoch": 0.7403598971722365, "grad_norm": 0.2232731133699417, "learning_rate": 9.359081998583812e-05, "loss": 1.2237, "step": 2736 }, { "epoch": 0.7409010959274793, "grad_norm": 0.2216210663318634, "learning_rate": 9.357539041111531e-05, "loss": 1.2338, "step": 2738 }, { "epoch": 0.7414422946827223, "grad_norm": 0.22623343765735626, "learning_rate": 9.355994356103818e-05, "loss": 1.2307, "step": 2740 }, { "epoch": 0.7419834934379651, "grad_norm": 0.23438121378421783, "learning_rate": 9.354447944173059e-05, "loss": 1.2363, "step": 2742 }, { "epoch": 0.7425246921932079, "grad_norm": 0.22417870163917542, "learning_rate": 9.352899805932322e-05, "loss": 1.2658, "step": 2744 }, { "epoch": 0.7430658909484508, "grad_norm": 0.23071123659610748, "learning_rate": 9.351349941995366e-05, "loss": 1.2414, "step": 2746 }, { "epoch": 0.7436070897036937, "grad_norm": 0.2217751145362854, "learning_rate": 9.349798352976629e-05, "loss": 1.2392, "step": 2748 }, { "epoch": 0.7441482884589365, "grad_norm": 0.23684772849082947, "learning_rate": 9.348245039491235e-05, "loss": 1.2503, "step": 2750 }, { "epoch": 0.7446894872141794, "grad_norm": 0.22171282768249512, "learning_rate": 9.34669000215499e-05, "loss": 1.2287, "step": 2752 }, { "epoch": 0.7452306859694223, "grad_norm": 0.22197501361370087, "learning_rate": 9.345133241584387e-05, "loss": 1.1991, "step": 2754 }, { "epoch": 0.7457718847246652, "grad_norm": 0.2621997594833374, "learning_rate": 9.343574758396598e-05, "loss": 1.2346, "step": 2756 }, { "epoch": 0.746313083479908, "grad_norm": 0.21338815987110138, "learning_rate": 9.342014553209482e-05, "loss": 1.2437, "step": 2758 }, { "epoch": 0.7468542822351508, "grad_norm": 0.21545028686523438, "learning_rate": 9.340452626641574e-05, "loss": 1.2558, "step": 2760 }, { "epoch": 0.7473954809903938, "grad_norm": 0.20994403958320618, "learning_rate": 9.338888979312101e-05, "loss": 1.2272, "step": 2762 }, { "epoch": 0.7479366797456366, "grad_norm": 0.21458999812602997, "learning_rate": 9.337323611840964e-05, "loss": 1.2522, "step": 2764 }, { "epoch": 0.7484778785008794, "grad_norm": 0.21569861471652985, "learning_rate": 9.335756524848751e-05, "loss": 1.2348, "step": 2766 }, { "epoch": 0.7490190772561223, "grad_norm": 0.21545200049877167, "learning_rate": 9.334187718956727e-05, "loss": 1.2351, "step": 2768 }, { "epoch": 0.7495602760113652, "grad_norm": 0.20550045371055603, "learning_rate": 9.332617194786844e-05, "loss": 1.2333, "step": 2770 }, { "epoch": 0.750101474766608, "grad_norm": 0.21121762692928314, "learning_rate": 9.331044952961729e-05, "loss": 1.2347, "step": 2772 }, { "epoch": 0.7506426735218509, "grad_norm": 0.2130371332168579, "learning_rate": 9.329470994104697e-05, "loss": 1.2384, "step": 2774 }, { "epoch": 0.7511838722770937, "grad_norm": 0.2100599855184555, "learning_rate": 9.327895318839739e-05, "loss": 1.2572, "step": 2776 }, { "epoch": 0.7517250710323367, "grad_norm": 0.2141609936952591, "learning_rate": 9.326317927791526e-05, "loss": 1.2493, "step": 2778 }, { "epoch": 0.7522662697875795, "grad_norm": 0.20215147733688354, "learning_rate": 9.32473882158541e-05, "loss": 1.2328, "step": 2780 }, { "epoch": 0.7528074685428223, "grad_norm": 0.20483511686325073, "learning_rate": 9.323158000847428e-05, "loss": 1.2467, "step": 2782 }, { "epoch": 0.7533486672980653, "grad_norm": 0.21057730913162231, "learning_rate": 9.32157546620429e-05, "loss": 1.226, "step": 2784 }, { "epoch": 0.7538898660533081, "grad_norm": 0.21694819629192352, "learning_rate": 9.319991218283385e-05, "loss": 1.2269, "step": 2786 }, { "epoch": 0.7544310648085509, "grad_norm": 0.22040502727031708, "learning_rate": 9.318405257712788e-05, "loss": 1.2336, "step": 2788 }, { "epoch": 0.7549722635637938, "grad_norm": 0.21631376445293427, "learning_rate": 9.31681758512125e-05, "loss": 1.2426, "step": 2790 }, { "epoch": 0.7555134623190367, "grad_norm": 0.2051629275083542, "learning_rate": 9.315228201138194e-05, "loss": 1.2528, "step": 2792 }, { "epoch": 0.7560546610742795, "grad_norm": 0.2192334532737732, "learning_rate": 9.313637106393733e-05, "loss": 1.2262, "step": 2794 }, { "epoch": 0.7565958598295224, "grad_norm": 0.21075467765331268, "learning_rate": 9.31204430151865e-05, "loss": 1.2464, "step": 2796 }, { "epoch": 0.7571370585847652, "grad_norm": 0.20599377155303955, "learning_rate": 9.31044978714441e-05, "loss": 1.2284, "step": 2798 }, { "epoch": 0.7576782573400082, "grad_norm": 0.20556782186031342, "learning_rate": 9.308853563903153e-05, "loss": 1.2337, "step": 2800 }, { "epoch": 0.758219456095251, "grad_norm": 0.2129114270210266, "learning_rate": 9.307255632427698e-05, "loss": 1.2351, "step": 2802 }, { "epoch": 0.7587606548504938, "grad_norm": 0.22170618176460266, "learning_rate": 9.305655993351539e-05, "loss": 1.2509, "step": 2804 }, { "epoch": 0.7593018536057367, "grad_norm": 0.2149934470653534, "learning_rate": 9.304054647308853e-05, "loss": 1.2506, "step": 2806 }, { "epoch": 0.7598430523609796, "grad_norm": 0.21681110560894012, "learning_rate": 9.302451594934488e-05, "loss": 1.2446, "step": 2808 }, { "epoch": 0.7603842511162224, "grad_norm": 0.2138003557920456, "learning_rate": 9.300846836863966e-05, "loss": 1.2315, "step": 2810 }, { "epoch": 0.7609254498714653, "grad_norm": 0.21714389324188232, "learning_rate": 9.299240373733495e-05, "loss": 1.2237, "step": 2812 }, { "epoch": 0.7614666486267082, "grad_norm": 0.20781435072422028, "learning_rate": 9.297632206179951e-05, "loss": 1.235, "step": 2814 }, { "epoch": 0.762007847381951, "grad_norm": 0.21106529235839844, "learning_rate": 9.296022334840889e-05, "loss": 1.241, "step": 2816 }, { "epoch": 0.7625490461371939, "grad_norm": 0.20268838107585907, "learning_rate": 9.294410760354537e-05, "loss": 1.2582, "step": 2818 }, { "epoch": 0.7630902448924367, "grad_norm": 0.20030425488948822, "learning_rate": 9.292797483359801e-05, "loss": 1.2428, "step": 2820 }, { "epoch": 0.7636314436476797, "grad_norm": 0.2100449800491333, "learning_rate": 9.291182504496258e-05, "loss": 1.2367, "step": 2822 }, { "epoch": 0.7641726424029225, "grad_norm": 0.21454234421253204, "learning_rate": 9.289565824404165e-05, "loss": 1.2261, "step": 2824 }, { "epoch": 0.7647138411581653, "grad_norm": 0.21005463600158691, "learning_rate": 9.28794744372445e-05, "loss": 1.237, "step": 2826 }, { "epoch": 0.7652550399134082, "grad_norm": 0.2123933732509613, "learning_rate": 9.286327363098717e-05, "loss": 1.2115, "step": 2828 }, { "epoch": 0.7657962386686511, "grad_norm": 0.2171681821346283, "learning_rate": 9.284705583169239e-05, "loss": 1.2415, "step": 2830 }, { "epoch": 0.7663374374238939, "grad_norm": 0.21714134514331818, "learning_rate": 9.283082104578972e-05, "loss": 1.237, "step": 2832 }, { "epoch": 0.7668786361791368, "grad_norm": 0.20111635327339172, "learning_rate": 9.281456927971536e-05, "loss": 1.2237, "step": 2834 }, { "epoch": 0.7674198349343796, "grad_norm": 0.20329216122627258, "learning_rate": 9.279830053991232e-05, "loss": 1.2338, "step": 2836 }, { "epoch": 0.7679610336896225, "grad_norm": 0.21309934556484222, "learning_rate": 9.278201483283026e-05, "loss": 1.2314, "step": 2838 }, { "epoch": 0.7685022324448654, "grad_norm": 0.2031441330909729, "learning_rate": 9.276571216492562e-05, "loss": 1.2016, "step": 2840 }, { "epoch": 0.7690434312001082, "grad_norm": 0.20808055996894836, "learning_rate": 9.274939254266157e-05, "loss": 1.2367, "step": 2842 }, { "epoch": 0.7695846299553512, "grad_norm": 0.2009919136762619, "learning_rate": 9.273305597250797e-05, "loss": 1.2488, "step": 2844 }, { "epoch": 0.770125828710594, "grad_norm": 0.2107824683189392, "learning_rate": 9.27167024609414e-05, "loss": 1.2268, "step": 2846 }, { "epoch": 0.7706670274658368, "grad_norm": 0.21597540378570557, "learning_rate": 9.270033201444517e-05, "loss": 1.251, "step": 2848 }, { "epoch": 0.7712082262210797, "grad_norm": 0.21570149064064026, "learning_rate": 9.268394463950934e-05, "loss": 1.2291, "step": 2850 }, { "epoch": 0.7717494249763226, "grad_norm": 0.2073807716369629, "learning_rate": 9.266754034263061e-05, "loss": 1.2229, "step": 2852 }, { "epoch": 0.7722906237315654, "grad_norm": 0.21396131813526154, "learning_rate": 9.265111913031243e-05, "loss": 1.2354, "step": 2854 }, { "epoch": 0.7728318224868083, "grad_norm": 0.22582246363162994, "learning_rate": 9.263468100906494e-05, "loss": 1.2304, "step": 2856 }, { "epoch": 0.7733730212420511, "grad_norm": 0.20621982216835022, "learning_rate": 9.2618225985405e-05, "loss": 1.2178, "step": 2858 }, { "epoch": 0.773914219997294, "grad_norm": 0.20848476886749268, "learning_rate": 9.260175406585619e-05, "loss": 1.2256, "step": 2860 }, { "epoch": 0.7744554187525369, "grad_norm": 0.20401284098625183, "learning_rate": 9.258526525694871e-05, "loss": 1.2201, "step": 2862 }, { "epoch": 0.7749966175077797, "grad_norm": 0.20163412392139435, "learning_rate": 9.256875956521953e-05, "loss": 1.2537, "step": 2864 }, { "epoch": 0.7755378162630227, "grad_norm": 0.21364827454090118, "learning_rate": 9.255223699721229e-05, "loss": 1.2473, "step": 2866 }, { "epoch": 0.7760790150182655, "grad_norm": 0.20431366562843323, "learning_rate": 9.253569755947732e-05, "loss": 1.2261, "step": 2868 }, { "epoch": 0.7766202137735083, "grad_norm": 0.20286224782466888, "learning_rate": 9.251914125857167e-05, "loss": 1.2227, "step": 2870 }, { "epoch": 0.7771614125287511, "grad_norm": 0.20155613124370575, "learning_rate": 9.2502568101059e-05, "loss": 1.223, "step": 2872 }, { "epoch": 0.7777026112839941, "grad_norm": 0.20083202421665192, "learning_rate": 9.24859780935097e-05, "loss": 1.2361, "step": 2874 }, { "epoch": 0.7782438100392369, "grad_norm": 0.20565393567085266, "learning_rate": 9.246937124250086e-05, "loss": 1.2454, "step": 2876 }, { "epoch": 0.7787850087944798, "grad_norm": 0.20953059196472168, "learning_rate": 9.24527475546162e-05, "loss": 1.2402, "step": 2878 }, { "epoch": 0.7793262075497226, "grad_norm": 0.20281900465488434, "learning_rate": 9.243610703644616e-05, "loss": 1.243, "step": 2880 }, { "epoch": 0.7798674063049655, "grad_norm": 0.2107364684343338, "learning_rate": 9.241944969458784e-05, "loss": 1.2531, "step": 2882 }, { "epoch": 0.7804086050602084, "grad_norm": 0.2223702222108841, "learning_rate": 9.240277553564495e-05, "loss": 1.2358, "step": 2884 }, { "epoch": 0.7809498038154512, "grad_norm": 0.21632793545722961, "learning_rate": 9.2386084566228e-05, "loss": 1.2228, "step": 2886 }, { "epoch": 0.781491002570694, "grad_norm": 0.20548765361309052, "learning_rate": 9.2369376792954e-05, "loss": 1.2211, "step": 2888 }, { "epoch": 0.782032201325937, "grad_norm": 0.2088259905576706, "learning_rate": 9.235265222244676e-05, "loss": 1.2279, "step": 2890 }, { "epoch": 0.7825734000811798, "grad_norm": 0.20582431554794312, "learning_rate": 9.233591086133666e-05, "loss": 1.2302, "step": 2892 }, { "epoch": 0.7831145988364226, "grad_norm": 0.20779459178447723, "learning_rate": 9.23191527162608e-05, "loss": 1.2463, "step": 2894 }, { "epoch": 0.7836557975916656, "grad_norm": 0.20974069833755493, "learning_rate": 9.23023777938629e-05, "loss": 1.2299, "step": 2896 }, { "epoch": 0.7841969963469084, "grad_norm": 0.21260978281497955, "learning_rate": 9.228558610079331e-05, "loss": 1.2288, "step": 2898 }, { "epoch": 0.7847381951021513, "grad_norm": 0.21376334130764008, "learning_rate": 9.226877764370908e-05, "loss": 1.2365, "step": 2900 }, { "epoch": 0.7852793938573941, "grad_norm": 0.2044704556465149, "learning_rate": 9.225195242927387e-05, "loss": 1.2308, "step": 2902 }, { "epoch": 0.785820592612637, "grad_norm": 0.20474430918693542, "learning_rate": 9.2235110464158e-05, "loss": 1.2247, "step": 2904 }, { "epoch": 0.7863617913678799, "grad_norm": 0.20992141962051392, "learning_rate": 9.221825175503842e-05, "loss": 1.2319, "step": 2906 }, { "epoch": 0.7869029901231227, "grad_norm": 0.21773672103881836, "learning_rate": 9.220137630859874e-05, "loss": 1.2248, "step": 2908 }, { "epoch": 0.7874441888783655, "grad_norm": 0.21675674617290497, "learning_rate": 9.218448413152913e-05, "loss": 1.235, "step": 2910 }, { "epoch": 0.7879853876336085, "grad_norm": 0.2061057984828949, "learning_rate": 9.216757523052653e-05, "loss": 1.2381, "step": 2912 }, { "epoch": 0.7885265863888513, "grad_norm": 0.20627647638320923, "learning_rate": 9.215064961229438e-05, "loss": 1.2299, "step": 2914 }, { "epoch": 0.7890677851440941, "grad_norm": 0.616995096206665, "learning_rate": 9.213370728354283e-05, "loss": 1.2687, "step": 2916 }, { "epoch": 0.789608983899337, "grad_norm": 0.23795656859874725, "learning_rate": 9.21167482509886e-05, "loss": 1.2376, "step": 2918 }, { "epoch": 0.7901501826545799, "grad_norm": 0.2422102987766266, "learning_rate": 9.209977252135506e-05, "loss": 1.2429, "step": 2920 }, { "epoch": 0.7906913814098228, "grad_norm": 0.2670714259147644, "learning_rate": 9.208278010137222e-05, "loss": 1.2101, "step": 2922 }, { "epoch": 0.7912325801650656, "grad_norm": 0.294065922498703, "learning_rate": 9.206577099777664e-05, "loss": 1.2465, "step": 2924 }, { "epoch": 0.7917737789203085, "grad_norm": 0.2536512315273285, "learning_rate": 9.204874521731158e-05, "loss": 1.224, "step": 2926 }, { "epoch": 0.7923149776755514, "grad_norm": 0.23533768951892853, "learning_rate": 9.203170276672681e-05, "loss": 1.2375, "step": 2928 }, { "epoch": 0.7928561764307942, "grad_norm": 0.29083576798439026, "learning_rate": 9.201464365277883e-05, "loss": 1.2247, "step": 2930 }, { "epoch": 0.793397375186037, "grad_norm": 0.278577595949173, "learning_rate": 9.199756788223067e-05, "loss": 1.2459, "step": 2932 }, { "epoch": 0.79393857394128, "grad_norm": 0.28439784049987793, "learning_rate": 9.198047546185193e-05, "loss": 1.224, "step": 2934 }, { "epoch": 0.7944797726965228, "grad_norm": 0.45653703808784485, "learning_rate": 9.196336639841892e-05, "loss": 1.2389, "step": 2936 }, { "epoch": 0.7950209714517656, "grad_norm": 0.43514567613601685, "learning_rate": 9.194624069871442e-05, "loss": 1.2365, "step": 2938 }, { "epoch": 0.7955621702070085, "grad_norm": 0.39155837893486023, "learning_rate": 9.192909836952794e-05, "loss": 1.2364, "step": 2940 }, { "epoch": 0.7961033689622514, "grad_norm": 0.2874111235141754, "learning_rate": 9.191193941765546e-05, "loss": 1.2255, "step": 2942 }, { "epoch": 0.7966445677174943, "grad_norm": 0.2747213840484619, "learning_rate": 9.189476384989963e-05, "loss": 1.2283, "step": 2944 }, { "epoch": 0.7971857664727371, "grad_norm": 0.2161729484796524, "learning_rate": 9.187757167306966e-05, "loss": 1.2346, "step": 2946 }, { "epoch": 0.7977269652279799, "grad_norm": 0.22049780189990997, "learning_rate": 9.186036289398134e-05, "loss": 1.2422, "step": 2948 }, { "epoch": 0.7982681639832229, "grad_norm": 0.22543483972549438, "learning_rate": 9.184313751945704e-05, "loss": 1.2366, "step": 2950 }, { "epoch": 0.7988093627384657, "grad_norm": 0.227324977517128, "learning_rate": 9.182589555632572e-05, "loss": 1.2251, "step": 2952 }, { "epoch": 0.7993505614937085, "grad_norm": 0.21022869646549225, "learning_rate": 9.180863701142293e-05, "loss": 1.2337, "step": 2954 }, { "epoch": 0.7998917602489515, "grad_norm": 0.21206127107143402, "learning_rate": 9.179136189159074e-05, "loss": 1.2277, "step": 2956 }, { "epoch": 0.8004329590041943, "grad_norm": 0.22186043858528137, "learning_rate": 9.177407020367788e-05, "loss": 1.2471, "step": 2958 }, { "epoch": 0.8009741577594371, "grad_norm": 0.24853624403476715, "learning_rate": 9.175676195453955e-05, "loss": 1.245, "step": 2960 }, { "epoch": 0.80151535651468, "grad_norm": 0.24665629863739014, "learning_rate": 9.173943715103757e-05, "loss": 1.2357, "step": 2962 }, { "epoch": 0.8020565552699229, "grad_norm": 0.25784316658973694, "learning_rate": 9.172209580004035e-05, "loss": 1.2382, "step": 2964 }, { "epoch": 0.8025977540251658, "grad_norm": 0.22399091720581055, "learning_rate": 9.170473790842278e-05, "loss": 1.2208, "step": 2966 }, { "epoch": 0.8031389527804086, "grad_norm": 0.2071622610092163, "learning_rate": 9.168736348306638e-05, "loss": 1.217, "step": 2968 }, { "epoch": 0.8036801515356514, "grad_norm": 0.20199859142303467, "learning_rate": 9.166997253085918e-05, "loss": 1.2489, "step": 2970 }, { "epoch": 0.8042213502908944, "grad_norm": 0.22274377942085266, "learning_rate": 9.165256505869581e-05, "loss": 1.2417, "step": 2972 }, { "epoch": 0.8047625490461372, "grad_norm": 0.2210993617773056, "learning_rate": 9.163514107347738e-05, "loss": 1.2395, "step": 2974 }, { "epoch": 0.80530374780138, "grad_norm": 0.22720351815223694, "learning_rate": 9.161770058211161e-05, "loss": 1.2454, "step": 2976 }, { "epoch": 0.8058449465566229, "grad_norm": 0.22297875583171844, "learning_rate": 9.160024359151274e-05, "loss": 1.2279, "step": 2978 }, { "epoch": 0.8063861453118658, "grad_norm": 0.2187831699848175, "learning_rate": 9.158277010860153e-05, "loss": 1.2481, "step": 2980 }, { "epoch": 0.8069273440671086, "grad_norm": 0.2193155139684677, "learning_rate": 9.15652801403053e-05, "loss": 1.2349, "step": 2982 }, { "epoch": 0.8074685428223515, "grad_norm": 0.21247895061969757, "learning_rate": 9.154777369355793e-05, "loss": 1.2109, "step": 2984 }, { "epoch": 0.8080097415775944, "grad_norm": 0.2187221348285675, "learning_rate": 9.15302507752998e-05, "loss": 1.2453, "step": 2986 }, { "epoch": 0.8085509403328373, "grad_norm": 0.2421714961528778, "learning_rate": 9.151271139247782e-05, "loss": 1.2325, "step": 2988 }, { "epoch": 0.8090921390880801, "grad_norm": 0.25165337324142456, "learning_rate": 9.149515555204542e-05, "loss": 1.2345, "step": 2990 }, { "epoch": 0.8096333378433229, "grad_norm": 0.6235466003417969, "learning_rate": 9.147758326096259e-05, "loss": 1.2307, "step": 2992 }, { "epoch": 0.8101745365985659, "grad_norm": 1.8468120098114014, "learning_rate": 9.14599945261958e-05, "loss": 1.2226, "step": 2994 }, { "epoch": 0.8107157353538087, "grad_norm": 0.6419652104377747, "learning_rate": 9.144238935471809e-05, "loss": 1.237, "step": 2996 }, { "epoch": 0.8112569341090515, "grad_norm": 9.949187278747559, "learning_rate": 9.142476775350895e-05, "loss": 1.2359, "step": 2998 }, { "epoch": 0.8117981328642944, "grad_norm": 51.645015716552734, "learning_rate": 9.140712972955445e-05, "loss": 3.9273, "step": 3000 }, { "epoch": 0.8123393316195373, "grad_norm": 162.77838134765625, "learning_rate": 9.138947528984714e-05, "loss": 7.3207, "step": 3002 }, { "epoch": 0.8128805303747801, "grad_norm": 143.2057342529297, "learning_rate": 9.137180444138604e-05, "loss": 6.7469, "step": 3004 }, { "epoch": 0.813421729130023, "grad_norm": 24.596132278442383, "learning_rate": 9.135411719117677e-05, "loss": 6.7275, "step": 3006 }, { "epoch": 0.8139629278852659, "grad_norm": 113.07401275634766, "learning_rate": 9.133641354623135e-05, "loss": 6.8831, "step": 3008 }, { "epoch": 0.8145041266405088, "grad_norm": 54.99746322631836, "learning_rate": 9.131869351356836e-05, "loss": 6.957, "step": 3010 }, { "epoch": 0.8150453253957516, "grad_norm": 41.836971282958984, "learning_rate": 9.130095710021287e-05, "loss": 6.9197, "step": 3012 }, { "epoch": 0.8155865241509944, "grad_norm": 16.159334182739258, "learning_rate": 9.128320431319643e-05, "loss": 6.6745, "step": 3014 }, { "epoch": 0.8161277229062374, "grad_norm": 35.35638427734375, "learning_rate": 9.12654351595571e-05, "loss": 6.5526, "step": 3016 }, { "epoch": 0.8166689216614802, "grad_norm": 13.418272018432617, "learning_rate": 9.124764964633941e-05, "loss": 6.5421, "step": 3018 }, { "epoch": 0.817210120416723, "grad_norm": 76.91167449951172, "learning_rate": 9.122984778059436e-05, "loss": 6.598, "step": 3020 }, { "epoch": 0.8177513191719659, "grad_norm": 29.271677017211914, "learning_rate": 9.121202956937949e-05, "loss": 6.746, "step": 3022 }, { "epoch": 0.8182925179272088, "grad_norm": 6.679831027984619, "learning_rate": 9.119419501975876e-05, "loss": 6.6051, "step": 3024 }, { "epoch": 0.8188337166824516, "grad_norm": 2.620072364807129, "learning_rate": 9.117634413880264e-05, "loss": 6.4967, "step": 3026 }, { "epoch": 0.8193749154376945, "grad_norm": 35.06947708129883, "learning_rate": 9.115847693358808e-05, "loss": 6.5821, "step": 3028 }, { "epoch": 0.8199161141929373, "grad_norm": 7.402040004730225, "learning_rate": 9.114059341119846e-05, "loss": 6.6581, "step": 3030 }, { "epoch": 0.8204573129481803, "grad_norm": 9.003362655639648, "learning_rate": 9.112269357872367e-05, "loss": 6.5743, "step": 3032 }, { "epoch": 0.8209985117034231, "grad_norm": 19.55501937866211, "learning_rate": 9.110477744326008e-05, "loss": 6.5286, "step": 3034 }, { "epoch": 0.8215397104586659, "grad_norm": 16.157516479492188, "learning_rate": 9.108684501191048e-05, "loss": 6.5647, "step": 3036 }, { "epoch": 0.8220809092139089, "grad_norm": 11.481574058532715, "learning_rate": 9.10688962917841e-05, "loss": 6.5452, "step": 3038 }, { "epoch": 0.8226221079691517, "grad_norm": 12.504718780517578, "learning_rate": 9.105093128999672e-05, "loss": 6.4514, "step": 3040 }, { "epoch": 0.8231633067243945, "grad_norm": 12.287551879882812, "learning_rate": 9.103295001367049e-05, "loss": 6.4944, "step": 3042 }, { "epoch": 0.8237045054796374, "grad_norm": 6.438191890716553, "learning_rate": 9.101495246993405e-05, "loss": 6.5133, "step": 3044 }, { "epoch": 0.8242457042348803, "grad_norm": 10.490718841552734, "learning_rate": 9.099693866592249e-05, "loss": 6.6264, "step": 3046 }, { "epoch": 0.8247869029901231, "grad_norm": 3.485543966293335, "learning_rate": 9.097890860877732e-05, "loss": 6.6551, "step": 3048 }, { "epoch": 0.825328101745366, "grad_norm": 7.037774562835693, "learning_rate": 9.096086230564653e-05, "loss": 6.5162, "step": 3050 }, { "epoch": 0.8258693005006088, "grad_norm": 7.147155284881592, "learning_rate": 9.094279976368452e-05, "loss": 6.4698, "step": 3052 }, { "epoch": 0.8264104992558517, "grad_norm": 6.157515048980713, "learning_rate": 9.092472099005212e-05, "loss": 6.452, "step": 3054 }, { "epoch": 0.8269516980110946, "grad_norm": 24.150165557861328, "learning_rate": 9.090662599191666e-05, "loss": 6.4559, "step": 3056 }, { "epoch": 0.8274928967663374, "grad_norm": 2.645434617996216, "learning_rate": 9.088851477645181e-05, "loss": 6.4264, "step": 3058 }, { "epoch": 0.8280340955215802, "grad_norm": 26.285385131835938, "learning_rate": 9.087038735083775e-05, "loss": 6.4145, "step": 3060 }, { "epoch": 0.8285752942768232, "grad_norm": 37.51917266845703, "learning_rate": 9.085224372226105e-05, "loss": 6.6315, "step": 3062 }, { "epoch": 0.829116493032066, "grad_norm": 4.452447891235352, "learning_rate": 9.083408389791468e-05, "loss": 6.5029, "step": 3064 }, { "epoch": 0.8296576917873089, "grad_norm": 2.6523871421813965, "learning_rate": 9.081590788499807e-05, "loss": 6.4376, "step": 3066 }, { "epoch": 0.8301988905425518, "grad_norm": 8.193984031677246, "learning_rate": 9.079771569071706e-05, "loss": 6.4238, "step": 3068 }, { "epoch": 0.8307400892977946, "grad_norm": 6.554834365844727, "learning_rate": 9.07795073222839e-05, "loss": 6.4618, "step": 3070 }, { "epoch": 0.8312812880530375, "grad_norm": 9.348752975463867, "learning_rate": 9.076128278691726e-05, "loss": 6.4341, "step": 3072 }, { "epoch": 0.8318224868082803, "grad_norm": 5.486053466796875, "learning_rate": 9.07430420918422e-05, "loss": 6.4018, "step": 3074 }, { "epoch": 0.8323636855635232, "grad_norm": 6.757177352905273, "learning_rate": 9.07247852442902e-05, "loss": 6.4055, "step": 3076 }, { "epoch": 0.8329048843187661, "grad_norm": 2.9033281803131104, "learning_rate": 9.070651225149913e-05, "loss": 6.3955, "step": 3078 }, { "epoch": 0.8334460830740089, "grad_norm": 9.029162406921387, "learning_rate": 9.068822312071328e-05, "loss": 6.3819, "step": 3080 }, { "epoch": 0.8339872818292517, "grad_norm": 1.991795539855957, "learning_rate": 9.066991785918333e-05, "loss": 6.4096, "step": 3082 }, { "epoch": 0.8345284805844947, "grad_norm": 1.6264301538467407, "learning_rate": 9.065159647416637e-05, "loss": 6.3804, "step": 3084 }, { "epoch": 0.8350696793397375, "grad_norm": 2.102339506149292, "learning_rate": 9.063325897292587e-05, "loss": 6.3781, "step": 3086 }, { "epoch": 0.8356108780949804, "grad_norm": 1.4373193979263306, "learning_rate": 9.061490536273164e-05, "loss": 6.3785, "step": 3088 }, { "epoch": 0.8361520768502232, "grad_norm": 1.3095070123672485, "learning_rate": 9.059653565085997e-05, "loss": 6.3726, "step": 3090 }, { "epoch": 0.8366932756054661, "grad_norm": 1.7201792001724243, "learning_rate": 9.057814984459347e-05, "loss": 6.3754, "step": 3092 }, { "epoch": 0.837234474360709, "grad_norm": 2.334815740585327, "learning_rate": 9.055974795122113e-05, "loss": 6.3692, "step": 3094 }, { "epoch": 0.8377756731159518, "grad_norm": 2.3444583415985107, "learning_rate": 9.054132997803837e-05, "loss": 6.353, "step": 3096 }, { "epoch": 0.8383168718711947, "grad_norm": 4.413025379180908, "learning_rate": 9.052289593234693e-05, "loss": 6.345, "step": 3098 }, { "epoch": 0.8388580706264376, "grad_norm": 6.012264251708984, "learning_rate": 9.050444582145495e-05, "loss": 6.3102, "step": 3100 }, { "epoch": 0.8393992693816804, "grad_norm": 16.267066955566406, "learning_rate": 9.04859796526769e-05, "loss": 6.2907, "step": 3102 }, { "epoch": 0.8399404681369232, "grad_norm": 22.159154891967773, "learning_rate": 9.046749743333369e-05, "loss": 6.2638, "step": 3104 }, { "epoch": 0.8404816668921662, "grad_norm": 12.406789779663086, "learning_rate": 9.044899917075251e-05, "loss": 6.2353, "step": 3106 }, { "epoch": 0.841022865647409, "grad_norm": 13.376330375671387, "learning_rate": 9.043048487226697e-05, "loss": 6.2521, "step": 3108 }, { "epoch": 0.8415640644026519, "grad_norm": 56.80801773071289, "learning_rate": 9.041195454521702e-05, "loss": 6.525, "step": 3110 }, { "epoch": 0.8421052631578947, "grad_norm": 25.448945999145508, "learning_rate": 9.039340819694897e-05, "loss": 6.2921, "step": 3112 }, { "epoch": 0.8426464619131376, "grad_norm": 14.529874801635742, "learning_rate": 9.037484583481544e-05, "loss": 6.3741, "step": 3114 }, { "epoch": 0.8431876606683805, "grad_norm": 60.734962463378906, "learning_rate": 9.035626746617547e-05, "loss": 6.4179, "step": 3116 }, { "epoch": 0.8437288594236233, "grad_norm": 65.94615936279297, "learning_rate": 9.033767309839438e-05, "loss": 7.1273, "step": 3118 }, { "epoch": 0.8442700581788661, "grad_norm": 18.26249122619629, "learning_rate": 9.031906273884388e-05, "loss": 6.5407, "step": 3120 }, { "epoch": 0.8448112569341091, "grad_norm": 35.836326599121094, "learning_rate": 9.030043639490197e-05, "loss": 6.3452, "step": 3122 }, { "epoch": 0.8453524556893519, "grad_norm": 31.0974178314209, "learning_rate": 9.028179407395305e-05, "loss": 6.7146, "step": 3124 }, { "epoch": 0.8458936544445947, "grad_norm": 20.31138801574707, "learning_rate": 9.026313578338782e-05, "loss": 6.3789, "step": 3126 }, { "epoch": 0.8464348531998377, "grad_norm": 22.042652130126953, "learning_rate": 9.024446153060328e-05, "loss": 6.2103, "step": 3128 }, { "epoch": 0.8469760519550805, "grad_norm": 24.26772117614746, "learning_rate": 9.022577132300283e-05, "loss": 6.2928, "step": 3130 }, { "epoch": 0.8475172507103234, "grad_norm": 23.121070861816406, "learning_rate": 9.020706516799615e-05, "loss": 6.3883, "step": 3132 }, { "epoch": 0.8480584494655662, "grad_norm": 22.5421199798584, "learning_rate": 9.018834307299922e-05, "loss": 6.2152, "step": 3134 }, { "epoch": 0.8485996482208091, "grad_norm": 15.424832344055176, "learning_rate": 9.016960504543439e-05, "loss": 6.2134, "step": 3136 }, { "epoch": 0.849140846976052, "grad_norm": 4.891734600067139, "learning_rate": 9.015085109273029e-05, "loss": 6.137, "step": 3138 }, { "epoch": 0.8496820457312948, "grad_norm": 3.1807522773742676, "learning_rate": 9.01320812223219e-05, "loss": 6.075, "step": 3140 }, { "epoch": 0.8502232444865376, "grad_norm": 3.0970818996429443, "learning_rate": 9.011329544165047e-05, "loss": 6.0443, "step": 3142 }, { "epoch": 0.8507644432417806, "grad_norm": 1.8074407577514648, "learning_rate": 9.009449375816358e-05, "loss": 6.0025, "step": 3144 }, { "epoch": 0.8513056419970234, "grad_norm": 2.570284843444824, "learning_rate": 9.007567617931512e-05, "loss": 5.9863, "step": 3146 }, { "epoch": 0.8518468407522662, "grad_norm": 4.418685436248779, "learning_rate": 9.005684271256525e-05, "loss": 5.9694, "step": 3148 }, { "epoch": 0.8523880395075092, "grad_norm": 6.861987113952637, "learning_rate": 9.003799336538046e-05, "loss": 5.9542, "step": 3150 }, { "epoch": 0.852929238262752, "grad_norm": 4.844227313995361, "learning_rate": 9.001912814523353e-05, "loss": 5.9202, "step": 3152 }, { "epoch": 0.8534704370179949, "grad_norm": 3.8856005668640137, "learning_rate": 9.000024705960352e-05, "loss": 5.9001, "step": 3154 }, { "epoch": 0.8540116357732377, "grad_norm": 7.827765464782715, "learning_rate": 8.998135011597583e-05, "loss": 5.9147, "step": 3156 }, { "epoch": 0.8545528345284806, "grad_norm": 8.146918296813965, "learning_rate": 8.996243732184206e-05, "loss": 5.8791, "step": 3158 }, { "epoch": 0.8550940332837235, "grad_norm": 2.4434125423431396, "learning_rate": 8.994350868470015e-05, "loss": 5.8594, "step": 3160 }, { "epoch": 0.8556352320389663, "grad_norm": 13.466486930847168, "learning_rate": 8.99245642120543e-05, "loss": 5.9065, "step": 3162 }, { "epoch": 0.8561764307942091, "grad_norm": 15.81242561340332, "learning_rate": 8.990560391141503e-05, "loss": 5.8803, "step": 3164 }, { "epoch": 0.8567176295494521, "grad_norm": 9.728450775146484, "learning_rate": 8.988662779029909e-05, "loss": 5.9393, "step": 3166 }, { "epoch": 0.8572588283046949, "grad_norm": 15.22099494934082, "learning_rate": 8.98676358562295e-05, "loss": 5.9002, "step": 3168 }, { "epoch": 0.8578000270599377, "grad_norm": 6.861898422241211, "learning_rate": 8.98486281167356e-05, "loss": 5.8433, "step": 3170 }, { "epoch": 0.8583412258151806, "grad_norm": 1.44621741771698, "learning_rate": 8.982960457935293e-05, "loss": 5.8085, "step": 3172 }, { "epoch": 0.8588824245704235, "grad_norm": 6.946808815002441, "learning_rate": 8.981056525162332e-05, "loss": 5.8282, "step": 3174 }, { "epoch": 0.8594236233256664, "grad_norm": 7.4296555519104, "learning_rate": 8.979151014109488e-05, "loss": 5.8066, "step": 3176 }, { "epoch": 0.8599648220809092, "grad_norm": 1.931650161743164, "learning_rate": 8.977243925532196e-05, "loss": 5.7569, "step": 3178 }, { "epoch": 0.8605060208361521, "grad_norm": 1.9481853246688843, "learning_rate": 8.975335260186515e-05, "loss": 5.7733, "step": 3180 }, { "epoch": 0.861047219591395, "grad_norm": 5.015133380889893, "learning_rate": 8.973425018829134e-05, "loss": 5.7617, "step": 3182 }, { "epoch": 0.8615884183466378, "grad_norm": 9.222882270812988, "learning_rate": 8.971513202217359e-05, "loss": 5.7758, "step": 3184 }, { "epoch": 0.8621296171018806, "grad_norm": 9.576149940490723, "learning_rate": 8.969599811109128e-05, "loss": 5.7407, "step": 3186 }, { "epoch": 0.8626708158571236, "grad_norm": 10.73227310180664, "learning_rate": 8.967684846262997e-05, "loss": 5.746, "step": 3188 }, { "epoch": 0.8632120146123664, "grad_norm": 3.917820692062378, "learning_rate": 8.965768308438155e-05, "loss": 5.7495, "step": 3190 }, { "epoch": 0.8637532133676092, "grad_norm": 16.54001808166504, "learning_rate": 8.963850198394402e-05, "loss": 5.8388, "step": 3192 }, { "epoch": 0.8642944121228521, "grad_norm": 7.986750602722168, "learning_rate": 8.961930516892172e-05, "loss": 5.7186, "step": 3194 }, { "epoch": 0.864835610878095, "grad_norm": 6.636990547180176, "learning_rate": 8.960009264692518e-05, "loss": 5.7177, "step": 3196 }, { "epoch": 0.8653768096333379, "grad_norm": 10.408087730407715, "learning_rate": 8.958086442557111e-05, "loss": 5.7057, "step": 3198 }, { "epoch": 0.8659180083885807, "grad_norm": 7.17811918258667, "learning_rate": 8.956162051248253e-05, "loss": 5.6626, "step": 3200 }, { "epoch": 0.8664592071438235, "grad_norm": 3.971498489379883, "learning_rate": 8.954236091528865e-05, "loss": 5.6487, "step": 3202 }, { "epoch": 0.8670004058990665, "grad_norm": 12.803196907043457, "learning_rate": 8.952308564162486e-05, "loss": 5.6778, "step": 3204 }, { "epoch": 0.8675416046543093, "grad_norm": 6.128145694732666, "learning_rate": 8.950379469913281e-05, "loss": 5.6023, "step": 3206 }, { "epoch": 0.8680828034095521, "grad_norm": 2.572782278060913, "learning_rate": 8.948448809546033e-05, "loss": 5.5705, "step": 3208 }, { "epoch": 0.8686240021647951, "grad_norm": 7.732709884643555, "learning_rate": 8.94651658382615e-05, "loss": 5.5648, "step": 3210 }, { "epoch": 0.8691652009200379, "grad_norm": 5.356006145477295, "learning_rate": 8.944582793519657e-05, "loss": 5.544, "step": 3212 }, { "epoch": 0.8697063996752807, "grad_norm": 6.853109359741211, "learning_rate": 8.9426474393932e-05, "loss": 5.5661, "step": 3214 }, { "epoch": 0.8702475984305236, "grad_norm": 6.834400177001953, "learning_rate": 8.940710522214044e-05, "loss": 5.4848, "step": 3216 }, { "epoch": 0.8707887971857665, "grad_norm": 3.9893531799316406, "learning_rate": 8.938772042750078e-05, "loss": 5.4885, "step": 3218 }, { "epoch": 0.8713299959410093, "grad_norm": 5.868941307067871, "learning_rate": 8.936832001769805e-05, "loss": 5.4513, "step": 3220 }, { "epoch": 0.8718711946962522, "grad_norm": 7.860538482666016, "learning_rate": 8.934890400042351e-05, "loss": 5.3947, "step": 3222 }, { "epoch": 0.872412393451495, "grad_norm": 7.617331027984619, "learning_rate": 8.932947238337456e-05, "loss": 5.3621, "step": 3224 }, { "epoch": 0.872953592206738, "grad_norm": 13.739005088806152, "learning_rate": 8.931002517425484e-05, "loss": 5.3659, "step": 3226 }, { "epoch": 0.8734947909619808, "grad_norm": 32.63043212890625, "learning_rate": 8.929056238077416e-05, "loss": 5.4167, "step": 3228 }, { "epoch": 0.8740359897172236, "grad_norm": 10.652158737182617, "learning_rate": 8.927108401064847e-05, "loss": 5.44, "step": 3230 }, { "epoch": 0.8745771884724665, "grad_norm": 8.121881484985352, "learning_rate": 8.925159007159994e-05, "loss": 5.321, "step": 3232 }, { "epoch": 0.8751183872277094, "grad_norm": 14.56945514678955, "learning_rate": 8.923208057135688e-05, "loss": 5.3423, "step": 3234 }, { "epoch": 0.8756595859829522, "grad_norm": 19.6383113861084, "learning_rate": 8.92125555176538e-05, "loss": 5.3221, "step": 3236 }, { "epoch": 0.8762007847381951, "grad_norm": 6.179553985595703, "learning_rate": 8.919301491823133e-05, "loss": 5.2839, "step": 3238 }, { "epoch": 0.876741983493438, "grad_norm": 14.963215827941895, "learning_rate": 8.917345878083631e-05, "loss": 5.2851, "step": 3240 }, { "epoch": 0.8772831822486808, "grad_norm": 9.411989212036133, "learning_rate": 8.915388711322173e-05, "loss": 5.2005, "step": 3242 }, { "epoch": 0.8778243810039237, "grad_norm": 4.158975601196289, "learning_rate": 8.91342999231467e-05, "loss": 5.1834, "step": 3244 }, { "epoch": 0.8783655797591665, "grad_norm": 6.446174621582031, "learning_rate": 8.911469721837655e-05, "loss": 5.1028, "step": 3246 }, { "epoch": 0.8789067785144095, "grad_norm": 7.448984622955322, "learning_rate": 8.909507900668269e-05, "loss": 5.0743, "step": 3248 }, { "epoch": 0.8794479772696523, "grad_norm": 13.622466087341309, "learning_rate": 8.907544529584273e-05, "loss": 5.0598, "step": 3250 }, { "epoch": 0.8799891760248951, "grad_norm": 7.748263835906982, "learning_rate": 8.905579609364041e-05, "loss": 5.009, "step": 3252 }, { "epoch": 0.880530374780138, "grad_norm": 7.795956611633301, "learning_rate": 8.903613140786558e-05, "loss": 4.9863, "step": 3254 }, { "epoch": 0.8810715735353809, "grad_norm": 4.537596702575684, "learning_rate": 8.901645124631428e-05, "loss": 4.9221, "step": 3256 }, { "epoch": 0.8816127722906237, "grad_norm": 9.186697006225586, "learning_rate": 8.899675561678863e-05, "loss": 4.9103, "step": 3258 }, { "epoch": 0.8821539710458666, "grad_norm": 6.000752925872803, "learning_rate": 8.897704452709697e-05, "loss": 4.8237, "step": 3260 }, { "epoch": 0.8826951698011094, "grad_norm": 3.594982862472534, "learning_rate": 8.895731798505366e-05, "loss": 4.8564, "step": 3262 }, { "epoch": 0.8832363685563523, "grad_norm": 6.02200174331665, "learning_rate": 8.893757599847927e-05, "loss": 4.8013, "step": 3264 }, { "epoch": 0.8837775673115952, "grad_norm": 4.270401477813721, "learning_rate": 8.891781857520044e-05, "loss": 4.7153, "step": 3266 }, { "epoch": 0.884318766066838, "grad_norm": 3.288161516189575, "learning_rate": 8.889804572304995e-05, "loss": 4.6645, "step": 3268 }, { "epoch": 0.884859964822081, "grad_norm": 3.725242853164673, "learning_rate": 8.887825744986674e-05, "loss": 4.6046, "step": 3270 }, { "epoch": 0.8854011635773238, "grad_norm": 5.191972732543945, "learning_rate": 8.885845376349574e-05, "loss": 4.5773, "step": 3272 }, { "epoch": 0.8859423623325666, "grad_norm": 4.172395706176758, "learning_rate": 8.883863467178814e-05, "loss": 4.5419, "step": 3274 }, { "epoch": 0.8864835610878095, "grad_norm": 3.393796920776367, "learning_rate": 8.881880018260116e-05, "loss": 4.4957, "step": 3276 }, { "epoch": 0.8870247598430524, "grad_norm": 4.771553993225098, "learning_rate": 8.87989503037981e-05, "loss": 4.4532, "step": 3278 }, { "epoch": 0.8875659585982952, "grad_norm": 3.170055389404297, "learning_rate": 8.877908504324843e-05, "loss": 4.3909, "step": 3280 }, { "epoch": 0.8881071573535381, "grad_norm": 2.902820110321045, "learning_rate": 8.875920440882767e-05, "loss": 4.3511, "step": 3282 }, { "epoch": 0.8886483561087809, "grad_norm": 3.18314528465271, "learning_rate": 8.873930840841745e-05, "loss": 4.3131, "step": 3284 }, { "epoch": 0.8891895548640238, "grad_norm": 2.8853447437286377, "learning_rate": 8.871939704990548e-05, "loss": 4.2587, "step": 3286 }, { "epoch": 0.8897307536192667, "grad_norm": 3.534005880355835, "learning_rate": 8.869947034118557e-05, "loss": 4.2324, "step": 3288 }, { "epoch": 0.8902719523745095, "grad_norm": 3.527754068374634, "learning_rate": 8.867952829015761e-05, "loss": 4.2085, "step": 3290 }, { "epoch": 0.8908131511297525, "grad_norm": 3.0101592540740967, "learning_rate": 8.86595709047276e-05, "loss": 4.1537, "step": 3292 }, { "epoch": 0.8913543498849953, "grad_norm": 2.5968217849731445, "learning_rate": 8.863959819280759e-05, "loss": 4.1401, "step": 3294 }, { "epoch": 0.8918955486402381, "grad_norm": 2.631981134414673, "learning_rate": 8.861961016231569e-05, "loss": 4.0802, "step": 3296 }, { "epoch": 0.892436747395481, "grad_norm": 2.6680941581726074, "learning_rate": 8.859960682117612e-05, "loss": 4.085, "step": 3298 }, { "epoch": 0.8929779461507239, "grad_norm": 3.104212999343872, "learning_rate": 8.857958817731915e-05, "loss": 4.0593, "step": 3300 }, { "epoch": 0.8935191449059667, "grad_norm": 3.0496253967285156, "learning_rate": 8.855955423868112e-05, "loss": 4.0423, "step": 3302 }, { "epoch": 0.8940603436612096, "grad_norm": 2.760368824005127, "learning_rate": 8.853950501320443e-05, "loss": 4.0191, "step": 3304 }, { "epoch": 0.8946015424164524, "grad_norm": 3.808211326599121, "learning_rate": 8.851944050883756e-05, "loss": 3.9884, "step": 3306 }, { "epoch": 0.8951427411716953, "grad_norm": 3.354142427444458, "learning_rate": 8.849936073353502e-05, "loss": 3.9332, "step": 3308 }, { "epoch": 0.8956839399269382, "grad_norm": 4.658494472503662, "learning_rate": 8.84792656952574e-05, "loss": 3.9316, "step": 3310 }, { "epoch": 0.896225138682181, "grad_norm": 3.3359267711639404, "learning_rate": 8.845915540197132e-05, "loss": 3.9064, "step": 3312 }, { "epoch": 0.8967663374374238, "grad_norm": 4.346761703491211, "learning_rate": 8.843902986164943e-05, "loss": 3.8806, "step": 3314 }, { "epoch": 0.8973075361926668, "grad_norm": 1.9214677810668945, "learning_rate": 8.84188890822705e-05, "loss": 3.8454, "step": 3316 }, { "epoch": 0.8978487349479096, "grad_norm": 3.275961399078369, "learning_rate": 8.839873307181925e-05, "loss": 3.8613, "step": 3318 }, { "epoch": 0.8983899337031525, "grad_norm": 2.879671573638916, "learning_rate": 8.83785618382865e-05, "loss": 3.8142, "step": 3320 }, { "epoch": 0.8989311324583954, "grad_norm": 2.393937587738037, "learning_rate": 8.83583753896691e-05, "loss": 3.7892, "step": 3322 }, { "epoch": 0.8994723312136382, "grad_norm": 2.331608772277832, "learning_rate": 8.833817373396986e-05, "loss": 3.7633, "step": 3324 }, { "epoch": 0.9000135299688811, "grad_norm": 3.3159894943237305, "learning_rate": 8.831795687919775e-05, "loss": 3.7549, "step": 3326 }, { "epoch": 0.9005547287241239, "grad_norm": 1.6604549884796143, "learning_rate": 8.829772483336763e-05, "loss": 3.7204, "step": 3328 }, { "epoch": 0.9010959274793668, "grad_norm": 3.669424057006836, "learning_rate": 8.827747760450047e-05, "loss": 3.6946, "step": 3330 }, { "epoch": 0.9016371262346097, "grad_norm": 5.51679801940918, "learning_rate": 8.825721520062325e-05, "loss": 3.6713, "step": 3332 }, { "epoch": 0.9021783249898525, "grad_norm": 3.6048996448516846, "learning_rate": 8.823693762976891e-05, "loss": 3.6855, "step": 3334 }, { "epoch": 0.9027195237450953, "grad_norm": 3.728710174560547, "learning_rate": 8.821664489997648e-05, "loss": 3.6089, "step": 3336 }, { "epoch": 0.9032607225003383, "grad_norm": 2.6480770111083984, "learning_rate": 8.819633701929093e-05, "loss": 3.609, "step": 3338 }, { "epoch": 0.9038019212555811, "grad_norm": 2.4537196159362793, "learning_rate": 8.817601399576329e-05, "loss": 3.5668, "step": 3340 }, { "epoch": 0.904343120010824, "grad_norm": 8.629188537597656, "learning_rate": 8.815567583745056e-05, "loss": 3.6679, "step": 3342 }, { "epoch": 0.9048843187660668, "grad_norm": 11.455950736999512, "learning_rate": 8.813532255241576e-05, "loss": 3.6838, "step": 3344 }, { "epoch": 0.9054255175213097, "grad_norm": 9.66180419921875, "learning_rate": 8.81149541487279e-05, "loss": 3.5955, "step": 3346 }, { "epoch": 0.9059667162765526, "grad_norm": 5.880641937255859, "learning_rate": 8.809457063446198e-05, "loss": 3.4443, "step": 3348 }, { "epoch": 0.9065079150317954, "grad_norm": 4.776944637298584, "learning_rate": 8.807417201769899e-05, "loss": 3.3301, "step": 3350 }, { "epoch": 0.9070491137870383, "grad_norm": 19.446060180664062, "learning_rate": 8.805375830652591e-05, "loss": 3.3896, "step": 3352 }, { "epoch": 0.9075903125422812, "grad_norm": 7.380261421203613, "learning_rate": 8.80333295090357e-05, "loss": 3.4393, "step": 3354 }, { "epoch": 0.908131511297524, "grad_norm": 4.7472734451293945, "learning_rate": 8.801288563332732e-05, "loss": 3.223, "step": 3356 }, { "epoch": 0.9086727100527668, "grad_norm": 4.367702960968018, "learning_rate": 8.799242668750567e-05, "loss": 2.9019, "step": 3358 }, { "epoch": 0.9092139088080098, "grad_norm": 12.423567771911621, "learning_rate": 8.797195267968169e-05, "loss": 2.6458, "step": 3360 }, { "epoch": 0.9097551075632526, "grad_norm": 57.905941009521484, "learning_rate": 8.795146361797219e-05, "loss": 2.5072, "step": 3362 }, { "epoch": 0.9102963063184955, "grad_norm": 29.289100646972656, "learning_rate": 8.793095951050007e-05, "loss": 2.077, "step": 3364 }, { "epoch": 0.9108375050737383, "grad_norm": 4.703507423400879, "learning_rate": 8.79104403653941e-05, "loss": 1.6952, "step": 3366 }, { "epoch": 0.9113787038289812, "grad_norm": 3.7362613677978516, "learning_rate": 8.788990619078903e-05, "loss": 1.5028, "step": 3368 }, { "epoch": 0.9119199025842241, "grad_norm": 1.9051291942596436, "learning_rate": 8.78693569948256e-05, "loss": 1.4128, "step": 3370 }, { "epoch": 0.9124611013394669, "grad_norm": 4.28729772567749, "learning_rate": 8.784879278565049e-05, "loss": 1.3943, "step": 3372 }, { "epoch": 0.9130023000947097, "grad_norm": 1.3635573387145996, "learning_rate": 8.782821357141633e-05, "loss": 1.3544, "step": 3374 }, { "epoch": 0.9135434988499527, "grad_norm": 1.0148718357086182, "learning_rate": 8.780761936028168e-05, "loss": 1.3325, "step": 3376 }, { "epoch": 0.9140846976051955, "grad_norm": 1.0702952146530151, "learning_rate": 8.778701016041108e-05, "loss": 1.2865, "step": 3378 }, { "epoch": 0.9146258963604383, "grad_norm": 0.7008850574493408, "learning_rate": 8.776638597997498e-05, "loss": 1.2967, "step": 3380 }, { "epoch": 0.9151670951156813, "grad_norm": 0.6870774626731873, "learning_rate": 8.77457468271498e-05, "loss": 1.2745, "step": 3382 }, { "epoch": 0.9157082938709241, "grad_norm": 0.42447954416275024, "learning_rate": 8.772509271011788e-05, "loss": 1.2755, "step": 3384 }, { "epoch": 0.916249492626167, "grad_norm": 0.37188005447387695, "learning_rate": 8.77044236370675e-05, "loss": 1.2747, "step": 3386 }, { "epoch": 0.9167906913814098, "grad_norm": 0.29276731610298157, "learning_rate": 8.768373961619283e-05, "loss": 1.2385, "step": 3388 }, { "epoch": 0.9173318901366527, "grad_norm": 0.29821205139160156, "learning_rate": 8.766304065569404e-05, "loss": 1.2333, "step": 3390 }, { "epoch": 0.9178730888918956, "grad_norm": 0.3106642961502075, "learning_rate": 8.764232676377715e-05, "loss": 1.2528, "step": 3392 }, { "epoch": 0.9184142876471384, "grad_norm": 0.2433456927537918, "learning_rate": 8.762159794865414e-05, "loss": 1.2644, "step": 3394 }, { "epoch": 0.9189554864023812, "grad_norm": 0.2584933042526245, "learning_rate": 8.76008542185429e-05, "loss": 1.2563, "step": 3396 }, { "epoch": 0.9194966851576242, "grad_norm": 0.2554602324962616, "learning_rate": 8.758009558166723e-05, "loss": 1.2529, "step": 3398 }, { "epoch": 0.920037883912867, "grad_norm": 0.2501952052116394, "learning_rate": 8.755932204625682e-05, "loss": 1.2461, "step": 3400 }, { "epoch": 0.9205790826681098, "grad_norm": 0.2322869598865509, "learning_rate": 8.753853362054731e-05, "loss": 1.2455, "step": 3402 }, { "epoch": 0.9211202814233527, "grad_norm": 0.26621463894844055, "learning_rate": 8.751773031278022e-05, "loss": 1.2414, "step": 3404 }, { "epoch": 0.9216614801785956, "grad_norm": 0.2485044300556183, "learning_rate": 8.749691213120297e-05, "loss": 1.2543, "step": 3406 }, { "epoch": 0.9222026789338384, "grad_norm": 0.22953349351882935, "learning_rate": 8.747607908406886e-05, "loss": 1.2455, "step": 3408 }, { "epoch": 0.9227438776890813, "grad_norm": 0.2378481775522232, "learning_rate": 8.74552311796371e-05, "loss": 1.2324, "step": 3410 }, { "epoch": 0.9232850764443242, "grad_norm": 0.2420920431613922, "learning_rate": 8.743436842617279e-05, "loss": 1.2472, "step": 3412 }, { "epoch": 0.9238262751995671, "grad_norm": 0.240481436252594, "learning_rate": 8.741349083194694e-05, "loss": 1.2512, "step": 3414 }, { "epoch": 0.9243674739548099, "grad_norm": 0.21981249749660492, "learning_rate": 8.73925984052364e-05, "loss": 1.2234, "step": 3416 }, { "epoch": 0.9249086727100527, "grad_norm": 0.21758708357810974, "learning_rate": 8.73716911543239e-05, "loss": 1.2423, "step": 3418 }, { "epoch": 0.9254498714652957, "grad_norm": 0.2222822904586792, "learning_rate": 8.735076908749811e-05, "loss": 1.2475, "step": 3420 }, { "epoch": 0.9259910702205385, "grad_norm": 0.2375306487083435, "learning_rate": 8.73298322130535e-05, "loss": 1.2511, "step": 3422 }, { "epoch": 0.9265322689757813, "grad_norm": 0.20608864724636078, "learning_rate": 8.730888053929047e-05, "loss": 1.2334, "step": 3424 }, { "epoch": 0.9270734677310242, "grad_norm": 0.21576453745365143, "learning_rate": 8.728791407451524e-05, "loss": 1.2389, "step": 3426 }, { "epoch": 0.9276146664862671, "grad_norm": 0.21437953412532806, "learning_rate": 8.726693282703991e-05, "loss": 1.2383, "step": 3428 }, { "epoch": 0.92815586524151, "grad_norm": 0.23263736069202423, "learning_rate": 8.724593680518243e-05, "loss": 1.2585, "step": 3430 }, { "epoch": 0.9286970639967528, "grad_norm": 0.2303396314382553, "learning_rate": 8.722492601726665e-05, "loss": 1.2426, "step": 3432 }, { "epoch": 0.9292382627519957, "grad_norm": 0.21872375905513763, "learning_rate": 8.720390047162223e-05, "loss": 1.2487, "step": 3434 }, { "epoch": 0.9297794615072386, "grad_norm": 1.3575583696365356, "learning_rate": 8.71828601765847e-05, "loss": 1.2599, "step": 3436 }, { "epoch": 0.9303206602624814, "grad_norm": 0.2798626720905304, "learning_rate": 8.716180514049543e-05, "loss": 1.2413, "step": 3438 }, { "epoch": 0.9308618590177242, "grad_norm": 0.3947781026363373, "learning_rate": 8.714073537170162e-05, "loss": 1.2391, "step": 3440 }, { "epoch": 0.9314030577729672, "grad_norm": 0.30611512064933777, "learning_rate": 8.711965087855635e-05, "loss": 1.2285, "step": 3442 }, { "epoch": 0.93194425652821, "grad_norm": 0.2964535355567932, "learning_rate": 8.709855166941849e-05, "loss": 1.2332, "step": 3444 }, { "epoch": 0.9324854552834528, "grad_norm": 0.30566874146461487, "learning_rate": 8.70774377526528e-05, "loss": 1.254, "step": 3446 }, { "epoch": 0.9330266540386957, "grad_norm": 0.25171148777008057, "learning_rate": 8.705630913662983e-05, "loss": 1.2445, "step": 3448 }, { "epoch": 0.9335678527939386, "grad_norm": 0.2296695113182068, "learning_rate": 8.703516582972595e-05, "loss": 1.2306, "step": 3450 }, { "epoch": 0.9341090515491814, "grad_norm": 0.2543555498123169, "learning_rate": 8.701400784032339e-05, "loss": 1.2623, "step": 3452 }, { "epoch": 0.9346502503044243, "grad_norm": 0.22564424574375153, "learning_rate": 8.699283517681017e-05, "loss": 1.2413, "step": 3454 }, { "epoch": 0.9351914490596671, "grad_norm": 0.22345957159996033, "learning_rate": 8.697164784758014e-05, "loss": 1.2244, "step": 3456 }, { "epoch": 0.9357326478149101, "grad_norm": 0.23204098641872406, "learning_rate": 8.695044586103296e-05, "loss": 1.2322, "step": 3458 }, { "epoch": 0.9362738465701529, "grad_norm": 0.36212965846061707, "learning_rate": 8.692922922557412e-05, "loss": 1.2371, "step": 3460 }, { "epoch": 0.9368150453253957, "grad_norm": 0.22496424615383148, "learning_rate": 8.690799794961489e-05, "loss": 1.2255, "step": 3462 }, { "epoch": 0.9373562440806387, "grad_norm": 0.23747360706329346, "learning_rate": 8.688675204157236e-05, "loss": 1.2515, "step": 3464 }, { "epoch": 0.9378974428358815, "grad_norm": 0.20290309190750122, "learning_rate": 8.686549150986943e-05, "loss": 1.2445, "step": 3466 }, { "epoch": 0.9384386415911243, "grad_norm": 0.22112032771110535, "learning_rate": 8.684421636293474e-05, "loss": 1.2467, "step": 3468 }, { "epoch": 0.9389798403463672, "grad_norm": 0.19764398038387299, "learning_rate": 8.682292660920281e-05, "loss": 1.2528, "step": 3470 }, { "epoch": 0.9395210391016101, "grad_norm": 0.22094492614269257, "learning_rate": 8.680162225711392e-05, "loss": 1.2208, "step": 3472 }, { "epoch": 0.940062237856853, "grad_norm": 0.20873764157295227, "learning_rate": 8.678030331511409e-05, "loss": 1.2376, "step": 3474 }, { "epoch": 0.9406034366120958, "grad_norm": 0.21512436866760254, "learning_rate": 8.675896979165517e-05, "loss": 1.2481, "step": 3476 }, { "epoch": 0.9411446353673386, "grad_norm": 0.20858894288539886, "learning_rate": 8.673762169519479e-05, "loss": 1.2354, "step": 3478 }, { "epoch": 0.9416858341225816, "grad_norm": 0.21578043699264526, "learning_rate": 8.671625903419636e-05, "loss": 1.231, "step": 3480 }, { "epoch": 0.9422270328778244, "grad_norm": 0.214979350566864, "learning_rate": 8.669488181712904e-05, "loss": 1.246, "step": 3482 }, { "epoch": 0.9427682316330672, "grad_norm": 0.21148213744163513, "learning_rate": 8.667349005246776e-05, "loss": 1.2501, "step": 3484 }, { "epoch": 0.94330943038831, "grad_norm": 0.21602432429790497, "learning_rate": 8.665208374869327e-05, "loss": 1.2312, "step": 3486 }, { "epoch": 0.943850629143553, "grad_norm": 0.2216794639825821, "learning_rate": 8.6630662914292e-05, "loss": 1.2405, "step": 3488 }, { "epoch": 0.9443918278987958, "grad_norm": 0.20914500951766968, "learning_rate": 8.660922755775622e-05, "loss": 1.2429, "step": 3490 }, { "epoch": 0.9449330266540387, "grad_norm": 0.22172749042510986, "learning_rate": 8.658777768758393e-05, "loss": 1.2467, "step": 3492 }, { "epoch": 0.9454742254092816, "grad_norm": 0.20988859236240387, "learning_rate": 8.656631331227883e-05, "loss": 1.2299, "step": 3494 }, { "epoch": 0.9460154241645244, "grad_norm": 0.2031324952840805, "learning_rate": 8.654483444035047e-05, "loss": 1.2186, "step": 3496 }, { "epoch": 0.9465566229197673, "grad_norm": 0.21033494174480438, "learning_rate": 8.652334108031406e-05, "loss": 1.2293, "step": 3498 }, { "epoch": 0.9470978216750101, "grad_norm": 0.2091301679611206, "learning_rate": 8.650183324069059e-05, "loss": 1.2181, "step": 3500 }, { "epoch": 0.947639020430253, "grad_norm": 0.2000979781150818, "learning_rate": 8.648031093000681e-05, "loss": 1.2375, "step": 3502 }, { "epoch": 0.9481802191854959, "grad_norm": 0.20794962346553802, "learning_rate": 8.645877415679519e-05, "loss": 1.2363, "step": 3504 }, { "epoch": 0.9487214179407387, "grad_norm": 0.2179345339536667, "learning_rate": 8.64372229295939e-05, "loss": 1.2251, "step": 3506 }, { "epoch": 0.9492626166959816, "grad_norm": 0.21006426215171814, "learning_rate": 8.64156572569469e-05, "loss": 1.2252, "step": 3508 }, { "epoch": 0.9498038154512245, "grad_norm": 0.21278280019760132, "learning_rate": 8.639407714740382e-05, "loss": 1.2507, "step": 3510 }, { "epoch": 0.9503450142064673, "grad_norm": 0.19911755621433258, "learning_rate": 8.637248260952006e-05, "loss": 1.224, "step": 3512 }, { "epoch": 0.9508862129617102, "grad_norm": 0.19069471955299377, "learning_rate": 8.63508736518567e-05, "loss": 1.2134, "step": 3514 }, { "epoch": 0.951427411716953, "grad_norm": 0.20871645212173462, "learning_rate": 8.632925028298059e-05, "loss": 1.2223, "step": 3516 }, { "epoch": 0.9519686104721959, "grad_norm": 0.2007288932800293, "learning_rate": 8.630761251146424e-05, "loss": 1.2283, "step": 3518 }, { "epoch": 0.9525098092274388, "grad_norm": 0.21398819983005524, "learning_rate": 8.628596034588588e-05, "loss": 1.2378, "step": 3520 }, { "epoch": 0.9530510079826816, "grad_norm": 0.2010694146156311, "learning_rate": 8.626429379482946e-05, "loss": 1.229, "step": 3522 }, { "epoch": 0.9535922067379246, "grad_norm": 0.20102912187576294, "learning_rate": 8.624261286688466e-05, "loss": 1.2346, "step": 3524 }, { "epoch": 0.9541334054931674, "grad_norm": 0.20535993576049805, "learning_rate": 8.62209175706468e-05, "loss": 1.2358, "step": 3526 }, { "epoch": 0.9546746042484102, "grad_norm": 0.22004395723342896, "learning_rate": 8.619920791471693e-05, "loss": 1.2261, "step": 3528 }, { "epoch": 0.955215803003653, "grad_norm": 0.19609929621219635, "learning_rate": 8.617748390770179e-05, "loss": 1.2432, "step": 3530 }, { "epoch": 0.955757001758896, "grad_norm": 0.19269846379756927, "learning_rate": 8.615574555821382e-05, "loss": 1.2244, "step": 3532 }, { "epoch": 0.9562982005141388, "grad_norm": 0.1955200880765915, "learning_rate": 8.613399287487112e-05, "loss": 1.2339, "step": 3534 }, { "epoch": 0.9568393992693817, "grad_norm": 0.19891057908535004, "learning_rate": 8.611222586629749e-05, "loss": 1.2197, "step": 3536 }, { "epoch": 0.9573805980246245, "grad_norm": 0.20394091308116913, "learning_rate": 8.60904445411224e-05, "loss": 1.2386, "step": 3538 }, { "epoch": 0.9579217967798674, "grad_norm": 0.2031053602695465, "learning_rate": 8.606864890798104e-05, "loss": 1.2273, "step": 3540 }, { "epoch": 0.9584629955351103, "grad_norm": 0.1948944330215454, "learning_rate": 8.604683897551417e-05, "loss": 1.2232, "step": 3542 }, { "epoch": 0.9590041942903531, "grad_norm": 0.20535525679588318, "learning_rate": 8.602501475236833e-05, "loss": 1.2411, "step": 3544 }, { "epoch": 0.9595453930455959, "grad_norm": 0.2302812784910202, "learning_rate": 8.600317624719565e-05, "loss": 1.2151, "step": 3546 }, { "epoch": 0.9600865918008389, "grad_norm": 0.21900974214076996, "learning_rate": 8.598132346865398e-05, "loss": 1.2153, "step": 3548 }, { "epoch": 0.9606277905560817, "grad_norm": 0.2264300286769867, "learning_rate": 8.59594564254068e-05, "loss": 1.2273, "step": 3550 }, { "epoch": 0.9611689893113246, "grad_norm": 0.20842230319976807, "learning_rate": 8.59375751261232e-05, "loss": 1.2131, "step": 3552 }, { "epoch": 0.9617101880665675, "grad_norm": 0.20842507481575012, "learning_rate": 8.5915679579478e-05, "loss": 1.2412, "step": 3554 }, { "epoch": 0.9622513868218103, "grad_norm": 0.21368186175823212, "learning_rate": 8.589376979415164e-05, "loss": 1.2435, "step": 3556 }, { "epoch": 0.9627925855770532, "grad_norm": 0.23424792289733887, "learning_rate": 8.587184577883018e-05, "loss": 1.2162, "step": 3558 }, { "epoch": 0.963333784332296, "grad_norm": 0.2133053094148636, "learning_rate": 8.584990754220536e-05, "loss": 1.2279, "step": 3560 }, { "epoch": 0.9638749830875389, "grad_norm": 0.2146514356136322, "learning_rate": 8.582795509297453e-05, "loss": 1.2622, "step": 3562 }, { "epoch": 0.9644161818427818, "grad_norm": 0.20293276011943817, "learning_rate": 8.580598843984069e-05, "loss": 1.2154, "step": 3564 }, { "epoch": 0.9649573805980246, "grad_norm": 0.2056434154510498, "learning_rate": 8.578400759151244e-05, "loss": 1.2203, "step": 3566 }, { "epoch": 0.9654985793532674, "grad_norm": 0.20615997910499573, "learning_rate": 8.576201255670406e-05, "loss": 1.2452, "step": 3568 }, { "epoch": 0.9660397781085104, "grad_norm": 0.2047366350889206, "learning_rate": 8.574000334413541e-05, "loss": 1.2406, "step": 3570 }, { "epoch": 0.9665809768637532, "grad_norm": 0.2215116322040558, "learning_rate": 8.571797996253201e-05, "loss": 1.2117, "step": 3572 }, { "epoch": 0.967122175618996, "grad_norm": 0.21521443128585815, "learning_rate": 8.569594242062494e-05, "loss": 1.2336, "step": 3574 }, { "epoch": 0.967663374374239, "grad_norm": 0.20681914687156677, "learning_rate": 8.567389072715095e-05, "loss": 1.2217, "step": 3576 }, { "epoch": 0.9682045731294818, "grad_norm": 0.21918296813964844, "learning_rate": 8.56518248908524e-05, "loss": 1.2301, "step": 3578 }, { "epoch": 0.9687457718847247, "grad_norm": 0.21497774124145508, "learning_rate": 8.562974492047717e-05, "loss": 1.2128, "step": 3580 }, { "epoch": 0.9692869706399675, "grad_norm": 0.23987126350402832, "learning_rate": 8.560765082477887e-05, "loss": 1.2442, "step": 3582 }, { "epoch": 0.9698281693952104, "grad_norm": 0.25350961089134216, "learning_rate": 8.558554261251663e-05, "loss": 1.2145, "step": 3584 }, { "epoch": 0.9703693681504533, "grad_norm": 0.21898071467876434, "learning_rate": 8.556342029245518e-05, "loss": 1.2319, "step": 3586 }, { "epoch": 0.9709105669056961, "grad_norm": 0.20007579028606415, "learning_rate": 8.554128387336489e-05, "loss": 1.2261, "step": 3588 }, { "epoch": 0.9714517656609389, "grad_norm": 0.2057974636554718, "learning_rate": 8.551913336402167e-05, "loss": 1.2354, "step": 3590 }, { "epoch": 0.9719929644161819, "grad_norm": 0.22897422313690186, "learning_rate": 8.549696877320701e-05, "loss": 1.2264, "step": 3592 }, { "epoch": 0.9725341631714247, "grad_norm": 0.21826642751693726, "learning_rate": 8.547479010970805e-05, "loss": 1.2464, "step": 3594 }, { "epoch": 0.9730753619266675, "grad_norm": 0.24179087579250336, "learning_rate": 8.545259738231744e-05, "loss": 1.2296, "step": 3596 }, { "epoch": 0.9736165606819104, "grad_norm": 0.21856723725795746, "learning_rate": 8.543039059983344e-05, "loss": 1.2291, "step": 3598 }, { "epoch": 0.9741577594371533, "grad_norm": 0.22182489931583405, "learning_rate": 8.540816977105986e-05, "loss": 1.244, "step": 3600 }, { "epoch": 0.9746989581923962, "grad_norm": 0.276157021522522, "learning_rate": 8.538593490480612e-05, "loss": 1.2137, "step": 3602 }, { "epoch": 0.975240156947639, "grad_norm": 0.2525583505630493, "learning_rate": 8.536368600988715e-05, "loss": 1.2271, "step": 3604 }, { "epoch": 0.9757813557028819, "grad_norm": 0.2494456022977829, "learning_rate": 8.534142309512348e-05, "loss": 1.2274, "step": 3606 }, { "epoch": 0.9763225544581248, "grad_norm": 0.23215800523757935, "learning_rate": 8.531914616934119e-05, "loss": 1.2183, "step": 3608 }, { "epoch": 0.9768637532133676, "grad_norm": 0.22265788912773132, "learning_rate": 8.529685524137188e-05, "loss": 1.2279, "step": 3610 }, { "epoch": 0.9774049519686104, "grad_norm": 0.23256999254226685, "learning_rate": 8.527455032005278e-05, "loss": 1.2368, "step": 3612 }, { "epoch": 0.9779461507238534, "grad_norm": 0.22369737923145294, "learning_rate": 8.52522314142266e-05, "loss": 1.2486, "step": 3614 }, { "epoch": 0.9784873494790962, "grad_norm": 0.21790017187595367, "learning_rate": 8.522989853274159e-05, "loss": 1.2191, "step": 3616 }, { "epoch": 0.979028548234339, "grad_norm": 0.20743697881698608, "learning_rate": 8.520755168445162e-05, "loss": 1.2325, "step": 3618 }, { "epoch": 0.9795697469895819, "grad_norm": 0.20409737527370453, "learning_rate": 8.518519087821599e-05, "loss": 1.2286, "step": 3620 }, { "epoch": 0.9801109457448248, "grad_norm": 0.21322283148765564, "learning_rate": 8.51628161228996e-05, "loss": 1.2238, "step": 3622 }, { "epoch": 0.9806521445000677, "grad_norm": 0.2082875818014145, "learning_rate": 8.514042742737289e-05, "loss": 1.2297, "step": 3624 }, { "epoch": 0.9811933432553105, "grad_norm": 0.205684632062912, "learning_rate": 8.511802480051178e-05, "loss": 1.2187, "step": 3626 }, { "epoch": 0.9817345420105533, "grad_norm": 0.20009024441242218, "learning_rate": 8.509560825119772e-05, "loss": 1.2217, "step": 3628 }, { "epoch": 0.9822757407657963, "grad_norm": 0.24374030530452728, "learning_rate": 8.507317778831774e-05, "loss": 1.2232, "step": 3630 }, { "epoch": 0.9828169395210391, "grad_norm": 0.2268858402967453, "learning_rate": 8.505073342076429e-05, "loss": 1.2308, "step": 3632 }, { "epoch": 0.9833581382762819, "grad_norm": 1.9946271181106567, "learning_rate": 8.502827515743541e-05, "loss": 1.2203, "step": 3634 }, { "epoch": 0.9838993370315249, "grad_norm": 0.3267524838447571, "learning_rate": 8.500580300723464e-05, "loss": 1.2236, "step": 3636 }, { "epoch": 0.9844405357867677, "grad_norm": 0.2528623640537262, "learning_rate": 8.498331697907096e-05, "loss": 1.2204, "step": 3638 }, { "epoch": 0.9849817345420105, "grad_norm": 0.22566676139831543, "learning_rate": 8.496081708185895e-05, "loss": 1.2249, "step": 3640 }, { "epoch": 0.9855229332972534, "grad_norm": 0.24136385321617126, "learning_rate": 8.493830332451861e-05, "loss": 1.2328, "step": 3642 }, { "epoch": 0.9860641320524963, "grad_norm": 0.2164882868528366, "learning_rate": 8.491577571597546e-05, "loss": 1.2525, "step": 3644 }, { "epoch": 0.9866053308077392, "grad_norm": 0.22188369929790497, "learning_rate": 8.489323426516054e-05, "loss": 1.2069, "step": 3646 }, { "epoch": 0.987146529562982, "grad_norm": 0.23576316237449646, "learning_rate": 8.487067898101031e-05, "loss": 1.2429, "step": 3648 }, { "epoch": 0.9876877283182248, "grad_norm": 0.2139056921005249, "learning_rate": 8.484810987246678e-05, "loss": 1.2196, "step": 3650 }, { "epoch": 0.9882289270734678, "grad_norm": 0.22105053067207336, "learning_rate": 8.482552694847744e-05, "loss": 1.2389, "step": 3652 }, { "epoch": 0.9887701258287106, "grad_norm": 0.22355502843856812, "learning_rate": 8.480293021799518e-05, "loss": 1.2162, "step": 3654 }, { "epoch": 0.9893113245839534, "grad_norm": 0.23202285170555115, "learning_rate": 8.478031968997845e-05, "loss": 1.2233, "step": 3656 }, { "epoch": 0.9898525233391963, "grad_norm": 0.209551602602005, "learning_rate": 8.475769537339115e-05, "loss": 1.2196, "step": 3658 }, { "epoch": 0.9903937220944392, "grad_norm": 0.23315675556659698, "learning_rate": 8.473505727720261e-05, "loss": 1.2342, "step": 3660 }, { "epoch": 0.990934920849682, "grad_norm": 0.22182142734527588, "learning_rate": 8.471240541038765e-05, "loss": 1.2239, "step": 3662 }, { "epoch": 0.9914761196049249, "grad_norm": 0.2126477062702179, "learning_rate": 8.468973978192654e-05, "loss": 1.2344, "step": 3664 }, { "epoch": 0.9920173183601678, "grad_norm": 0.19775603711605072, "learning_rate": 8.466706040080504e-05, "loss": 1.2101, "step": 3666 }, { "epoch": 0.9925585171154107, "grad_norm": 0.21096171438694, "learning_rate": 8.46443672760143e-05, "loss": 1.2345, "step": 3668 }, { "epoch": 0.9930997158706535, "grad_norm": 0.2033991813659668, "learning_rate": 8.462166041655098e-05, "loss": 1.2307, "step": 3670 }, { "epoch": 0.9936409146258963, "grad_norm": 0.1952131688594818, "learning_rate": 8.459893983141714e-05, "loss": 1.2362, "step": 3672 }, { "epoch": 0.9941821133811393, "grad_norm": 0.2031036764383316, "learning_rate": 8.45762055296203e-05, "loss": 1.2249, "step": 3674 }, { "epoch": 0.9947233121363821, "grad_norm": 0.1928708553314209, "learning_rate": 8.455345752017343e-05, "loss": 1.2276, "step": 3676 }, { "epoch": 0.9952645108916249, "grad_norm": 0.1981291025876999, "learning_rate": 8.453069581209489e-05, "loss": 1.2154, "step": 3678 }, { "epoch": 0.9958057096468678, "grad_norm": 0.2076874077320099, "learning_rate": 8.450792041440856e-05, "loss": 1.2306, "step": 3680 }, { "epoch": 0.9963469084021107, "grad_norm": 0.23212751746177673, "learning_rate": 8.448513133614364e-05, "loss": 1.2212, "step": 3682 }, { "epoch": 0.9968881071573535, "grad_norm": 0.22317342460155487, "learning_rate": 8.446232858633481e-05, "loss": 1.2364, "step": 3684 }, { "epoch": 0.9974293059125964, "grad_norm": 0.2007218599319458, "learning_rate": 8.443951217402216e-05, "loss": 1.223, "step": 3686 }, { "epoch": 0.9979705046678392, "grad_norm": 0.2146870344877243, "learning_rate": 8.441668210825122e-05, "loss": 1.2248, "step": 3688 }, { "epoch": 0.9985117034230822, "grad_norm": 0.18961116671562195, "learning_rate": 8.43938383980729e-05, "loss": 1.2217, "step": 3690 }, { "epoch": 0.999052902178325, "grad_norm": 0.2126687914133072, "learning_rate": 8.437098105254353e-05, "loss": 1.1985, "step": 3692 }, { "epoch": 0.9995941009335678, "grad_norm": 0.2034890204668045, "learning_rate": 8.434811008072486e-05, "loss": 1.2364, "step": 3694 }, { "epoch": 1.0, "grad_norm": 0.25010621547698975, "learning_rate": 8.432522549168402e-05, "loss": 1.2423, "step": 3696 }, { "epoch": 1.0005411987552428, "grad_norm": 0.3276897072792053, "learning_rate": 8.430232729449353e-05, "loss": 1.1664, "step": 3698 }, { "epoch": 1.0010823975104857, "grad_norm": 0.25672757625579834, "learning_rate": 8.427941549823134e-05, "loss": 1.1803, "step": 3700 }, { "epoch": 1.0016235962657285, "grad_norm": 0.24125026166439056, "learning_rate": 8.42564901119808e-05, "loss": 1.174, "step": 3702 }, { "epoch": 1.0021647950209716, "grad_norm": 0.23144879937171936, "learning_rate": 8.42335511448306e-05, "loss": 1.1553, "step": 3704 }, { "epoch": 1.0027059937762144, "grad_norm": 0.20953825116157532, "learning_rate": 8.421059860587481e-05, "loss": 1.153, "step": 3706 }, { "epoch": 1.0032471925314572, "grad_norm": 0.22265411913394928, "learning_rate": 8.418763250421293e-05, "loss": 1.1571, "step": 3708 }, { "epoch": 1.0037883912867, "grad_norm": 0.21143150329589844, "learning_rate": 8.416465284894983e-05, "loss": 1.1711, "step": 3710 }, { "epoch": 1.0043295900419429, "grad_norm": 0.22948767244815826, "learning_rate": 8.41416596491957e-05, "loss": 1.1781, "step": 3712 }, { "epoch": 1.0048707887971857, "grad_norm": 0.2108934074640274, "learning_rate": 8.411865291406618e-05, "loss": 1.1737, "step": 3714 }, { "epoch": 1.0054119875524286, "grad_norm": 0.21712428331375122, "learning_rate": 8.409563265268218e-05, "loss": 1.1585, "step": 3716 }, { "epoch": 1.0059531863076714, "grad_norm": 0.20881038904190063, "learning_rate": 8.407259887417007e-05, "loss": 1.1594, "step": 3718 }, { "epoch": 1.0064943850629144, "grad_norm": 0.22524814307689667, "learning_rate": 8.404955158766153e-05, "loss": 1.1535, "step": 3720 }, { "epoch": 1.0070355838181573, "grad_norm": 0.2055925875902176, "learning_rate": 8.402649080229357e-05, "loss": 1.1518, "step": 3722 }, { "epoch": 1.0075767825734, "grad_norm": 0.22745977342128754, "learning_rate": 8.40034165272086e-05, "loss": 1.1666, "step": 3724 }, { "epoch": 1.008117981328643, "grad_norm": 0.21135374903678894, "learning_rate": 8.398032877155435e-05, "loss": 1.1599, "step": 3726 }, { "epoch": 1.0086591800838858, "grad_norm": 0.22054436802864075, "learning_rate": 8.395722754448392e-05, "loss": 1.1608, "step": 3728 }, { "epoch": 1.0092003788391286, "grad_norm": 0.24490900337696075, "learning_rate": 8.393411285515571e-05, "loss": 1.1391, "step": 3730 }, { "epoch": 1.0097415775943714, "grad_norm": 0.21600517630577087, "learning_rate": 8.39109847127335e-05, "loss": 1.186, "step": 3732 }, { "epoch": 1.0102827763496145, "grad_norm": 0.21650472283363342, "learning_rate": 8.388784312638638e-05, "loss": 1.1682, "step": 3734 }, { "epoch": 1.0108239751048573, "grad_norm": 0.2596750557422638, "learning_rate": 8.386468810528875e-05, "loss": 1.1753, "step": 3736 }, { "epoch": 1.0113651738601002, "grad_norm": 0.20751477777957916, "learning_rate": 8.38415196586204e-05, "loss": 1.1714, "step": 3738 }, { "epoch": 1.011906372615343, "grad_norm": 0.20294234156608582, "learning_rate": 8.381833779556638e-05, "loss": 1.1517, "step": 3740 }, { "epoch": 1.0124475713705858, "grad_norm": 0.2066117227077484, "learning_rate": 8.379514252531709e-05, "loss": 1.141, "step": 3742 }, { "epoch": 1.0129887701258287, "grad_norm": 0.21005451679229736, "learning_rate": 8.377193385706823e-05, "loss": 1.1613, "step": 3744 }, { "epoch": 1.0135299688810715, "grad_norm": 0.21950848400592804, "learning_rate": 8.374871180002082e-05, "loss": 1.1692, "step": 3746 }, { "epoch": 1.0140711676363143, "grad_norm": 0.2529325485229492, "learning_rate": 8.372547636338117e-05, "loss": 1.1721, "step": 3748 }, { "epoch": 1.0146123663915574, "grad_norm": 0.24388043582439423, "learning_rate": 8.370222755636094e-05, "loss": 1.1608, "step": 3750 }, { "epoch": 1.0151535651468002, "grad_norm": 0.22465898096561432, "learning_rate": 8.367896538817704e-05, "loss": 1.1759, "step": 3752 }, { "epoch": 1.015694763902043, "grad_norm": 0.24493010342121124, "learning_rate": 8.365568986805172e-05, "loss": 1.1578, "step": 3754 }, { "epoch": 1.0162359626572859, "grad_norm": 0.24231097102165222, "learning_rate": 8.363240100521249e-05, "loss": 1.1726, "step": 3756 }, { "epoch": 1.0167771614125287, "grad_norm": 0.20873676240444183, "learning_rate": 8.360909880889214e-05, "loss": 1.1745, "step": 3758 }, { "epoch": 1.0173183601677716, "grad_norm": 0.2147914171218872, "learning_rate": 8.35857832883288e-05, "loss": 1.1444, "step": 3760 }, { "epoch": 1.0178595589230144, "grad_norm": 0.22383643686771393, "learning_rate": 8.356245445276585e-05, "loss": 1.1597, "step": 3762 }, { "epoch": 1.0184007576782574, "grad_norm": 0.23194357752799988, "learning_rate": 8.35391123114519e-05, "loss": 1.152, "step": 3764 }, { "epoch": 1.0189419564335003, "grad_norm": 0.23192541301250458, "learning_rate": 8.351575687364095e-05, "loss": 1.187, "step": 3766 }, { "epoch": 1.019483155188743, "grad_norm": 0.24341407418251038, "learning_rate": 8.349238814859217e-05, "loss": 1.1569, "step": 3768 }, { "epoch": 1.020024353943986, "grad_norm": 0.22612228989601135, "learning_rate": 8.346900614557001e-05, "loss": 1.1721, "step": 3770 }, { "epoch": 1.0205655526992288, "grad_norm": 0.22228077054023743, "learning_rate": 8.344561087384426e-05, "loss": 1.1564, "step": 3772 }, { "epoch": 1.0211067514544716, "grad_norm": 0.22433893382549286, "learning_rate": 8.342220234268987e-05, "loss": 1.14, "step": 3774 }, { "epoch": 1.0216479502097144, "grad_norm": 0.22573336958885193, "learning_rate": 8.339878056138712e-05, "loss": 1.1738, "step": 3776 }, { "epoch": 1.0221891489649573, "grad_norm": 0.21434976160526276, "learning_rate": 8.337534553922151e-05, "loss": 1.1667, "step": 3778 }, { "epoch": 1.0227303477202003, "grad_norm": 0.20959998667240143, "learning_rate": 8.335189728548381e-05, "loss": 1.1811, "step": 3780 }, { "epoch": 1.0232715464754432, "grad_norm": 0.21443971991539001, "learning_rate": 8.332843580946999e-05, "loss": 1.1715, "step": 3782 }, { "epoch": 1.023812745230686, "grad_norm": 0.21527649462223053, "learning_rate": 8.330496112048131e-05, "loss": 1.1938, "step": 3784 }, { "epoch": 1.0243539439859288, "grad_norm": 0.22661571204662323, "learning_rate": 8.328147322782424e-05, "loss": 1.1676, "step": 3786 }, { "epoch": 1.0248951427411717, "grad_norm": 0.23696081340312958, "learning_rate": 8.325797214081052e-05, "loss": 1.1905, "step": 3788 }, { "epoch": 1.0254363414964145, "grad_norm": 0.22171412408351898, "learning_rate": 8.323445786875709e-05, "loss": 1.1466, "step": 3790 }, { "epoch": 1.0259775402516573, "grad_norm": 0.20770622789859772, "learning_rate": 8.321093042098612e-05, "loss": 1.1502, "step": 3792 }, { "epoch": 1.0265187390069004, "grad_norm": 0.20591381192207336, "learning_rate": 8.318738980682499e-05, "loss": 1.159, "step": 3794 }, { "epoch": 1.0270599377621432, "grad_norm": 0.22844432294368744, "learning_rate": 8.316383603560633e-05, "loss": 1.1805, "step": 3796 }, { "epoch": 1.027601136517386, "grad_norm": 0.22971852123737335, "learning_rate": 8.314026911666799e-05, "loss": 1.1606, "step": 3798 }, { "epoch": 1.0281423352726289, "grad_norm": 0.20094196498394012, "learning_rate": 8.311668905935301e-05, "loss": 1.1648, "step": 3800 }, { "epoch": 1.0286835340278717, "grad_norm": 0.21529503166675568, "learning_rate": 8.309309587300964e-05, "loss": 1.1672, "step": 3802 }, { "epoch": 1.0292247327831145, "grad_norm": 0.21473126113414764, "learning_rate": 8.306948956699132e-05, "loss": 1.1534, "step": 3804 }, { "epoch": 1.0297659315383574, "grad_norm": 0.21291570365428925, "learning_rate": 8.304587015065674e-05, "loss": 1.1472, "step": 3806 }, { "epoch": 1.0303071302936004, "grad_norm": 0.21232275664806366, "learning_rate": 8.302223763336976e-05, "loss": 1.1697, "step": 3808 }, { "epoch": 1.0308483290488433, "grad_norm": 0.23822760581970215, "learning_rate": 8.299859202449939e-05, "loss": 1.1677, "step": 3810 }, { "epoch": 1.031389527804086, "grad_norm": 0.24717482924461365, "learning_rate": 8.297493333341992e-05, "loss": 1.171, "step": 3812 }, { "epoch": 1.031930726559329, "grad_norm": 0.2262924462556839, "learning_rate": 8.295126156951076e-05, "loss": 1.1654, "step": 3814 }, { "epoch": 1.0324719253145718, "grad_norm": 0.22285550832748413, "learning_rate": 8.292757674215652e-05, "loss": 1.1583, "step": 3816 }, { "epoch": 1.0330131240698146, "grad_norm": 0.22245456278324127, "learning_rate": 8.290387886074698e-05, "loss": 1.146, "step": 3818 }, { "epoch": 1.0335543228250574, "grad_norm": 0.2081366628408432, "learning_rate": 8.288016793467714e-05, "loss": 1.1551, "step": 3820 }, { "epoch": 1.0340955215803003, "grad_norm": 0.20822925865650177, "learning_rate": 8.28564439733471e-05, "loss": 1.1625, "step": 3822 }, { "epoch": 1.0346367203355433, "grad_norm": 0.2068616896867752, "learning_rate": 8.283270698616218e-05, "loss": 1.1715, "step": 3824 }, { "epoch": 1.0351779190907862, "grad_norm": 0.19563379883766174, "learning_rate": 8.280895698253286e-05, "loss": 1.1446, "step": 3826 }, { "epoch": 1.035719117846029, "grad_norm": 0.214201420545578, "learning_rate": 8.278519397187474e-05, "loss": 1.1653, "step": 3828 }, { "epoch": 1.0362603166012718, "grad_norm": 0.22149145603179932, "learning_rate": 8.276141796360865e-05, "loss": 1.1601, "step": 3830 }, { "epoch": 1.0368015153565147, "grad_norm": 0.20305697619915009, "learning_rate": 8.273762896716049e-05, "loss": 1.1684, "step": 3832 }, { "epoch": 1.0373427141117575, "grad_norm": 0.22398139536380768, "learning_rate": 8.271382699196135e-05, "loss": 1.1711, "step": 3834 }, { "epoch": 1.0378839128670003, "grad_norm": 0.23009321093559265, "learning_rate": 8.26900120474475e-05, "loss": 1.1759, "step": 3836 }, { "epoch": 1.0384251116222432, "grad_norm": 0.2264915555715561, "learning_rate": 8.266618414306028e-05, "loss": 1.1633, "step": 3838 }, { "epoch": 1.0389663103774862, "grad_norm": 0.20079673826694489, "learning_rate": 8.26423432882462e-05, "loss": 1.1672, "step": 3840 }, { "epoch": 1.039507509132729, "grad_norm": 0.20294247567653656, "learning_rate": 8.261848949245694e-05, "loss": 1.1736, "step": 3842 }, { "epoch": 1.0400487078879719, "grad_norm": 0.21752537786960602, "learning_rate": 8.259462276514924e-05, "loss": 1.1472, "step": 3844 }, { "epoch": 1.0405899066432147, "grad_norm": 0.21777775883674622, "learning_rate": 8.257074311578504e-05, "loss": 1.1658, "step": 3846 }, { "epoch": 1.0411311053984575, "grad_norm": 0.2083350569009781, "learning_rate": 8.254685055383135e-05, "loss": 1.1561, "step": 3848 }, { "epoch": 1.0416723041537004, "grad_norm": 0.22833497822284698, "learning_rate": 8.252294508876031e-05, "loss": 1.1512, "step": 3850 }, { "epoch": 1.0422135029089432, "grad_norm": 0.22746221721172333, "learning_rate": 8.249902673004917e-05, "loss": 1.1617, "step": 3852 }, { "epoch": 1.0427547016641863, "grad_norm": 0.23666687309741974, "learning_rate": 8.247509548718035e-05, "loss": 1.1588, "step": 3854 }, { "epoch": 1.043295900419429, "grad_norm": 0.22944943606853485, "learning_rate": 8.245115136964128e-05, "loss": 1.184, "step": 3856 }, { "epoch": 1.043837099174672, "grad_norm": 0.22142989933490753, "learning_rate": 8.242719438692458e-05, "loss": 1.1535, "step": 3858 }, { "epoch": 1.0443782979299148, "grad_norm": 0.22694510221481323, "learning_rate": 8.240322454852791e-05, "loss": 1.16, "step": 3860 }, { "epoch": 1.0449194966851576, "grad_norm": 0.2044788897037506, "learning_rate": 8.237924186395408e-05, "loss": 1.1324, "step": 3862 }, { "epoch": 1.0454606954404004, "grad_norm": 0.1951093226671219, "learning_rate": 8.235524634271094e-05, "loss": 1.1572, "step": 3864 }, { "epoch": 1.0460018941956433, "grad_norm": 0.2163376808166504, "learning_rate": 8.233123799431148e-05, "loss": 1.1703, "step": 3866 }, { "epoch": 1.0465430929508863, "grad_norm": 0.2503316402435303, "learning_rate": 8.230721682827372e-05, "loss": 1.1462, "step": 3868 }, { "epoch": 1.0470842917061292, "grad_norm": 0.30522701144218445, "learning_rate": 8.228318285412081e-05, "loss": 1.1584, "step": 3870 }, { "epoch": 1.047625490461372, "grad_norm": 1.271936058998108, "learning_rate": 8.225913608138095e-05, "loss": 1.1548, "step": 3872 }, { "epoch": 1.0481666892166148, "grad_norm": 0.40829160809516907, "learning_rate": 8.223507651958743e-05, "loss": 1.1716, "step": 3874 }, { "epoch": 1.0487078879718577, "grad_norm": 0.4559307396411896, "learning_rate": 8.22110041782786e-05, "loss": 1.1752, "step": 3876 }, { "epoch": 1.0492490867271005, "grad_norm": 0.30671998858451843, "learning_rate": 8.21869190669979e-05, "loss": 1.1629, "step": 3878 }, { "epoch": 1.0497902854823433, "grad_norm": 0.2638482451438904, "learning_rate": 8.216282119529378e-05, "loss": 1.1504, "step": 3880 }, { "epoch": 1.0503314842375862, "grad_norm": 0.24638111889362335, "learning_rate": 8.213871057271978e-05, "loss": 1.1551, "step": 3882 }, { "epoch": 1.0508726829928292, "grad_norm": 0.25287169218063354, "learning_rate": 8.21145872088345e-05, "loss": 1.1707, "step": 3884 }, { "epoch": 1.051413881748072, "grad_norm": 0.23718564212322235, "learning_rate": 8.209045111320161e-05, "loss": 1.1642, "step": 3886 }, { "epoch": 1.0519550805033149, "grad_norm": 0.21913887560367584, "learning_rate": 8.20663022953898e-05, "loss": 1.175, "step": 3888 }, { "epoch": 1.0524962792585577, "grad_norm": 0.24881695210933685, "learning_rate": 8.204214076497278e-05, "loss": 1.1722, "step": 3890 }, { "epoch": 1.0530374780138005, "grad_norm": 0.24020962417125702, "learning_rate": 8.201796653152936e-05, "loss": 1.1563, "step": 3892 }, { "epoch": 1.0535786767690434, "grad_norm": 0.20904816687107086, "learning_rate": 8.199377960464333e-05, "loss": 1.1779, "step": 3894 }, { "epoch": 1.0541198755242862, "grad_norm": 0.2172718644142151, "learning_rate": 8.196957999390356e-05, "loss": 1.1471, "step": 3896 }, { "epoch": 1.054661074279529, "grad_norm": 0.2129276990890503, "learning_rate": 8.194536770890391e-05, "loss": 1.1648, "step": 3898 }, { "epoch": 1.055202273034772, "grad_norm": 0.22173850238323212, "learning_rate": 8.192114275924327e-05, "loss": 1.1558, "step": 3900 }, { "epoch": 1.055743471790015, "grad_norm": 0.20564569532871246, "learning_rate": 8.18969051545256e-05, "loss": 1.1654, "step": 3902 }, { "epoch": 1.0562846705452578, "grad_norm": 0.20729513466358185, "learning_rate": 8.187265490435978e-05, "loss": 1.164, "step": 3904 }, { "epoch": 1.0568258693005006, "grad_norm": 0.23436667025089264, "learning_rate": 8.184839201835981e-05, "loss": 1.1603, "step": 3906 }, { "epoch": 1.0573670680557434, "grad_norm": 0.2048719972372055, "learning_rate": 8.182411650614464e-05, "loss": 1.1805, "step": 3908 }, { "epoch": 1.0579082668109863, "grad_norm": 0.21470339596271515, "learning_rate": 8.17998283773382e-05, "loss": 1.1507, "step": 3910 }, { "epoch": 1.058449465566229, "grad_norm": 0.2106187790632248, "learning_rate": 8.177552764156951e-05, "loss": 1.1654, "step": 3912 }, { "epoch": 1.0589906643214722, "grad_norm": 0.2404891699552536, "learning_rate": 8.17512143084725e-05, "loss": 1.1374, "step": 3914 }, { "epoch": 1.059531863076715, "grad_norm": 0.20607243478298187, "learning_rate": 8.172688838768614e-05, "loss": 1.1777, "step": 3916 }, { "epoch": 1.0600730618319578, "grad_norm": 0.2091047763824463, "learning_rate": 8.170254988885438e-05, "loss": 1.1609, "step": 3918 }, { "epoch": 1.0606142605872007, "grad_norm": 0.21691367030143738, "learning_rate": 8.167819882162617e-05, "loss": 1.1554, "step": 3920 }, { "epoch": 1.0611554593424435, "grad_norm": 0.221197247505188, "learning_rate": 8.16538351956554e-05, "loss": 1.1579, "step": 3922 }, { "epoch": 1.0616966580976863, "grad_norm": 0.21998266875743866, "learning_rate": 8.162945902060099e-05, "loss": 1.1558, "step": 3924 }, { "epoch": 1.0622378568529292, "grad_norm": 0.21962489187717438, "learning_rate": 8.160507030612684e-05, "loss": 1.1618, "step": 3926 }, { "epoch": 1.0627790556081722, "grad_norm": 0.20770075917243958, "learning_rate": 8.158066906190174e-05, "loss": 1.1664, "step": 3928 }, { "epoch": 1.063320254363415, "grad_norm": 0.20497769117355347, "learning_rate": 8.155625529759951e-05, "loss": 1.1577, "step": 3930 }, { "epoch": 1.0638614531186579, "grad_norm": 0.21967573463916779, "learning_rate": 8.153182902289897e-05, "loss": 1.163, "step": 3932 }, { "epoch": 1.0644026518739007, "grad_norm": 0.21568840742111206, "learning_rate": 8.150739024748383e-05, "loss": 1.1735, "step": 3934 }, { "epoch": 1.0649438506291435, "grad_norm": 0.20917922258377075, "learning_rate": 8.148293898104277e-05, "loss": 1.1491, "step": 3936 }, { "epoch": 1.0654850493843864, "grad_norm": 0.21231794357299805, "learning_rate": 8.145847523326944e-05, "loss": 1.1523, "step": 3938 }, { "epoch": 1.0660262481396292, "grad_norm": 0.2191847562789917, "learning_rate": 8.143399901386244e-05, "loss": 1.154, "step": 3940 }, { "epoch": 1.066567446894872, "grad_norm": 0.23816804587841034, "learning_rate": 8.140951033252528e-05, "loss": 1.1512, "step": 3942 }, { "epoch": 1.067108645650115, "grad_norm": 0.20847490429878235, "learning_rate": 8.138500919896649e-05, "loss": 1.1584, "step": 3944 }, { "epoch": 1.067649844405358, "grad_norm": 0.2102094441652298, "learning_rate": 8.136049562289943e-05, "loss": 1.1688, "step": 3946 }, { "epoch": 1.0681910431606008, "grad_norm": 0.2000993937253952, "learning_rate": 8.133596961404246e-05, "loss": 1.1703, "step": 3948 }, { "epoch": 1.0687322419158436, "grad_norm": 0.20474949479103088, "learning_rate": 8.131143118211888e-05, "loss": 1.1433, "step": 3950 }, { "epoch": 1.0692734406710864, "grad_norm": 0.2065473198890686, "learning_rate": 8.128688033685685e-05, "loss": 1.1583, "step": 3952 }, { "epoch": 1.0698146394263293, "grad_norm": 0.22109980881214142, "learning_rate": 8.126231708798953e-05, "loss": 1.1619, "step": 3954 }, { "epoch": 1.070355838181572, "grad_norm": 0.2283199280500412, "learning_rate": 8.123774144525492e-05, "loss": 1.1649, "step": 3956 }, { "epoch": 1.070897036936815, "grad_norm": 0.21421821415424347, "learning_rate": 8.1213153418396e-05, "loss": 1.1548, "step": 3958 }, { "epoch": 1.071438235692058, "grad_norm": 0.20380717515945435, "learning_rate": 8.118855301716061e-05, "loss": 1.1502, "step": 3960 }, { "epoch": 1.0719794344473008, "grad_norm": 0.2934326231479645, "learning_rate": 8.116394025130156e-05, "loss": 1.1655, "step": 3962 }, { "epoch": 1.0725206332025436, "grad_norm": 0.637283444404602, "learning_rate": 8.113931513057647e-05, "loss": 1.163, "step": 3964 }, { "epoch": 1.0730618319577865, "grad_norm": 0.3647371828556061, "learning_rate": 8.111467766474793e-05, "loss": 1.1736, "step": 3966 }, { "epoch": 1.0736030307130293, "grad_norm": 0.8815774917602539, "learning_rate": 8.10900278635834e-05, "loss": 1.1622, "step": 3968 }, { "epoch": 1.0741442294682721, "grad_norm": 1.764570713043213, "learning_rate": 8.106536573685523e-05, "loss": 1.1594, "step": 3970 }, { "epoch": 1.074685428223515, "grad_norm": 0.295211523771286, "learning_rate": 8.104069129434067e-05, "loss": 1.1623, "step": 3972 }, { "epoch": 1.075226626978758, "grad_norm": 0.5825856328010559, "learning_rate": 8.10160045458218e-05, "loss": 1.1773, "step": 3974 }, { "epoch": 1.0757678257340009, "grad_norm": 0.40720388293266296, "learning_rate": 8.099130550108566e-05, "loss": 1.1581, "step": 3976 }, { "epoch": 1.0763090244892437, "grad_norm": 0.3680490553379059, "learning_rate": 8.096659416992414e-05, "loss": 1.1747, "step": 3978 }, { "epoch": 1.0768502232444865, "grad_norm": 0.2958071529865265, "learning_rate": 8.094187056213393e-05, "loss": 1.1517, "step": 3980 }, { "epoch": 1.0773914219997294, "grad_norm": 0.28484034538269043, "learning_rate": 8.09171346875167e-05, "loss": 1.145, "step": 3982 }, { "epoch": 1.0779326207549722, "grad_norm": 0.3037899434566498, "learning_rate": 8.089238655587887e-05, "loss": 1.1532, "step": 3984 }, { "epoch": 1.078473819510215, "grad_norm": 0.27155500650405884, "learning_rate": 8.086762617703181e-05, "loss": 1.1713, "step": 3986 }, { "epoch": 1.079015018265458, "grad_norm": 0.2606882154941559, "learning_rate": 8.08428535607917e-05, "loss": 1.1773, "step": 3988 }, { "epoch": 1.079556217020701, "grad_norm": 0.22694651782512665, "learning_rate": 8.081806871697959e-05, "loss": 1.1613, "step": 3990 }, { "epoch": 1.0800974157759438, "grad_norm": 0.23277917504310608, "learning_rate": 8.079327165542135e-05, "loss": 1.1573, "step": 3992 }, { "epoch": 1.0806386145311866, "grad_norm": 0.24309536814689636, "learning_rate": 8.076846238594774e-05, "loss": 1.1787, "step": 3994 }, { "epoch": 1.0811798132864294, "grad_norm": 0.23015187680721283, "learning_rate": 8.074364091839432e-05, "loss": 1.161, "step": 3996 }, { "epoch": 1.0817210120416723, "grad_norm": 0.22507344186306, "learning_rate": 8.071880726260149e-05, "loss": 1.1693, "step": 3998 }, { "epoch": 1.082262210796915, "grad_norm": 0.23293937742710114, "learning_rate": 8.06939614284145e-05, "loss": 1.1496, "step": 4000 }, { "epoch": 1.0828034095521581, "grad_norm": 0.25221702456474304, "learning_rate": 8.06691034256834e-05, "loss": 1.1638, "step": 4002 }, { "epoch": 1.083344608307401, "grad_norm": 0.4965110123157501, "learning_rate": 8.064423326426313e-05, "loss": 1.1511, "step": 4004 }, { "epoch": 1.0838858070626438, "grad_norm": 0.23343190550804138, "learning_rate": 8.061935095401336e-05, "loss": 1.1619, "step": 4006 }, { "epoch": 1.0844270058178866, "grad_norm": 0.24027478694915771, "learning_rate": 8.059445650479862e-05, "loss": 1.1642, "step": 4008 }, { "epoch": 1.0849682045731295, "grad_norm": 0.2377437800168991, "learning_rate": 8.056954992648828e-05, "loss": 1.1653, "step": 4010 }, { "epoch": 1.0855094033283723, "grad_norm": 0.21449270844459534, "learning_rate": 8.054463122895645e-05, "loss": 1.1623, "step": 4012 }, { "epoch": 1.0860506020836151, "grad_norm": 0.22771266102790833, "learning_rate": 8.051970042208214e-05, "loss": 1.1848, "step": 4014 }, { "epoch": 1.086591800838858, "grad_norm": 0.22908978164196014, "learning_rate": 8.049475751574907e-05, "loss": 1.1716, "step": 4016 }, { "epoch": 1.087132999594101, "grad_norm": 0.21853183209896088, "learning_rate": 8.046980251984579e-05, "loss": 1.1488, "step": 4018 }, { "epoch": 1.0876741983493439, "grad_norm": 0.2258382886648178, "learning_rate": 8.044483544426565e-05, "loss": 1.1516, "step": 4020 }, { "epoch": 1.0882153971045867, "grad_norm": 0.22110265493392944, "learning_rate": 8.04198562989068e-05, "loss": 1.1651, "step": 4022 }, { "epoch": 1.0887565958598295, "grad_norm": 0.2222428172826767, "learning_rate": 8.039486509367213e-05, "loss": 1.1689, "step": 4024 }, { "epoch": 1.0892977946150724, "grad_norm": 0.26905685663223267, "learning_rate": 8.036986183846937e-05, "loss": 1.1664, "step": 4026 }, { "epoch": 1.0898389933703152, "grad_norm": 0.2191791832447052, "learning_rate": 8.0344846543211e-05, "loss": 1.1662, "step": 4028 }, { "epoch": 1.090380192125558, "grad_norm": 0.22855934500694275, "learning_rate": 8.031981921781425e-05, "loss": 1.1686, "step": 4030 }, { "epoch": 1.0909213908808009, "grad_norm": 0.22431407868862152, "learning_rate": 8.029477987220113e-05, "loss": 1.1699, "step": 4032 }, { "epoch": 1.091462589636044, "grad_norm": 0.21042400598526, "learning_rate": 8.026972851629846e-05, "loss": 1.1574, "step": 4034 }, { "epoch": 1.0920037883912868, "grad_norm": 0.22836509346961975, "learning_rate": 8.024466516003777e-05, "loss": 1.1717, "step": 4036 }, { "epoch": 1.0925449871465296, "grad_norm": 0.21852664649486542, "learning_rate": 8.021958981335535e-05, "loss": 1.148, "step": 4038 }, { "epoch": 1.0930861859017724, "grad_norm": 0.23865893483161926, "learning_rate": 8.01945024861923e-05, "loss": 1.1716, "step": 4040 }, { "epoch": 1.0936273846570153, "grad_norm": 0.21468840539455414, "learning_rate": 8.01694031884944e-05, "loss": 1.1609, "step": 4042 }, { "epoch": 1.094168583412258, "grad_norm": 0.219735786318779, "learning_rate": 8.014429193021219e-05, "loss": 1.183, "step": 4044 }, { "epoch": 1.094709782167501, "grad_norm": 0.23305675387382507, "learning_rate": 8.011916872130099e-05, "loss": 1.1545, "step": 4046 }, { "epoch": 1.095250980922744, "grad_norm": 0.2143971025943756, "learning_rate": 8.009403357172083e-05, "loss": 1.155, "step": 4048 }, { "epoch": 1.0957921796779868, "grad_norm": 0.20684604346752167, "learning_rate": 8.006888649143646e-05, "loss": 1.1501, "step": 4050 }, { "epoch": 1.0963333784332296, "grad_norm": 0.22377672791481018, "learning_rate": 8.00437274904174e-05, "loss": 1.1559, "step": 4052 }, { "epoch": 1.0968745771884725, "grad_norm": 0.20589472353458405, "learning_rate": 8.001855657863787e-05, "loss": 1.1682, "step": 4054 }, { "epoch": 1.0974157759437153, "grad_norm": 0.20585069060325623, "learning_rate": 7.999337376607677e-05, "loss": 1.1539, "step": 4056 }, { "epoch": 1.0979569746989581, "grad_norm": 0.2118963897228241, "learning_rate": 7.996817906271782e-05, "loss": 1.1888, "step": 4058 }, { "epoch": 1.098498173454201, "grad_norm": 0.2089332491159439, "learning_rate": 7.994297247854939e-05, "loss": 1.1705, "step": 4060 }, { "epoch": 1.099039372209444, "grad_norm": 0.2157619744539261, "learning_rate": 7.991775402356453e-05, "loss": 1.1468, "step": 4062 }, { "epoch": 1.0995805709646869, "grad_norm": 0.21696984767913818, "learning_rate": 7.989252370776108e-05, "loss": 1.1585, "step": 4064 }, { "epoch": 1.1001217697199297, "grad_norm": 0.2212984263896942, "learning_rate": 7.98672815411415e-05, "loss": 1.1624, "step": 4066 }, { "epoch": 1.1006629684751725, "grad_norm": 0.2152889221906662, "learning_rate": 7.984202753371299e-05, "loss": 1.1794, "step": 4068 }, { "epoch": 1.1012041672304154, "grad_norm": 0.22469989955425262, "learning_rate": 7.981676169548747e-05, "loss": 1.1651, "step": 4070 }, { "epoch": 1.1017453659856582, "grad_norm": 0.2330954372882843, "learning_rate": 7.979148403648146e-05, "loss": 1.1536, "step": 4072 }, { "epoch": 1.102286564740901, "grad_norm": 0.22769203782081604, "learning_rate": 7.976619456671628e-05, "loss": 1.1484, "step": 4074 }, { "epoch": 1.1028277634961439, "grad_norm": 0.21332180500030518, "learning_rate": 7.974089329621786e-05, "loss": 1.1462, "step": 4076 }, { "epoch": 1.103368962251387, "grad_norm": 0.20643813908100128, "learning_rate": 7.971558023501679e-05, "loss": 1.1538, "step": 4078 }, { "epoch": 1.1039101610066298, "grad_norm": 0.2165161520242691, "learning_rate": 7.96902553931484e-05, "loss": 1.1447, "step": 4080 }, { "epoch": 1.1044513597618726, "grad_norm": 0.2074541449546814, "learning_rate": 7.96649187806527e-05, "loss": 1.1592, "step": 4082 }, { "epoch": 1.1049925585171154, "grad_norm": 0.2030552178621292, "learning_rate": 7.963957040757424e-05, "loss": 1.1648, "step": 4084 }, { "epoch": 1.1055337572723583, "grad_norm": 0.8349897861480713, "learning_rate": 7.961421028396239e-05, "loss": 1.1549, "step": 4086 }, { "epoch": 1.106074956027601, "grad_norm": 0.22756335139274597, "learning_rate": 7.958883841987108e-05, "loss": 1.1577, "step": 4088 }, { "epoch": 1.106616154782844, "grad_norm": 0.21515905857086182, "learning_rate": 7.956345482535892e-05, "loss": 1.1522, "step": 4090 }, { "epoch": 1.1071573535380868, "grad_norm": 0.211012601852417, "learning_rate": 7.953805951048916e-05, "loss": 1.1654, "step": 4092 }, { "epoch": 1.1076985522933298, "grad_norm": 0.2122959941625595, "learning_rate": 7.951265248532976e-05, "loss": 1.1376, "step": 4094 }, { "epoch": 1.1082397510485726, "grad_norm": 0.21735821664333344, "learning_rate": 7.94872337599532e-05, "loss": 1.1609, "step": 4096 }, { "epoch": 1.1087809498038155, "grad_norm": 0.2157265990972519, "learning_rate": 7.946180334443673e-05, "loss": 1.1413, "step": 4098 }, { "epoch": 1.1093221485590583, "grad_norm": 0.23656488955020905, "learning_rate": 7.943636124886214e-05, "loss": 1.1706, "step": 4100 }, { "epoch": 1.1098633473143011, "grad_norm": 0.44128191471099854, "learning_rate": 7.94109074833159e-05, "loss": 1.1422, "step": 4102 }, { "epoch": 1.110404546069544, "grad_norm": 0.30563896894454956, "learning_rate": 7.938544205788909e-05, "loss": 1.1624, "step": 4104 }, { "epoch": 1.1109457448247868, "grad_norm": 0.3123810291290283, "learning_rate": 7.935996498267742e-05, "loss": 1.1615, "step": 4106 }, { "epoch": 1.1114869435800299, "grad_norm": 0.2737880051136017, "learning_rate": 7.933447626778119e-05, "loss": 1.1816, "step": 4108 }, { "epoch": 1.1120281423352727, "grad_norm": 0.22107571363449097, "learning_rate": 7.930897592330535e-05, "loss": 1.1561, "step": 4110 }, { "epoch": 1.1125693410905155, "grad_norm": 0.2256803661584854, "learning_rate": 7.928346395935945e-05, "loss": 1.1749, "step": 4112 }, { "epoch": 1.1131105398457584, "grad_norm": 0.2229342758655548, "learning_rate": 7.925794038605766e-05, "loss": 1.1587, "step": 4114 }, { "epoch": 1.1136517386010012, "grad_norm": 0.211014524102211, "learning_rate": 7.923240521351871e-05, "loss": 1.1754, "step": 4116 }, { "epoch": 1.114192937356244, "grad_norm": 0.2330767959356308, "learning_rate": 7.920685845186595e-05, "loss": 1.1499, "step": 4118 }, { "epoch": 1.1147341361114869, "grad_norm": 0.21222126483917236, "learning_rate": 7.918130011122737e-05, "loss": 1.1561, "step": 4120 }, { "epoch": 1.11527533486673, "grad_norm": 0.6723094582557678, "learning_rate": 7.915573020173547e-05, "loss": 1.1633, "step": 4122 }, { "epoch": 1.1158165336219728, "grad_norm": 0.2816324532032013, "learning_rate": 7.91301487335274e-05, "loss": 1.1713, "step": 4124 }, { "epoch": 1.1163577323772156, "grad_norm": 0.25782155990600586, "learning_rate": 7.910455571674486e-05, "loss": 1.1626, "step": 4126 }, { "epoch": 1.1168989311324584, "grad_norm": 0.22253093123435974, "learning_rate": 7.907895116153413e-05, "loss": 1.1676, "step": 4128 }, { "epoch": 1.1174401298877013, "grad_norm": 0.24996554851531982, "learning_rate": 7.905333507804608e-05, "loss": 1.1613, "step": 4130 }, { "epoch": 1.117981328642944, "grad_norm": 0.20716790854930878, "learning_rate": 7.902770747643615e-05, "loss": 1.1557, "step": 4132 }, { "epoch": 1.118522527398187, "grad_norm": 0.21587280929088593, "learning_rate": 7.900206836686432e-05, "loss": 1.1508, "step": 4134 }, { "epoch": 1.1190637261534297, "grad_norm": 0.22769996523857117, "learning_rate": 7.897641775949518e-05, "loss": 1.1477, "step": 4136 }, { "epoch": 1.1196049249086728, "grad_norm": 0.2523115277290344, "learning_rate": 7.895075566449781e-05, "loss": 1.1744, "step": 4138 }, { "epoch": 1.1201461236639156, "grad_norm": 0.2227836698293686, "learning_rate": 7.892508209204592e-05, "loss": 1.1511, "step": 4140 }, { "epoch": 1.1206873224191585, "grad_norm": 0.23955698311328888, "learning_rate": 7.88993970523177e-05, "loss": 1.1604, "step": 4142 }, { "epoch": 1.1212285211744013, "grad_norm": 0.217301145195961, "learning_rate": 7.887370055549594e-05, "loss": 1.1441, "step": 4144 }, { "epoch": 1.1217697199296441, "grad_norm": 0.22721989452838898, "learning_rate": 7.884799261176795e-05, "loss": 1.131, "step": 4146 }, { "epoch": 1.122310918684887, "grad_norm": 0.23621070384979248, "learning_rate": 7.882227323132558e-05, "loss": 1.1413, "step": 4148 }, { "epoch": 1.1228521174401298, "grad_norm": 0.20073488354682922, "learning_rate": 7.879654242436523e-05, "loss": 1.1474, "step": 4150 }, { "epoch": 1.1233933161953726, "grad_norm": 0.20614007115364075, "learning_rate": 7.877080020108776e-05, "loss": 1.171, "step": 4152 }, { "epoch": 1.1239345149506157, "grad_norm": 0.21867454051971436, "learning_rate": 7.874504657169868e-05, "loss": 1.1486, "step": 4154 }, { "epoch": 1.1244757137058585, "grad_norm": 0.22386595606803894, "learning_rate": 7.871928154640788e-05, "loss": 1.1462, "step": 4156 }, { "epoch": 1.1250169124611014, "grad_norm": 0.20961107313632965, "learning_rate": 7.86935051354299e-05, "loss": 1.1833, "step": 4158 }, { "epoch": 1.1255581112163442, "grad_norm": 0.205885648727417, "learning_rate": 7.866771734898373e-05, "loss": 1.1513, "step": 4160 }, { "epoch": 1.126099309971587, "grad_norm": 0.20518140494823456, "learning_rate": 7.864191819729282e-05, "loss": 1.1696, "step": 4162 }, { "epoch": 1.1266405087268299, "grad_norm": 0.205685555934906, "learning_rate": 7.861610769058523e-05, "loss": 1.1437, "step": 4164 }, { "epoch": 1.1271817074820727, "grad_norm": 0.2135431170463562, "learning_rate": 7.859028583909345e-05, "loss": 1.1604, "step": 4166 }, { "epoch": 1.1277229062373157, "grad_norm": 0.21558403968811035, "learning_rate": 7.85644526530545e-05, "loss": 1.1771, "step": 4168 }, { "epoch": 1.1282641049925586, "grad_norm": 0.220762237906456, "learning_rate": 7.853860814270985e-05, "loss": 1.1391, "step": 4170 }, { "epoch": 1.1288053037478014, "grad_norm": 0.21308547258377075, "learning_rate": 7.851275231830555e-05, "loss": 1.1463, "step": 4172 }, { "epoch": 1.1293465025030442, "grad_norm": 0.2130470871925354, "learning_rate": 7.848688519009202e-05, "loss": 1.1331, "step": 4174 }, { "epoch": 1.129887701258287, "grad_norm": 0.20271500945091248, "learning_rate": 7.846100676832424e-05, "loss": 1.1561, "step": 4176 }, { "epoch": 1.13042890001353, "grad_norm": 0.21627353131771088, "learning_rate": 7.843511706326165e-05, "loss": 1.1573, "step": 4178 }, { "epoch": 1.1309700987687727, "grad_norm": 0.22262731194496155, "learning_rate": 7.840921608516815e-05, "loss": 1.1461, "step": 4180 }, { "epoch": 1.1315112975240158, "grad_norm": 0.1991957724094391, "learning_rate": 7.838330384431214e-05, "loss": 1.1438, "step": 4182 }, { "epoch": 1.1320524962792586, "grad_norm": 0.20617301762104034, "learning_rate": 7.835738035096643e-05, "loss": 1.1505, "step": 4184 }, { "epoch": 1.1325936950345015, "grad_norm": 0.22232504189014435, "learning_rate": 7.833144561540835e-05, "loss": 1.1501, "step": 4186 }, { "epoch": 1.1331348937897443, "grad_norm": 0.1919006109237671, "learning_rate": 7.830549964791964e-05, "loss": 1.1465, "step": 4188 }, { "epoch": 1.1336760925449871, "grad_norm": 0.20378994941711426, "learning_rate": 7.827954245878654e-05, "loss": 1.1598, "step": 4190 }, { "epoch": 1.13421729130023, "grad_norm": 0.21754945814609528, "learning_rate": 7.825357405829967e-05, "loss": 1.1612, "step": 4192 }, { "epoch": 1.1347584900554728, "grad_norm": 0.20612432062625885, "learning_rate": 7.822759445675419e-05, "loss": 1.1518, "step": 4194 }, { "epoch": 1.1352996888107159, "grad_norm": 0.21937181055545807, "learning_rate": 7.820160366444961e-05, "loss": 1.1567, "step": 4196 }, { "epoch": 1.1358408875659587, "grad_norm": 0.2110893726348877, "learning_rate": 7.817560169168991e-05, "loss": 1.1582, "step": 4198 }, { "epoch": 1.1363820863212015, "grad_norm": 0.2051175981760025, "learning_rate": 7.814958854878356e-05, "loss": 1.1777, "step": 4200 }, { "epoch": 1.1369232850764444, "grad_norm": 0.21144556999206543, "learning_rate": 7.812356424604335e-05, "loss": 1.1568, "step": 4202 }, { "epoch": 1.1374644838316872, "grad_norm": 0.22127197682857513, "learning_rate": 7.809752879378656e-05, "loss": 1.145, "step": 4204 }, { "epoch": 1.13800568258693, "grad_norm": 0.20766502618789673, "learning_rate": 7.807148220233488e-05, "loss": 1.1506, "step": 4206 }, { "epoch": 1.1385468813421729, "grad_norm": 0.20828627049922943, "learning_rate": 7.804542448201447e-05, "loss": 1.177, "step": 4208 }, { "epoch": 1.1390880800974157, "grad_norm": 0.19857019186019897, "learning_rate": 7.801935564315574e-05, "loss": 1.1514, "step": 4210 }, { "epoch": 1.1396292788526585, "grad_norm": 0.1956413984298706, "learning_rate": 7.799327569609373e-05, "loss": 1.1578, "step": 4212 }, { "epoch": 1.1401704776079016, "grad_norm": 0.20685957372188568, "learning_rate": 7.79671846511677e-05, "loss": 1.1484, "step": 4214 }, { "epoch": 1.1407116763631444, "grad_norm": 0.21370911598205566, "learning_rate": 7.794108251872138e-05, "loss": 1.1445, "step": 4216 }, { "epoch": 1.1412528751183872, "grad_norm": 0.21143577992916107, "learning_rate": 7.791496930910293e-05, "loss": 1.1488, "step": 4218 }, { "epoch": 1.14179407387363, "grad_norm": 0.20833255350589752, "learning_rate": 7.788884503266485e-05, "loss": 1.1553, "step": 4220 }, { "epoch": 1.142335272628873, "grad_norm": 0.21482530236244202, "learning_rate": 7.786270969976403e-05, "loss": 1.1521, "step": 4222 }, { "epoch": 1.1428764713841157, "grad_norm": 0.21887533366680145, "learning_rate": 7.783656332076177e-05, "loss": 1.1381, "step": 4224 }, { "epoch": 1.1434176701393586, "grad_norm": 0.22636641561985016, "learning_rate": 7.781040590602373e-05, "loss": 1.1572, "step": 4226 }, { "epoch": 1.1439588688946016, "grad_norm": 0.2340206354856491, "learning_rate": 7.778423746591999e-05, "loss": 1.1493, "step": 4228 }, { "epoch": 1.1445000676498445, "grad_norm": 0.23177602887153625, "learning_rate": 7.77580580108249e-05, "loss": 1.1586, "step": 4230 }, { "epoch": 1.1450412664050873, "grad_norm": 0.19879846274852753, "learning_rate": 7.773186755111728e-05, "loss": 1.171, "step": 4232 }, { "epoch": 1.1455824651603301, "grad_norm": 0.2035079002380371, "learning_rate": 7.770566609718026e-05, "loss": 1.1593, "step": 4234 }, { "epoch": 1.146123663915573, "grad_norm": 0.2164110541343689, "learning_rate": 7.767945365940134e-05, "loss": 1.1546, "step": 4236 }, { "epoch": 1.1466648626708158, "grad_norm": 0.22445808351039886, "learning_rate": 7.765323024817237e-05, "loss": 1.1536, "step": 4238 }, { "epoch": 1.1472060614260586, "grad_norm": 0.2171921283006668, "learning_rate": 7.762699587388957e-05, "loss": 1.166, "step": 4240 }, { "epoch": 1.1477472601813017, "grad_norm": 0.20324265956878662, "learning_rate": 7.76007505469535e-05, "loss": 1.1514, "step": 4242 }, { "epoch": 1.1482884589365445, "grad_norm": 0.20602954924106598, "learning_rate": 7.757449427776902e-05, "loss": 1.1526, "step": 4244 }, { "epoch": 1.1488296576917874, "grad_norm": 0.19193702936172485, "learning_rate": 7.754822707674538e-05, "loss": 1.1461, "step": 4246 }, { "epoch": 1.1493708564470302, "grad_norm": 0.2005046159029007, "learning_rate": 7.752194895429617e-05, "loss": 1.1422, "step": 4248 }, { "epoch": 1.149912055202273, "grad_norm": 0.20979368686676025, "learning_rate": 7.749565992083925e-05, "loss": 1.1508, "step": 4250 }, { "epoch": 1.1504532539575159, "grad_norm": 0.20569835603237152, "learning_rate": 7.746935998679685e-05, "loss": 1.1587, "step": 4252 }, { "epoch": 1.1509944527127587, "grad_norm": 0.21431037783622742, "learning_rate": 7.744304916259553e-05, "loss": 1.1743, "step": 4254 }, { "epoch": 1.1515356514680017, "grad_norm": 0.23459142446517944, "learning_rate": 7.741672745866612e-05, "loss": 1.1511, "step": 4256 }, { "epoch": 1.1520768502232446, "grad_norm": 0.20567552745342255, "learning_rate": 7.739039488544382e-05, "loss": 1.1432, "step": 4258 }, { "epoch": 1.1526180489784874, "grad_norm": 0.21821950376033783, "learning_rate": 7.73640514533681e-05, "loss": 1.1565, "step": 4260 }, { "epoch": 1.1531592477337302, "grad_norm": 0.21483366191387177, "learning_rate": 7.733769717288275e-05, "loss": 1.1753, "step": 4262 }, { "epoch": 1.153700446488973, "grad_norm": 0.2083878517150879, "learning_rate": 7.731133205443587e-05, "loss": 1.1564, "step": 4264 }, { "epoch": 1.154241645244216, "grad_norm": 0.22914838790893555, "learning_rate": 7.728495610847984e-05, "loss": 1.1463, "step": 4266 }, { "epoch": 1.1547828439994587, "grad_norm": 0.21248088777065277, "learning_rate": 7.72585693454713e-05, "loss": 1.167, "step": 4268 }, { "epoch": 1.1553240427547016, "grad_norm": 0.21737425029277802, "learning_rate": 7.723217177587129e-05, "loss": 1.1391, "step": 4270 }, { "epoch": 1.1558652415099444, "grad_norm": 0.20481689274311066, "learning_rate": 7.7205763410145e-05, "loss": 1.1659, "step": 4272 }, { "epoch": 1.1564064402651875, "grad_norm": 0.2236364781856537, "learning_rate": 7.717934425876199e-05, "loss": 1.156, "step": 4274 }, { "epoch": 1.1569476390204303, "grad_norm": 0.20564323663711548, "learning_rate": 7.715291433219605e-05, "loss": 1.1251, "step": 4276 }, { "epoch": 1.1574888377756731, "grad_norm": 0.21151994168758392, "learning_rate": 7.712647364092525e-05, "loss": 1.1438, "step": 4278 }, { "epoch": 1.158030036530916, "grad_norm": 0.2575263977050781, "learning_rate": 7.710002219543198e-05, "loss": 1.1655, "step": 4280 }, { "epoch": 1.1585712352861588, "grad_norm": 0.2303873598575592, "learning_rate": 7.707356000620279e-05, "loss": 1.1703, "step": 4282 }, { "epoch": 1.1591124340414016, "grad_norm": 0.2279600352048874, "learning_rate": 7.704708708372858e-05, "loss": 1.1732, "step": 4284 }, { "epoch": 1.1596536327966445, "grad_norm": 0.21688218414783478, "learning_rate": 7.702060343850449e-05, "loss": 1.1598, "step": 4286 }, { "epoch": 1.1601948315518875, "grad_norm": 0.20281021296977997, "learning_rate": 7.699410908102987e-05, "loss": 1.1689, "step": 4288 }, { "epoch": 1.1607360303071304, "grad_norm": 0.2117750495672226, "learning_rate": 7.696760402180834e-05, "loss": 1.1557, "step": 4290 }, { "epoch": 1.1612772290623732, "grad_norm": 0.21930202841758728, "learning_rate": 7.694108827134779e-05, "loss": 1.1408, "step": 4292 }, { "epoch": 1.161818427817616, "grad_norm": 0.22055776417255402, "learning_rate": 7.691456184016031e-05, "loss": 1.1635, "step": 4294 }, { "epoch": 1.1623596265728589, "grad_norm": 0.24794168770313263, "learning_rate": 7.68880247387622e-05, "loss": 1.1491, "step": 4296 }, { "epoch": 1.1629008253281017, "grad_norm": 0.23536056280136108, "learning_rate": 7.686147697767407e-05, "loss": 1.1591, "step": 4298 }, { "epoch": 1.1634420240833445, "grad_norm": 0.2640872001647949, "learning_rate": 7.683491856742071e-05, "loss": 1.1616, "step": 4300 }, { "epoch": 1.1639832228385876, "grad_norm": 0.22453339397907257, "learning_rate": 7.680834951853113e-05, "loss": 1.135, "step": 4302 }, { "epoch": 1.1645244215938304, "grad_norm": 0.23823754489421844, "learning_rate": 7.678176984153855e-05, "loss": 1.1487, "step": 4304 }, { "epoch": 1.1650656203490732, "grad_norm": 0.20748983323574066, "learning_rate": 7.675517954698044e-05, "loss": 1.1523, "step": 4306 }, { "epoch": 1.165606819104316, "grad_norm": 0.21772535145282745, "learning_rate": 7.672857864539844e-05, "loss": 1.1461, "step": 4308 }, { "epoch": 1.166148017859559, "grad_norm": 0.22087602317333221, "learning_rate": 7.670196714733842e-05, "loss": 1.1619, "step": 4310 }, { "epoch": 1.1666892166148017, "grad_norm": 0.22839230298995972, "learning_rate": 7.667534506335043e-05, "loss": 1.1533, "step": 4312 }, { "epoch": 1.1672304153700446, "grad_norm": 0.21815411746501923, "learning_rate": 7.664871240398875e-05, "loss": 1.1403, "step": 4314 }, { "epoch": 1.1677716141252876, "grad_norm": 0.19821114838123322, "learning_rate": 7.66220691798118e-05, "loss": 1.1472, "step": 4316 }, { "epoch": 1.1683128128805305, "grad_norm": 0.19013382494449615, "learning_rate": 7.659541540138222e-05, "loss": 1.1535, "step": 4318 }, { "epoch": 1.1688540116357733, "grad_norm": 0.1978456676006317, "learning_rate": 7.656875107926687e-05, "loss": 1.146, "step": 4320 }, { "epoch": 1.1693952103910161, "grad_norm": 0.20645636320114136, "learning_rate": 7.654207622403673e-05, "loss": 1.1576, "step": 4322 }, { "epoch": 1.169936409146259, "grad_norm": 0.2145734280347824, "learning_rate": 7.651539084626698e-05, "loss": 1.1563, "step": 4324 }, { "epoch": 1.1704776079015018, "grad_norm": 0.20490793883800507, "learning_rate": 7.648869495653697e-05, "loss": 1.1827, "step": 4326 }, { "epoch": 1.1710188066567446, "grad_norm": 0.20274941623210907, "learning_rate": 7.646198856543021e-05, "loss": 1.1498, "step": 4328 }, { "epoch": 1.1715600054119875, "grad_norm": 0.2002313733100891, "learning_rate": 7.643527168353439e-05, "loss": 1.1296, "step": 4330 }, { "epoch": 1.1721012041672303, "grad_norm": 0.21061885356903076, "learning_rate": 7.640854432144137e-05, "loss": 1.186, "step": 4332 }, { "epoch": 1.1726424029224733, "grad_norm": 0.20808294415473938, "learning_rate": 7.638180648974715e-05, "loss": 1.1428, "step": 4334 }, { "epoch": 1.1731836016777162, "grad_norm": 0.2033407986164093, "learning_rate": 7.635505819905182e-05, "loss": 1.1533, "step": 4336 }, { "epoch": 1.173724800432959, "grad_norm": 0.21454395353794098, "learning_rate": 7.632829945995974e-05, "loss": 1.1726, "step": 4338 }, { "epoch": 1.1742659991882018, "grad_norm": 0.20768827199935913, "learning_rate": 7.630153028307929e-05, "loss": 1.1801, "step": 4340 }, { "epoch": 1.1748071979434447, "grad_norm": 0.21167118847370148, "learning_rate": 7.627475067902307e-05, "loss": 1.1561, "step": 4342 }, { "epoch": 1.1753483966986875, "grad_norm": 0.2219628542661667, "learning_rate": 7.62479606584078e-05, "loss": 1.1455, "step": 4344 }, { "epoch": 1.1758895954539303, "grad_norm": 0.22176925837993622, "learning_rate": 7.622116023185429e-05, "loss": 1.1583, "step": 4346 }, { "epoch": 1.1764307942091734, "grad_norm": 0.21282261610031128, "learning_rate": 7.619434940998751e-05, "loss": 1.1415, "step": 4348 }, { "epoch": 1.1769719929644162, "grad_norm": 0.20743747055530548, "learning_rate": 7.616752820343655e-05, "loss": 1.1552, "step": 4350 }, { "epoch": 1.177513191719659, "grad_norm": 0.21067087352275848, "learning_rate": 7.61406966228346e-05, "loss": 1.1677, "step": 4352 }, { "epoch": 1.178054390474902, "grad_norm": 0.22039294242858887, "learning_rate": 7.611385467881898e-05, "loss": 1.1495, "step": 4354 }, { "epoch": 1.1785955892301447, "grad_norm": 0.1977587342262268, "learning_rate": 7.60870023820311e-05, "loss": 1.1562, "step": 4356 }, { "epoch": 1.1791367879853876, "grad_norm": 0.2049712985754013, "learning_rate": 7.60601397431165e-05, "loss": 1.1526, "step": 4358 }, { "epoch": 1.1796779867406304, "grad_norm": 0.22354736924171448, "learning_rate": 7.603326677272482e-05, "loss": 1.1611, "step": 4360 }, { "epoch": 1.1802191854958735, "grad_norm": 0.3315167725086212, "learning_rate": 7.600638348150978e-05, "loss": 1.1621, "step": 4362 }, { "epoch": 1.1807603842511163, "grad_norm": 0.26512739062309265, "learning_rate": 7.597948988012912e-05, "loss": 1.1573, "step": 4364 }, { "epoch": 1.1813015830063591, "grad_norm": 0.2502736449241638, "learning_rate": 7.595258597924484e-05, "loss": 1.1786, "step": 4366 }, { "epoch": 1.181842781761602, "grad_norm": 0.2125827521085739, "learning_rate": 7.592567178952288e-05, "loss": 1.1357, "step": 4368 }, { "epoch": 1.1823839805168448, "grad_norm": 0.21820925176143646, "learning_rate": 7.589874732163328e-05, "loss": 1.1459, "step": 4370 }, { "epoch": 1.1829251792720876, "grad_norm": 0.23182585835456848, "learning_rate": 7.587181258625022e-05, "loss": 1.1397, "step": 4372 }, { "epoch": 1.1834663780273305, "grad_norm": 0.2198820859193802, "learning_rate": 7.58448675940519e-05, "loss": 1.1438, "step": 4374 }, { "epoch": 1.1840075767825735, "grad_norm": 0.21208171546459198, "learning_rate": 7.581791235572058e-05, "loss": 1.1307, "step": 4376 }, { "epoch": 1.1845487755378163, "grad_norm": 0.19944527745246887, "learning_rate": 7.57909468819426e-05, "loss": 1.1548, "step": 4378 }, { "epoch": 1.1850899742930592, "grad_norm": 0.20075197517871857, "learning_rate": 7.576397118340834e-05, "loss": 1.1445, "step": 4380 }, { "epoch": 1.185631173048302, "grad_norm": 0.2044142484664917, "learning_rate": 7.573698527081228e-05, "loss": 1.1474, "step": 4382 }, { "epoch": 1.1861723718035448, "grad_norm": 0.2024833858013153, "learning_rate": 7.57099891548529e-05, "loss": 1.1648, "step": 4384 }, { "epoch": 1.1867135705587877, "grad_norm": 0.20577891170978546, "learning_rate": 7.568298284623274e-05, "loss": 1.1693, "step": 4386 }, { "epoch": 1.1872547693140305, "grad_norm": 0.21271593868732452, "learning_rate": 7.565596635565841e-05, "loss": 1.1331, "step": 4388 }, { "epoch": 1.1877959680692733, "grad_norm": 0.20553138852119446, "learning_rate": 7.562893969384051e-05, "loss": 1.1618, "step": 4390 }, { "epoch": 1.1883371668245164, "grad_norm": 0.21582964062690735, "learning_rate": 7.560190287149367e-05, "loss": 1.1616, "step": 4392 }, { "epoch": 1.1888783655797592, "grad_norm": 0.2435728907585144, "learning_rate": 7.55748558993366e-05, "loss": 1.1741, "step": 4394 }, { "epoch": 1.189419564335002, "grad_norm": 0.2033778578042984, "learning_rate": 7.5547798788092e-05, "loss": 1.1519, "step": 4396 }, { "epoch": 1.189960763090245, "grad_norm": 0.20006676018238068, "learning_rate": 7.552073154848656e-05, "loss": 1.1649, "step": 4398 }, { "epoch": 1.1905019618454877, "grad_norm": 0.19640152156352997, "learning_rate": 7.549365419125109e-05, "loss": 1.1381, "step": 4400 }, { "epoch": 1.1910431606007306, "grad_norm": 0.21240241825580597, "learning_rate": 7.546656672712027e-05, "loss": 1.1589, "step": 4402 }, { "epoch": 1.1915843593559734, "grad_norm": 0.22936300933361053, "learning_rate": 7.54394691668329e-05, "loss": 1.1346, "step": 4404 }, { "epoch": 1.1921255581112162, "grad_norm": 0.20601502060890198, "learning_rate": 7.541236152113172e-05, "loss": 1.1672, "step": 4406 }, { "epoch": 1.1926667568664593, "grad_norm": 0.22351358830928802, "learning_rate": 7.538524380076351e-05, "loss": 1.1545, "step": 4408 }, { "epoch": 1.1932079556217021, "grad_norm": 0.23404501378536224, "learning_rate": 7.535811601647897e-05, "loss": 1.1484, "step": 4410 }, { "epoch": 1.193749154376945, "grad_norm": 0.20127235352993011, "learning_rate": 7.533097817903292e-05, "loss": 1.1559, "step": 4412 }, { "epoch": 1.1942903531321878, "grad_norm": 0.20970293879508972, "learning_rate": 7.530383029918404e-05, "loss": 1.1478, "step": 4414 }, { "epoch": 1.1948315518874306, "grad_norm": 0.2203601449728012, "learning_rate": 7.527667238769503e-05, "loss": 1.1399, "step": 4416 }, { "epoch": 1.1953727506426735, "grad_norm": 0.21253615617752075, "learning_rate": 7.524950445533259e-05, "loss": 1.1687, "step": 4418 }, { "epoch": 1.1959139493979163, "grad_norm": 0.20153376460075378, "learning_rate": 7.522232651286741e-05, "loss": 1.1491, "step": 4420 }, { "epoch": 1.1964551481531593, "grad_norm": 0.20877231657505035, "learning_rate": 7.519513857107405e-05, "loss": 1.1559, "step": 4422 }, { "epoch": 1.1969963469084022, "grad_norm": 0.2033756971359253, "learning_rate": 7.516794064073117e-05, "loss": 1.1526, "step": 4424 }, { "epoch": 1.197537545663645, "grad_norm": 0.19848887622356415, "learning_rate": 7.514073273262126e-05, "loss": 1.147, "step": 4426 }, { "epoch": 1.1980787444188878, "grad_norm": 0.25644296407699585, "learning_rate": 7.511351485753089e-05, "loss": 1.1269, "step": 4428 }, { "epoch": 1.1986199431741307, "grad_norm": 0.23441651463508606, "learning_rate": 7.508628702625044e-05, "loss": 1.1587, "step": 4430 }, { "epoch": 1.1991611419293735, "grad_norm": 0.2278803586959839, "learning_rate": 7.50590492495744e-05, "loss": 1.1536, "step": 4432 }, { "epoch": 1.1997023406846163, "grad_norm": 0.21613961458206177, "learning_rate": 7.503180153830107e-05, "loss": 1.1339, "step": 4434 }, { "epoch": 1.2002435394398594, "grad_norm": 0.2126021385192871, "learning_rate": 7.500454390323274e-05, "loss": 1.1446, "step": 4436 }, { "epoch": 1.2007847381951022, "grad_norm": 0.21502047777175903, "learning_rate": 7.497727635517564e-05, "loss": 1.1458, "step": 4438 }, { "epoch": 1.201325936950345, "grad_norm": 0.20378261804580688, "learning_rate": 7.494999890493993e-05, "loss": 1.1531, "step": 4440 }, { "epoch": 1.201867135705588, "grad_norm": 0.20229879021644592, "learning_rate": 7.492271156333968e-05, "loss": 1.1401, "step": 4442 }, { "epoch": 1.2024083344608307, "grad_norm": 0.21491043269634247, "learning_rate": 7.489541434119286e-05, "loss": 1.1537, "step": 4444 }, { "epoch": 1.2029495332160736, "grad_norm": 0.2054162323474884, "learning_rate": 7.486810724932142e-05, "loss": 1.1618, "step": 4446 }, { "epoch": 1.2034907319713164, "grad_norm": 0.21886977553367615, "learning_rate": 7.484079029855118e-05, "loss": 1.1356, "step": 4448 }, { "epoch": 1.2040319307265595, "grad_norm": 0.2097829133272171, "learning_rate": 7.481346349971187e-05, "loss": 1.1374, "step": 4450 }, { "epoch": 1.2045731294818023, "grad_norm": 0.200583815574646, "learning_rate": 7.478612686363713e-05, "loss": 1.1758, "step": 4452 }, { "epoch": 1.2051143282370451, "grad_norm": 0.21000835299491882, "learning_rate": 7.475878040116451e-05, "loss": 1.1608, "step": 4454 }, { "epoch": 1.205655526992288, "grad_norm": 0.21297107636928558, "learning_rate": 7.473142412313543e-05, "loss": 1.1434, "step": 4456 }, { "epoch": 1.2061967257475308, "grad_norm": 0.2080170065164566, "learning_rate": 7.470405804039524e-05, "loss": 1.1663, "step": 4458 }, { "epoch": 1.2067379245027736, "grad_norm": 0.19678303599357605, "learning_rate": 7.467668216379316e-05, "loss": 1.1476, "step": 4460 }, { "epoch": 1.2072791232580165, "grad_norm": 0.2111286073923111, "learning_rate": 7.464929650418225e-05, "loss": 1.1465, "step": 4462 }, { "epoch": 1.2078203220132593, "grad_norm": 0.22287851572036743, "learning_rate": 7.462190107241952e-05, "loss": 1.1675, "step": 4464 }, { "epoch": 1.2083615207685021, "grad_norm": 0.21192297339439392, "learning_rate": 7.45944958793658e-05, "loss": 1.1533, "step": 4466 }, { "epoch": 1.2089027195237452, "grad_norm": 0.21232372522354126, "learning_rate": 7.456708093588582e-05, "loss": 1.1379, "step": 4468 }, { "epoch": 1.209443918278988, "grad_norm": 0.2120969444513321, "learning_rate": 7.453965625284818e-05, "loss": 1.1505, "step": 4470 }, { "epoch": 1.2099851170342308, "grad_norm": 0.21147684752941132, "learning_rate": 7.45122218411253e-05, "loss": 1.142, "step": 4472 }, { "epoch": 1.2105263157894737, "grad_norm": 0.208118736743927, "learning_rate": 7.44847777115935e-05, "loss": 1.145, "step": 4474 }, { "epoch": 1.2110675145447165, "grad_norm": 0.21549181640148163, "learning_rate": 7.445732387513293e-05, "loss": 1.1442, "step": 4476 }, { "epoch": 1.2116087132999593, "grad_norm": 0.22884537279605865, "learning_rate": 7.442986034262757e-05, "loss": 1.1266, "step": 4478 }, { "epoch": 1.2121499120552022, "grad_norm": 0.22194430232048035, "learning_rate": 7.440238712496533e-05, "loss": 1.1374, "step": 4480 }, { "epoch": 1.2126911108104452, "grad_norm": 0.2395986169576645, "learning_rate": 7.437490423303786e-05, "loss": 1.149, "step": 4482 }, { "epoch": 1.213232309565688, "grad_norm": 0.2180141806602478, "learning_rate": 7.434741167774067e-05, "loss": 1.1577, "step": 4484 }, { "epoch": 1.213773508320931, "grad_norm": 0.21561622619628906, "learning_rate": 7.431990946997313e-05, "loss": 1.1446, "step": 4486 }, { "epoch": 1.2143147070761737, "grad_norm": 0.21005584299564362, "learning_rate": 7.429239762063844e-05, "loss": 1.1385, "step": 4488 }, { "epoch": 1.2148559058314166, "grad_norm": 0.21435660123825073, "learning_rate": 7.426487614064358e-05, "loss": 1.1719, "step": 4490 }, { "epoch": 1.2153971045866594, "grad_norm": 0.21810956299304962, "learning_rate": 7.423734504089939e-05, "loss": 1.1459, "step": 4492 }, { "epoch": 1.2159383033419022, "grad_norm": 0.20335710048675537, "learning_rate": 7.420980433232048e-05, "loss": 1.1223, "step": 4494 }, { "epoch": 1.2164795020971453, "grad_norm": 0.20494681596755981, "learning_rate": 7.41822540258253e-05, "loss": 1.1547, "step": 4496 }, { "epoch": 1.2170207008523881, "grad_norm": 0.19216883182525635, "learning_rate": 7.415469413233612e-05, "loss": 1.1338, "step": 4498 }, { "epoch": 1.217561899607631, "grad_norm": 0.20295827090740204, "learning_rate": 7.412712466277898e-05, "loss": 1.159, "step": 4500 }, { "epoch": 1.2181030983628738, "grad_norm": 0.20249909162521362, "learning_rate": 7.409954562808373e-05, "loss": 1.142, "step": 4502 }, { "epoch": 1.2186442971181166, "grad_norm": 0.21104544401168823, "learning_rate": 7.407195703918399e-05, "loss": 1.1451, "step": 4504 }, { "epoch": 1.2191854958733594, "grad_norm": 0.22997575998306274, "learning_rate": 7.404435890701722e-05, "loss": 1.1469, "step": 4506 }, { "epoch": 1.2197266946286023, "grad_norm": 0.21492913365364075, "learning_rate": 7.40167512425246e-05, "loss": 1.1562, "step": 4508 }, { "epoch": 1.2202678933838453, "grad_norm": 0.2012024223804474, "learning_rate": 7.398913405665114e-05, "loss": 1.1507, "step": 4510 }, { "epoch": 1.2208090921390882, "grad_norm": 0.20585717260837555, "learning_rate": 7.39615073603456e-05, "loss": 1.1417, "step": 4512 }, { "epoch": 1.221350290894331, "grad_norm": 0.20011591911315918, "learning_rate": 7.393387116456049e-05, "loss": 1.1236, "step": 4514 }, { "epoch": 1.2218914896495738, "grad_norm": 0.19506965577602386, "learning_rate": 7.390622548025217e-05, "loss": 1.1482, "step": 4516 }, { "epoch": 1.2224326884048167, "grad_norm": 0.20323827862739563, "learning_rate": 7.387857031838063e-05, "loss": 1.1576, "step": 4518 }, { "epoch": 1.2229738871600595, "grad_norm": 0.2006043940782547, "learning_rate": 7.385090568990974e-05, "loss": 1.1468, "step": 4520 }, { "epoch": 1.2235150859153023, "grad_norm": 0.19386060535907745, "learning_rate": 7.382323160580706e-05, "loss": 1.1407, "step": 4522 }, { "epoch": 1.2240562846705452, "grad_norm": 0.19958168268203735, "learning_rate": 7.37955480770439e-05, "loss": 1.1386, "step": 4524 }, { "epoch": 1.224597483425788, "grad_norm": 0.20519964396953583, "learning_rate": 7.376785511459533e-05, "loss": 1.1366, "step": 4526 }, { "epoch": 1.225138682181031, "grad_norm": 0.19929784536361694, "learning_rate": 7.374015272944015e-05, "loss": 1.1518, "step": 4528 }, { "epoch": 1.225679880936274, "grad_norm": 0.20659440755844116, "learning_rate": 7.371244093256093e-05, "loss": 1.1427, "step": 4530 }, { "epoch": 1.2262210796915167, "grad_norm": 0.2114669233560562, "learning_rate": 7.36847197349439e-05, "loss": 1.1547, "step": 4532 }, { "epoch": 1.2267622784467596, "grad_norm": 0.20865440368652344, "learning_rate": 7.36569891475791e-05, "loss": 1.1316, "step": 4534 }, { "epoch": 1.2273034772020024, "grad_norm": 0.21056681871414185, "learning_rate": 7.362924918146022e-05, "loss": 1.1551, "step": 4536 }, { "epoch": 1.2278446759572452, "grad_norm": 0.2034716010093689, "learning_rate": 7.360149984758473e-05, "loss": 1.134, "step": 4538 }, { "epoch": 1.228385874712488, "grad_norm": 0.2167576402425766, "learning_rate": 7.357374115695377e-05, "loss": 1.1559, "step": 4540 }, { "epoch": 1.2289270734677311, "grad_norm": 0.20762617886066437, "learning_rate": 7.354597312057219e-05, "loss": 1.1443, "step": 4542 }, { "epoch": 1.229468272222974, "grad_norm": 0.2080763280391693, "learning_rate": 7.351819574944856e-05, "loss": 1.1495, "step": 4544 }, { "epoch": 1.2300094709782168, "grad_norm": 0.2057662308216095, "learning_rate": 7.349040905459517e-05, "loss": 1.1378, "step": 4546 }, { "epoch": 1.2305506697334596, "grad_norm": 0.22165626287460327, "learning_rate": 7.346261304702797e-05, "loss": 1.1525, "step": 4548 }, { "epoch": 1.2310918684887024, "grad_norm": 0.21772271394729614, "learning_rate": 7.343480773776664e-05, "loss": 1.1523, "step": 4550 }, { "epoch": 1.2316330672439453, "grad_norm": 0.20765674114227295, "learning_rate": 7.340699313783448e-05, "loss": 1.1308, "step": 4552 }, { "epoch": 1.2321742659991881, "grad_norm": 0.2179335653781891, "learning_rate": 7.337916925825855e-05, "loss": 1.162, "step": 4554 }, { "epoch": 1.2327154647544312, "grad_norm": 0.2079366147518158, "learning_rate": 7.335133611006956e-05, "loss": 1.1509, "step": 4556 }, { "epoch": 1.233256663509674, "grad_norm": 0.21322664618492126, "learning_rate": 7.332349370430188e-05, "loss": 1.1411, "step": 4558 }, { "epoch": 1.2337978622649168, "grad_norm": 0.19475920498371124, "learning_rate": 7.329564205199356e-05, "loss": 1.154, "step": 4560 }, { "epoch": 1.2343390610201597, "grad_norm": 0.20483115315437317, "learning_rate": 7.326778116418633e-05, "loss": 1.1481, "step": 4562 }, { "epoch": 1.2348802597754025, "grad_norm": 0.19957055151462555, "learning_rate": 7.323991105192557e-05, "loss": 1.1433, "step": 4564 }, { "epoch": 1.2354214585306453, "grad_norm": 0.2019873857498169, "learning_rate": 7.32120317262603e-05, "loss": 1.1444, "step": 4566 }, { "epoch": 1.2359626572858882, "grad_norm": 0.20566652715206146, "learning_rate": 7.318414319824323e-05, "loss": 1.1394, "step": 4568 }, { "epoch": 1.2365038560411312, "grad_norm": 0.19783833622932434, "learning_rate": 7.315624547893067e-05, "loss": 1.1411, "step": 4570 }, { "epoch": 1.237045054796374, "grad_norm": 0.20047402381896973, "learning_rate": 7.312833857938264e-05, "loss": 1.1288, "step": 4572 }, { "epoch": 1.2375862535516169, "grad_norm": 0.20567147433757782, "learning_rate": 7.310042251066272e-05, "loss": 1.1474, "step": 4574 }, { "epoch": 1.2381274523068597, "grad_norm": 0.2040497362613678, "learning_rate": 7.307249728383817e-05, "loss": 1.1216, "step": 4576 }, { "epoch": 1.2386686510621026, "grad_norm": 0.19251488149166107, "learning_rate": 7.304456290997991e-05, "loss": 1.1425, "step": 4578 }, { "epoch": 1.2392098498173454, "grad_norm": 0.2024182677268982, "learning_rate": 7.30166194001624e-05, "loss": 1.1337, "step": 4580 }, { "epoch": 1.2397510485725882, "grad_norm": 0.21187058091163635, "learning_rate": 7.298866676546383e-05, "loss": 1.1373, "step": 4582 }, { "epoch": 1.240292247327831, "grad_norm": 0.20792464911937714, "learning_rate": 7.296070501696593e-05, "loss": 1.1464, "step": 4584 }, { "epoch": 1.2408334460830739, "grad_norm": 0.214219868183136, "learning_rate": 7.293273416575405e-05, "loss": 1.1431, "step": 4586 }, { "epoch": 1.241374644838317, "grad_norm": 0.1944214552640915, "learning_rate": 7.290475422291719e-05, "loss": 1.1416, "step": 4588 }, { "epoch": 1.2419158435935598, "grad_norm": 0.20861075818538666, "learning_rate": 7.287676519954792e-05, "loss": 1.1192, "step": 4590 }, { "epoch": 1.2424570423488026, "grad_norm": 0.19681338965892792, "learning_rate": 7.284876710674238e-05, "loss": 1.125, "step": 4592 }, { "epoch": 1.2429982411040454, "grad_norm": 0.1965487152338028, "learning_rate": 7.28207599556004e-05, "loss": 1.159, "step": 4594 }, { "epoch": 1.2435394398592883, "grad_norm": 0.2032519429922104, "learning_rate": 7.279274375722533e-05, "loss": 1.144, "step": 4596 }, { "epoch": 1.244080638614531, "grad_norm": 0.19022652506828308, "learning_rate": 7.276471852272409e-05, "loss": 1.1467, "step": 4598 }, { "epoch": 1.244621837369774, "grad_norm": 0.21404559910297394, "learning_rate": 7.273668426320724e-05, "loss": 1.1426, "step": 4600 }, { "epoch": 1.245163036125017, "grad_norm": 0.19883404672145844, "learning_rate": 7.27086409897889e-05, "loss": 1.1312, "step": 4602 }, { "epoch": 1.2457042348802598, "grad_norm": 0.2046336680650711, "learning_rate": 7.268058871358674e-05, "loss": 1.1467, "step": 4604 }, { "epoch": 1.2462454336355027, "grad_norm": 0.20716378092765808, "learning_rate": 7.265252744572201e-05, "loss": 1.1281, "step": 4606 }, { "epoch": 1.2467866323907455, "grad_norm": 0.20886875689029694, "learning_rate": 7.262445719731956e-05, "loss": 1.1357, "step": 4608 }, { "epoch": 1.2473278311459883, "grad_norm": 0.22756427526474, "learning_rate": 7.259637797950771e-05, "loss": 1.1405, "step": 4610 }, { "epoch": 1.2478690299012312, "grad_norm": 0.229325532913208, "learning_rate": 7.256828980341846e-05, "loss": 1.1456, "step": 4612 }, { "epoch": 1.248410228656474, "grad_norm": 0.20845824480056763, "learning_rate": 7.254019268018728e-05, "loss": 1.1507, "step": 4614 }, { "epoch": 1.248951427411717, "grad_norm": 0.20090307295322418, "learning_rate": 7.251208662095318e-05, "loss": 1.1433, "step": 4616 }, { "epoch": 1.2494926261669599, "grad_norm": 0.19882068037986755, "learning_rate": 7.248397163685874e-05, "loss": 1.1416, "step": 4618 }, { "epoch": 1.2500338249222027, "grad_norm": 0.21998406946659088, "learning_rate": 7.245584773905012e-05, "loss": 1.1489, "step": 4620 }, { "epoch": 1.2505750236774456, "grad_norm": 0.21411266922950745, "learning_rate": 7.242771493867691e-05, "loss": 1.148, "step": 4622 }, { "epoch": 1.2511162224326884, "grad_norm": 0.20658662915229797, "learning_rate": 7.239957324689232e-05, "loss": 1.1259, "step": 4624 }, { "epoch": 1.2516574211879312, "grad_norm": 0.20834538340568542, "learning_rate": 7.237142267485305e-05, "loss": 1.148, "step": 4626 }, { "epoch": 1.252198619943174, "grad_norm": 0.1978132128715515, "learning_rate": 7.234326323371931e-05, "loss": 1.1529, "step": 4628 }, { "epoch": 1.252739818698417, "grad_norm": 0.2106226533651352, "learning_rate": 7.231509493465484e-05, "loss": 1.1225, "step": 4630 }, { "epoch": 1.2532810174536597, "grad_norm": 0.20942124724388123, "learning_rate": 7.228691778882693e-05, "loss": 1.145, "step": 4632 }, { "epoch": 1.2538222162089028, "grad_norm": 0.2049248218536377, "learning_rate": 7.225873180740627e-05, "loss": 1.1555, "step": 4634 }, { "epoch": 1.2543634149641456, "grad_norm": 0.20401233434677124, "learning_rate": 7.22305370015672e-05, "loss": 1.113, "step": 4636 }, { "epoch": 1.2549046137193884, "grad_norm": 0.20931097865104675, "learning_rate": 7.220233338248743e-05, "loss": 1.1501, "step": 4638 }, { "epoch": 1.2554458124746313, "grad_norm": 0.20634891092777252, "learning_rate": 7.217412096134823e-05, "loss": 1.1439, "step": 4640 }, { "epoch": 1.255987011229874, "grad_norm": 0.21860553324222565, "learning_rate": 7.214589974933434e-05, "loss": 1.1515, "step": 4642 }, { "epoch": 1.2565282099851172, "grad_norm": 0.19998759031295776, "learning_rate": 7.211766975763397e-05, "loss": 1.1339, "step": 4644 }, { "epoch": 1.2570694087403598, "grad_norm": 0.21334104239940643, "learning_rate": 7.208943099743888e-05, "loss": 1.1504, "step": 4646 }, { "epoch": 1.2576106074956028, "grad_norm": 0.2096724510192871, "learning_rate": 7.206118347994421e-05, "loss": 1.1493, "step": 4648 }, { "epoch": 1.2581518062508457, "grad_norm": 0.20401859283447266, "learning_rate": 7.203292721634863e-05, "loss": 1.1447, "step": 4650 }, { "epoch": 1.2586930050060885, "grad_norm": 0.20878173410892487, "learning_rate": 7.200466221785427e-05, "loss": 1.1426, "step": 4652 }, { "epoch": 1.2592342037613313, "grad_norm": 0.2019282877445221, "learning_rate": 7.19763884956667e-05, "loss": 1.135, "step": 4654 }, { "epoch": 1.2597754025165742, "grad_norm": 0.21409070491790771, "learning_rate": 7.194810606099498e-05, "loss": 1.1504, "step": 4656 }, { "epoch": 1.260316601271817, "grad_norm": 0.1969073861837387, "learning_rate": 7.191981492505163e-05, "loss": 1.1646, "step": 4658 }, { "epoch": 1.2608578000270598, "grad_norm": 0.21602770686149597, "learning_rate": 7.189151509905257e-05, "loss": 1.1428, "step": 4660 }, { "epoch": 1.2613989987823029, "grad_norm": 0.20433968305587769, "learning_rate": 7.186320659421721e-05, "loss": 1.1438, "step": 4662 }, { "epoch": 1.2619401975375457, "grad_norm": 0.18745940923690796, "learning_rate": 7.183488942176838e-05, "loss": 1.1233, "step": 4664 }, { "epoch": 1.2624813962927885, "grad_norm": 0.2125159353017807, "learning_rate": 7.180656359293236e-05, "loss": 1.1365, "step": 4666 }, { "epoch": 1.2630225950480314, "grad_norm": 0.2109309583902359, "learning_rate": 7.177822911893883e-05, "loss": 1.1189, "step": 4668 }, { "epoch": 1.2635637938032742, "grad_norm": 0.20971255004405975, "learning_rate": 7.174988601102096e-05, "loss": 1.1547, "step": 4670 }, { "epoch": 1.264104992558517, "grad_norm": 0.20887215435504913, "learning_rate": 7.172153428041527e-05, "loss": 1.1158, "step": 4672 }, { "epoch": 1.2646461913137599, "grad_norm": 0.208766907453537, "learning_rate": 7.169317393836175e-05, "loss": 1.16, "step": 4674 }, { "epoch": 1.265187390069003, "grad_norm": 0.2097134292125702, "learning_rate": 7.166480499610379e-05, "loss": 1.1458, "step": 4676 }, { "epoch": 1.2657285888242458, "grad_norm": 0.20617030560970306, "learning_rate": 7.163642746488817e-05, "loss": 1.1327, "step": 4678 }, { "epoch": 1.2662697875794886, "grad_norm": 0.19682592153549194, "learning_rate": 7.160804135596509e-05, "loss": 1.1516, "step": 4680 }, { "epoch": 1.2668109863347314, "grad_norm": 0.21410858631134033, "learning_rate": 7.157964668058818e-05, "loss": 1.1456, "step": 4682 }, { "epoch": 1.2673521850899743, "grad_norm": 0.20145538449287415, "learning_rate": 7.15512434500144e-05, "loss": 1.141, "step": 4684 }, { "epoch": 1.267893383845217, "grad_norm": 0.22975938022136688, "learning_rate": 7.152283167550416e-05, "loss": 1.1372, "step": 4686 }, { "epoch": 1.26843458260046, "grad_norm": 0.21135157346725464, "learning_rate": 7.149441136832126e-05, "loss": 1.133, "step": 4688 }, { "epoch": 1.268975781355703, "grad_norm": 0.20664750039577484, "learning_rate": 7.14659825397328e-05, "loss": 1.1604, "step": 4690 }, { "epoch": 1.2695169801109458, "grad_norm": 0.20334216952323914, "learning_rate": 7.143754520100938e-05, "loss": 1.1567, "step": 4692 }, { "epoch": 1.2700581788661887, "grad_norm": 0.20391784608364105, "learning_rate": 7.140909936342488e-05, "loss": 1.1401, "step": 4694 }, { "epoch": 1.2705993776214315, "grad_norm": 0.21056121587753296, "learning_rate": 7.138064503825658e-05, "loss": 1.141, "step": 4696 }, { "epoch": 1.2711405763766743, "grad_norm": 0.2019626796245575, "learning_rate": 7.135218223678514e-05, "loss": 1.149, "step": 4698 }, { "epoch": 1.2716817751319172, "grad_norm": 0.19824273884296417, "learning_rate": 7.132371097029454e-05, "loss": 1.143, "step": 4700 }, { "epoch": 1.27222297388716, "grad_norm": 0.19521044194698334, "learning_rate": 7.129523125007217e-05, "loss": 1.1512, "step": 4702 }, { "epoch": 1.272764172642403, "grad_norm": 0.18712207674980164, "learning_rate": 7.126674308740874e-05, "loss": 1.141, "step": 4704 }, { "epoch": 1.2733053713976457, "grad_norm": 0.20281393826007843, "learning_rate": 7.123824649359829e-05, "loss": 1.135, "step": 4706 }, { "epoch": 1.2738465701528887, "grad_norm": 0.18838536739349365, "learning_rate": 7.120974147993826e-05, "loss": 1.1516, "step": 4708 }, { "epoch": 1.2743877689081315, "grad_norm": 0.1939174383878708, "learning_rate": 7.118122805772934e-05, "loss": 1.1513, "step": 4710 }, { "epoch": 1.2749289676633744, "grad_norm": 0.19968074560165405, "learning_rate": 7.115270623827565e-05, "loss": 1.1444, "step": 4712 }, { "epoch": 1.2754701664186172, "grad_norm": 0.19661501049995422, "learning_rate": 7.112417603288458e-05, "loss": 1.1416, "step": 4714 }, { "epoch": 1.27601136517386, "grad_norm": 0.20567114651203156, "learning_rate": 7.109563745286684e-05, "loss": 1.1413, "step": 4716 }, { "epoch": 1.276552563929103, "grad_norm": 0.19401569664478302, "learning_rate": 7.10670905095365e-05, "loss": 1.153, "step": 4718 }, { "epoch": 1.2770937626843457, "grad_norm": 0.21700969338417053, "learning_rate": 7.103853521421094e-05, "loss": 1.1523, "step": 4720 }, { "epoch": 1.2776349614395888, "grad_norm": 0.6526715159416199, "learning_rate": 7.10099715782108e-05, "loss": 1.1385, "step": 4722 }, { "epoch": 1.2781761601948316, "grad_norm": 0.23611807823181152, "learning_rate": 7.098139961286007e-05, "loss": 1.1578, "step": 4724 }, { "epoch": 1.2787173589500744, "grad_norm": 0.22932860255241394, "learning_rate": 7.095281932948605e-05, "loss": 1.1433, "step": 4726 }, { "epoch": 1.2792585577053173, "grad_norm": 0.2409789115190506, "learning_rate": 7.092423073941931e-05, "loss": 1.1409, "step": 4728 }, { "epoch": 1.27979975646056, "grad_norm": 0.2185688316822052, "learning_rate": 7.089563385399371e-05, "loss": 1.1486, "step": 4730 }, { "epoch": 1.280340955215803, "grad_norm": 0.22509334981441498, "learning_rate": 7.086702868454645e-05, "loss": 1.1492, "step": 4732 }, { "epoch": 1.2808821539710458, "grad_norm": 0.2434450089931488, "learning_rate": 7.083841524241794e-05, "loss": 1.1533, "step": 4734 }, { "epoch": 1.2814233527262888, "grad_norm": 0.21914516389369965, "learning_rate": 7.080979353895193e-05, "loss": 1.1658, "step": 4736 }, { "epoch": 1.2819645514815317, "grad_norm": 0.23442131280899048, "learning_rate": 7.078116358549544e-05, "loss": 1.1355, "step": 4738 }, { "epoch": 1.2825057502367745, "grad_norm": 0.23039007186889648, "learning_rate": 7.075252539339871e-05, "loss": 1.1671, "step": 4740 }, { "epoch": 1.2830469489920173, "grad_norm": 0.2244633287191391, "learning_rate": 7.072387897401528e-05, "loss": 1.1298, "step": 4742 }, { "epoch": 1.2835881477472602, "grad_norm": 0.20873397588729858, "learning_rate": 7.069522433870197e-05, "loss": 1.1485, "step": 4744 }, { "epoch": 1.284129346502503, "grad_norm": 0.20467349886894226, "learning_rate": 7.066656149881881e-05, "loss": 1.1349, "step": 4746 }, { "epoch": 1.2846705452577458, "grad_norm": 0.21368630230426788, "learning_rate": 7.063789046572916e-05, "loss": 1.1396, "step": 4748 }, { "epoch": 1.2852117440129889, "grad_norm": 0.21021267771720886, "learning_rate": 7.060921125079954e-05, "loss": 1.1241, "step": 4750 }, { "epoch": 1.2857529427682317, "grad_norm": 0.20324338972568512, "learning_rate": 7.058052386539975e-05, "loss": 1.1451, "step": 4752 }, { "epoch": 1.2862941415234745, "grad_norm": 0.20919355750083923, "learning_rate": 7.055182832090287e-05, "loss": 1.1474, "step": 4754 }, { "epoch": 1.2868353402787174, "grad_norm": 0.1998199224472046, "learning_rate": 7.052312462868514e-05, "loss": 1.1181, "step": 4756 }, { "epoch": 1.2873765390339602, "grad_norm": 0.209974467754364, "learning_rate": 7.049441280012608e-05, "loss": 1.1292, "step": 4758 }, { "epoch": 1.287917737789203, "grad_norm": 0.19232650101184845, "learning_rate": 7.046569284660841e-05, "loss": 1.1363, "step": 4760 }, { "epoch": 1.2884589365444459, "grad_norm": 0.19009630382061005, "learning_rate": 7.043696477951812e-05, "loss": 1.1195, "step": 4762 }, { "epoch": 1.289000135299689, "grad_norm": 0.19685257971286774, "learning_rate": 7.040822861024434e-05, "loss": 1.1556, "step": 4764 }, { "epoch": 1.2895413340549315, "grad_norm": 0.19877856969833374, "learning_rate": 7.037948435017946e-05, "loss": 1.1463, "step": 4766 }, { "epoch": 1.2900825328101746, "grad_norm": 0.20576737821102142, "learning_rate": 7.035073201071909e-05, "loss": 1.1489, "step": 4768 }, { "epoch": 1.2906237315654174, "grad_norm": 0.22239983081817627, "learning_rate": 7.0321971603262e-05, "loss": 1.1443, "step": 4770 }, { "epoch": 1.2911649303206603, "grad_norm": 0.2060551643371582, "learning_rate": 7.02932031392102e-05, "loss": 1.1411, "step": 4772 }, { "epoch": 1.291706129075903, "grad_norm": 0.20176757872104645, "learning_rate": 7.026442662996888e-05, "loss": 1.1542, "step": 4774 }, { "epoch": 1.292247327831146, "grad_norm": 0.19776682555675507, "learning_rate": 7.023564208694638e-05, "loss": 1.1313, "step": 4776 }, { "epoch": 1.292788526586389, "grad_norm": 0.19966423511505127, "learning_rate": 7.020684952155428e-05, "loss": 1.1169, "step": 4778 }, { "epoch": 1.2933297253416316, "grad_norm": 0.18727391958236694, "learning_rate": 7.017804894520735e-05, "loss": 1.1263, "step": 4780 }, { "epoch": 1.2938709240968747, "grad_norm": 0.19659611582756042, "learning_rate": 7.014924036932345e-05, "loss": 1.1227, "step": 4782 }, { "epoch": 1.2944121228521175, "grad_norm": 0.20023833215236664, "learning_rate": 7.01204238053237e-05, "loss": 1.1196, "step": 4784 }, { "epoch": 1.2949533216073603, "grad_norm": 0.20402821898460388, "learning_rate": 7.009159926463237e-05, "loss": 1.1413, "step": 4786 }, { "epoch": 1.2954945203626032, "grad_norm": 0.21623113751411438, "learning_rate": 7.006276675867685e-05, "loss": 1.1611, "step": 4788 }, { "epoch": 1.296035719117846, "grad_norm": 0.2232583463191986, "learning_rate": 7.003392629888772e-05, "loss": 1.1486, "step": 4790 }, { "epoch": 1.2965769178730888, "grad_norm": 0.20442326366901398, "learning_rate": 7.00050778966987e-05, "loss": 1.141, "step": 4792 }, { "epoch": 1.2971181166283317, "grad_norm": 0.20763815939426422, "learning_rate": 6.99762215635467e-05, "loss": 1.1333, "step": 4794 }, { "epoch": 1.2976593153835747, "grad_norm": 0.1989380568265915, "learning_rate": 6.994735731087171e-05, "loss": 1.1338, "step": 4796 }, { "epoch": 1.2982005141388175, "grad_norm": 0.20569868385791779, "learning_rate": 6.991848515011689e-05, "loss": 1.1211, "step": 4798 }, { "epoch": 1.2987417128940604, "grad_norm": 0.3479338586330414, "learning_rate": 6.988960509272855e-05, "loss": 1.1212, "step": 4800 }, { "epoch": 1.2992829116493032, "grad_norm": 0.2268143892288208, "learning_rate": 6.986071715015611e-05, "loss": 1.1217, "step": 4802 }, { "epoch": 1.299824110404546, "grad_norm": 0.22618204355239868, "learning_rate": 6.983182133385213e-05, "loss": 1.1542, "step": 4804 }, { "epoch": 1.3003653091597889, "grad_norm": 0.20148879289627075, "learning_rate": 6.980291765527228e-05, "loss": 1.129, "step": 4806 }, { "epoch": 1.3009065079150317, "grad_norm": 0.2690919041633606, "learning_rate": 6.977400612587535e-05, "loss": 1.1441, "step": 4808 }, { "epoch": 1.3014477066702748, "grad_norm": 0.19888824224472046, "learning_rate": 6.974508675712323e-05, "loss": 1.1387, "step": 4810 }, { "epoch": 1.3019889054255176, "grad_norm": 0.21588613092899323, "learning_rate": 6.971615956048094e-05, "loss": 1.1558, "step": 4812 }, { "epoch": 1.3025301041807604, "grad_norm": 0.2101002186536789, "learning_rate": 6.968722454741662e-05, "loss": 1.1429, "step": 4814 }, { "epoch": 1.3030713029360033, "grad_norm": 0.20919887721538544, "learning_rate": 6.965828172940145e-05, "loss": 1.1504, "step": 4816 }, { "epoch": 1.303612501691246, "grad_norm": 0.21550770103931427, "learning_rate": 6.962933111790975e-05, "loss": 1.1487, "step": 4818 }, { "epoch": 1.304153700446489, "grad_norm": 0.20764297246932983, "learning_rate": 6.96003727244189e-05, "loss": 1.1305, "step": 4820 }, { "epoch": 1.3046948992017318, "grad_norm": 0.2233119159936905, "learning_rate": 6.957140656040942e-05, "loss": 1.1314, "step": 4822 }, { "epoch": 1.3052360979569748, "grad_norm": 0.22899368405342102, "learning_rate": 6.954243263736486e-05, "loss": 1.1515, "step": 4824 }, { "epoch": 1.3057772967122174, "grad_norm": 0.26306775212287903, "learning_rate": 6.951345096677183e-05, "loss": 1.1531, "step": 4826 }, { "epoch": 1.3063184954674605, "grad_norm": 0.23401707410812378, "learning_rate": 6.948446156012007e-05, "loss": 1.1498, "step": 4828 }, { "epoch": 1.3068596942227033, "grad_norm": 0.24562807381153107, "learning_rate": 6.945546442890236e-05, "loss": 1.1173, "step": 4830 }, { "epoch": 1.3074008929779461, "grad_norm": 0.2258317917585373, "learning_rate": 6.942645958461451e-05, "loss": 1.1403, "step": 4832 }, { "epoch": 1.307942091733189, "grad_norm": 0.2683268189430237, "learning_rate": 6.939744703875546e-05, "loss": 1.1346, "step": 4834 }, { "epoch": 1.3084832904884318, "grad_norm": 0.23453928530216217, "learning_rate": 6.93684268028271e-05, "loss": 1.1543, "step": 4836 }, { "epoch": 1.3090244892436749, "grad_norm": 0.22976765036582947, "learning_rate": 6.933939888833451e-05, "loss": 1.1534, "step": 4838 }, { "epoch": 1.3095656879989175, "grad_norm": 0.19898580014705658, "learning_rate": 6.931036330678568e-05, "loss": 1.1559, "step": 4840 }, { "epoch": 1.3101068867541605, "grad_norm": 0.20745742321014404, "learning_rate": 6.92813200696917e-05, "loss": 1.1476, "step": 4842 }, { "epoch": 1.3106480855094034, "grad_norm": 0.19812755286693573, "learning_rate": 6.92522691885667e-05, "loss": 1.1116, "step": 4844 }, { "epoch": 1.3111892842646462, "grad_norm": 0.2005460262298584, "learning_rate": 6.922321067492783e-05, "loss": 1.1444, "step": 4846 }, { "epoch": 1.311730483019889, "grad_norm": 0.22607260942459106, "learning_rate": 6.919414454029525e-05, "loss": 1.1302, "step": 4848 }, { "epoch": 1.3122716817751319, "grad_norm": 0.21330516040325165, "learning_rate": 6.916507079619217e-05, "loss": 1.1336, "step": 4850 }, { "epoch": 1.3128128805303747, "grad_norm": 0.2110857516527176, "learning_rate": 6.913598945414479e-05, "loss": 1.1563, "step": 4852 }, { "epoch": 1.3133540792856175, "grad_norm": 0.21332454681396484, "learning_rate": 6.910690052568236e-05, "loss": 1.1303, "step": 4854 }, { "epoch": 1.3138952780408606, "grad_norm": 0.21654580533504486, "learning_rate": 6.907780402233706e-05, "loss": 1.1323, "step": 4856 }, { "epoch": 1.3144364767961034, "grad_norm": 0.22266875207424164, "learning_rate": 6.904869995564419e-05, "loss": 1.1477, "step": 4858 }, { "epoch": 1.3149776755513463, "grad_norm": 0.21026547253131866, "learning_rate": 6.901958833714196e-05, "loss": 1.1545, "step": 4860 }, { "epoch": 1.315518874306589, "grad_norm": 0.24251089990139008, "learning_rate": 6.899046917837157e-05, "loss": 1.1345, "step": 4862 }, { "epoch": 1.316060073061832, "grad_norm": 0.2157732993364334, "learning_rate": 6.896134249087727e-05, "loss": 1.1591, "step": 4864 }, { "epoch": 1.3166012718170748, "grad_norm": 0.21439550817012787, "learning_rate": 6.893220828620626e-05, "loss": 1.1239, "step": 4866 }, { "epoch": 1.3171424705723176, "grad_norm": 0.20335568487644196, "learning_rate": 6.890306657590871e-05, "loss": 1.128, "step": 4868 }, { "epoch": 1.3176836693275606, "grad_norm": 0.20837850868701935, "learning_rate": 6.88739173715378e-05, "loss": 1.1493, "step": 4870 }, { "epoch": 1.3182248680828035, "grad_norm": 0.20061874389648438, "learning_rate": 6.884476068464962e-05, "loss": 1.1321, "step": 4872 }, { "epoch": 1.3187660668380463, "grad_norm": 0.20998738706111908, "learning_rate": 6.881559652680332e-05, "loss": 1.1449, "step": 4874 }, { "epoch": 1.3193072655932891, "grad_norm": 0.215812548995018, "learning_rate": 6.878642490956091e-05, "loss": 1.1387, "step": 4876 }, { "epoch": 1.319848464348532, "grad_norm": 0.21340392529964447, "learning_rate": 6.87572458444874e-05, "loss": 1.1323, "step": 4878 }, { "epoch": 1.3203896631037748, "grad_norm": 0.19966621696949005, "learning_rate": 6.87280593431508e-05, "loss": 1.1465, "step": 4880 }, { "epoch": 1.3209308618590176, "grad_norm": 0.19357992708683014, "learning_rate": 6.869886541712201e-05, "loss": 1.1317, "step": 4882 }, { "epoch": 1.3214720606142607, "grad_norm": 0.20211121439933777, "learning_rate": 6.866966407797488e-05, "loss": 1.136, "step": 4884 }, { "epoch": 1.3220132593695033, "grad_norm": 0.20985981822013855, "learning_rate": 6.864045533728618e-05, "loss": 1.143, "step": 4886 }, { "epoch": 1.3225544581247464, "grad_norm": 0.1939292699098587, "learning_rate": 6.86112392066357e-05, "loss": 1.1362, "step": 4888 }, { "epoch": 1.3230956568799892, "grad_norm": 0.19840145111083984, "learning_rate": 6.858201569760606e-05, "loss": 1.135, "step": 4890 }, { "epoch": 1.323636855635232, "grad_norm": 0.22339017689228058, "learning_rate": 6.855278482178288e-05, "loss": 1.1429, "step": 4892 }, { "epoch": 1.3241780543904749, "grad_norm": 0.2093936651945114, "learning_rate": 6.852354659075464e-05, "loss": 1.1434, "step": 4894 }, { "epoch": 1.3247192531457177, "grad_norm": 0.19721046090126038, "learning_rate": 6.849430101611276e-05, "loss": 1.135, "step": 4896 }, { "epoch": 1.3252604519009608, "grad_norm": 0.20016610622406006, "learning_rate": 6.84650481094516e-05, "loss": 1.1211, "step": 4898 }, { "epoch": 1.3258016506562034, "grad_norm": 0.20054888725280762, "learning_rate": 6.843578788236837e-05, "loss": 1.1325, "step": 4900 }, { "epoch": 1.3263428494114464, "grad_norm": 0.20290598273277283, "learning_rate": 6.840652034646325e-05, "loss": 1.1403, "step": 4902 }, { "epoch": 1.3268840481666893, "grad_norm": 0.19229461252689362, "learning_rate": 6.837724551333926e-05, "loss": 1.1283, "step": 4904 }, { "epoch": 1.327425246921932, "grad_norm": 0.2016628235578537, "learning_rate": 6.834796339460232e-05, "loss": 1.1405, "step": 4906 }, { "epoch": 1.327966445677175, "grad_norm": 0.19562238454818726, "learning_rate": 6.83186740018613e-05, "loss": 1.1397, "step": 4908 }, { "epoch": 1.3285076444324178, "grad_norm": 0.19663389027118683, "learning_rate": 6.828937734672785e-05, "loss": 1.1182, "step": 4910 }, { "epoch": 1.3290488431876606, "grad_norm": 0.18820057809352875, "learning_rate": 6.826007344081658e-05, "loss": 1.1551, "step": 4912 }, { "epoch": 1.3295900419429034, "grad_norm": 0.19613684713840485, "learning_rate": 6.823076229574496e-05, "loss": 1.1525, "step": 4914 }, { "epoch": 1.3301312406981465, "grad_norm": 0.1969241350889206, "learning_rate": 6.820144392313333e-05, "loss": 1.1255, "step": 4916 }, { "epoch": 1.3306724394533893, "grad_norm": 0.20861990749835968, "learning_rate": 6.817211833460483e-05, "loss": 1.1375, "step": 4918 }, { "epoch": 1.3312136382086321, "grad_norm": 0.2094329297542572, "learning_rate": 6.814278554178558e-05, "loss": 1.1308, "step": 4920 }, { "epoch": 1.331754836963875, "grad_norm": 0.19369575381278992, "learning_rate": 6.811344555630446e-05, "loss": 1.1471, "step": 4922 }, { "epoch": 1.3322960357191178, "grad_norm": 0.21166963875293732, "learning_rate": 6.808409838979324e-05, "loss": 1.1284, "step": 4924 }, { "epoch": 1.3328372344743606, "grad_norm": 0.205407053232193, "learning_rate": 6.805474405388652e-05, "loss": 1.1391, "step": 4926 }, { "epoch": 1.3333784332296035, "grad_norm": 0.19018761813640594, "learning_rate": 6.802538256022177e-05, "loss": 1.111, "step": 4928 }, { "epoch": 1.3339196319848465, "grad_norm": 0.19279932975769043, "learning_rate": 6.799601392043927e-05, "loss": 1.1369, "step": 4930 }, { "epoch": 1.3344608307400894, "grad_norm": 0.20698602497577667, "learning_rate": 6.796663814618216e-05, "loss": 1.1507, "step": 4932 }, { "epoch": 1.3350020294953322, "grad_norm": 0.21521639823913574, "learning_rate": 6.793725524909635e-05, "loss": 1.1271, "step": 4934 }, { "epoch": 1.335543228250575, "grad_norm": 0.200445294380188, "learning_rate": 6.790786524083067e-05, "loss": 1.1364, "step": 4936 }, { "epoch": 1.3360844270058179, "grad_norm": 0.22544774413108826, "learning_rate": 6.787846813303668e-05, "loss": 1.1368, "step": 4938 }, { "epoch": 1.3366256257610607, "grad_norm": 0.19603730738162994, "learning_rate": 6.78490639373688e-05, "loss": 1.123, "step": 4940 }, { "epoch": 1.3371668245163035, "grad_norm": 0.21301992237567902, "learning_rate": 6.781965266548425e-05, "loss": 1.1448, "step": 4942 }, { "epoch": 1.3377080232715466, "grad_norm": 0.2113674283027649, "learning_rate": 6.779023432904305e-05, "loss": 1.1511, "step": 4944 }, { "epoch": 1.3382492220267894, "grad_norm": 0.21615102887153625, "learning_rate": 6.776080893970803e-05, "loss": 1.125, "step": 4946 }, { "epoch": 1.3387904207820323, "grad_norm": 0.2178725004196167, "learning_rate": 6.773137650914483e-05, "loss": 1.1153, "step": 4948 }, { "epoch": 1.339331619537275, "grad_norm": 0.19788886606693268, "learning_rate": 6.770193704902184e-05, "loss": 1.128, "step": 4950 }, { "epoch": 1.339872818292518, "grad_norm": 0.20174042880535126, "learning_rate": 6.767249057101025e-05, "loss": 1.1316, "step": 4952 }, { "epoch": 1.3404140170477608, "grad_norm": 0.20333023369312286, "learning_rate": 6.764303708678406e-05, "loss": 1.1357, "step": 4954 }, { "epoch": 1.3409552158030036, "grad_norm": 0.20150478184223175, "learning_rate": 6.761357660802003e-05, "loss": 1.1434, "step": 4956 }, { "epoch": 1.3414964145582466, "grad_norm": 0.20875629782676697, "learning_rate": 6.75841091463977e-05, "loss": 1.1429, "step": 4958 }, { "epoch": 1.3420376133134893, "grad_norm": 0.23204512894153595, "learning_rate": 6.755463471359936e-05, "loss": 1.1299, "step": 4960 }, { "epoch": 1.3425788120687323, "grad_norm": 0.20591384172439575, "learning_rate": 6.752515332131006e-05, "loss": 1.1358, "step": 4962 }, { "epoch": 1.3431200108239751, "grad_norm": 0.22469563782215118, "learning_rate": 6.749566498121765e-05, "loss": 1.1347, "step": 4964 }, { "epoch": 1.343661209579218, "grad_norm": 0.2194630652666092, "learning_rate": 6.746616970501272e-05, "loss": 1.1391, "step": 4966 }, { "epoch": 1.3442024083344608, "grad_norm": 0.2115604132413864, "learning_rate": 6.743666750438856e-05, "loss": 1.1504, "step": 4968 }, { "epoch": 1.3447436070897036, "grad_norm": 0.22515109181404114, "learning_rate": 6.740715839104126e-05, "loss": 1.1221, "step": 4970 }, { "epoch": 1.3452848058449465, "grad_norm": 0.21035178005695343, "learning_rate": 6.737764237666964e-05, "loss": 1.1414, "step": 4972 }, { "epoch": 1.3458260046001893, "grad_norm": 0.24386465549468994, "learning_rate": 6.734811947297526e-05, "loss": 1.1212, "step": 4974 }, { "epoch": 1.3463672033554324, "grad_norm": 0.21872912347316742, "learning_rate": 6.731858969166236e-05, "loss": 1.1272, "step": 4976 }, { "epoch": 1.3469084021106752, "grad_norm": 0.22619320452213287, "learning_rate": 6.7289053044438e-05, "loss": 1.1312, "step": 4978 }, { "epoch": 1.347449600865918, "grad_norm": 0.23938359320163727, "learning_rate": 6.725950954301186e-05, "loss": 1.1321, "step": 4980 }, { "epoch": 1.3479907996211609, "grad_norm": 0.22768397629261017, "learning_rate": 6.722995919909643e-05, "loss": 1.1291, "step": 4982 }, { "epoch": 1.3485319983764037, "grad_norm": 0.2165715992450714, "learning_rate": 6.720040202440684e-05, "loss": 1.1657, "step": 4984 }, { "epoch": 1.3490731971316465, "grad_norm": 0.2011752873659134, "learning_rate": 6.717083803066096e-05, "loss": 1.1198, "step": 4986 }, { "epoch": 1.3496143958868894, "grad_norm": 0.2029523253440857, "learning_rate": 6.714126722957938e-05, "loss": 1.1299, "step": 4988 }, { "epoch": 1.3501555946421324, "grad_norm": 0.21974807977676392, "learning_rate": 6.711168963288537e-05, "loss": 1.125, "step": 4990 }, { "epoch": 1.3506967933973753, "grad_norm": 0.21973131597042084, "learning_rate": 6.708210525230487e-05, "loss": 1.1352, "step": 4992 }, { "epoch": 1.351237992152618, "grad_norm": 0.22160713374614716, "learning_rate": 6.705251409956657e-05, "loss": 1.1398, "step": 4994 }, { "epoch": 1.351779190907861, "grad_norm": 0.5308296084403992, "learning_rate": 6.702291618640178e-05, "loss": 1.1859, "step": 4996 }, { "epoch": 1.3523203896631038, "grad_norm": 0.3140636086463928, "learning_rate": 6.699331152454451e-05, "loss": 1.1285, "step": 4998 }, { "epoch": 1.3528615884183466, "grad_norm": 0.318144828081131, "learning_rate": 6.696370012573148e-05, "loss": 1.1123, "step": 5000 }, { "epoch": 1.3534027871735894, "grad_norm": 0.2733546495437622, "learning_rate": 6.693408200170205e-05, "loss": 1.1354, "step": 5002 }, { "epoch": 1.3539439859288325, "grad_norm": 0.22881552577018738, "learning_rate": 6.690445716419822e-05, "loss": 1.1446, "step": 5004 }, { "epoch": 1.3544851846840753, "grad_norm": 0.22428303956985474, "learning_rate": 6.687482562496473e-05, "loss": 1.1469, "step": 5006 }, { "epoch": 1.3550263834393181, "grad_norm": 0.23629316687583923, "learning_rate": 6.68451873957489e-05, "loss": 1.1412, "step": 5008 }, { "epoch": 1.355567582194561, "grad_norm": 0.2209664285182953, "learning_rate": 6.681554248830074e-05, "loss": 1.1508, "step": 5010 }, { "epoch": 1.3561087809498038, "grad_norm": 0.24471548199653625, "learning_rate": 6.678589091437288e-05, "loss": 1.142, "step": 5012 }, { "epoch": 1.3566499797050466, "grad_norm": 0.23942361772060394, "learning_rate": 6.675623268572066e-05, "loss": 1.1451, "step": 5014 }, { "epoch": 1.3571911784602895, "grad_norm": 0.25475817918777466, "learning_rate": 6.672656781410196e-05, "loss": 1.1508, "step": 5016 }, { "epoch": 1.3577323772155325, "grad_norm": 0.23169760406017303, "learning_rate": 6.669689631127738e-05, "loss": 1.1231, "step": 5018 }, { "epoch": 1.3582735759707751, "grad_norm": 0.221586674451828, "learning_rate": 6.666721818901009e-05, "loss": 1.12, "step": 5020 }, { "epoch": 1.3588147747260182, "grad_norm": 0.21877366304397583, "learning_rate": 6.663753345906591e-05, "loss": 1.1266, "step": 5022 }, { "epoch": 1.359355973481261, "grad_norm": 0.20271623134613037, "learning_rate": 6.660784213321328e-05, "loss": 1.1278, "step": 5024 }, { "epoch": 1.3598971722365039, "grad_norm": 0.24258723855018616, "learning_rate": 6.657814422322326e-05, "loss": 1.1396, "step": 5026 }, { "epoch": 1.3604383709917467, "grad_norm": 0.204366996884346, "learning_rate": 6.65484397408695e-05, "loss": 1.1221, "step": 5028 }, { "epoch": 1.3609795697469895, "grad_norm": 0.2240559309720993, "learning_rate": 6.65187286979283e-05, "loss": 1.1552, "step": 5030 }, { "epoch": 1.3615207685022326, "grad_norm": 0.21712256968021393, "learning_rate": 6.648901110617846e-05, "loss": 1.1402, "step": 5032 }, { "epoch": 1.3620619672574752, "grad_norm": 0.23759770393371582, "learning_rate": 6.64592869774015e-05, "loss": 1.1289, "step": 5034 }, { "epoch": 1.3626031660127182, "grad_norm": 0.20966365933418274, "learning_rate": 6.642955632338148e-05, "loss": 1.1275, "step": 5036 }, { "epoch": 1.363144364767961, "grad_norm": 0.20241115987300873, "learning_rate": 6.639981915590501e-05, "loss": 1.1366, "step": 5038 }, { "epoch": 1.363685563523204, "grad_norm": 0.1985287219285965, "learning_rate": 6.637007548676132e-05, "loss": 1.1201, "step": 5040 }, { "epoch": 1.3642267622784467, "grad_norm": 0.23188084363937378, "learning_rate": 6.634032532774224e-05, "loss": 1.1256, "step": 5042 }, { "epoch": 1.3647679610336896, "grad_norm": 1.0067830085754395, "learning_rate": 6.631056869064211e-05, "loss": 1.1488, "step": 5044 }, { "epoch": 1.3653091597889324, "grad_norm": 0.26151588559150696, "learning_rate": 6.628080558725794e-05, "loss": 1.1368, "step": 5046 }, { "epoch": 1.3658503585441752, "grad_norm": 0.2546170949935913, "learning_rate": 6.625103602938916e-05, "loss": 1.1213, "step": 5048 }, { "epoch": 1.3663915572994183, "grad_norm": 0.24770961701869965, "learning_rate": 6.622126002883786e-05, "loss": 1.1517, "step": 5050 }, { "epoch": 1.3669327560546611, "grad_norm": 0.20164626836776733, "learning_rate": 6.619147759740869e-05, "loss": 1.1274, "step": 5052 }, { "epoch": 1.367473954809904, "grad_norm": 0.21095354855060577, "learning_rate": 6.61616887469088e-05, "loss": 1.1387, "step": 5054 }, { "epoch": 1.3680151535651468, "grad_norm": 0.20875804126262665, "learning_rate": 6.613189348914788e-05, "loss": 1.1321, "step": 5056 }, { "epoch": 1.3685563523203896, "grad_norm": 0.20944173634052277, "learning_rate": 6.610209183593824e-05, "loss": 1.1093, "step": 5058 }, { "epoch": 1.3690975510756325, "grad_norm": 0.195838063955307, "learning_rate": 6.607228379909463e-05, "loss": 1.128, "step": 5060 }, { "epoch": 1.3696387498308753, "grad_norm": 0.20506072044372559, "learning_rate": 6.604246939043437e-05, "loss": 1.1258, "step": 5062 }, { "epoch": 1.3701799485861184, "grad_norm": 0.21078158915042877, "learning_rate": 6.601264862177735e-05, "loss": 1.1111, "step": 5064 }, { "epoch": 1.3707211473413612, "grad_norm": 0.20859210193157196, "learning_rate": 6.598282150494588e-05, "loss": 1.1477, "step": 5066 }, { "epoch": 1.371262346096604, "grad_norm": 0.26835790276527405, "learning_rate": 6.595298805176488e-05, "loss": 1.1459, "step": 5068 }, { "epoch": 1.3718035448518469, "grad_norm": 0.28039342164993286, "learning_rate": 6.592314827406177e-05, "loss": 1.137, "step": 5070 }, { "epoch": 1.3723447436070897, "grad_norm": 0.25044727325439453, "learning_rate": 6.58933021836664e-05, "loss": 1.1468, "step": 5072 }, { "epoch": 1.3728859423623325, "grad_norm": 0.2226564884185791, "learning_rate": 6.586344979241122e-05, "loss": 1.1252, "step": 5074 }, { "epoch": 1.3734271411175754, "grad_norm": 0.2516416907310486, "learning_rate": 6.58335911121311e-05, "loss": 1.1248, "step": 5076 }, { "epoch": 1.3739683398728184, "grad_norm": 0.2427980750799179, "learning_rate": 6.580372615466348e-05, "loss": 1.1575, "step": 5078 }, { "epoch": 1.374509538628061, "grad_norm": 0.22615928947925568, "learning_rate": 6.577385493184822e-05, "loss": 1.1181, "step": 5080 }, { "epoch": 1.375050737383304, "grad_norm": 0.24130937457084656, "learning_rate": 6.574397745552772e-05, "loss": 1.1158, "step": 5082 }, { "epoch": 1.375591936138547, "grad_norm": 0.22093050181865692, "learning_rate": 6.571409373754678e-05, "loss": 1.1598, "step": 5084 }, { "epoch": 1.3761331348937897, "grad_norm": 20.267744064331055, "learning_rate": 6.568420378975278e-05, "loss": 1.1315, "step": 5086 }, { "epoch": 1.3766743336490326, "grad_norm": 0.8621043562889099, "learning_rate": 6.565430762399546e-05, "loss": 1.1657, "step": 5088 }, { "epoch": 1.3772155324042754, "grad_norm": 0.5339746475219727, "learning_rate": 6.562440525212712e-05, "loss": 1.1516, "step": 5090 }, { "epoch": 1.3777567311595185, "grad_norm": 7.3805365562438965, "learning_rate": 6.559449668600248e-05, "loss": 1.2317, "step": 5092 }, { "epoch": 1.378297929914761, "grad_norm": 0.4800806939601898, "learning_rate": 6.556458193747871e-05, "loss": 1.248, "step": 5094 }, { "epoch": 1.3788391286700041, "grad_norm": 0.3002035617828369, "learning_rate": 6.553466101841542e-05, "loss": 1.2319, "step": 5096 }, { "epoch": 1.379380327425247, "grad_norm": 0.3063008785247803, "learning_rate": 6.550473394067472e-05, "loss": 1.2244, "step": 5098 }, { "epoch": 1.3799215261804898, "grad_norm": 0.2721727192401886, "learning_rate": 6.547480071612107e-05, "loss": 1.144, "step": 5100 }, { "epoch": 1.3804627249357326, "grad_norm": 0.2548552453517914, "learning_rate": 6.544486135662146e-05, "loss": 1.1369, "step": 5102 }, { "epoch": 1.3810039236909755, "grad_norm": 0.2472192496061325, "learning_rate": 6.541491587404529e-05, "loss": 1.1254, "step": 5104 }, { "epoch": 1.3815451224462183, "grad_norm": 0.2365529090166092, "learning_rate": 6.538496428026434e-05, "loss": 1.1292, "step": 5106 }, { "epoch": 1.3820863212014611, "grad_norm": 0.2324395775794983, "learning_rate": 6.535500658715286e-05, "loss": 1.1315, "step": 5108 }, { "epoch": 1.3826275199567042, "grad_norm": 0.22274981439113617, "learning_rate": 6.532504280658747e-05, "loss": 1.1334, "step": 5110 }, { "epoch": 1.383168718711947, "grad_norm": 0.22897140681743622, "learning_rate": 6.529507295044728e-05, "loss": 1.1357, "step": 5112 }, { "epoch": 1.3837099174671899, "grad_norm": 0.21158845722675323, "learning_rate": 6.526509703061375e-05, "loss": 1.1315, "step": 5114 }, { "epoch": 1.3842511162224327, "grad_norm": 0.25797906517982483, "learning_rate": 6.523511505897074e-05, "loss": 1.1264, "step": 5116 }, { "epoch": 1.3847923149776755, "grad_norm": 0.20588941872119904, "learning_rate": 6.520512704740455e-05, "loss": 1.1293, "step": 5118 }, { "epoch": 1.3853335137329184, "grad_norm": 0.20949113368988037, "learning_rate": 6.517513300780385e-05, "loss": 1.1399, "step": 5120 }, { "epoch": 1.3858747124881612, "grad_norm": 0.5824947953224182, "learning_rate": 6.51451329520597e-05, "loss": 1.1393, "step": 5122 }, { "epoch": 1.3864159112434042, "grad_norm": 0.2793792486190796, "learning_rate": 6.511512689206552e-05, "loss": 1.1285, "step": 5124 }, { "epoch": 1.386957109998647, "grad_norm": 0.24267977476119995, "learning_rate": 6.508511483971718e-05, "loss": 1.1481, "step": 5126 }, { "epoch": 1.38749830875389, "grad_norm": 0.23414160311222076, "learning_rate": 6.505509680691285e-05, "loss": 1.1364, "step": 5128 }, { "epoch": 1.3880395075091327, "grad_norm": 0.22055640816688538, "learning_rate": 6.502507280555313e-05, "loss": 1.1362, "step": 5130 }, { "epoch": 1.3885807062643756, "grad_norm": 0.22319395840168, "learning_rate": 6.499504284754093e-05, "loss": 1.1341, "step": 5132 }, { "epoch": 1.3891219050196184, "grad_norm": 0.2219814509153366, "learning_rate": 6.496500694478158e-05, "loss": 1.1291, "step": 5134 }, { "epoch": 1.3896631037748612, "grad_norm": 0.22638678550720215, "learning_rate": 6.493496510918273e-05, "loss": 1.1493, "step": 5136 }, { "epoch": 1.3902043025301043, "grad_norm": 0.21132950484752655, "learning_rate": 6.490491735265438e-05, "loss": 1.1283, "step": 5138 }, { "epoch": 1.390745501285347, "grad_norm": 0.23047256469726562, "learning_rate": 6.48748636871089e-05, "loss": 1.1402, "step": 5140 }, { "epoch": 1.39128670004059, "grad_norm": 0.21426177024841309, "learning_rate": 6.484480412446097e-05, "loss": 1.1259, "step": 5142 }, { "epoch": 1.3918278987958328, "grad_norm": 0.22578999400138855, "learning_rate": 6.481473867662766e-05, "loss": 1.1304, "step": 5144 }, { "epoch": 1.3923690975510756, "grad_norm": 0.20824173092842102, "learning_rate": 6.478466735552832e-05, "loss": 1.1291, "step": 5146 }, { "epoch": 1.3929102963063185, "grad_norm": 0.20857419073581696, "learning_rate": 6.475459017308466e-05, "loss": 1.1132, "step": 5148 }, { "epoch": 1.3934514950615613, "grad_norm": 0.20939777791500092, "learning_rate": 6.47245071412207e-05, "loss": 1.1358, "step": 5150 }, { "epoch": 1.3939926938168044, "grad_norm": 0.2175268530845642, "learning_rate": 6.469441827186278e-05, "loss": 1.1368, "step": 5152 }, { "epoch": 1.394533892572047, "grad_norm": 0.9276557564735413, "learning_rate": 6.466432357693955e-05, "loss": 1.1468, "step": 5154 }, { "epoch": 1.39507509132729, "grad_norm": 0.2582313120365143, "learning_rate": 6.4634223068382e-05, "loss": 1.143, "step": 5156 }, { "epoch": 1.3956162900825329, "grad_norm": 0.2258533239364624, "learning_rate": 6.460411675812337e-05, "loss": 1.1324, "step": 5158 }, { "epoch": 1.3961574888377757, "grad_norm": 0.23796269297599792, "learning_rate": 6.457400465809925e-05, "loss": 1.1181, "step": 5160 }, { "epoch": 1.3966986875930185, "grad_norm": 0.2054450958967209, "learning_rate": 6.454388678024752e-05, "loss": 1.1342, "step": 5162 }, { "epoch": 1.3972398863482614, "grad_norm": 0.30061593651771545, "learning_rate": 6.45137631365083e-05, "loss": 1.1404, "step": 5164 }, { "epoch": 1.3977810851035042, "grad_norm": 0.2129613757133484, "learning_rate": 6.448363373882405e-05, "loss": 1.1331, "step": 5166 }, { "epoch": 1.398322283858747, "grad_norm": 0.21098573505878448, "learning_rate": 6.445349859913952e-05, "loss": 1.1163, "step": 5168 }, { "epoch": 1.39886348261399, "grad_norm": 0.7791925668716431, "learning_rate": 6.442335772940167e-05, "loss": 1.122, "step": 5170 }, { "epoch": 1.399404681369233, "grad_norm": 0.27684348821640015, "learning_rate": 6.439321114155981e-05, "loss": 1.1117, "step": 5172 }, { "epoch": 1.3999458801244757, "grad_norm": 0.27352577447891235, "learning_rate": 6.436305884756543e-05, "loss": 1.1255, "step": 5174 }, { "epoch": 1.4004870788797186, "grad_norm": 0.22613908350467682, "learning_rate": 6.433290085937239e-05, "loss": 1.1343, "step": 5176 }, { "epoch": 1.4010282776349614, "grad_norm": 0.2271454930305481, "learning_rate": 6.430273718893671e-05, "loss": 1.1367, "step": 5178 }, { "epoch": 1.4015694763902042, "grad_norm": 0.21145206689834595, "learning_rate": 6.427256784821671e-05, "loss": 1.1392, "step": 5180 }, { "epoch": 1.402110675145447, "grad_norm": 0.21040070056915283, "learning_rate": 6.424239284917296e-05, "loss": 1.1516, "step": 5182 }, { "epoch": 1.4026518739006901, "grad_norm": 0.22618639469146729, "learning_rate": 6.421221220376826e-05, "loss": 1.1405, "step": 5184 }, { "epoch": 1.403193072655933, "grad_norm": 0.20837995409965515, "learning_rate": 6.418202592396762e-05, "loss": 1.1209, "step": 5186 }, { "epoch": 1.4037342714111758, "grad_norm": 0.1959199756383896, "learning_rate": 6.415183402173837e-05, "loss": 1.1421, "step": 5188 }, { "epoch": 1.4042754701664186, "grad_norm": 0.21497224271297455, "learning_rate": 6.412163650904997e-05, "loss": 1.1275, "step": 5190 }, { "epoch": 1.4048166689216615, "grad_norm": 0.20324070751667023, "learning_rate": 6.409143339787416e-05, "loss": 1.147, "step": 5192 }, { "epoch": 1.4053578676769043, "grad_norm": 0.19893191754817963, "learning_rate": 6.406122470018489e-05, "loss": 1.1451, "step": 5194 }, { "epoch": 1.4058990664321471, "grad_norm": 0.2447110414505005, "learning_rate": 6.403101042795833e-05, "loss": 1.1311, "step": 5196 }, { "epoch": 1.4064402651873902, "grad_norm": 0.20944955945014954, "learning_rate": 6.400079059317283e-05, "loss": 1.1245, "step": 5198 }, { "epoch": 1.406981463942633, "grad_norm": 0.19964562356472015, "learning_rate": 6.397056520780901e-05, "loss": 1.133, "step": 5200 }, { "epoch": 1.4075226626978758, "grad_norm": 0.20749281346797943, "learning_rate": 6.394033428384961e-05, "loss": 1.1166, "step": 5202 }, { "epoch": 1.4080638614531187, "grad_norm": 0.2048967033624649, "learning_rate": 6.391009783327961e-05, "loss": 1.1212, "step": 5204 }, { "epoch": 1.4086050602083615, "grad_norm": 0.20158132910728455, "learning_rate": 6.387985586808618e-05, "loss": 1.1327, "step": 5206 }, { "epoch": 1.4091462589636043, "grad_norm": 0.2008758783340454, "learning_rate": 6.384960840025868e-05, "loss": 1.1342, "step": 5208 }, { "epoch": 1.4096874577188472, "grad_norm": 0.20249807834625244, "learning_rate": 6.381935544178863e-05, "loss": 1.1515, "step": 5210 }, { "epoch": 1.4102286564740902, "grad_norm": 0.20463450253009796, "learning_rate": 6.378909700466975e-05, "loss": 1.1372, "step": 5212 }, { "epoch": 1.4107698552293328, "grad_norm": 0.2040434032678604, "learning_rate": 6.37588331008979e-05, "loss": 1.1194, "step": 5214 }, { "epoch": 1.411311053984576, "grad_norm": 0.2022273689508438, "learning_rate": 6.372856374247116e-05, "loss": 1.1308, "step": 5216 }, { "epoch": 1.4118522527398187, "grad_norm": 0.19660891592502594, "learning_rate": 6.369828894138972e-05, "loss": 1.153, "step": 5218 }, { "epoch": 1.4123934514950616, "grad_norm": 0.20182554423809052, "learning_rate": 6.366800870965595e-05, "loss": 1.1085, "step": 5220 }, { "epoch": 1.4129346502503044, "grad_norm": 0.2155122309923172, "learning_rate": 6.363772305927439e-05, "loss": 1.1408, "step": 5222 }, { "epoch": 1.4134758490055472, "grad_norm": 0.1944376677274704, "learning_rate": 6.36074320022517e-05, "loss": 1.134, "step": 5224 }, { "epoch": 1.41401704776079, "grad_norm": 0.19909417629241943, "learning_rate": 6.357713555059667e-05, "loss": 1.1211, "step": 5226 }, { "epoch": 1.414558246516033, "grad_norm": 0.2039753794670105, "learning_rate": 6.354683371632028e-05, "loss": 1.135, "step": 5228 }, { "epoch": 1.415099445271276, "grad_norm": 0.2083454132080078, "learning_rate": 6.351652651143563e-05, "loss": 1.1153, "step": 5230 }, { "epoch": 1.4156406440265188, "grad_norm": 0.19243714213371277, "learning_rate": 6.34862139479579e-05, "loss": 1.1396, "step": 5232 }, { "epoch": 1.4161818427817616, "grad_norm": 0.19831132888793945, "learning_rate": 6.345589603790445e-05, "loss": 1.1144, "step": 5234 }, { "epoch": 1.4167230415370045, "grad_norm": 0.20038144290447235, "learning_rate": 6.342557279329473e-05, "loss": 1.1217, "step": 5236 }, { "epoch": 1.4172642402922473, "grad_norm": 0.1926330327987671, "learning_rate": 6.33952442261503e-05, "loss": 1.1121, "step": 5238 }, { "epoch": 1.4178054390474901, "grad_norm": 0.1921953707933426, "learning_rate": 6.33649103484949e-05, "loss": 1.1044, "step": 5240 }, { "epoch": 1.418346637802733, "grad_norm": 0.19673408567905426, "learning_rate": 6.333457117235426e-05, "loss": 1.1277, "step": 5242 }, { "epoch": 1.418887836557976, "grad_norm": 0.21036703884601593, "learning_rate": 6.330422670975629e-05, "loss": 1.1299, "step": 5244 }, { "epoch": 1.4194290353132188, "grad_norm": 0.20971907675266266, "learning_rate": 6.3273876972731e-05, "loss": 1.1374, "step": 5246 }, { "epoch": 1.4199702340684617, "grad_norm": 0.19726496934890747, "learning_rate": 6.324352197331043e-05, "loss": 1.158, "step": 5248 }, { "epoch": 1.4205114328237045, "grad_norm": 0.19105949997901917, "learning_rate": 6.321316172352875e-05, "loss": 1.1345, "step": 5250 }, { "epoch": 1.4210526315789473, "grad_norm": 0.19923214614391327, "learning_rate": 6.318279623542223e-05, "loss": 1.12, "step": 5252 }, { "epoch": 1.4215938303341902, "grad_norm": 0.2036782056093216, "learning_rate": 6.315242552102919e-05, "loss": 1.1147, "step": 5254 }, { "epoch": 1.422135029089433, "grad_norm": 0.20262980461120605, "learning_rate": 6.312204959238999e-05, "loss": 1.1234, "step": 5256 }, { "epoch": 1.422676227844676, "grad_norm": 0.2113923728466034, "learning_rate": 6.309166846154712e-05, "loss": 1.1301, "step": 5258 }, { "epoch": 1.423217426599919, "grad_norm": 0.20867778360843658, "learning_rate": 6.306128214054508e-05, "loss": 1.1239, "step": 5260 }, { "epoch": 1.4237586253551617, "grad_norm": 0.20628774166107178, "learning_rate": 6.303089064143049e-05, "loss": 1.1468, "step": 5262 }, { "epoch": 1.4242998241104046, "grad_norm": 0.21228572726249695, "learning_rate": 6.300049397625194e-05, "loss": 1.1144, "step": 5264 }, { "epoch": 1.4248410228656474, "grad_norm": 0.2117290496826172, "learning_rate": 6.297009215706013e-05, "loss": 1.1314, "step": 5266 }, { "epoch": 1.4253822216208902, "grad_norm": 0.2099006623029709, "learning_rate": 6.293968519590779e-05, "loss": 1.1248, "step": 5268 }, { "epoch": 1.425923420376133, "grad_norm": 0.20300981402397156, "learning_rate": 6.290927310484969e-05, "loss": 1.1097, "step": 5270 }, { "epoch": 1.4264646191313761, "grad_norm": 0.22275353968143463, "learning_rate": 6.287885589594258e-05, "loss": 1.118, "step": 5272 }, { "epoch": 1.4270058178866187, "grad_norm": 0.20226998627185822, "learning_rate": 6.284843358124538e-05, "loss": 1.1182, "step": 5274 }, { "epoch": 1.4275470166418618, "grad_norm": 0.21334192156791687, "learning_rate": 6.281800617281884e-05, "loss": 1.1023, "step": 5276 }, { "epoch": 1.4280882153971046, "grad_norm": 0.2120470404624939, "learning_rate": 6.278757368272587e-05, "loss": 1.1554, "step": 5278 }, { "epoch": 1.4286294141523475, "grad_norm": 0.20143356919288635, "learning_rate": 6.27571361230314e-05, "loss": 1.1467, "step": 5280 }, { "epoch": 1.4291706129075903, "grad_norm": 0.20335279405117035, "learning_rate": 6.272669350580225e-05, "loss": 1.1178, "step": 5282 }, { "epoch": 1.4297118116628331, "grad_norm": 0.2324477583169937, "learning_rate": 6.269624584310734e-05, "loss": 1.1288, "step": 5284 }, { "epoch": 1.4302530104180762, "grad_norm": 0.22252020239830017, "learning_rate": 6.26657931470176e-05, "loss": 1.1465, "step": 5286 }, { "epoch": 1.4307942091733188, "grad_norm": 0.19176894426345825, "learning_rate": 6.263533542960591e-05, "loss": 1.131, "step": 5288 }, { "epoch": 1.4313354079285618, "grad_norm": 0.20940960943698883, "learning_rate": 6.260487270294714e-05, "loss": 1.1133, "step": 5290 }, { "epoch": 1.4318766066838047, "grad_norm": 0.2106446623802185, "learning_rate": 6.257440497911817e-05, "loss": 1.1438, "step": 5292 }, { "epoch": 1.4324178054390475, "grad_norm": 0.20876814424991608, "learning_rate": 6.254393227019786e-05, "loss": 1.1312, "step": 5294 }, { "epoch": 1.4329590041942903, "grad_norm": 0.2158835232257843, "learning_rate": 6.251345458826703e-05, "loss": 1.1284, "step": 5296 }, { "epoch": 1.4335002029495332, "grad_norm": 0.19507522881031036, "learning_rate": 6.248297194540849e-05, "loss": 1.1194, "step": 5298 }, { "epoch": 1.434041401704776, "grad_norm": 0.20181389153003693, "learning_rate": 6.2452484353707e-05, "loss": 1.1119, "step": 5300 }, { "epoch": 1.4345826004600188, "grad_norm": 0.22877374291419983, "learning_rate": 6.242199182524931e-05, "loss": 1.1195, "step": 5302 }, { "epoch": 1.435123799215262, "grad_norm": 0.2084018886089325, "learning_rate": 6.239149437212407e-05, "loss": 1.1255, "step": 5304 }, { "epoch": 1.4356649979705047, "grad_norm": 0.20372161269187927, "learning_rate": 6.236099200642193e-05, "loss": 1.1286, "step": 5306 }, { "epoch": 1.4362061967257476, "grad_norm": 0.19950535893440247, "learning_rate": 6.233048474023551e-05, "loss": 1.1145, "step": 5308 }, { "epoch": 1.4367473954809904, "grad_norm": 0.20108552277088165, "learning_rate": 6.229997258565929e-05, "loss": 1.1341, "step": 5310 }, { "epoch": 1.4372885942362332, "grad_norm": 0.2273787260055542, "learning_rate": 6.226945555478977e-05, "loss": 1.1273, "step": 5312 }, { "epoch": 1.437829792991476, "grad_norm": 0.22451777756214142, "learning_rate": 6.223893365972535e-05, "loss": 1.1054, "step": 5314 }, { "epoch": 1.438370991746719, "grad_norm": 0.21472510695457458, "learning_rate": 6.220840691256633e-05, "loss": 1.1257, "step": 5316 }, { "epoch": 1.438912190501962, "grad_norm": 0.22151541709899902, "learning_rate": 6.217787532541499e-05, "loss": 1.137, "step": 5318 }, { "epoch": 1.4394533892572048, "grad_norm": 0.2031947374343872, "learning_rate": 6.21473389103755e-05, "loss": 1.1253, "step": 5320 }, { "epoch": 1.4399945880124476, "grad_norm": 0.20195578038692474, "learning_rate": 6.211679767955393e-05, "loss": 1.128, "step": 5322 }, { "epoch": 1.4405357867676905, "grad_norm": 0.19635425508022308, "learning_rate": 6.208625164505828e-05, "loss": 1.1214, "step": 5324 }, { "epoch": 1.4410769855229333, "grad_norm": 0.20612046122550964, "learning_rate": 6.205570081899846e-05, "loss": 1.128, "step": 5326 }, { "epoch": 1.4416181842781761, "grad_norm": 0.21617263555526733, "learning_rate": 6.202514521348627e-05, "loss": 1.1528, "step": 5328 }, { "epoch": 1.442159383033419, "grad_norm": 0.2163170725107193, "learning_rate": 6.199458484063537e-05, "loss": 1.1125, "step": 5330 }, { "epoch": 1.442700581788662, "grad_norm": 0.23022876679897308, "learning_rate": 6.196401971256138e-05, "loss": 1.1316, "step": 5332 }, { "epoch": 1.4432417805439046, "grad_norm": 0.2044323831796646, "learning_rate": 6.193344984138176e-05, "loss": 1.1188, "step": 5334 }, { "epoch": 1.4437829792991477, "grad_norm": 0.2090006023645401, "learning_rate": 6.190287523921585e-05, "loss": 1.1118, "step": 5336 }, { "epoch": 1.4443241780543905, "grad_norm": 0.21892942488193512, "learning_rate": 6.187229591818487e-05, "loss": 1.1139, "step": 5338 }, { "epoch": 1.4448653768096333, "grad_norm": 0.22355982661247253, "learning_rate": 6.184171189041194e-05, "loss": 1.1013, "step": 5340 }, { "epoch": 1.4454065755648762, "grad_norm": 0.20586957037448883, "learning_rate": 6.181112316802199e-05, "loss": 1.1317, "step": 5342 }, { "epoch": 1.445947774320119, "grad_norm": 0.2016911506652832, "learning_rate": 6.178052976314186e-05, "loss": 1.1356, "step": 5344 }, { "epoch": 1.446488973075362, "grad_norm": 0.20564743876457214, "learning_rate": 6.174993168790022e-05, "loss": 1.1019, "step": 5346 }, { "epoch": 1.4470301718306047, "grad_norm": 0.20427408814430237, "learning_rate": 6.171932895442762e-05, "loss": 1.1171, "step": 5348 }, { "epoch": 1.4475713705858477, "grad_norm": 0.21171078085899353, "learning_rate": 6.168872157485641e-05, "loss": 1.1292, "step": 5350 }, { "epoch": 1.4481125693410906, "grad_norm": 0.2225184589624405, "learning_rate": 6.165810956132082e-05, "loss": 1.1199, "step": 5352 }, { "epoch": 1.4486537680963334, "grad_norm": 0.2076125293970108, "learning_rate": 6.162749292595693e-05, "loss": 1.1369, "step": 5354 }, { "epoch": 1.4491949668515762, "grad_norm": 0.19879041612148285, "learning_rate": 6.159687168090259e-05, "loss": 1.108, "step": 5356 }, { "epoch": 1.449736165606819, "grad_norm": 0.24063818156719208, "learning_rate": 6.156624583829753e-05, "loss": 1.1098, "step": 5358 }, { "epoch": 1.450277364362062, "grad_norm": 0.23444120585918427, "learning_rate": 6.15356154102833e-05, "loss": 1.1111, "step": 5360 }, { "epoch": 1.4508185631173047, "grad_norm": 0.22161896526813507, "learning_rate": 6.150498040900325e-05, "loss": 1.112, "step": 5362 }, { "epoch": 1.4513597618725478, "grad_norm": 0.2024424970149994, "learning_rate": 6.147434084660253e-05, "loss": 1.1162, "step": 5364 }, { "epoch": 1.4519009606277906, "grad_norm": 0.20698332786560059, "learning_rate": 6.144369673522813e-05, "loss": 1.1197, "step": 5366 }, { "epoch": 1.4524421593830334, "grad_norm": 0.1972983181476593, "learning_rate": 6.141304808702886e-05, "loss": 1.1378, "step": 5368 }, { "epoch": 1.4529833581382763, "grad_norm": 0.20257173478603363, "learning_rate": 6.138239491415525e-05, "loss": 1.1276, "step": 5370 }, { "epoch": 1.4535245568935191, "grad_norm": 0.2144201546907425, "learning_rate": 6.135173722875972e-05, "loss": 1.1237, "step": 5372 }, { "epoch": 1.454065755648762, "grad_norm": 0.2175070196390152, "learning_rate": 6.132107504299641e-05, "loss": 1.1047, "step": 5374 }, { "epoch": 1.4546069544040048, "grad_norm": 0.22028861939907074, "learning_rate": 6.129040836902126e-05, "loss": 1.1263, "step": 5376 }, { "epoch": 1.4551481531592478, "grad_norm": 0.23484350740909576, "learning_rate": 6.125973721899201e-05, "loss": 1.1198, "step": 5378 }, { "epoch": 1.4556893519144907, "grad_norm": 0.24591375887393951, "learning_rate": 6.122906160506814e-05, "loss": 1.127, "step": 5380 }, { "epoch": 1.4562305506697335, "grad_norm": 0.22519347071647644, "learning_rate": 6.119838153941095e-05, "loss": 1.1405, "step": 5382 }, { "epoch": 1.4567717494249763, "grad_norm": 0.2102644294500351, "learning_rate": 6.116769703418347e-05, "loss": 1.1273, "step": 5384 }, { "epoch": 1.4573129481802192, "grad_norm": 9.886588096618652, "learning_rate": 6.113700810155046e-05, "loss": 1.1474, "step": 5386 }, { "epoch": 1.457854146935462, "grad_norm": 0.24959422647953033, "learning_rate": 6.110631475367852e-05, "loss": 1.1304, "step": 5388 }, { "epoch": 1.4583953456907048, "grad_norm": 0.2364051342010498, "learning_rate": 6.107561700273592e-05, "loss": 1.1316, "step": 5390 }, { "epoch": 1.458936544445948, "grad_norm": 0.2226945012807846, "learning_rate": 6.10449148608927e-05, "loss": 1.1081, "step": 5392 }, { "epoch": 1.4594777432011905, "grad_norm": 0.21343544125556946, "learning_rate": 6.1014208340320665e-05, "loss": 1.1128, "step": 5394 }, { "epoch": 1.4600189419564336, "grad_norm": 0.21775399148464203, "learning_rate": 6.098349745319334e-05, "loss": 1.1365, "step": 5396 }, { "epoch": 1.4605601407116764, "grad_norm": 0.21264183521270752, "learning_rate": 6.0952782211685955e-05, "loss": 1.1251, "step": 5398 }, { "epoch": 1.4611013394669192, "grad_norm": 0.21287740767002106, "learning_rate": 6.092206262797553e-05, "loss": 1.1301, "step": 5400 }, { "epoch": 1.461642538222162, "grad_norm": 0.212521493434906, "learning_rate": 6.089133871424074e-05, "loss": 1.1082, "step": 5402 }, { "epoch": 1.462183736977405, "grad_norm": 0.21181194484233856, "learning_rate": 6.0860610482662005e-05, "loss": 1.0978, "step": 5404 }, { "epoch": 1.462724935732648, "grad_norm": 0.21306376159191132, "learning_rate": 6.0829877945421464e-05, "loss": 1.1167, "step": 5406 }, { "epoch": 1.4632661344878906, "grad_norm": 0.19244280457496643, "learning_rate": 6.079914111470295e-05, "loss": 1.1316, "step": 5408 }, { "epoch": 1.4638073332431336, "grad_norm": 0.2017110288143158, "learning_rate": 6.076840000269199e-05, "loss": 1.1128, "step": 5410 }, { "epoch": 1.4643485319983764, "grad_norm": 0.2144317775964737, "learning_rate": 6.073765462157586e-05, "loss": 1.1187, "step": 5412 }, { "epoch": 1.4648897307536193, "grad_norm": 0.20555539429187775, "learning_rate": 6.0706904983543444e-05, "loss": 1.1234, "step": 5414 }, { "epoch": 1.4654309295088621, "grad_norm": 0.2013130784034729, "learning_rate": 6.0676151100785373e-05, "loss": 1.1329, "step": 5416 }, { "epoch": 1.465972128264105, "grad_norm": 0.19903089106082916, "learning_rate": 6.0645392985493966e-05, "loss": 1.1204, "step": 5418 }, { "epoch": 1.4665133270193478, "grad_norm": 0.20048683881759644, "learning_rate": 6.061463064986317e-05, "loss": 1.1237, "step": 5420 }, { "epoch": 1.4670545257745906, "grad_norm": 0.2066463679075241, "learning_rate": 6.058386410608865e-05, "loss": 1.1252, "step": 5422 }, { "epoch": 1.4675957245298337, "grad_norm": 0.19743682444095612, "learning_rate": 6.055309336636773e-05, "loss": 1.1135, "step": 5424 }, { "epoch": 1.4681369232850765, "grad_norm": 0.20476719737052917, "learning_rate": 6.05223184428994e-05, "loss": 1.1326, "step": 5426 }, { "epoch": 1.4686781220403193, "grad_norm": 0.2180105745792389, "learning_rate": 6.049153934788429e-05, "loss": 1.1321, "step": 5428 }, { "epoch": 1.4692193207955622, "grad_norm": 0.19696936011314392, "learning_rate": 6.0460756093524684e-05, "loss": 1.1273, "step": 5430 }, { "epoch": 1.469760519550805, "grad_norm": 0.20372888445854187, "learning_rate": 6.0429968692024544e-05, "loss": 1.1293, "step": 5432 }, { "epoch": 1.4703017183060478, "grad_norm": 0.20151478052139282, "learning_rate": 6.039917715558945e-05, "loss": 1.1156, "step": 5434 }, { "epoch": 1.4708429170612907, "grad_norm": 0.20314039289951324, "learning_rate": 6.036838149642664e-05, "loss": 1.1231, "step": 5436 }, { "epoch": 1.4713841158165337, "grad_norm": 0.24197211861610413, "learning_rate": 6.033758172674495e-05, "loss": 1.104, "step": 5438 }, { "epoch": 1.4719253145717766, "grad_norm": 0.20811395347118378, "learning_rate": 6.0306777858754915e-05, "loss": 1.1323, "step": 5440 }, { "epoch": 1.4724665133270194, "grad_norm": 0.1977129578590393, "learning_rate": 6.0275969904668605e-05, "loss": 1.1314, "step": 5442 }, { "epoch": 1.4730077120822622, "grad_norm": 0.2016446590423584, "learning_rate": 6.0245157876699774e-05, "loss": 1.1146, "step": 5444 }, { "epoch": 1.473548910837505, "grad_norm": 0.18964055180549622, "learning_rate": 6.0214341787063776e-05, "loss": 1.1309, "step": 5446 }, { "epoch": 1.4740901095927479, "grad_norm": 0.2042587697505951, "learning_rate": 6.018352164797759e-05, "loss": 1.1211, "step": 5448 }, { "epoch": 1.4746313083479907, "grad_norm": 0.19844447076320648, "learning_rate": 6.015269747165975e-05, "loss": 1.121, "step": 5450 }, { "epoch": 1.4751725071032338, "grad_norm": 0.19678519666194916, "learning_rate": 6.012186927033044e-05, "loss": 1.1441, "step": 5452 }, { "epoch": 1.4757137058584764, "grad_norm": 0.20729778707027435, "learning_rate": 6.009103705621144e-05, "loss": 1.1155, "step": 5454 }, { "epoch": 1.4762549046137194, "grad_norm": 0.21526449918746948, "learning_rate": 6.006020084152606e-05, "loss": 1.1449, "step": 5456 }, { "epoch": 1.4767961033689623, "grad_norm": 0.21945127844810486, "learning_rate": 6.0029360638499286e-05, "loss": 1.1031, "step": 5458 }, { "epoch": 1.477337302124205, "grad_norm": 0.1969752162694931, "learning_rate": 5.9998516459357604e-05, "loss": 1.1191, "step": 5460 }, { "epoch": 1.477878500879448, "grad_norm": 0.22696499526500702, "learning_rate": 5.996766831632913e-05, "loss": 1.1128, "step": 5462 }, { "epoch": 1.4784196996346908, "grad_norm": 0.26032930612564087, "learning_rate": 5.993681622164354e-05, "loss": 1.1213, "step": 5464 }, { "epoch": 1.4789608983899338, "grad_norm": 0.21460728347301483, "learning_rate": 5.990596018753204e-05, "loss": 1.1081, "step": 5466 }, { "epoch": 1.4795020971451764, "grad_norm": 0.20702674984931946, "learning_rate": 5.987510022622746e-05, "loss": 1.1147, "step": 5468 }, { "epoch": 1.4800432959004195, "grad_norm": 0.19713331758975983, "learning_rate": 5.9844236349964134e-05, "loss": 1.1385, "step": 5470 }, { "epoch": 1.4805844946556623, "grad_norm": 0.20525701344013214, "learning_rate": 5.981336857097799e-05, "loss": 1.1286, "step": 5472 }, { "epoch": 1.4811256934109052, "grad_norm": 0.19685855507850647, "learning_rate": 5.9782496901506444e-05, "loss": 1.1317, "step": 5474 }, { "epoch": 1.481666892166148, "grad_norm": 0.20022651553153992, "learning_rate": 5.9751621353788535e-05, "loss": 1.1248, "step": 5476 }, { "epoch": 1.4822080909213908, "grad_norm": 0.2085038274526596, "learning_rate": 5.972074194006476e-05, "loss": 1.1551, "step": 5478 }, { "epoch": 1.4827492896766337, "grad_norm": 0.20097662508487701, "learning_rate": 5.968985867257721e-05, "loss": 1.1197, "step": 5480 }, { "epoch": 1.4832904884318765, "grad_norm": 0.19229756295681, "learning_rate": 5.965897156356949e-05, "loss": 1.1258, "step": 5482 }, { "epoch": 1.4838316871871196, "grad_norm": 0.2001403421163559, "learning_rate": 5.9628080625286665e-05, "loss": 1.1131, "step": 5484 }, { "epoch": 1.4843728859423624, "grad_norm": 0.19318203628063202, "learning_rate": 5.959718586997542e-05, "loss": 1.118, "step": 5486 }, { "epoch": 1.4849140846976052, "grad_norm": 0.19797928631305695, "learning_rate": 5.95662873098839e-05, "loss": 1.103, "step": 5488 }, { "epoch": 1.485455283452848, "grad_norm": 0.20442748069763184, "learning_rate": 5.953538495726172e-05, "loss": 1.1066, "step": 5490 }, { "epoch": 1.4859964822080909, "grad_norm": 0.19988074898719788, "learning_rate": 5.9504478824360077e-05, "loss": 1.1022, "step": 5492 }, { "epoch": 1.4865376809633337, "grad_norm": 0.20216168463230133, "learning_rate": 5.947356892343161e-05, "loss": 1.1222, "step": 5494 }, { "epoch": 1.4870788797185766, "grad_norm": 0.2014523148536682, "learning_rate": 5.944265526673051e-05, "loss": 1.1327, "step": 5496 }, { "epoch": 1.4876200784738196, "grad_norm": 0.1946011334657669, "learning_rate": 5.941173786651236e-05, "loss": 1.0985, "step": 5498 }, { "epoch": 1.4881612772290624, "grad_norm": 0.19211317598819733, "learning_rate": 5.938081673503433e-05, "loss": 1.1405, "step": 5500 }, { "epoch": 1.4887024759843053, "grad_norm": 0.19912263751029968, "learning_rate": 5.934989188455502e-05, "loss": 1.1234, "step": 5502 }, { "epoch": 1.489243674739548, "grad_norm": 0.2064303755760193, "learning_rate": 5.931896332733451e-05, "loss": 1.1317, "step": 5504 }, { "epoch": 1.489784873494791, "grad_norm": 0.19061782956123352, "learning_rate": 5.928803107563432e-05, "loss": 1.1099, "step": 5506 }, { "epoch": 1.4903260722500338, "grad_norm": 0.1989295333623886, "learning_rate": 5.92570951417175e-05, "loss": 1.1273, "step": 5508 }, { "epoch": 1.4908672710052766, "grad_norm": 0.19522297382354736, "learning_rate": 5.92261555378485e-05, "loss": 1.1123, "step": 5510 }, { "epoch": 1.4914084697605197, "grad_norm": 0.1995413452386856, "learning_rate": 5.9195212276293255e-05, "loss": 1.1218, "step": 5512 }, { "epoch": 1.4919496685157625, "grad_norm": 0.1953144520521164, "learning_rate": 5.916426536931915e-05, "loss": 1.1164, "step": 5514 }, { "epoch": 1.4924908672710053, "grad_norm": 0.20359961688518524, "learning_rate": 5.9133314829195006e-05, "loss": 1.1134, "step": 5516 }, { "epoch": 1.4930320660262482, "grad_norm": 0.20576973259449005, "learning_rate": 5.9102360668191084e-05, "loss": 1.1195, "step": 5518 }, { "epoch": 1.493573264781491, "grad_norm": 0.2180139273405075, "learning_rate": 5.907140289857907e-05, "loss": 1.1179, "step": 5520 }, { "epoch": 1.4941144635367338, "grad_norm": 0.2019367218017578, "learning_rate": 5.9040441532632115e-05, "loss": 1.1057, "step": 5522 }, { "epoch": 1.4946556622919767, "grad_norm": 0.19558557868003845, "learning_rate": 5.900947658262477e-05, "loss": 1.1164, "step": 5524 }, { "epoch": 1.4951968610472197, "grad_norm": 0.19859646260738373, "learning_rate": 5.897850806083302e-05, "loss": 1.1267, "step": 5526 }, { "epoch": 1.4957380598024623, "grad_norm": 0.19553080201148987, "learning_rate": 5.8947535979534244e-05, "loss": 1.1277, "step": 5528 }, { "epoch": 1.4962792585577054, "grad_norm": 0.20781400799751282, "learning_rate": 5.891656035100724e-05, "loss": 1.1179, "step": 5530 }, { "epoch": 1.4968204573129482, "grad_norm": 0.20255321264266968, "learning_rate": 5.8885581187532246e-05, "loss": 1.1243, "step": 5532 }, { "epoch": 1.497361656068191, "grad_norm": 0.2056676149368286, "learning_rate": 5.8854598501390845e-05, "loss": 1.1329, "step": 5534 }, { "epoch": 1.4979028548234339, "grad_norm": 0.20784781873226166, "learning_rate": 5.8823612304866046e-05, "loss": 1.1042, "step": 5536 }, { "epoch": 1.4984440535786767, "grad_norm": 0.20506465435028076, "learning_rate": 5.8792622610242275e-05, "loss": 1.1157, "step": 5538 }, { "epoch": 1.4989852523339195, "grad_norm": 0.21897608041763306, "learning_rate": 5.8761629429805296e-05, "loss": 1.1311, "step": 5540 }, { "epoch": 1.4995264510891624, "grad_norm": 0.21256893873214722, "learning_rate": 5.87306327758423e-05, "loss": 1.1106, "step": 5542 }, { "epoch": 1.5000676498444054, "grad_norm": 0.2042350023984909, "learning_rate": 5.86996326606418e-05, "loss": 1.1029, "step": 5544 }, { "epoch": 1.5006088485996483, "grad_norm": 0.20099198818206787, "learning_rate": 5.866862909649373e-05, "loss": 1.1068, "step": 5546 }, { "epoch": 1.501150047354891, "grad_norm": 0.20550420880317688, "learning_rate": 5.863762209568938e-05, "loss": 1.1256, "step": 5548 }, { "epoch": 1.501691246110134, "grad_norm": 0.2070087343454361, "learning_rate": 5.8606611670521404e-05, "loss": 1.1324, "step": 5550 }, { "epoch": 1.5022324448653768, "grad_norm": 0.2023104876279831, "learning_rate": 5.8575597833283794e-05, "loss": 1.1188, "step": 5552 }, { "epoch": 1.5027736436206198, "grad_norm": 0.1989484578371048, "learning_rate": 5.854458059627191e-05, "loss": 1.1313, "step": 5554 }, { "epoch": 1.5033148423758624, "grad_norm": 0.19165557622909546, "learning_rate": 5.851355997178247e-05, "loss": 1.1385, "step": 5556 }, { "epoch": 1.5038560411311055, "grad_norm": 0.20374418795108795, "learning_rate": 5.848253597211349e-05, "loss": 1.1304, "step": 5558 }, { "epoch": 1.504397239886348, "grad_norm": 0.19242756068706512, "learning_rate": 5.845150860956441e-05, "loss": 1.1146, "step": 5560 }, { "epoch": 1.5049384386415912, "grad_norm": 0.19632889330387115, "learning_rate": 5.84204778964359e-05, "loss": 1.1212, "step": 5562 }, { "epoch": 1.505479637396834, "grad_norm": 0.19135157763957977, "learning_rate": 5.838944384503003e-05, "loss": 1.1194, "step": 5564 }, { "epoch": 1.5060208361520768, "grad_norm": 0.20354853570461273, "learning_rate": 5.835840646765019e-05, "loss": 1.1318, "step": 5566 }, { "epoch": 1.5065620349073197, "grad_norm": 0.19074110686779022, "learning_rate": 5.832736577660103e-05, "loss": 1.1257, "step": 5568 }, { "epoch": 1.5071032336625625, "grad_norm": 0.19348789751529694, "learning_rate": 5.829632178418857e-05, "loss": 1.0964, "step": 5570 }, { "epoch": 1.5076444324178055, "grad_norm": 0.19624833762645721, "learning_rate": 5.8265274502720134e-05, "loss": 1.1124, "step": 5572 }, { "epoch": 1.5081856311730482, "grad_norm": 0.1924959421157837, "learning_rate": 5.823422394450434e-05, "loss": 1.1188, "step": 5574 }, { "epoch": 1.5087268299282912, "grad_norm": 0.2001020610332489, "learning_rate": 5.820317012185108e-05, "loss": 1.105, "step": 5576 }, { "epoch": 1.509268028683534, "grad_norm": 0.1978660523891449, "learning_rate": 5.817211304707161e-05, "loss": 1.1073, "step": 5578 }, { "epoch": 1.5098092274387769, "grad_norm": 0.1959565430879593, "learning_rate": 5.8141052732478375e-05, "loss": 1.12, "step": 5580 }, { "epoch": 1.5103504261940197, "grad_norm": 0.1948666125535965, "learning_rate": 5.81099891903852e-05, "loss": 1.118, "step": 5582 }, { "epoch": 1.5108916249492625, "grad_norm": 0.19219863414764404, "learning_rate": 5.807892243310713e-05, "loss": 1.0994, "step": 5584 }, { "epoch": 1.5114328237045056, "grad_norm": 0.20111101865768433, "learning_rate": 5.8047852472960496e-05, "loss": 1.099, "step": 5586 }, { "epoch": 1.5119740224597482, "grad_norm": 0.21085359156131744, "learning_rate": 5.801677932226293e-05, "loss": 1.1227, "step": 5588 }, { "epoch": 1.5125152212149913, "grad_norm": 0.20131339132785797, "learning_rate": 5.798570299333329e-05, "loss": 1.1148, "step": 5590 }, { "epoch": 1.513056419970234, "grad_norm": 0.20731684565544128, "learning_rate": 5.79546234984917e-05, "loss": 1.1049, "step": 5592 }, { "epoch": 1.513597618725477, "grad_norm": 0.19507162272930145, "learning_rate": 5.792354085005956e-05, "loss": 1.1032, "step": 5594 }, { "epoch": 1.5141388174807198, "grad_norm": 0.19368614256381989, "learning_rate": 5.78924550603595e-05, "loss": 1.1197, "step": 5596 }, { "epoch": 1.5146800162359626, "grad_norm": 0.20111224055290222, "learning_rate": 5.7861366141715424e-05, "loss": 1.1129, "step": 5598 }, { "epoch": 1.5152212149912057, "grad_norm": 0.1937616467475891, "learning_rate": 5.783027410645242e-05, "loss": 1.1078, "step": 5600 }, { "epoch": 1.5157624137464483, "grad_norm": 0.19857732951641083, "learning_rate": 5.7799178966896885e-05, "loss": 1.1269, "step": 5602 }, { "epoch": 1.5163036125016913, "grad_norm": 0.19797003269195557, "learning_rate": 5.776808073537637e-05, "loss": 1.115, "step": 5604 }, { "epoch": 1.5168448112569342, "grad_norm": 0.1918354630470276, "learning_rate": 5.773697942421974e-05, "loss": 1.1062, "step": 5606 }, { "epoch": 1.517386010012177, "grad_norm": 0.20058605074882507, "learning_rate": 5.7705875045756995e-05, "loss": 1.1242, "step": 5608 }, { "epoch": 1.5179272087674198, "grad_norm": 0.20740623772144318, "learning_rate": 5.76747676123194e-05, "loss": 1.1234, "step": 5610 }, { "epoch": 1.5184684075226627, "grad_norm": 0.20457324385643005, "learning_rate": 5.7643657136239416e-05, "loss": 1.1229, "step": 5612 }, { "epoch": 1.5190096062779057, "grad_norm": 0.2054700404405594, "learning_rate": 5.76125436298507e-05, "loss": 1.1146, "step": 5614 }, { "epoch": 1.5195508050331483, "grad_norm": 0.1983955353498459, "learning_rate": 5.758142710548816e-05, "loss": 1.1406, "step": 5616 }, { "epoch": 1.5200920037883914, "grad_norm": 0.20349404215812683, "learning_rate": 5.755030757548784e-05, "loss": 1.122, "step": 5618 }, { "epoch": 1.520633202543634, "grad_norm": 0.2126217931509018, "learning_rate": 5.751918505218698e-05, "loss": 1.1192, "step": 5620 }, { "epoch": 1.521174401298877, "grad_norm": 0.20240618288516998, "learning_rate": 5.748805954792407e-05, "loss": 1.128, "step": 5622 }, { "epoch": 1.5217156000541199, "grad_norm": 0.19409391283988953, "learning_rate": 5.74569310750387e-05, "loss": 1.116, "step": 5624 }, { "epoch": 1.5222567988093627, "grad_norm": 0.19456911087036133, "learning_rate": 5.74257996458717e-05, "loss": 1.1168, "step": 5626 }, { "epoch": 1.5227979975646058, "grad_norm": 0.20936742424964905, "learning_rate": 5.7394665272765045e-05, "loss": 1.1283, "step": 5628 }, { "epoch": 1.5233391963198484, "grad_norm": 0.19374299049377441, "learning_rate": 5.736352796806187e-05, "loss": 1.1181, "step": 5630 }, { "epoch": 1.5238803950750914, "grad_norm": 0.2012673169374466, "learning_rate": 5.7332387744106475e-05, "loss": 1.1186, "step": 5632 }, { "epoch": 1.524421593830334, "grad_norm": 0.19683314859867096, "learning_rate": 5.730124461324433e-05, "loss": 1.1133, "step": 5634 }, { "epoch": 1.524962792585577, "grad_norm": 0.20046451687812805, "learning_rate": 5.7270098587822075e-05, "loss": 1.0925, "step": 5636 }, { "epoch": 1.52550399134082, "grad_norm": 0.20211833715438843, "learning_rate": 5.723894968018744e-05, "loss": 1.1286, "step": 5638 }, { "epoch": 1.5260451900960628, "grad_norm": 0.19466207921504974, "learning_rate": 5.7207797902689344e-05, "loss": 1.1178, "step": 5640 }, { "epoch": 1.5265863888513056, "grad_norm": 0.19484566152095795, "learning_rate": 5.717664326767783e-05, "loss": 1.1182, "step": 5642 }, { "epoch": 1.5271275876065484, "grad_norm": 0.20028981566429138, "learning_rate": 5.714548578750407e-05, "loss": 1.1245, "step": 5644 }, { "epoch": 1.5276687863617915, "grad_norm": 0.21267130970954895, "learning_rate": 5.711432547452038e-05, "loss": 1.1072, "step": 5646 }, { "epoch": 1.528209985117034, "grad_norm": 0.20393440127372742, "learning_rate": 5.708316234108019e-05, "loss": 1.1179, "step": 5648 }, { "epoch": 1.5287511838722772, "grad_norm": 0.19241678714752197, "learning_rate": 5.705199639953802e-05, "loss": 1.1438, "step": 5650 }, { "epoch": 1.52929238262752, "grad_norm": 0.1952701359987259, "learning_rate": 5.702082766224957e-05, "loss": 1.0999, "step": 5652 }, { "epoch": 1.5298335813827628, "grad_norm": 0.1929616928100586, "learning_rate": 5.698965614157157e-05, "loss": 1.1093, "step": 5654 }, { "epoch": 1.5303747801380057, "grad_norm": 0.19738009572029114, "learning_rate": 5.6958481849861924e-05, "loss": 1.1225, "step": 5656 }, { "epoch": 1.5309159788932485, "grad_norm": 0.21183903515338898, "learning_rate": 5.6927304799479586e-05, "loss": 1.1238, "step": 5658 }, { "epoch": 1.5314571776484915, "grad_norm": 0.19697804749011993, "learning_rate": 5.6896125002784605e-05, "loss": 1.1164, "step": 5660 }, { "epoch": 1.5319983764037342, "grad_norm": 0.2036931812763214, "learning_rate": 5.6864942472138164e-05, "loss": 1.1144, "step": 5662 }, { "epoch": 1.5325395751589772, "grad_norm": 0.1969900131225586, "learning_rate": 5.683375721990247e-05, "loss": 1.1071, "step": 5664 }, { "epoch": 1.53308077391422, "grad_norm": 0.21038979291915894, "learning_rate": 5.680256925844085e-05, "loss": 1.1431, "step": 5666 }, { "epoch": 1.5336219726694629, "grad_norm": 0.20183879137039185, "learning_rate": 5.6771378600117696e-05, "loss": 1.1036, "step": 5668 }, { "epoch": 1.5341631714247057, "grad_norm": 0.1987054944038391, "learning_rate": 5.674018525729847e-05, "loss": 1.1207, "step": 5670 }, { "epoch": 1.5347043701799485, "grad_norm": 0.19822414219379425, "learning_rate": 5.670898924234968e-05, "loss": 1.1396, "step": 5672 }, { "epoch": 1.5352455689351916, "grad_norm": 0.21729126572608948, "learning_rate": 5.6677790567638913e-05, "loss": 1.1022, "step": 5674 }, { "epoch": 1.5357867676904342, "grad_norm": 0.209171324968338, "learning_rate": 5.664658924553482e-05, "loss": 1.1315, "step": 5676 }, { "epoch": 1.5363279664456773, "grad_norm": 0.19557133316993713, "learning_rate": 5.661538528840706e-05, "loss": 1.1161, "step": 5678 }, { "epoch": 1.5368691652009199, "grad_norm": 0.19774852693080902, "learning_rate": 5.65841787086264e-05, "loss": 1.111, "step": 5680 }, { "epoch": 1.537410363956163, "grad_norm": 0.19714070856571198, "learning_rate": 5.655296951856459e-05, "loss": 1.1402, "step": 5682 }, { "epoch": 1.5379515627114058, "grad_norm": 0.20607705414295197, "learning_rate": 5.6521757730594425e-05, "loss": 1.1228, "step": 5684 }, { "epoch": 1.5384927614666486, "grad_norm": 0.19225092232227325, "learning_rate": 5.649054335708975e-05, "loss": 1.1217, "step": 5686 }, { "epoch": 1.5390339602218917, "grad_norm": 0.19334369897842407, "learning_rate": 5.645932641042544e-05, "loss": 1.107, "step": 5688 }, { "epoch": 1.5395751589771343, "grad_norm": 0.1904376894235611, "learning_rate": 5.642810690297734e-05, "loss": 1.1095, "step": 5690 }, { "epoch": 1.5401163577323773, "grad_norm": 0.19064228236675262, "learning_rate": 5.639688484712238e-05, "loss": 1.109, "step": 5692 }, { "epoch": 1.54065755648762, "grad_norm": 0.19435817003250122, "learning_rate": 5.636566025523844e-05, "loss": 1.1089, "step": 5694 }, { "epoch": 1.541198755242863, "grad_norm": 0.2055196613073349, "learning_rate": 5.6334433139704455e-05, "loss": 1.1357, "step": 5696 }, { "epoch": 1.5417399539981058, "grad_norm": 0.20478758215904236, "learning_rate": 5.630320351290032e-05, "loss": 1.1057, "step": 5698 }, { "epoch": 1.5422811527533486, "grad_norm": 0.1930474191904068, "learning_rate": 5.627197138720694e-05, "loss": 1.1148, "step": 5700 } ], "logging_steps": 2, "max_steps": 11088, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.3692714142476743e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }