{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08117981328642944, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00027059937762143147, "grad_norm": 4.086390018463135, "learning_rate": 0.0, "loss": 3.2754, "step": 1 }, { "epoch": 0.0005411987552428629, "grad_norm": 3.758815288543701, "learning_rate": 9.017132551848513e-08, "loss": 3.2863, "step": 2 }, { "epoch": 0.0010823975104857259, "grad_norm": 3.8250608444213867, "learning_rate": 2.705139765554554e-07, "loss": 3.3425, "step": 4 }, { "epoch": 0.0016235962657285888, "grad_norm": 3.8092095851898193, "learning_rate": 4.5085662759242564e-07, "loss": 3.3165, "step": 6 }, { "epoch": 0.0021647950209714517, "grad_norm": 3.7621052265167236, "learning_rate": 6.311992786293959e-07, "loss": 3.3295, "step": 8 }, { "epoch": 0.002705993776214315, "grad_norm": 3.4136276245117188, "learning_rate": 8.115419296663661e-07, "loss": 3.3073, "step": 10 }, { "epoch": 0.0032471925314571776, "grad_norm": 2.855100393295288, "learning_rate": 9.918845807033363e-07, "loss": 3.3031, "step": 12 }, { "epoch": 0.0037883912867000408, "grad_norm": 2.491767406463623, "learning_rate": 1.1722272317403068e-06, "loss": 3.2943, "step": 14 }, { "epoch": 0.0043295900419429035, "grad_norm": 2.359778642654419, "learning_rate": 1.3525698827772768e-06, "loss": 3.2622, "step": 16 }, { "epoch": 0.004870788797185766, "grad_norm": 2.037504196166992, "learning_rate": 1.5329125338142473e-06, "loss": 3.239, "step": 18 }, { "epoch": 0.00541198755242863, "grad_norm": 2.8542497158050537, "learning_rate": 1.7132551848512173e-06, "loss": 3.2031, "step": 20 }, { "epoch": 0.0059531863076714925, "grad_norm": 2.297046661376953, "learning_rate": 1.8935978358881876e-06, "loss": 3.1721, "step": 22 }, { "epoch": 0.006494385062914355, "grad_norm": 2.2149112224578857, "learning_rate": 2.0739404869251576e-06, "loss": 3.121, "step": 24 }, { "epoch": 0.007035583818157218, "grad_norm": 1.8048591613769531, "learning_rate": 2.254283137962128e-06, "loss": 3.0857, "step": 26 }, { "epoch": 0.0075767825734000815, "grad_norm": 1.7466434240341187, "learning_rate": 2.4346257889990986e-06, "loss": 3.0489, "step": 28 }, { "epoch": 0.008117981328642944, "grad_norm": 2.1722524166107178, "learning_rate": 2.6149684400360686e-06, "loss": 3.0016, "step": 30 }, { "epoch": 0.008659180083885807, "grad_norm": 1.364578366279602, "learning_rate": 2.7953110910730386e-06, "loss": 2.9587, "step": 32 }, { "epoch": 0.00920037883912867, "grad_norm": 1.5823427438735962, "learning_rate": 2.9756537421100095e-06, "loss": 2.931, "step": 34 }, { "epoch": 0.009741577594371532, "grad_norm": 1.2367908954620361, "learning_rate": 3.1559963931469796e-06, "loss": 2.8953, "step": 36 }, { "epoch": 0.010282776349614395, "grad_norm": 1.0437366962432861, "learning_rate": 3.3363390441839496e-06, "loss": 2.8412, "step": 38 }, { "epoch": 0.01082397510485726, "grad_norm": 1.081803798675537, "learning_rate": 3.5166816952209197e-06, "loss": 2.7832, "step": 40 }, { "epoch": 0.011365173860100122, "grad_norm": 0.9715840220451355, "learning_rate": 3.69702434625789e-06, "loss": 2.7729, "step": 42 }, { "epoch": 0.011906372615342985, "grad_norm": 0.8603936433792114, "learning_rate": 3.877366997294861e-06, "loss": 2.6904, "step": 44 }, { "epoch": 0.012447571370585848, "grad_norm": 0.8236231803894043, "learning_rate": 4.057709648331831e-06, "loss": 2.6908, "step": 46 }, { "epoch": 0.01298877012582871, "grad_norm": 0.7681186199188232, "learning_rate": 4.2380522993688015e-06, "loss": 2.6212, "step": 48 }, { "epoch": 0.013529968881071573, "grad_norm": 0.8002827167510986, "learning_rate": 4.4183949504057716e-06, "loss": 2.6035, "step": 50 }, { "epoch": 0.014071167636314436, "grad_norm": 0.6757120490074158, "learning_rate": 4.598737601442742e-06, "loss": 2.595, "step": 52 }, { "epoch": 0.014612366391557299, "grad_norm": 0.6619369387626648, "learning_rate": 4.779080252479712e-06, "loss": 2.5522, "step": 54 }, { "epoch": 0.015153565146800163, "grad_norm": 0.6247105598449707, "learning_rate": 4.959422903516682e-06, "loss": 2.5079, "step": 56 }, { "epoch": 0.015694763902043024, "grad_norm": 0.6559263467788696, "learning_rate": 5.139765554553652e-06, "loss": 2.5009, "step": 58 }, { "epoch": 0.01623596265728589, "grad_norm": 0.6590877175331116, "learning_rate": 5.320108205590623e-06, "loss": 2.4648, "step": 60 }, { "epoch": 0.01677716141252875, "grad_norm": 0.6045516133308411, "learning_rate": 5.500450856627593e-06, "loss": 2.421, "step": 62 }, { "epoch": 0.017318360167771614, "grad_norm": 0.6533932089805603, "learning_rate": 5.680793507664563e-06, "loss": 2.3966, "step": 64 }, { "epoch": 0.01785955892301448, "grad_norm": 0.6478094458580017, "learning_rate": 5.861136158701533e-06, "loss": 2.3903, "step": 66 }, { "epoch": 0.01840075767825734, "grad_norm": 0.7349300980567932, "learning_rate": 6.041478809738504e-06, "loss": 2.3552, "step": 68 }, { "epoch": 0.018941956433500204, "grad_norm": 0.6454821825027466, "learning_rate": 6.221821460775474e-06, "loss": 2.3262, "step": 70 }, { "epoch": 0.019483155188743065, "grad_norm": 0.7321672439575195, "learning_rate": 6.402164111812444e-06, "loss": 2.3197, "step": 72 }, { "epoch": 0.02002435394398593, "grad_norm": 0.7664237022399902, "learning_rate": 6.582506762849414e-06, "loss": 2.2992, "step": 74 }, { "epoch": 0.02056555269922879, "grad_norm": 0.6843811869621277, "learning_rate": 6.762849413886384e-06, "loss": 2.2927, "step": 76 }, { "epoch": 0.021106751454471655, "grad_norm": 0.7199612259864807, "learning_rate": 6.9431920649233556e-06, "loss": 2.2525, "step": 78 }, { "epoch": 0.02164795020971452, "grad_norm": 0.778446614742279, "learning_rate": 7.123534715960326e-06, "loss": 2.2267, "step": 80 }, { "epoch": 0.02218914896495738, "grad_norm": 0.9287930727005005, "learning_rate": 7.303877366997296e-06, "loss": 2.2206, "step": 82 }, { "epoch": 0.022730347720200245, "grad_norm": 1.033782958984375, "learning_rate": 7.484220018034266e-06, "loss": 2.2063, "step": 84 }, { "epoch": 0.023271546475443106, "grad_norm": 1.0132615566253662, "learning_rate": 7.664562669071236e-06, "loss": 2.1677, "step": 86 }, { "epoch": 0.02381274523068597, "grad_norm": 0.9043529033660889, "learning_rate": 7.844905320108207e-06, "loss": 2.1696, "step": 88 }, { "epoch": 0.02435394398592883, "grad_norm": 0.6718290448188782, "learning_rate": 8.025247971145176e-06, "loss": 2.1492, "step": 90 }, { "epoch": 0.024895142741171696, "grad_norm": 0.9615944027900696, "learning_rate": 8.205590622182147e-06, "loss": 2.1452, "step": 92 }, { "epoch": 0.02543634149641456, "grad_norm": 0.9435996413230896, "learning_rate": 8.385933273219116e-06, "loss": 2.1098, "step": 94 }, { "epoch": 0.02597754025165742, "grad_norm": 0.7614261507987976, "learning_rate": 8.566275924256087e-06, "loss": 2.1286, "step": 96 }, { "epoch": 0.026518739006900285, "grad_norm": 0.9416339993476868, "learning_rate": 8.746618575293058e-06, "loss": 2.1092, "step": 98 }, { "epoch": 0.027059937762143146, "grad_norm": 0.9229443073272705, "learning_rate": 8.926961226330027e-06, "loss": 2.0932, "step": 100 }, { "epoch": 0.02760113651738601, "grad_norm": 0.7135593295097351, "learning_rate": 9.107303877366998e-06, "loss": 2.0699, "step": 102 }, { "epoch": 0.028142335272628872, "grad_norm": 1.0263723134994507, "learning_rate": 9.287646528403967e-06, "loss": 2.0445, "step": 104 }, { "epoch": 0.028683534027871736, "grad_norm": 1.0300300121307373, "learning_rate": 9.467989179440938e-06, "loss": 2.0463, "step": 106 }, { "epoch": 0.029224732783114597, "grad_norm": 0.8331286311149597, "learning_rate": 9.648331830477909e-06, "loss": 2.0381, "step": 108 }, { "epoch": 0.02976593153835746, "grad_norm": 0.7501435875892639, "learning_rate": 9.828674481514878e-06, "loss": 2.0411, "step": 110 }, { "epoch": 0.030307130293600326, "grad_norm": 0.6895191073417664, "learning_rate": 1.0009017132551849e-05, "loss": 2.0475, "step": 112 }, { "epoch": 0.030848329048843187, "grad_norm": 0.95854252576828, "learning_rate": 1.018935978358882e-05, "loss": 2.0071, "step": 114 }, { "epoch": 0.03138952780408605, "grad_norm": 1.1303929090499878, "learning_rate": 1.036970243462579e-05, "loss": 2.0008, "step": 116 }, { "epoch": 0.031930726559328916, "grad_norm": 0.7708876729011536, "learning_rate": 1.055004508566276e-05, "loss": 2.0061, "step": 118 }, { "epoch": 0.03247192531457178, "grad_norm": 0.9773860573768616, "learning_rate": 1.073038773669973e-05, "loss": 2.0096, "step": 120 }, { "epoch": 0.03301312406981464, "grad_norm": 1.118385910987854, "learning_rate": 1.09107303877367e-05, "loss": 1.9939, "step": 122 }, { "epoch": 0.0335543228250575, "grad_norm": 0.7215014696121216, "learning_rate": 1.109107303877367e-05, "loss": 1.9515, "step": 124 }, { "epoch": 0.03409552158030037, "grad_norm": 0.9696834683418274, "learning_rate": 1.1271415689810642e-05, "loss": 1.9639, "step": 126 }, { "epoch": 0.03463672033554323, "grad_norm": 0.945482611656189, "learning_rate": 1.1451758340847611e-05, "loss": 1.9397, "step": 128 }, { "epoch": 0.03517791909078609, "grad_norm": 0.7454535365104675, "learning_rate": 1.1632100991884582e-05, "loss": 1.9353, "step": 130 }, { "epoch": 0.03571911784602896, "grad_norm": 0.7824187278747559, "learning_rate": 1.1812443642921551e-05, "loss": 1.9227, "step": 132 }, { "epoch": 0.03626031660127182, "grad_norm": 0.7939879894256592, "learning_rate": 1.1992786293958522e-05, "loss": 1.9126, "step": 134 }, { "epoch": 0.03680151535651468, "grad_norm": 0.7776147723197937, "learning_rate": 1.2173128944995491e-05, "loss": 1.9002, "step": 136 }, { "epoch": 0.03734271411175754, "grad_norm": 0.6580236554145813, "learning_rate": 1.2353471596032462e-05, "loss": 1.9121, "step": 138 }, { "epoch": 0.03788391286700041, "grad_norm": 0.7200301289558411, "learning_rate": 1.2533814247069433e-05, "loss": 1.8885, "step": 140 }, { "epoch": 0.03842511162224327, "grad_norm": 0.7958497405052185, "learning_rate": 1.2714156898106402e-05, "loss": 1.9095, "step": 142 }, { "epoch": 0.03896631037748613, "grad_norm": 0.9120681881904602, "learning_rate": 1.2894499549143375e-05, "loss": 1.884, "step": 144 }, { "epoch": 0.039507509132729, "grad_norm": 0.8108247518539429, "learning_rate": 1.3074842200180342e-05, "loss": 1.8656, "step": 146 }, { "epoch": 0.04004870788797186, "grad_norm": 0.7010449171066284, "learning_rate": 1.3255184851217315e-05, "loss": 1.8635, "step": 148 }, { "epoch": 0.04058990664321472, "grad_norm": 0.8178524374961853, "learning_rate": 1.3435527502254284e-05, "loss": 1.8933, "step": 150 }, { "epoch": 0.04113110539845758, "grad_norm": 1.0447405576705933, "learning_rate": 1.3615870153291255e-05, "loss": 1.8523, "step": 152 }, { "epoch": 0.04167230415370045, "grad_norm": 0.8516271710395813, "learning_rate": 1.3796212804328224e-05, "loss": 1.8528, "step": 154 }, { "epoch": 0.04221350290894331, "grad_norm": 0.8437328934669495, "learning_rate": 1.3976555455365195e-05, "loss": 1.861, "step": 156 }, { "epoch": 0.04275470166418617, "grad_norm": 0.851265549659729, "learning_rate": 1.4156898106402164e-05, "loss": 1.8315, "step": 158 }, { "epoch": 0.04329590041942904, "grad_norm": 0.7337156534194946, "learning_rate": 1.4337240757439135e-05, "loss": 1.8354, "step": 160 }, { "epoch": 0.0438370991746719, "grad_norm": 0.9754143357276917, "learning_rate": 1.4517583408476104e-05, "loss": 1.8252, "step": 162 }, { "epoch": 0.04437829792991476, "grad_norm": 0.6172115802764893, "learning_rate": 1.4697926059513075e-05, "loss": 1.8094, "step": 164 }, { "epoch": 0.04491949668515762, "grad_norm": 0.8304158449172974, "learning_rate": 1.4878268710550044e-05, "loss": 1.8078, "step": 166 }, { "epoch": 0.04546069544040049, "grad_norm": 0.6388853788375854, "learning_rate": 1.5058611361587017e-05, "loss": 1.8106, "step": 168 }, { "epoch": 0.04600189419564335, "grad_norm": 0.743231475353241, "learning_rate": 1.5238954012623984e-05, "loss": 1.8144, "step": 170 }, { "epoch": 0.04654309295088621, "grad_norm": 0.6442289352416992, "learning_rate": 1.5419296663660955e-05, "loss": 1.7831, "step": 172 }, { "epoch": 0.04708429170612908, "grad_norm": 0.6877187490463257, "learning_rate": 1.559963931469793e-05, "loss": 1.8043, "step": 174 }, { "epoch": 0.04762549046137194, "grad_norm": 0.9389640688896179, "learning_rate": 1.5779981965734897e-05, "loss": 1.7869, "step": 176 }, { "epoch": 0.0481666892166148, "grad_norm": 1.0456589460372925, "learning_rate": 1.5960324616771868e-05, "loss": 1.7681, "step": 178 }, { "epoch": 0.04870788797185766, "grad_norm": 0.9617791175842285, "learning_rate": 1.614066726780884e-05, "loss": 1.7668, "step": 180 }, { "epoch": 0.04924908672710053, "grad_norm": 0.9334360361099243, "learning_rate": 1.632100991884581e-05, "loss": 1.7893, "step": 182 }, { "epoch": 0.04979028548234339, "grad_norm": 0.8952531814575195, "learning_rate": 1.6501352569882777e-05, "loss": 1.7758, "step": 184 }, { "epoch": 0.05033148423758625, "grad_norm": 0.8544924855232239, "learning_rate": 1.6681695220919748e-05, "loss": 1.793, "step": 186 }, { "epoch": 0.05087268299282912, "grad_norm": 0.7782765030860901, "learning_rate": 1.686203787195672e-05, "loss": 1.768, "step": 188 }, { "epoch": 0.05141388174807198, "grad_norm": 0.7119695544242859, "learning_rate": 1.704238052299369e-05, "loss": 1.7685, "step": 190 }, { "epoch": 0.05195508050331484, "grad_norm": 0.9119647145271301, "learning_rate": 1.7222723174030657e-05, "loss": 1.7706, "step": 192 }, { "epoch": 0.0524962792585577, "grad_norm": 0.6414957642555237, "learning_rate": 1.7403065825067628e-05, "loss": 1.7626, "step": 194 }, { "epoch": 0.05303747801380057, "grad_norm": 0.8069677352905273, "learning_rate": 1.75834084761046e-05, "loss": 1.7423, "step": 196 }, { "epoch": 0.05357867676904343, "grad_norm": 0.6549937725067139, "learning_rate": 1.776375112714157e-05, "loss": 1.7428, "step": 198 }, { "epoch": 0.05411987552428629, "grad_norm": 0.8064024448394775, "learning_rate": 1.7944093778178538e-05, "loss": 1.7448, "step": 200 }, { "epoch": 0.054661074279529154, "grad_norm": 0.7182701826095581, "learning_rate": 1.8124436429215512e-05, "loss": 1.7248, "step": 202 }, { "epoch": 0.05520227303477202, "grad_norm": 0.6997919678688049, "learning_rate": 1.830477908025248e-05, "loss": 1.7281, "step": 204 }, { "epoch": 0.05574347179001488, "grad_norm": 0.7071277499198914, "learning_rate": 1.848512173128945e-05, "loss": 1.714, "step": 206 }, { "epoch": 0.056284670545257744, "grad_norm": 0.6344273090362549, "learning_rate": 1.866546438232642e-05, "loss": 1.7463, "step": 208 }, { "epoch": 0.05682586930050061, "grad_norm": 0.7192733883857727, "learning_rate": 1.8845807033363392e-05, "loss": 1.737, "step": 210 }, { "epoch": 0.05736706805574347, "grad_norm": 0.7418521642684937, "learning_rate": 1.9026149684400363e-05, "loss": 1.7197, "step": 212 }, { "epoch": 0.057908266810986334, "grad_norm": 0.875845730304718, "learning_rate": 1.920649233543733e-05, "loss": 1.6968, "step": 214 }, { "epoch": 0.058449465566229195, "grad_norm": 0.7394037842750549, "learning_rate": 1.9386834986474305e-05, "loss": 1.7051, "step": 216 }, { "epoch": 0.05899066432147206, "grad_norm": 0.6689572930335999, "learning_rate": 1.9567177637511272e-05, "loss": 1.7152, "step": 218 }, { "epoch": 0.05953186307671492, "grad_norm": 0.7955539226531982, "learning_rate": 1.9747520288548243e-05, "loss": 1.7136, "step": 220 }, { "epoch": 0.060073061831957784, "grad_norm": 0.7005388140678406, "learning_rate": 1.9927862939585214e-05, "loss": 1.7152, "step": 222 }, { "epoch": 0.06061426058720065, "grad_norm": 0.6205731630325317, "learning_rate": 2.0108205590622185e-05, "loss": 1.6901, "step": 224 }, { "epoch": 0.06115545934244351, "grad_norm": 0.7079929709434509, "learning_rate": 2.0288548241659152e-05, "loss": 1.6905, "step": 226 }, { "epoch": 0.061696658097686374, "grad_norm": 0.6871302723884583, "learning_rate": 2.0468890892696123e-05, "loss": 1.6867, "step": 228 }, { "epoch": 0.062237856852929235, "grad_norm": 0.7172162532806396, "learning_rate": 2.0649233543733094e-05, "loss": 1.685, "step": 230 }, { "epoch": 0.0627790556081721, "grad_norm": 0.6729004979133606, "learning_rate": 2.0829576194770065e-05, "loss": 1.6961, "step": 232 }, { "epoch": 0.06332025436341496, "grad_norm": 0.7335099577903748, "learning_rate": 2.1009918845807033e-05, "loss": 1.6797, "step": 234 }, { "epoch": 0.06386145311865783, "grad_norm": 0.6398060321807861, "learning_rate": 2.1190261496844003e-05, "loss": 1.7037, "step": 236 }, { "epoch": 0.0644026518739007, "grad_norm": 0.7026365399360657, "learning_rate": 2.1370604147880974e-05, "loss": 1.6698, "step": 238 }, { "epoch": 0.06494385062914355, "grad_norm": 0.7972332239151001, "learning_rate": 2.1550946798917945e-05, "loss": 1.6866, "step": 240 }, { "epoch": 0.06548504938438642, "grad_norm": 0.7363021969795227, "learning_rate": 2.1731289449954913e-05, "loss": 1.6879, "step": 242 }, { "epoch": 0.06602624813962928, "grad_norm": 0.7071017026901245, "learning_rate": 2.1911632100991887e-05, "loss": 1.6898, "step": 244 }, { "epoch": 0.06656744689487214, "grad_norm": 0.8030880093574524, "learning_rate": 2.2091974752028858e-05, "loss": 1.6734, "step": 246 }, { "epoch": 0.067108645650115, "grad_norm": 0.7429569363594055, "learning_rate": 2.2272317403065825e-05, "loss": 1.6722, "step": 248 }, { "epoch": 0.06764984440535787, "grad_norm": 0.6807804107666016, "learning_rate": 2.2452660054102796e-05, "loss": 1.6697, "step": 250 }, { "epoch": 0.06819104316060073, "grad_norm": 0.6632562875747681, "learning_rate": 2.2633002705139767e-05, "loss": 1.6453, "step": 252 }, { "epoch": 0.0687322419158436, "grad_norm": 0.6661680340766907, "learning_rate": 2.2813345356176738e-05, "loss": 1.6701, "step": 254 }, { "epoch": 0.06927344067108646, "grad_norm": 0.6747105121612549, "learning_rate": 2.2993688007213706e-05, "loss": 1.6729, "step": 256 }, { "epoch": 0.06981463942632932, "grad_norm": 0.7698473334312439, "learning_rate": 2.317403065825068e-05, "loss": 1.6528, "step": 258 }, { "epoch": 0.07035583818157218, "grad_norm": 0.6111325621604919, "learning_rate": 2.3354373309287647e-05, "loss": 1.6412, "step": 260 }, { "epoch": 0.07089703693681504, "grad_norm": 0.7405019998550415, "learning_rate": 2.3534715960324618e-05, "loss": 1.6564, "step": 262 }, { "epoch": 0.07143823569205791, "grad_norm": 0.6702501773834229, "learning_rate": 2.371505861136159e-05, "loss": 1.654, "step": 264 }, { "epoch": 0.07197943444730077, "grad_norm": 0.7076373100280762, "learning_rate": 2.389540126239856e-05, "loss": 1.6301, "step": 266 }, { "epoch": 0.07252063320254364, "grad_norm": 0.7239627242088318, "learning_rate": 2.4075743913435528e-05, "loss": 1.6575, "step": 268 }, { "epoch": 0.0730618319577865, "grad_norm": 0.753480076789856, "learning_rate": 2.42560865644725e-05, "loss": 1.6603, "step": 270 }, { "epoch": 0.07360303071302936, "grad_norm": 0.7261641025543213, "learning_rate": 2.443642921550947e-05, "loss": 1.6449, "step": 272 }, { "epoch": 0.07414422946827222, "grad_norm": 0.6315119862556458, "learning_rate": 2.461677186654644e-05, "loss": 1.6538, "step": 274 }, { "epoch": 0.07468542822351508, "grad_norm": 0.5698412656784058, "learning_rate": 2.4797114517583408e-05, "loss": 1.6663, "step": 276 }, { "epoch": 0.07522662697875795, "grad_norm": 0.5968983173370361, "learning_rate": 2.497745716862038e-05, "loss": 1.643, "step": 278 }, { "epoch": 0.07576782573400082, "grad_norm": 0.561126172542572, "learning_rate": 2.5157799819657353e-05, "loss": 1.6301, "step": 280 }, { "epoch": 0.07630902448924368, "grad_norm": 0.7290865778923035, "learning_rate": 2.533814247069432e-05, "loss": 1.6412, "step": 282 }, { "epoch": 0.07685022324448654, "grad_norm": 0.7629122138023376, "learning_rate": 2.5518485121731288e-05, "loss": 1.6335, "step": 284 }, { "epoch": 0.0773914219997294, "grad_norm": 0.5383496284484863, "learning_rate": 2.5698827772768262e-05, "loss": 1.6226, "step": 286 }, { "epoch": 0.07793262075497226, "grad_norm": 0.7778373956680298, "learning_rate": 2.5879170423805233e-05, "loss": 1.6333, "step": 288 }, { "epoch": 0.07847381951021512, "grad_norm": 0.6851366758346558, "learning_rate": 2.60595130748422e-05, "loss": 1.6251, "step": 290 }, { "epoch": 0.079015018265458, "grad_norm": 0.5947225689888, "learning_rate": 2.623985572587917e-05, "loss": 1.6298, "step": 292 }, { "epoch": 0.07955621702070086, "grad_norm": 0.9742544889450073, "learning_rate": 2.6420198376916146e-05, "loss": 1.6252, "step": 294 }, { "epoch": 0.08009741577594372, "grad_norm": 1.2064323425292969, "learning_rate": 2.6600541027953113e-05, "loss": 1.6152, "step": 296 }, { "epoch": 0.08063861453118658, "grad_norm": 1.0506716966629028, "learning_rate": 2.678088367899008e-05, "loss": 1.6351, "step": 298 }, { "epoch": 0.08117981328642944, "grad_norm": 1.2992738485336304, "learning_rate": 2.696122633002705e-05, "loss": 1.6193, "step": 300 } ], "logging_steps": 2, "max_steps": 11088, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2470943491083469e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }