| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.08117981328642944, | |
| "eval_steps": 500, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00027059937762143147, | |
| "grad_norm": 4.086390018463135, | |
| "learning_rate": 0.0, | |
| "loss": 3.2754, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0005411987552428629, | |
| "grad_norm": 3.758815288543701, | |
| "learning_rate": 9.017132551848513e-08, | |
| "loss": 3.2863, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0010823975104857259, | |
| "grad_norm": 3.8250608444213867, | |
| "learning_rate": 2.705139765554554e-07, | |
| "loss": 3.3425, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0016235962657285888, | |
| "grad_norm": 3.8092095851898193, | |
| "learning_rate": 4.5085662759242564e-07, | |
| "loss": 3.3165, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0021647950209714517, | |
| "grad_norm": 3.7621052265167236, | |
| "learning_rate": 6.311992786293959e-07, | |
| "loss": 3.3295, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.002705993776214315, | |
| "grad_norm": 3.4136276245117188, | |
| "learning_rate": 8.115419296663661e-07, | |
| "loss": 3.3073, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0032471925314571776, | |
| "grad_norm": 2.855100393295288, | |
| "learning_rate": 9.918845807033363e-07, | |
| "loss": 3.3031, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0037883912867000408, | |
| "grad_norm": 2.491767406463623, | |
| "learning_rate": 1.1722272317403068e-06, | |
| "loss": 3.2943, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.0043295900419429035, | |
| "grad_norm": 2.359778642654419, | |
| "learning_rate": 1.3525698827772768e-06, | |
| "loss": 3.2622, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.004870788797185766, | |
| "grad_norm": 2.037504196166992, | |
| "learning_rate": 1.5329125338142473e-06, | |
| "loss": 3.239, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.00541198755242863, | |
| "grad_norm": 2.8542497158050537, | |
| "learning_rate": 1.7132551848512173e-06, | |
| "loss": 3.2031, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0059531863076714925, | |
| "grad_norm": 2.297046661376953, | |
| "learning_rate": 1.8935978358881876e-06, | |
| "loss": 3.1721, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.006494385062914355, | |
| "grad_norm": 2.2149112224578857, | |
| "learning_rate": 2.0739404869251576e-06, | |
| "loss": 3.121, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.007035583818157218, | |
| "grad_norm": 1.8048591613769531, | |
| "learning_rate": 2.254283137962128e-06, | |
| "loss": 3.0857, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0075767825734000815, | |
| "grad_norm": 1.7466434240341187, | |
| "learning_rate": 2.4346257889990986e-06, | |
| "loss": 3.0489, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.008117981328642944, | |
| "grad_norm": 2.1722524166107178, | |
| "learning_rate": 2.6149684400360686e-06, | |
| "loss": 3.0016, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.008659180083885807, | |
| "grad_norm": 1.364578366279602, | |
| "learning_rate": 2.7953110910730386e-06, | |
| "loss": 2.9587, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.00920037883912867, | |
| "grad_norm": 1.5823427438735962, | |
| "learning_rate": 2.9756537421100095e-06, | |
| "loss": 2.931, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.009741577594371532, | |
| "grad_norm": 1.2367908954620361, | |
| "learning_rate": 3.1559963931469796e-06, | |
| "loss": 2.8953, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.010282776349614395, | |
| "grad_norm": 1.0437366962432861, | |
| "learning_rate": 3.3363390441839496e-06, | |
| "loss": 2.8412, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.01082397510485726, | |
| "grad_norm": 1.081803798675537, | |
| "learning_rate": 3.5166816952209197e-06, | |
| "loss": 2.7832, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.011365173860100122, | |
| "grad_norm": 0.9715840220451355, | |
| "learning_rate": 3.69702434625789e-06, | |
| "loss": 2.7729, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.011906372615342985, | |
| "grad_norm": 0.8603936433792114, | |
| "learning_rate": 3.877366997294861e-06, | |
| "loss": 2.6904, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.012447571370585848, | |
| "grad_norm": 0.8236231803894043, | |
| "learning_rate": 4.057709648331831e-06, | |
| "loss": 2.6908, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.01298877012582871, | |
| "grad_norm": 0.7681186199188232, | |
| "learning_rate": 4.2380522993688015e-06, | |
| "loss": 2.6212, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.013529968881071573, | |
| "grad_norm": 0.8002827167510986, | |
| "learning_rate": 4.4183949504057716e-06, | |
| "loss": 2.6035, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.014071167636314436, | |
| "grad_norm": 0.6757120490074158, | |
| "learning_rate": 4.598737601442742e-06, | |
| "loss": 2.595, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.014612366391557299, | |
| "grad_norm": 0.6619369387626648, | |
| "learning_rate": 4.779080252479712e-06, | |
| "loss": 2.5522, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.015153565146800163, | |
| "grad_norm": 0.6247105598449707, | |
| "learning_rate": 4.959422903516682e-06, | |
| "loss": 2.5079, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.015694763902043024, | |
| "grad_norm": 0.6559263467788696, | |
| "learning_rate": 5.139765554553652e-06, | |
| "loss": 2.5009, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.01623596265728589, | |
| "grad_norm": 0.6590877175331116, | |
| "learning_rate": 5.320108205590623e-06, | |
| "loss": 2.4648, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.01677716141252875, | |
| "grad_norm": 0.6045516133308411, | |
| "learning_rate": 5.500450856627593e-06, | |
| "loss": 2.421, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.017318360167771614, | |
| "grad_norm": 0.6533932089805603, | |
| "learning_rate": 5.680793507664563e-06, | |
| "loss": 2.3966, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.01785955892301448, | |
| "grad_norm": 0.6478094458580017, | |
| "learning_rate": 5.861136158701533e-06, | |
| "loss": 2.3903, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.01840075767825734, | |
| "grad_norm": 0.7349300980567932, | |
| "learning_rate": 6.041478809738504e-06, | |
| "loss": 2.3552, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.018941956433500204, | |
| "grad_norm": 0.6454821825027466, | |
| "learning_rate": 6.221821460775474e-06, | |
| "loss": 2.3262, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.019483155188743065, | |
| "grad_norm": 0.7321672439575195, | |
| "learning_rate": 6.402164111812444e-06, | |
| "loss": 2.3197, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.02002435394398593, | |
| "grad_norm": 0.7664237022399902, | |
| "learning_rate": 6.582506762849414e-06, | |
| "loss": 2.2992, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.02056555269922879, | |
| "grad_norm": 0.6843811869621277, | |
| "learning_rate": 6.762849413886384e-06, | |
| "loss": 2.2927, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.021106751454471655, | |
| "grad_norm": 0.7199612259864807, | |
| "learning_rate": 6.9431920649233556e-06, | |
| "loss": 2.2525, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.02164795020971452, | |
| "grad_norm": 0.778446614742279, | |
| "learning_rate": 7.123534715960326e-06, | |
| "loss": 2.2267, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.02218914896495738, | |
| "grad_norm": 0.9287930727005005, | |
| "learning_rate": 7.303877366997296e-06, | |
| "loss": 2.2206, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.022730347720200245, | |
| "grad_norm": 1.033782958984375, | |
| "learning_rate": 7.484220018034266e-06, | |
| "loss": 2.2063, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.023271546475443106, | |
| "grad_norm": 1.0132615566253662, | |
| "learning_rate": 7.664562669071236e-06, | |
| "loss": 2.1677, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.02381274523068597, | |
| "grad_norm": 0.9043529033660889, | |
| "learning_rate": 7.844905320108207e-06, | |
| "loss": 2.1696, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.02435394398592883, | |
| "grad_norm": 0.6718290448188782, | |
| "learning_rate": 8.025247971145176e-06, | |
| "loss": 2.1492, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.024895142741171696, | |
| "grad_norm": 0.9615944027900696, | |
| "learning_rate": 8.205590622182147e-06, | |
| "loss": 2.1452, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.02543634149641456, | |
| "grad_norm": 0.9435996413230896, | |
| "learning_rate": 8.385933273219116e-06, | |
| "loss": 2.1098, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.02597754025165742, | |
| "grad_norm": 0.7614261507987976, | |
| "learning_rate": 8.566275924256087e-06, | |
| "loss": 2.1286, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.026518739006900285, | |
| "grad_norm": 0.9416339993476868, | |
| "learning_rate": 8.746618575293058e-06, | |
| "loss": 2.1092, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.027059937762143146, | |
| "grad_norm": 0.9229443073272705, | |
| "learning_rate": 8.926961226330027e-06, | |
| "loss": 2.0932, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02760113651738601, | |
| "grad_norm": 0.7135593295097351, | |
| "learning_rate": 9.107303877366998e-06, | |
| "loss": 2.0699, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.028142335272628872, | |
| "grad_norm": 1.0263723134994507, | |
| "learning_rate": 9.287646528403967e-06, | |
| "loss": 2.0445, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.028683534027871736, | |
| "grad_norm": 1.0300300121307373, | |
| "learning_rate": 9.467989179440938e-06, | |
| "loss": 2.0463, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.029224732783114597, | |
| "grad_norm": 0.8331286311149597, | |
| "learning_rate": 9.648331830477909e-06, | |
| "loss": 2.0381, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.02976593153835746, | |
| "grad_norm": 0.7501435875892639, | |
| "learning_rate": 9.828674481514878e-06, | |
| "loss": 2.0411, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.030307130293600326, | |
| "grad_norm": 0.6895191073417664, | |
| "learning_rate": 1.0009017132551849e-05, | |
| "loss": 2.0475, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.030848329048843187, | |
| "grad_norm": 0.95854252576828, | |
| "learning_rate": 1.018935978358882e-05, | |
| "loss": 2.0071, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.03138952780408605, | |
| "grad_norm": 1.1303929090499878, | |
| "learning_rate": 1.036970243462579e-05, | |
| "loss": 2.0008, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.031930726559328916, | |
| "grad_norm": 0.7708876729011536, | |
| "learning_rate": 1.055004508566276e-05, | |
| "loss": 2.0061, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.03247192531457178, | |
| "grad_norm": 0.9773860573768616, | |
| "learning_rate": 1.073038773669973e-05, | |
| "loss": 2.0096, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03301312406981464, | |
| "grad_norm": 1.118385910987854, | |
| "learning_rate": 1.09107303877367e-05, | |
| "loss": 1.9939, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.0335543228250575, | |
| "grad_norm": 0.7215014696121216, | |
| "learning_rate": 1.109107303877367e-05, | |
| "loss": 1.9515, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.03409552158030037, | |
| "grad_norm": 0.9696834683418274, | |
| "learning_rate": 1.1271415689810642e-05, | |
| "loss": 1.9639, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.03463672033554323, | |
| "grad_norm": 0.945482611656189, | |
| "learning_rate": 1.1451758340847611e-05, | |
| "loss": 1.9397, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.03517791909078609, | |
| "grad_norm": 0.7454535365104675, | |
| "learning_rate": 1.1632100991884582e-05, | |
| "loss": 1.9353, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03571911784602896, | |
| "grad_norm": 0.7824187278747559, | |
| "learning_rate": 1.1812443642921551e-05, | |
| "loss": 1.9227, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.03626031660127182, | |
| "grad_norm": 0.7939879894256592, | |
| "learning_rate": 1.1992786293958522e-05, | |
| "loss": 1.9126, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.03680151535651468, | |
| "grad_norm": 0.7776147723197937, | |
| "learning_rate": 1.2173128944995491e-05, | |
| "loss": 1.9002, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.03734271411175754, | |
| "grad_norm": 0.6580236554145813, | |
| "learning_rate": 1.2353471596032462e-05, | |
| "loss": 1.9121, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.03788391286700041, | |
| "grad_norm": 0.7200301289558411, | |
| "learning_rate": 1.2533814247069433e-05, | |
| "loss": 1.8885, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.03842511162224327, | |
| "grad_norm": 0.7958497405052185, | |
| "learning_rate": 1.2714156898106402e-05, | |
| "loss": 1.9095, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.03896631037748613, | |
| "grad_norm": 0.9120681881904602, | |
| "learning_rate": 1.2894499549143375e-05, | |
| "loss": 1.884, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.039507509132729, | |
| "grad_norm": 0.8108247518539429, | |
| "learning_rate": 1.3074842200180342e-05, | |
| "loss": 1.8656, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.04004870788797186, | |
| "grad_norm": 0.7010449171066284, | |
| "learning_rate": 1.3255184851217315e-05, | |
| "loss": 1.8635, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.04058990664321472, | |
| "grad_norm": 0.8178524374961853, | |
| "learning_rate": 1.3435527502254284e-05, | |
| "loss": 1.8933, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04113110539845758, | |
| "grad_norm": 1.0447405576705933, | |
| "learning_rate": 1.3615870153291255e-05, | |
| "loss": 1.8523, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.04167230415370045, | |
| "grad_norm": 0.8516271710395813, | |
| "learning_rate": 1.3796212804328224e-05, | |
| "loss": 1.8528, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.04221350290894331, | |
| "grad_norm": 0.8437328934669495, | |
| "learning_rate": 1.3976555455365195e-05, | |
| "loss": 1.861, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.04275470166418617, | |
| "grad_norm": 0.851265549659729, | |
| "learning_rate": 1.4156898106402164e-05, | |
| "loss": 1.8315, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.04329590041942904, | |
| "grad_norm": 0.7337156534194946, | |
| "learning_rate": 1.4337240757439135e-05, | |
| "loss": 1.8354, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0438370991746719, | |
| "grad_norm": 0.9754143357276917, | |
| "learning_rate": 1.4517583408476104e-05, | |
| "loss": 1.8252, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.04437829792991476, | |
| "grad_norm": 0.6172115802764893, | |
| "learning_rate": 1.4697926059513075e-05, | |
| "loss": 1.8094, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.04491949668515762, | |
| "grad_norm": 0.8304158449172974, | |
| "learning_rate": 1.4878268710550044e-05, | |
| "loss": 1.8078, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.04546069544040049, | |
| "grad_norm": 0.6388853788375854, | |
| "learning_rate": 1.5058611361587017e-05, | |
| "loss": 1.8106, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.04600189419564335, | |
| "grad_norm": 0.743231475353241, | |
| "learning_rate": 1.5238954012623984e-05, | |
| "loss": 1.8144, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04654309295088621, | |
| "grad_norm": 0.6442289352416992, | |
| "learning_rate": 1.5419296663660955e-05, | |
| "loss": 1.7831, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.04708429170612908, | |
| "grad_norm": 0.6877187490463257, | |
| "learning_rate": 1.559963931469793e-05, | |
| "loss": 1.8043, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.04762549046137194, | |
| "grad_norm": 0.9389640688896179, | |
| "learning_rate": 1.5779981965734897e-05, | |
| "loss": 1.7869, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.0481666892166148, | |
| "grad_norm": 1.0456589460372925, | |
| "learning_rate": 1.5960324616771868e-05, | |
| "loss": 1.7681, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.04870788797185766, | |
| "grad_norm": 0.9617791175842285, | |
| "learning_rate": 1.614066726780884e-05, | |
| "loss": 1.7668, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.04924908672710053, | |
| "grad_norm": 0.9334360361099243, | |
| "learning_rate": 1.632100991884581e-05, | |
| "loss": 1.7893, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.04979028548234339, | |
| "grad_norm": 0.8952531814575195, | |
| "learning_rate": 1.6501352569882777e-05, | |
| "loss": 1.7758, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.05033148423758625, | |
| "grad_norm": 0.8544924855232239, | |
| "learning_rate": 1.6681695220919748e-05, | |
| "loss": 1.793, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.05087268299282912, | |
| "grad_norm": 0.7782765030860901, | |
| "learning_rate": 1.686203787195672e-05, | |
| "loss": 1.768, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.05141388174807198, | |
| "grad_norm": 0.7119695544242859, | |
| "learning_rate": 1.704238052299369e-05, | |
| "loss": 1.7685, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.05195508050331484, | |
| "grad_norm": 0.9119647145271301, | |
| "learning_rate": 1.7222723174030657e-05, | |
| "loss": 1.7706, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.0524962792585577, | |
| "grad_norm": 0.6414957642555237, | |
| "learning_rate": 1.7403065825067628e-05, | |
| "loss": 1.7626, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.05303747801380057, | |
| "grad_norm": 0.8069677352905273, | |
| "learning_rate": 1.75834084761046e-05, | |
| "loss": 1.7423, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.05357867676904343, | |
| "grad_norm": 0.6549937725067139, | |
| "learning_rate": 1.776375112714157e-05, | |
| "loss": 1.7428, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.05411987552428629, | |
| "grad_norm": 0.8064024448394775, | |
| "learning_rate": 1.7944093778178538e-05, | |
| "loss": 1.7448, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.054661074279529154, | |
| "grad_norm": 0.7182701826095581, | |
| "learning_rate": 1.8124436429215512e-05, | |
| "loss": 1.7248, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.05520227303477202, | |
| "grad_norm": 0.6997919678688049, | |
| "learning_rate": 1.830477908025248e-05, | |
| "loss": 1.7281, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.05574347179001488, | |
| "grad_norm": 0.7071277499198914, | |
| "learning_rate": 1.848512173128945e-05, | |
| "loss": 1.714, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.056284670545257744, | |
| "grad_norm": 0.6344273090362549, | |
| "learning_rate": 1.866546438232642e-05, | |
| "loss": 1.7463, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.05682586930050061, | |
| "grad_norm": 0.7192733883857727, | |
| "learning_rate": 1.8845807033363392e-05, | |
| "loss": 1.737, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.05736706805574347, | |
| "grad_norm": 0.7418521642684937, | |
| "learning_rate": 1.9026149684400363e-05, | |
| "loss": 1.7197, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.057908266810986334, | |
| "grad_norm": 0.875845730304718, | |
| "learning_rate": 1.920649233543733e-05, | |
| "loss": 1.6968, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.058449465566229195, | |
| "grad_norm": 0.7394037842750549, | |
| "learning_rate": 1.9386834986474305e-05, | |
| "loss": 1.7051, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.05899066432147206, | |
| "grad_norm": 0.6689572930335999, | |
| "learning_rate": 1.9567177637511272e-05, | |
| "loss": 1.7152, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.05953186307671492, | |
| "grad_norm": 0.7955539226531982, | |
| "learning_rate": 1.9747520288548243e-05, | |
| "loss": 1.7136, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.060073061831957784, | |
| "grad_norm": 0.7005388140678406, | |
| "learning_rate": 1.9927862939585214e-05, | |
| "loss": 1.7152, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.06061426058720065, | |
| "grad_norm": 0.6205731630325317, | |
| "learning_rate": 2.0108205590622185e-05, | |
| "loss": 1.6901, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.06115545934244351, | |
| "grad_norm": 0.7079929709434509, | |
| "learning_rate": 2.0288548241659152e-05, | |
| "loss": 1.6905, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.061696658097686374, | |
| "grad_norm": 0.6871302723884583, | |
| "learning_rate": 2.0468890892696123e-05, | |
| "loss": 1.6867, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.062237856852929235, | |
| "grad_norm": 0.7172162532806396, | |
| "learning_rate": 2.0649233543733094e-05, | |
| "loss": 1.685, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0627790556081721, | |
| "grad_norm": 0.6729004979133606, | |
| "learning_rate": 2.0829576194770065e-05, | |
| "loss": 1.6961, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.06332025436341496, | |
| "grad_norm": 0.7335099577903748, | |
| "learning_rate": 2.1009918845807033e-05, | |
| "loss": 1.6797, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.06386145311865783, | |
| "grad_norm": 0.6398060321807861, | |
| "learning_rate": 2.1190261496844003e-05, | |
| "loss": 1.7037, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.0644026518739007, | |
| "grad_norm": 0.7026365399360657, | |
| "learning_rate": 2.1370604147880974e-05, | |
| "loss": 1.6698, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.06494385062914355, | |
| "grad_norm": 0.7972332239151001, | |
| "learning_rate": 2.1550946798917945e-05, | |
| "loss": 1.6866, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.06548504938438642, | |
| "grad_norm": 0.7363021969795227, | |
| "learning_rate": 2.1731289449954913e-05, | |
| "loss": 1.6879, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.06602624813962928, | |
| "grad_norm": 0.7071017026901245, | |
| "learning_rate": 2.1911632100991887e-05, | |
| "loss": 1.6898, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.06656744689487214, | |
| "grad_norm": 0.8030880093574524, | |
| "learning_rate": 2.2091974752028858e-05, | |
| "loss": 1.6734, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.067108645650115, | |
| "grad_norm": 0.7429569363594055, | |
| "learning_rate": 2.2272317403065825e-05, | |
| "loss": 1.6722, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.06764984440535787, | |
| "grad_norm": 0.6807804107666016, | |
| "learning_rate": 2.2452660054102796e-05, | |
| "loss": 1.6697, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.06819104316060073, | |
| "grad_norm": 0.6632562875747681, | |
| "learning_rate": 2.2633002705139767e-05, | |
| "loss": 1.6453, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.0687322419158436, | |
| "grad_norm": 0.6661680340766907, | |
| "learning_rate": 2.2813345356176738e-05, | |
| "loss": 1.6701, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.06927344067108646, | |
| "grad_norm": 0.6747105121612549, | |
| "learning_rate": 2.2993688007213706e-05, | |
| "loss": 1.6729, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.06981463942632932, | |
| "grad_norm": 0.7698473334312439, | |
| "learning_rate": 2.317403065825068e-05, | |
| "loss": 1.6528, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.07035583818157218, | |
| "grad_norm": 0.6111325621604919, | |
| "learning_rate": 2.3354373309287647e-05, | |
| "loss": 1.6412, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.07089703693681504, | |
| "grad_norm": 0.7405019998550415, | |
| "learning_rate": 2.3534715960324618e-05, | |
| "loss": 1.6564, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.07143823569205791, | |
| "grad_norm": 0.6702501773834229, | |
| "learning_rate": 2.371505861136159e-05, | |
| "loss": 1.654, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.07197943444730077, | |
| "grad_norm": 0.7076373100280762, | |
| "learning_rate": 2.389540126239856e-05, | |
| "loss": 1.6301, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.07252063320254364, | |
| "grad_norm": 0.7239627242088318, | |
| "learning_rate": 2.4075743913435528e-05, | |
| "loss": 1.6575, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.0730618319577865, | |
| "grad_norm": 0.753480076789856, | |
| "learning_rate": 2.42560865644725e-05, | |
| "loss": 1.6603, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.07360303071302936, | |
| "grad_norm": 0.7261641025543213, | |
| "learning_rate": 2.443642921550947e-05, | |
| "loss": 1.6449, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.07414422946827222, | |
| "grad_norm": 0.6315119862556458, | |
| "learning_rate": 2.461677186654644e-05, | |
| "loss": 1.6538, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.07468542822351508, | |
| "grad_norm": 0.5698412656784058, | |
| "learning_rate": 2.4797114517583408e-05, | |
| "loss": 1.6663, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.07522662697875795, | |
| "grad_norm": 0.5968983173370361, | |
| "learning_rate": 2.497745716862038e-05, | |
| "loss": 1.643, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.07576782573400082, | |
| "grad_norm": 0.561126172542572, | |
| "learning_rate": 2.5157799819657353e-05, | |
| "loss": 1.6301, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.07630902448924368, | |
| "grad_norm": 0.7290865778923035, | |
| "learning_rate": 2.533814247069432e-05, | |
| "loss": 1.6412, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.07685022324448654, | |
| "grad_norm": 0.7629122138023376, | |
| "learning_rate": 2.5518485121731288e-05, | |
| "loss": 1.6335, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.0773914219997294, | |
| "grad_norm": 0.5383496284484863, | |
| "learning_rate": 2.5698827772768262e-05, | |
| "loss": 1.6226, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.07793262075497226, | |
| "grad_norm": 0.7778373956680298, | |
| "learning_rate": 2.5879170423805233e-05, | |
| "loss": 1.6333, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.07847381951021512, | |
| "grad_norm": 0.6851366758346558, | |
| "learning_rate": 2.60595130748422e-05, | |
| "loss": 1.6251, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.079015018265458, | |
| "grad_norm": 0.5947225689888, | |
| "learning_rate": 2.623985572587917e-05, | |
| "loss": 1.6298, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.07955621702070086, | |
| "grad_norm": 0.9742544889450073, | |
| "learning_rate": 2.6420198376916146e-05, | |
| "loss": 1.6252, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.08009741577594372, | |
| "grad_norm": 1.2064323425292969, | |
| "learning_rate": 2.6600541027953113e-05, | |
| "loss": 1.6152, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.08063861453118658, | |
| "grad_norm": 1.0506716966629028, | |
| "learning_rate": 2.678088367899008e-05, | |
| "loss": 1.6351, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.08117981328642944, | |
| "grad_norm": 1.2992738485336304, | |
| "learning_rate": 2.696122633002705e-05, | |
| "loss": 1.6193, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 11088, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2470943491083469e+19, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |