sla-cpt-base / l2-7b-eu /checkpoint-300 /trainer_state.json
tvkain's picture
Add files using upload-large-folder tool
3bd65fb verified
raw
history blame
27.1 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.08117981328642944,
"eval_steps": 500,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00027059937762143147,
"grad_norm": 4.086390018463135,
"learning_rate": 0.0,
"loss": 3.2754,
"step": 1
},
{
"epoch": 0.0005411987552428629,
"grad_norm": 3.758815288543701,
"learning_rate": 9.017132551848513e-08,
"loss": 3.2863,
"step": 2
},
{
"epoch": 0.0010823975104857259,
"grad_norm": 3.8250608444213867,
"learning_rate": 2.705139765554554e-07,
"loss": 3.3425,
"step": 4
},
{
"epoch": 0.0016235962657285888,
"grad_norm": 3.8092095851898193,
"learning_rate": 4.5085662759242564e-07,
"loss": 3.3165,
"step": 6
},
{
"epoch": 0.0021647950209714517,
"grad_norm": 3.7621052265167236,
"learning_rate": 6.311992786293959e-07,
"loss": 3.3295,
"step": 8
},
{
"epoch": 0.002705993776214315,
"grad_norm": 3.4136276245117188,
"learning_rate": 8.115419296663661e-07,
"loss": 3.3073,
"step": 10
},
{
"epoch": 0.0032471925314571776,
"grad_norm": 2.855100393295288,
"learning_rate": 9.918845807033363e-07,
"loss": 3.3031,
"step": 12
},
{
"epoch": 0.0037883912867000408,
"grad_norm": 2.491767406463623,
"learning_rate": 1.1722272317403068e-06,
"loss": 3.2943,
"step": 14
},
{
"epoch": 0.0043295900419429035,
"grad_norm": 2.359778642654419,
"learning_rate": 1.3525698827772768e-06,
"loss": 3.2622,
"step": 16
},
{
"epoch": 0.004870788797185766,
"grad_norm": 2.037504196166992,
"learning_rate": 1.5329125338142473e-06,
"loss": 3.239,
"step": 18
},
{
"epoch": 0.00541198755242863,
"grad_norm": 2.8542497158050537,
"learning_rate": 1.7132551848512173e-06,
"loss": 3.2031,
"step": 20
},
{
"epoch": 0.0059531863076714925,
"grad_norm": 2.297046661376953,
"learning_rate": 1.8935978358881876e-06,
"loss": 3.1721,
"step": 22
},
{
"epoch": 0.006494385062914355,
"grad_norm": 2.2149112224578857,
"learning_rate": 2.0739404869251576e-06,
"loss": 3.121,
"step": 24
},
{
"epoch": 0.007035583818157218,
"grad_norm": 1.8048591613769531,
"learning_rate": 2.254283137962128e-06,
"loss": 3.0857,
"step": 26
},
{
"epoch": 0.0075767825734000815,
"grad_norm": 1.7466434240341187,
"learning_rate": 2.4346257889990986e-06,
"loss": 3.0489,
"step": 28
},
{
"epoch": 0.008117981328642944,
"grad_norm": 2.1722524166107178,
"learning_rate": 2.6149684400360686e-06,
"loss": 3.0016,
"step": 30
},
{
"epoch": 0.008659180083885807,
"grad_norm": 1.364578366279602,
"learning_rate": 2.7953110910730386e-06,
"loss": 2.9587,
"step": 32
},
{
"epoch": 0.00920037883912867,
"grad_norm": 1.5823427438735962,
"learning_rate": 2.9756537421100095e-06,
"loss": 2.931,
"step": 34
},
{
"epoch": 0.009741577594371532,
"grad_norm": 1.2367908954620361,
"learning_rate": 3.1559963931469796e-06,
"loss": 2.8953,
"step": 36
},
{
"epoch": 0.010282776349614395,
"grad_norm": 1.0437366962432861,
"learning_rate": 3.3363390441839496e-06,
"loss": 2.8412,
"step": 38
},
{
"epoch": 0.01082397510485726,
"grad_norm": 1.081803798675537,
"learning_rate": 3.5166816952209197e-06,
"loss": 2.7832,
"step": 40
},
{
"epoch": 0.011365173860100122,
"grad_norm": 0.9715840220451355,
"learning_rate": 3.69702434625789e-06,
"loss": 2.7729,
"step": 42
},
{
"epoch": 0.011906372615342985,
"grad_norm": 0.8603936433792114,
"learning_rate": 3.877366997294861e-06,
"loss": 2.6904,
"step": 44
},
{
"epoch": 0.012447571370585848,
"grad_norm": 0.8236231803894043,
"learning_rate": 4.057709648331831e-06,
"loss": 2.6908,
"step": 46
},
{
"epoch": 0.01298877012582871,
"grad_norm": 0.7681186199188232,
"learning_rate": 4.2380522993688015e-06,
"loss": 2.6212,
"step": 48
},
{
"epoch": 0.013529968881071573,
"grad_norm": 0.8002827167510986,
"learning_rate": 4.4183949504057716e-06,
"loss": 2.6035,
"step": 50
},
{
"epoch": 0.014071167636314436,
"grad_norm": 0.6757120490074158,
"learning_rate": 4.598737601442742e-06,
"loss": 2.595,
"step": 52
},
{
"epoch": 0.014612366391557299,
"grad_norm": 0.6619369387626648,
"learning_rate": 4.779080252479712e-06,
"loss": 2.5522,
"step": 54
},
{
"epoch": 0.015153565146800163,
"grad_norm": 0.6247105598449707,
"learning_rate": 4.959422903516682e-06,
"loss": 2.5079,
"step": 56
},
{
"epoch": 0.015694763902043024,
"grad_norm": 0.6559263467788696,
"learning_rate": 5.139765554553652e-06,
"loss": 2.5009,
"step": 58
},
{
"epoch": 0.01623596265728589,
"grad_norm": 0.6590877175331116,
"learning_rate": 5.320108205590623e-06,
"loss": 2.4648,
"step": 60
},
{
"epoch": 0.01677716141252875,
"grad_norm": 0.6045516133308411,
"learning_rate": 5.500450856627593e-06,
"loss": 2.421,
"step": 62
},
{
"epoch": 0.017318360167771614,
"grad_norm": 0.6533932089805603,
"learning_rate": 5.680793507664563e-06,
"loss": 2.3966,
"step": 64
},
{
"epoch": 0.01785955892301448,
"grad_norm": 0.6478094458580017,
"learning_rate": 5.861136158701533e-06,
"loss": 2.3903,
"step": 66
},
{
"epoch": 0.01840075767825734,
"grad_norm": 0.7349300980567932,
"learning_rate": 6.041478809738504e-06,
"loss": 2.3552,
"step": 68
},
{
"epoch": 0.018941956433500204,
"grad_norm": 0.6454821825027466,
"learning_rate": 6.221821460775474e-06,
"loss": 2.3262,
"step": 70
},
{
"epoch": 0.019483155188743065,
"grad_norm": 0.7321672439575195,
"learning_rate": 6.402164111812444e-06,
"loss": 2.3197,
"step": 72
},
{
"epoch": 0.02002435394398593,
"grad_norm": 0.7664237022399902,
"learning_rate": 6.582506762849414e-06,
"loss": 2.2992,
"step": 74
},
{
"epoch": 0.02056555269922879,
"grad_norm": 0.6843811869621277,
"learning_rate": 6.762849413886384e-06,
"loss": 2.2927,
"step": 76
},
{
"epoch": 0.021106751454471655,
"grad_norm": 0.7199612259864807,
"learning_rate": 6.9431920649233556e-06,
"loss": 2.2525,
"step": 78
},
{
"epoch": 0.02164795020971452,
"grad_norm": 0.778446614742279,
"learning_rate": 7.123534715960326e-06,
"loss": 2.2267,
"step": 80
},
{
"epoch": 0.02218914896495738,
"grad_norm": 0.9287930727005005,
"learning_rate": 7.303877366997296e-06,
"loss": 2.2206,
"step": 82
},
{
"epoch": 0.022730347720200245,
"grad_norm": 1.033782958984375,
"learning_rate": 7.484220018034266e-06,
"loss": 2.2063,
"step": 84
},
{
"epoch": 0.023271546475443106,
"grad_norm": 1.0132615566253662,
"learning_rate": 7.664562669071236e-06,
"loss": 2.1677,
"step": 86
},
{
"epoch": 0.02381274523068597,
"grad_norm": 0.9043529033660889,
"learning_rate": 7.844905320108207e-06,
"loss": 2.1696,
"step": 88
},
{
"epoch": 0.02435394398592883,
"grad_norm": 0.6718290448188782,
"learning_rate": 8.025247971145176e-06,
"loss": 2.1492,
"step": 90
},
{
"epoch": 0.024895142741171696,
"grad_norm": 0.9615944027900696,
"learning_rate": 8.205590622182147e-06,
"loss": 2.1452,
"step": 92
},
{
"epoch": 0.02543634149641456,
"grad_norm": 0.9435996413230896,
"learning_rate": 8.385933273219116e-06,
"loss": 2.1098,
"step": 94
},
{
"epoch": 0.02597754025165742,
"grad_norm": 0.7614261507987976,
"learning_rate": 8.566275924256087e-06,
"loss": 2.1286,
"step": 96
},
{
"epoch": 0.026518739006900285,
"grad_norm": 0.9416339993476868,
"learning_rate": 8.746618575293058e-06,
"loss": 2.1092,
"step": 98
},
{
"epoch": 0.027059937762143146,
"grad_norm": 0.9229443073272705,
"learning_rate": 8.926961226330027e-06,
"loss": 2.0932,
"step": 100
},
{
"epoch": 0.02760113651738601,
"grad_norm": 0.7135593295097351,
"learning_rate": 9.107303877366998e-06,
"loss": 2.0699,
"step": 102
},
{
"epoch": 0.028142335272628872,
"grad_norm": 1.0263723134994507,
"learning_rate": 9.287646528403967e-06,
"loss": 2.0445,
"step": 104
},
{
"epoch": 0.028683534027871736,
"grad_norm": 1.0300300121307373,
"learning_rate": 9.467989179440938e-06,
"loss": 2.0463,
"step": 106
},
{
"epoch": 0.029224732783114597,
"grad_norm": 0.8331286311149597,
"learning_rate": 9.648331830477909e-06,
"loss": 2.0381,
"step": 108
},
{
"epoch": 0.02976593153835746,
"grad_norm": 0.7501435875892639,
"learning_rate": 9.828674481514878e-06,
"loss": 2.0411,
"step": 110
},
{
"epoch": 0.030307130293600326,
"grad_norm": 0.6895191073417664,
"learning_rate": 1.0009017132551849e-05,
"loss": 2.0475,
"step": 112
},
{
"epoch": 0.030848329048843187,
"grad_norm": 0.95854252576828,
"learning_rate": 1.018935978358882e-05,
"loss": 2.0071,
"step": 114
},
{
"epoch": 0.03138952780408605,
"grad_norm": 1.1303929090499878,
"learning_rate": 1.036970243462579e-05,
"loss": 2.0008,
"step": 116
},
{
"epoch": 0.031930726559328916,
"grad_norm": 0.7708876729011536,
"learning_rate": 1.055004508566276e-05,
"loss": 2.0061,
"step": 118
},
{
"epoch": 0.03247192531457178,
"grad_norm": 0.9773860573768616,
"learning_rate": 1.073038773669973e-05,
"loss": 2.0096,
"step": 120
},
{
"epoch": 0.03301312406981464,
"grad_norm": 1.118385910987854,
"learning_rate": 1.09107303877367e-05,
"loss": 1.9939,
"step": 122
},
{
"epoch": 0.0335543228250575,
"grad_norm": 0.7215014696121216,
"learning_rate": 1.109107303877367e-05,
"loss": 1.9515,
"step": 124
},
{
"epoch": 0.03409552158030037,
"grad_norm": 0.9696834683418274,
"learning_rate": 1.1271415689810642e-05,
"loss": 1.9639,
"step": 126
},
{
"epoch": 0.03463672033554323,
"grad_norm": 0.945482611656189,
"learning_rate": 1.1451758340847611e-05,
"loss": 1.9397,
"step": 128
},
{
"epoch": 0.03517791909078609,
"grad_norm": 0.7454535365104675,
"learning_rate": 1.1632100991884582e-05,
"loss": 1.9353,
"step": 130
},
{
"epoch": 0.03571911784602896,
"grad_norm": 0.7824187278747559,
"learning_rate": 1.1812443642921551e-05,
"loss": 1.9227,
"step": 132
},
{
"epoch": 0.03626031660127182,
"grad_norm": 0.7939879894256592,
"learning_rate": 1.1992786293958522e-05,
"loss": 1.9126,
"step": 134
},
{
"epoch": 0.03680151535651468,
"grad_norm": 0.7776147723197937,
"learning_rate": 1.2173128944995491e-05,
"loss": 1.9002,
"step": 136
},
{
"epoch": 0.03734271411175754,
"grad_norm": 0.6580236554145813,
"learning_rate": 1.2353471596032462e-05,
"loss": 1.9121,
"step": 138
},
{
"epoch": 0.03788391286700041,
"grad_norm": 0.7200301289558411,
"learning_rate": 1.2533814247069433e-05,
"loss": 1.8885,
"step": 140
},
{
"epoch": 0.03842511162224327,
"grad_norm": 0.7958497405052185,
"learning_rate": 1.2714156898106402e-05,
"loss": 1.9095,
"step": 142
},
{
"epoch": 0.03896631037748613,
"grad_norm": 0.9120681881904602,
"learning_rate": 1.2894499549143375e-05,
"loss": 1.884,
"step": 144
},
{
"epoch": 0.039507509132729,
"grad_norm": 0.8108247518539429,
"learning_rate": 1.3074842200180342e-05,
"loss": 1.8656,
"step": 146
},
{
"epoch": 0.04004870788797186,
"grad_norm": 0.7010449171066284,
"learning_rate": 1.3255184851217315e-05,
"loss": 1.8635,
"step": 148
},
{
"epoch": 0.04058990664321472,
"grad_norm": 0.8178524374961853,
"learning_rate": 1.3435527502254284e-05,
"loss": 1.8933,
"step": 150
},
{
"epoch": 0.04113110539845758,
"grad_norm": 1.0447405576705933,
"learning_rate": 1.3615870153291255e-05,
"loss": 1.8523,
"step": 152
},
{
"epoch": 0.04167230415370045,
"grad_norm": 0.8516271710395813,
"learning_rate": 1.3796212804328224e-05,
"loss": 1.8528,
"step": 154
},
{
"epoch": 0.04221350290894331,
"grad_norm": 0.8437328934669495,
"learning_rate": 1.3976555455365195e-05,
"loss": 1.861,
"step": 156
},
{
"epoch": 0.04275470166418617,
"grad_norm": 0.851265549659729,
"learning_rate": 1.4156898106402164e-05,
"loss": 1.8315,
"step": 158
},
{
"epoch": 0.04329590041942904,
"grad_norm": 0.7337156534194946,
"learning_rate": 1.4337240757439135e-05,
"loss": 1.8354,
"step": 160
},
{
"epoch": 0.0438370991746719,
"grad_norm": 0.9754143357276917,
"learning_rate": 1.4517583408476104e-05,
"loss": 1.8252,
"step": 162
},
{
"epoch": 0.04437829792991476,
"grad_norm": 0.6172115802764893,
"learning_rate": 1.4697926059513075e-05,
"loss": 1.8094,
"step": 164
},
{
"epoch": 0.04491949668515762,
"grad_norm": 0.8304158449172974,
"learning_rate": 1.4878268710550044e-05,
"loss": 1.8078,
"step": 166
},
{
"epoch": 0.04546069544040049,
"grad_norm": 0.6388853788375854,
"learning_rate": 1.5058611361587017e-05,
"loss": 1.8106,
"step": 168
},
{
"epoch": 0.04600189419564335,
"grad_norm": 0.743231475353241,
"learning_rate": 1.5238954012623984e-05,
"loss": 1.8144,
"step": 170
},
{
"epoch": 0.04654309295088621,
"grad_norm": 0.6442289352416992,
"learning_rate": 1.5419296663660955e-05,
"loss": 1.7831,
"step": 172
},
{
"epoch": 0.04708429170612908,
"grad_norm": 0.6877187490463257,
"learning_rate": 1.559963931469793e-05,
"loss": 1.8043,
"step": 174
},
{
"epoch": 0.04762549046137194,
"grad_norm": 0.9389640688896179,
"learning_rate": 1.5779981965734897e-05,
"loss": 1.7869,
"step": 176
},
{
"epoch": 0.0481666892166148,
"grad_norm": 1.0456589460372925,
"learning_rate": 1.5960324616771868e-05,
"loss": 1.7681,
"step": 178
},
{
"epoch": 0.04870788797185766,
"grad_norm": 0.9617791175842285,
"learning_rate": 1.614066726780884e-05,
"loss": 1.7668,
"step": 180
},
{
"epoch": 0.04924908672710053,
"grad_norm": 0.9334360361099243,
"learning_rate": 1.632100991884581e-05,
"loss": 1.7893,
"step": 182
},
{
"epoch": 0.04979028548234339,
"grad_norm": 0.8952531814575195,
"learning_rate": 1.6501352569882777e-05,
"loss": 1.7758,
"step": 184
},
{
"epoch": 0.05033148423758625,
"grad_norm": 0.8544924855232239,
"learning_rate": 1.6681695220919748e-05,
"loss": 1.793,
"step": 186
},
{
"epoch": 0.05087268299282912,
"grad_norm": 0.7782765030860901,
"learning_rate": 1.686203787195672e-05,
"loss": 1.768,
"step": 188
},
{
"epoch": 0.05141388174807198,
"grad_norm": 0.7119695544242859,
"learning_rate": 1.704238052299369e-05,
"loss": 1.7685,
"step": 190
},
{
"epoch": 0.05195508050331484,
"grad_norm": 0.9119647145271301,
"learning_rate": 1.7222723174030657e-05,
"loss": 1.7706,
"step": 192
},
{
"epoch": 0.0524962792585577,
"grad_norm": 0.6414957642555237,
"learning_rate": 1.7403065825067628e-05,
"loss": 1.7626,
"step": 194
},
{
"epoch": 0.05303747801380057,
"grad_norm": 0.8069677352905273,
"learning_rate": 1.75834084761046e-05,
"loss": 1.7423,
"step": 196
},
{
"epoch": 0.05357867676904343,
"grad_norm": 0.6549937725067139,
"learning_rate": 1.776375112714157e-05,
"loss": 1.7428,
"step": 198
},
{
"epoch": 0.05411987552428629,
"grad_norm": 0.8064024448394775,
"learning_rate": 1.7944093778178538e-05,
"loss": 1.7448,
"step": 200
},
{
"epoch": 0.054661074279529154,
"grad_norm": 0.7182701826095581,
"learning_rate": 1.8124436429215512e-05,
"loss": 1.7248,
"step": 202
},
{
"epoch": 0.05520227303477202,
"grad_norm": 0.6997919678688049,
"learning_rate": 1.830477908025248e-05,
"loss": 1.7281,
"step": 204
},
{
"epoch": 0.05574347179001488,
"grad_norm": 0.7071277499198914,
"learning_rate": 1.848512173128945e-05,
"loss": 1.714,
"step": 206
},
{
"epoch": 0.056284670545257744,
"grad_norm": 0.6344273090362549,
"learning_rate": 1.866546438232642e-05,
"loss": 1.7463,
"step": 208
},
{
"epoch": 0.05682586930050061,
"grad_norm": 0.7192733883857727,
"learning_rate": 1.8845807033363392e-05,
"loss": 1.737,
"step": 210
},
{
"epoch": 0.05736706805574347,
"grad_norm": 0.7418521642684937,
"learning_rate": 1.9026149684400363e-05,
"loss": 1.7197,
"step": 212
},
{
"epoch": 0.057908266810986334,
"grad_norm": 0.875845730304718,
"learning_rate": 1.920649233543733e-05,
"loss": 1.6968,
"step": 214
},
{
"epoch": 0.058449465566229195,
"grad_norm": 0.7394037842750549,
"learning_rate": 1.9386834986474305e-05,
"loss": 1.7051,
"step": 216
},
{
"epoch": 0.05899066432147206,
"grad_norm": 0.6689572930335999,
"learning_rate": 1.9567177637511272e-05,
"loss": 1.7152,
"step": 218
},
{
"epoch": 0.05953186307671492,
"grad_norm": 0.7955539226531982,
"learning_rate": 1.9747520288548243e-05,
"loss": 1.7136,
"step": 220
},
{
"epoch": 0.060073061831957784,
"grad_norm": 0.7005388140678406,
"learning_rate": 1.9927862939585214e-05,
"loss": 1.7152,
"step": 222
},
{
"epoch": 0.06061426058720065,
"grad_norm": 0.6205731630325317,
"learning_rate": 2.0108205590622185e-05,
"loss": 1.6901,
"step": 224
},
{
"epoch": 0.06115545934244351,
"grad_norm": 0.7079929709434509,
"learning_rate": 2.0288548241659152e-05,
"loss": 1.6905,
"step": 226
},
{
"epoch": 0.061696658097686374,
"grad_norm": 0.6871302723884583,
"learning_rate": 2.0468890892696123e-05,
"loss": 1.6867,
"step": 228
},
{
"epoch": 0.062237856852929235,
"grad_norm": 0.7172162532806396,
"learning_rate": 2.0649233543733094e-05,
"loss": 1.685,
"step": 230
},
{
"epoch": 0.0627790556081721,
"grad_norm": 0.6729004979133606,
"learning_rate": 2.0829576194770065e-05,
"loss": 1.6961,
"step": 232
},
{
"epoch": 0.06332025436341496,
"grad_norm": 0.7335099577903748,
"learning_rate": 2.1009918845807033e-05,
"loss": 1.6797,
"step": 234
},
{
"epoch": 0.06386145311865783,
"grad_norm": 0.6398060321807861,
"learning_rate": 2.1190261496844003e-05,
"loss": 1.7037,
"step": 236
},
{
"epoch": 0.0644026518739007,
"grad_norm": 0.7026365399360657,
"learning_rate": 2.1370604147880974e-05,
"loss": 1.6698,
"step": 238
},
{
"epoch": 0.06494385062914355,
"grad_norm": 0.7972332239151001,
"learning_rate": 2.1550946798917945e-05,
"loss": 1.6866,
"step": 240
},
{
"epoch": 0.06548504938438642,
"grad_norm": 0.7363021969795227,
"learning_rate": 2.1731289449954913e-05,
"loss": 1.6879,
"step": 242
},
{
"epoch": 0.06602624813962928,
"grad_norm": 0.7071017026901245,
"learning_rate": 2.1911632100991887e-05,
"loss": 1.6898,
"step": 244
},
{
"epoch": 0.06656744689487214,
"grad_norm": 0.8030880093574524,
"learning_rate": 2.2091974752028858e-05,
"loss": 1.6734,
"step": 246
},
{
"epoch": 0.067108645650115,
"grad_norm": 0.7429569363594055,
"learning_rate": 2.2272317403065825e-05,
"loss": 1.6722,
"step": 248
},
{
"epoch": 0.06764984440535787,
"grad_norm": 0.6807804107666016,
"learning_rate": 2.2452660054102796e-05,
"loss": 1.6697,
"step": 250
},
{
"epoch": 0.06819104316060073,
"grad_norm": 0.6632562875747681,
"learning_rate": 2.2633002705139767e-05,
"loss": 1.6453,
"step": 252
},
{
"epoch": 0.0687322419158436,
"grad_norm": 0.6661680340766907,
"learning_rate": 2.2813345356176738e-05,
"loss": 1.6701,
"step": 254
},
{
"epoch": 0.06927344067108646,
"grad_norm": 0.6747105121612549,
"learning_rate": 2.2993688007213706e-05,
"loss": 1.6729,
"step": 256
},
{
"epoch": 0.06981463942632932,
"grad_norm": 0.7698473334312439,
"learning_rate": 2.317403065825068e-05,
"loss": 1.6528,
"step": 258
},
{
"epoch": 0.07035583818157218,
"grad_norm": 0.6111325621604919,
"learning_rate": 2.3354373309287647e-05,
"loss": 1.6412,
"step": 260
},
{
"epoch": 0.07089703693681504,
"grad_norm": 0.7405019998550415,
"learning_rate": 2.3534715960324618e-05,
"loss": 1.6564,
"step": 262
},
{
"epoch": 0.07143823569205791,
"grad_norm": 0.6702501773834229,
"learning_rate": 2.371505861136159e-05,
"loss": 1.654,
"step": 264
},
{
"epoch": 0.07197943444730077,
"grad_norm": 0.7076373100280762,
"learning_rate": 2.389540126239856e-05,
"loss": 1.6301,
"step": 266
},
{
"epoch": 0.07252063320254364,
"grad_norm": 0.7239627242088318,
"learning_rate": 2.4075743913435528e-05,
"loss": 1.6575,
"step": 268
},
{
"epoch": 0.0730618319577865,
"grad_norm": 0.753480076789856,
"learning_rate": 2.42560865644725e-05,
"loss": 1.6603,
"step": 270
},
{
"epoch": 0.07360303071302936,
"grad_norm": 0.7261641025543213,
"learning_rate": 2.443642921550947e-05,
"loss": 1.6449,
"step": 272
},
{
"epoch": 0.07414422946827222,
"grad_norm": 0.6315119862556458,
"learning_rate": 2.461677186654644e-05,
"loss": 1.6538,
"step": 274
},
{
"epoch": 0.07468542822351508,
"grad_norm": 0.5698412656784058,
"learning_rate": 2.4797114517583408e-05,
"loss": 1.6663,
"step": 276
},
{
"epoch": 0.07522662697875795,
"grad_norm": 0.5968983173370361,
"learning_rate": 2.497745716862038e-05,
"loss": 1.643,
"step": 278
},
{
"epoch": 0.07576782573400082,
"grad_norm": 0.561126172542572,
"learning_rate": 2.5157799819657353e-05,
"loss": 1.6301,
"step": 280
},
{
"epoch": 0.07630902448924368,
"grad_norm": 0.7290865778923035,
"learning_rate": 2.533814247069432e-05,
"loss": 1.6412,
"step": 282
},
{
"epoch": 0.07685022324448654,
"grad_norm": 0.7629122138023376,
"learning_rate": 2.5518485121731288e-05,
"loss": 1.6335,
"step": 284
},
{
"epoch": 0.0773914219997294,
"grad_norm": 0.5383496284484863,
"learning_rate": 2.5698827772768262e-05,
"loss": 1.6226,
"step": 286
},
{
"epoch": 0.07793262075497226,
"grad_norm": 0.7778373956680298,
"learning_rate": 2.5879170423805233e-05,
"loss": 1.6333,
"step": 288
},
{
"epoch": 0.07847381951021512,
"grad_norm": 0.6851366758346558,
"learning_rate": 2.60595130748422e-05,
"loss": 1.6251,
"step": 290
},
{
"epoch": 0.079015018265458,
"grad_norm": 0.5947225689888,
"learning_rate": 2.623985572587917e-05,
"loss": 1.6298,
"step": 292
},
{
"epoch": 0.07955621702070086,
"grad_norm": 0.9742544889450073,
"learning_rate": 2.6420198376916146e-05,
"loss": 1.6252,
"step": 294
},
{
"epoch": 0.08009741577594372,
"grad_norm": 1.2064323425292969,
"learning_rate": 2.6600541027953113e-05,
"loss": 1.6152,
"step": 296
},
{
"epoch": 0.08063861453118658,
"grad_norm": 1.0506716966629028,
"learning_rate": 2.678088367899008e-05,
"loss": 1.6351,
"step": 298
},
{
"epoch": 0.08117981328642944,
"grad_norm": 1.2992738485336304,
"learning_rate": 2.696122633002705e-05,
"loss": 1.6193,
"step": 300
}
],
"logging_steps": 2,
"max_steps": 11088,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2470943491083469e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}