sla-cpt-base / q2.5-ga /checkpoint-900 /trainer_state.json
tvkain's picture
Add files using upload-large-folder tool
19d6272 verified
raw
history blame
79 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3153606937935263,
"eval_steps": 500,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00035040077088169594,
"grad_norm": 6.5142412185668945,
"learning_rate": 0.0,
"loss": 5.324,
"step": 1
},
{
"epoch": 0.0007008015417633919,
"grad_norm": 6.758334159851074,
"learning_rate": 6.993006993006994e-07,
"loss": 5.3405,
"step": 2
},
{
"epoch": 0.0014016030835267838,
"grad_norm": 6.22674036026001,
"learning_rate": 2.0979020979020983e-06,
"loss": 5.3286,
"step": 4
},
{
"epoch": 0.0021024046252901755,
"grad_norm": 5.438386917114258,
"learning_rate": 3.496503496503497e-06,
"loss": 5.25,
"step": 6
},
{
"epoch": 0.0028032061670535675,
"grad_norm": 3.365504741668701,
"learning_rate": 4.895104895104895e-06,
"loss": 5.2821,
"step": 8
},
{
"epoch": 0.0035040077088169595,
"grad_norm": 7.186147212982178,
"learning_rate": 6.2937062937062944e-06,
"loss": 5.21,
"step": 10
},
{
"epoch": 0.004204809250580351,
"grad_norm": 4.960826396942139,
"learning_rate": 7.692307692307694e-06,
"loss": 5.0759,
"step": 12
},
{
"epoch": 0.004905610792343743,
"grad_norm": 4.001464366912842,
"learning_rate": 9.090909090909091e-06,
"loss": 5.1092,
"step": 14
},
{
"epoch": 0.005606412334107135,
"grad_norm": 3.2986342906951904,
"learning_rate": 1.048951048951049e-05,
"loss": 4.93,
"step": 16
},
{
"epoch": 0.006307213875870527,
"grad_norm": 2.5407276153564453,
"learning_rate": 1.188811188811189e-05,
"loss": 4.8535,
"step": 18
},
{
"epoch": 0.007008015417633919,
"grad_norm": 2.211754083633423,
"learning_rate": 1.3286713286713287e-05,
"loss": 4.74,
"step": 20
},
{
"epoch": 0.007708816959397311,
"grad_norm": 1.6710195541381836,
"learning_rate": 1.4685314685314686e-05,
"loss": 4.609,
"step": 22
},
{
"epoch": 0.008409618501160702,
"grad_norm": 1.280752182006836,
"learning_rate": 1.6083916083916083e-05,
"loss": 4.4879,
"step": 24
},
{
"epoch": 0.009110420042924094,
"grad_norm": 1.312186598777771,
"learning_rate": 1.7482517482517483e-05,
"loss": 4.3995,
"step": 26
},
{
"epoch": 0.009811221584687486,
"grad_norm": 1.3315190076828003,
"learning_rate": 1.888111888111888e-05,
"loss": 4.3005,
"step": 28
},
{
"epoch": 0.010512023126450878,
"grad_norm": 1.3252590894699097,
"learning_rate": 2.027972027972028e-05,
"loss": 4.1952,
"step": 30
},
{
"epoch": 0.01121282466821427,
"grad_norm": 1.3794758319854736,
"learning_rate": 2.1678321678321677e-05,
"loss": 4.1459,
"step": 32
},
{
"epoch": 0.011913626209977662,
"grad_norm": 1.1808068752288818,
"learning_rate": 2.307692307692308e-05,
"loss": 4.034,
"step": 34
},
{
"epoch": 0.012614427751741054,
"grad_norm": 1.31660795211792,
"learning_rate": 2.4475524475524478e-05,
"loss": 3.926,
"step": 36
},
{
"epoch": 0.013315229293504446,
"grad_norm": 1.0347495079040527,
"learning_rate": 2.5874125874125877e-05,
"loss": 3.8812,
"step": 38
},
{
"epoch": 0.014016030835267838,
"grad_norm": 1.050775408744812,
"learning_rate": 2.7272727272727273e-05,
"loss": 3.7787,
"step": 40
},
{
"epoch": 0.01471683237703123,
"grad_norm": 0.9461761713027954,
"learning_rate": 2.8671328671328672e-05,
"loss": 3.6738,
"step": 42
},
{
"epoch": 0.015417633918794622,
"grad_norm": 1.0460454225540161,
"learning_rate": 3.0069930069930068e-05,
"loss": 3.6385,
"step": 44
},
{
"epoch": 0.016118435460558012,
"grad_norm": 1.0687191486358643,
"learning_rate": 3.146853146853147e-05,
"loss": 3.5701,
"step": 46
},
{
"epoch": 0.016819237002321404,
"grad_norm": 1.4722611904144287,
"learning_rate": 3.2867132867132866e-05,
"loss": 3.5438,
"step": 48
},
{
"epoch": 0.017520038544084796,
"grad_norm": 1.1305724382400513,
"learning_rate": 3.4265734265734265e-05,
"loss": 3.4694,
"step": 50
},
{
"epoch": 0.018220840085848188,
"grad_norm": 0.9322625994682312,
"learning_rate": 3.566433566433567e-05,
"loss": 3.4488,
"step": 52
},
{
"epoch": 0.01892164162761158,
"grad_norm": 1.2441555261611938,
"learning_rate": 3.7062937062937064e-05,
"loss": 3.4289,
"step": 54
},
{
"epoch": 0.019622443169374972,
"grad_norm": 0.9397731423377991,
"learning_rate": 3.846153846153846e-05,
"loss": 3.4021,
"step": 56
},
{
"epoch": 0.020323244711138364,
"grad_norm": 1.3261164426803589,
"learning_rate": 3.986013986013986e-05,
"loss": 3.3575,
"step": 58
},
{
"epoch": 0.021024046252901756,
"grad_norm": 1.08541738986969,
"learning_rate": 4.125874125874126e-05,
"loss": 3.3403,
"step": 60
},
{
"epoch": 0.021724847794665148,
"grad_norm": 0.8626166582107544,
"learning_rate": 4.265734265734266e-05,
"loss": 3.3306,
"step": 62
},
{
"epoch": 0.02242564933642854,
"grad_norm": 1.0596344470977783,
"learning_rate": 4.405594405594406e-05,
"loss": 3.2779,
"step": 64
},
{
"epoch": 0.023126450878191932,
"grad_norm": 1.511917233467102,
"learning_rate": 4.545454545454546e-05,
"loss": 3.2759,
"step": 66
},
{
"epoch": 0.023827252419955324,
"grad_norm": 1.2062046527862549,
"learning_rate": 4.685314685314686e-05,
"loss": 3.2545,
"step": 68
},
{
"epoch": 0.024528053961718716,
"grad_norm": 1.1399930715560913,
"learning_rate": 4.825174825174825e-05,
"loss": 3.2235,
"step": 70
},
{
"epoch": 0.025228855503482108,
"grad_norm": 0.8960133790969849,
"learning_rate": 4.9650349650349656e-05,
"loss": 3.2025,
"step": 72
},
{
"epoch": 0.0259296570452455,
"grad_norm": 1.3042056560516357,
"learning_rate": 5.1048951048951055e-05,
"loss": 3.1475,
"step": 74
},
{
"epoch": 0.026630458587008892,
"grad_norm": 1.186320424079895,
"learning_rate": 5.244755244755245e-05,
"loss": 3.1759,
"step": 76
},
{
"epoch": 0.027331260128772284,
"grad_norm": 1.2691158056259155,
"learning_rate": 5.384615384615385e-05,
"loss": 3.1296,
"step": 78
},
{
"epoch": 0.028032061670535676,
"grad_norm": 0.7816159129142761,
"learning_rate": 5.524475524475524e-05,
"loss": 3.1017,
"step": 80
},
{
"epoch": 0.028732863212299068,
"grad_norm": 1.1489295959472656,
"learning_rate": 5.664335664335665e-05,
"loss": 3.1151,
"step": 82
},
{
"epoch": 0.02943366475406246,
"grad_norm": 1.5686062574386597,
"learning_rate": 5.8041958041958044e-05,
"loss": 3.114,
"step": 84
},
{
"epoch": 0.030134466295825852,
"grad_norm": 1.4421433210372925,
"learning_rate": 5.944055944055944e-05,
"loss": 3.0946,
"step": 86
},
{
"epoch": 0.030835267837589244,
"grad_norm": 1.335250973701477,
"learning_rate": 6.083916083916085e-05,
"loss": 3.084,
"step": 88
},
{
"epoch": 0.03153606937935263,
"grad_norm": 0.970507800579071,
"learning_rate": 6.223776223776224e-05,
"loss": 3.1163,
"step": 90
},
{
"epoch": 0.032236870921116025,
"grad_norm": 1.2849407196044922,
"learning_rate": 6.363636363636364e-05,
"loss": 3.063,
"step": 92
},
{
"epoch": 0.032937672462879417,
"grad_norm": 1.0378247499465942,
"learning_rate": 6.503496503496504e-05,
"loss": 3.0223,
"step": 94
},
{
"epoch": 0.03363847400464281,
"grad_norm": 1.3139392137527466,
"learning_rate": 6.643356643356644e-05,
"loss": 3.0572,
"step": 96
},
{
"epoch": 0.0343392755464062,
"grad_norm": 1.254752278327942,
"learning_rate": 6.783216783216784e-05,
"loss": 3.0408,
"step": 98
},
{
"epoch": 0.03504007708816959,
"grad_norm": 1.3333168029785156,
"learning_rate": 6.923076923076924e-05,
"loss": 3.0185,
"step": 100
},
{
"epoch": 0.035740878629932984,
"grad_norm": 1.2795464992523193,
"learning_rate": 7.062937062937062e-05,
"loss": 3.0328,
"step": 102
},
{
"epoch": 0.036441680171696376,
"grad_norm": 1.2025645971298218,
"learning_rate": 7.202797202797204e-05,
"loss": 3.0303,
"step": 104
},
{
"epoch": 0.03714248171345977,
"grad_norm": 1.1741266250610352,
"learning_rate": 7.342657342657343e-05,
"loss": 3.0252,
"step": 106
},
{
"epoch": 0.03784328325522316,
"grad_norm": 1.2022653818130493,
"learning_rate": 7.482517482517482e-05,
"loss": 3.0183,
"step": 108
},
{
"epoch": 0.03854408479698655,
"grad_norm": 1.1950666904449463,
"learning_rate": 7.622377622377622e-05,
"loss": 2.9804,
"step": 110
},
{
"epoch": 0.039244886338749944,
"grad_norm": 1.5780822038650513,
"learning_rate": 7.762237762237763e-05,
"loss": 2.9804,
"step": 112
},
{
"epoch": 0.039945687880513336,
"grad_norm": 1.0478655099868774,
"learning_rate": 7.902097902097903e-05,
"loss": 2.9894,
"step": 114
},
{
"epoch": 0.04064648942227673,
"grad_norm": 1.1782268285751343,
"learning_rate": 8.041958041958042e-05,
"loss": 2.9717,
"step": 116
},
{
"epoch": 0.04134729096404012,
"grad_norm": 1.0321820974349976,
"learning_rate": 8.181818181818183e-05,
"loss": 2.9776,
"step": 118
},
{
"epoch": 0.04204809250580351,
"grad_norm": 0.9697206020355225,
"learning_rate": 8.321678321678323e-05,
"loss": 2.9804,
"step": 120
},
{
"epoch": 0.042748894047566904,
"grad_norm": 1.1984606981277466,
"learning_rate": 8.461538461538461e-05,
"loss": 2.9495,
"step": 122
},
{
"epoch": 0.043449695589330296,
"grad_norm": 0.9830178618431091,
"learning_rate": 8.601398601398601e-05,
"loss": 2.9656,
"step": 124
},
{
"epoch": 0.04415049713109369,
"grad_norm": 1.3105114698410034,
"learning_rate": 8.741258741258743e-05,
"loss": 2.9306,
"step": 126
},
{
"epoch": 0.04485129867285708,
"grad_norm": 1.3499157428741455,
"learning_rate": 8.881118881118881e-05,
"loss": 2.9381,
"step": 128
},
{
"epoch": 0.04555210021462047,
"grad_norm": 0.9977575540542603,
"learning_rate": 9.020979020979021e-05,
"loss": 2.907,
"step": 130
},
{
"epoch": 0.046252901756383864,
"grad_norm": 1.2331498861312866,
"learning_rate": 9.160839160839161e-05,
"loss": 2.9224,
"step": 132
},
{
"epoch": 0.046953703298147256,
"grad_norm": 1.451253890991211,
"learning_rate": 9.300699300699301e-05,
"loss": 2.9202,
"step": 134
},
{
"epoch": 0.04765450483991065,
"grad_norm": 1.2146471738815308,
"learning_rate": 9.440559440559441e-05,
"loss": 2.9098,
"step": 136
},
{
"epoch": 0.04835530638167404,
"grad_norm": 1.0873245000839233,
"learning_rate": 9.580419580419581e-05,
"loss": 2.9218,
"step": 138
},
{
"epoch": 0.04905610792343743,
"grad_norm": 1.276413083076477,
"learning_rate": 9.72027972027972e-05,
"loss": 2.8947,
"step": 140
},
{
"epoch": 0.049756909465200824,
"grad_norm": 1.126065731048584,
"learning_rate": 9.86013986013986e-05,
"loss": 2.8788,
"step": 142
},
{
"epoch": 0.050457711006964216,
"grad_norm": 1.5177017450332642,
"learning_rate": 0.0001,
"loss": 2.9043,
"step": 144
},
{
"epoch": 0.05115851254872761,
"grad_norm": 1.3744112253189087,
"learning_rate": 9.99998657109765e-05,
"loss": 2.888,
"step": 146
},
{
"epoch": 0.051859314090491,
"grad_norm": 1.7921055555343628,
"learning_rate": 9.999946284462733e-05,
"loss": 2.8631,
"step": 148
},
{
"epoch": 0.05256011563225439,
"grad_norm": 1.1755317449569702,
"learning_rate": 9.999879140311652e-05,
"loss": 2.8735,
"step": 150
},
{
"epoch": 0.053260917174017784,
"grad_norm": 0.846362292766571,
"learning_rate": 9.999785139005073e-05,
"loss": 2.8768,
"step": 152
},
{
"epoch": 0.053961718715781176,
"grad_norm": 0.9867280721664429,
"learning_rate": 9.999664281047933e-05,
"loss": 2.8859,
"step": 154
},
{
"epoch": 0.05466252025754457,
"grad_norm": 0.9751666188240051,
"learning_rate": 9.999516567089429e-05,
"loss": 2.8497,
"step": 156
},
{
"epoch": 0.05536332179930796,
"grad_norm": 1.0603703260421753,
"learning_rate": 9.999341997923011e-05,
"loss": 2.8404,
"step": 158
},
{
"epoch": 0.05606412334107135,
"grad_norm": 1.0447975397109985,
"learning_rate": 9.999140574486392e-05,
"loss": 2.9092,
"step": 160
},
{
"epoch": 0.056764924882834744,
"grad_norm": 1.3046443462371826,
"learning_rate": 9.998912297861527e-05,
"loss": 2.8971,
"step": 162
},
{
"epoch": 0.057465726424598136,
"grad_norm": 1.1029243469238281,
"learning_rate": 9.998657169274622e-05,
"loss": 2.8834,
"step": 164
},
{
"epoch": 0.05816652796636153,
"grad_norm": 0.8594210743904114,
"learning_rate": 9.99837519009611e-05,
"loss": 2.8361,
"step": 166
},
{
"epoch": 0.05886732950812492,
"grad_norm": 0.8585363030433655,
"learning_rate": 9.998066361840665e-05,
"loss": 2.8782,
"step": 168
},
{
"epoch": 0.05956813104988831,
"grad_norm": 0.693467378616333,
"learning_rate": 9.997730686167173e-05,
"loss": 2.8537,
"step": 170
},
{
"epoch": 0.060268932591651704,
"grad_norm": 0.8418940305709839,
"learning_rate": 9.997368164878738e-05,
"loss": 2.8294,
"step": 172
},
{
"epoch": 0.060969734133415096,
"grad_norm": 0.9938271045684814,
"learning_rate": 9.996978799922665e-05,
"loss": 2.8458,
"step": 174
},
{
"epoch": 0.06167053567517849,
"grad_norm": 1.0347217321395874,
"learning_rate": 9.99656259339045e-05,
"loss": 2.8081,
"step": 176
},
{
"epoch": 0.06237133721694188,
"grad_norm": 0.9216743111610413,
"learning_rate": 9.996119547517775e-05,
"loss": 2.8655,
"step": 178
},
{
"epoch": 0.06307213875870527,
"grad_norm": 1.0579859018325806,
"learning_rate": 9.995649664684486e-05,
"loss": 2.823,
"step": 180
},
{
"epoch": 0.06377294030046866,
"grad_norm": 0.9864194393157959,
"learning_rate": 9.995152947414586e-05,
"loss": 2.8081,
"step": 182
},
{
"epoch": 0.06447374184223205,
"grad_norm": 0.8999143838882446,
"learning_rate": 9.994629398376226e-05,
"loss": 2.7947,
"step": 184
},
{
"epoch": 0.06517454338399545,
"grad_norm": 0.9121315479278564,
"learning_rate": 9.994079020381676e-05,
"loss": 2.8253,
"step": 186
},
{
"epoch": 0.06587534492575883,
"grad_norm": 0.8578842282295227,
"learning_rate": 9.993501816387329e-05,
"loss": 2.7548,
"step": 188
},
{
"epoch": 0.06657614646752223,
"grad_norm": 0.8564820289611816,
"learning_rate": 9.992897789493672e-05,
"loss": 2.8361,
"step": 190
},
{
"epoch": 0.06727694800928562,
"grad_norm": 0.8013344407081604,
"learning_rate": 9.992266942945269e-05,
"loss": 2.8606,
"step": 192
},
{
"epoch": 0.06797774955104902,
"grad_norm": 0.7343975901603699,
"learning_rate": 9.991609280130752e-05,
"loss": 2.7947,
"step": 194
},
{
"epoch": 0.0686785510928124,
"grad_norm": 0.7338536381721497,
"learning_rate": 9.990924804582797e-05,
"loss": 2.7492,
"step": 196
},
{
"epoch": 0.0693793526345758,
"grad_norm": 0.828781008720398,
"learning_rate": 9.990213519978109e-05,
"loss": 2.8013,
"step": 198
},
{
"epoch": 0.07008015417633918,
"grad_norm": 0.7156624794006348,
"learning_rate": 9.989475430137391e-05,
"loss": 2.7943,
"step": 200
},
{
"epoch": 0.07078095571810258,
"grad_norm": 0.6014353632926941,
"learning_rate": 9.988710539025341e-05,
"loss": 2.8099,
"step": 202
},
{
"epoch": 0.07148175725986597,
"grad_norm": 0.6569661498069763,
"learning_rate": 9.987918850750619e-05,
"loss": 2.8125,
"step": 204
},
{
"epoch": 0.07218255880162937,
"grad_norm": 0.6558775305747986,
"learning_rate": 9.987100369565825e-05,
"loss": 2.7487,
"step": 206
},
{
"epoch": 0.07288336034339275,
"grad_norm": 0.6454245448112488,
"learning_rate": 9.986255099867481e-05,
"loss": 2.7648,
"step": 208
},
{
"epoch": 0.07358416188515615,
"grad_norm": 0.5741921067237854,
"learning_rate": 9.985383046196004e-05,
"loss": 2.7743,
"step": 210
},
{
"epoch": 0.07428496342691954,
"grad_norm": 0.5875937938690186,
"learning_rate": 9.984484213235685e-05,
"loss": 2.7728,
"step": 212
},
{
"epoch": 0.07498576496868294,
"grad_norm": 0.6638422012329102,
"learning_rate": 9.98355860581466e-05,
"loss": 2.7504,
"step": 214
},
{
"epoch": 0.07568656651044632,
"grad_norm": 1.1614341735839844,
"learning_rate": 9.982606228904884e-05,
"loss": 2.7923,
"step": 216
},
{
"epoch": 0.07638736805220972,
"grad_norm": 1.005254864692688,
"learning_rate": 9.981627087622108e-05,
"loss": 2.76,
"step": 218
},
{
"epoch": 0.0770881695939731,
"grad_norm": 0.7738555669784546,
"learning_rate": 9.980621187225852e-05,
"loss": 2.7866,
"step": 220
},
{
"epoch": 0.0777889711357365,
"grad_norm": 0.9469527006149292,
"learning_rate": 9.979588533119367e-05,
"loss": 2.8012,
"step": 222
},
{
"epoch": 0.07848977267749989,
"grad_norm": 0.9031473398208618,
"learning_rate": 9.978529130849619e-05,
"loss": 2.7522,
"step": 224
},
{
"epoch": 0.07919057421926329,
"grad_norm": 0.9450514912605286,
"learning_rate": 9.977442986107252e-05,
"loss": 2.7791,
"step": 226
},
{
"epoch": 0.07989137576102667,
"grad_norm": 0.7259206771850586,
"learning_rate": 9.97633010472656e-05,
"loss": 2.7237,
"step": 228
},
{
"epoch": 0.08059217730279007,
"grad_norm": 0.6595309972763062,
"learning_rate": 9.975190492685451e-05,
"loss": 2.7284,
"step": 230
},
{
"epoch": 0.08129297884455346,
"grad_norm": 0.7696382999420166,
"learning_rate": 9.974024156105422e-05,
"loss": 2.7631,
"step": 232
},
{
"epoch": 0.08199378038631686,
"grad_norm": 0.7305110096931458,
"learning_rate": 9.972831101251521e-05,
"loss": 2.7793,
"step": 234
},
{
"epoch": 0.08269458192808024,
"grad_norm": 0.6039514541625977,
"learning_rate": 9.971611334532314e-05,
"loss": 2.7669,
"step": 236
},
{
"epoch": 0.08339538346984364,
"grad_norm": 0.5824711918830872,
"learning_rate": 9.970364862499852e-05,
"loss": 2.7476,
"step": 238
},
{
"epoch": 0.08409618501160702,
"grad_norm": 0.6831758618354797,
"learning_rate": 9.969091691849637e-05,
"loss": 2.7098,
"step": 240
},
{
"epoch": 0.08479698655337042,
"grad_norm": 0.6469074487686157,
"learning_rate": 9.967791829420581e-05,
"loss": 2.7609,
"step": 242
},
{
"epoch": 0.08549778809513381,
"grad_norm": 0.5876832604408264,
"learning_rate": 9.966465282194976e-05,
"loss": 2.7306,
"step": 244
},
{
"epoch": 0.08619858963689721,
"grad_norm": 0.6310129761695862,
"learning_rate": 9.965112057298451e-05,
"loss": 2.7283,
"step": 246
},
{
"epoch": 0.08689939117866059,
"grad_norm": 0.6113069653511047,
"learning_rate": 9.963732161999935e-05,
"loss": 2.7274,
"step": 248
},
{
"epoch": 0.08760019272042399,
"grad_norm": 1.0655111074447632,
"learning_rate": 9.96232560371162e-05,
"loss": 2.7022,
"step": 250
},
{
"epoch": 0.08830099426218738,
"grad_norm": 0.8412613272666931,
"learning_rate": 9.960892389988918e-05,
"loss": 2.7213,
"step": 252
},
{
"epoch": 0.08900179580395078,
"grad_norm": 0.7329776883125305,
"learning_rate": 9.959432528530428e-05,
"loss": 2.7343,
"step": 254
},
{
"epoch": 0.08970259734571416,
"grad_norm": 0.702498197555542,
"learning_rate": 9.95794602717788e-05,
"loss": 2.7642,
"step": 256
},
{
"epoch": 0.09040339888747755,
"grad_norm": 0.6936408281326294,
"learning_rate": 9.95643289391611e-05,
"loss": 2.7081,
"step": 258
},
{
"epoch": 0.09110420042924094,
"grad_norm": 0.664743959903717,
"learning_rate": 9.954893136873005e-05,
"loss": 2.7054,
"step": 260
},
{
"epoch": 0.09180500197100433,
"grad_norm": 0.5716791152954102,
"learning_rate": 9.953326764319463e-05,
"loss": 2.6751,
"step": 262
},
{
"epoch": 0.09250580351276773,
"grad_norm": 0.6207195520401001,
"learning_rate": 9.95173378466935e-05,
"loss": 2.6945,
"step": 264
},
{
"epoch": 0.09320660505453111,
"grad_norm": 0.6572092771530151,
"learning_rate": 9.950114206479453e-05,
"loss": 2.6989,
"step": 266
},
{
"epoch": 0.09390740659629451,
"grad_norm": 0.7676830887794495,
"learning_rate": 9.948468038449435e-05,
"loss": 2.7613,
"step": 268
},
{
"epoch": 0.0946082081380579,
"grad_norm": 0.5810503959655762,
"learning_rate": 9.946795289421787e-05,
"loss": 2.7234,
"step": 270
},
{
"epoch": 0.0953090096798213,
"grad_norm": 0.6459682583808899,
"learning_rate": 9.945095968381784e-05,
"loss": 2.717,
"step": 272
},
{
"epoch": 0.09600981122158468,
"grad_norm": 0.6498464345932007,
"learning_rate": 9.94337008445743e-05,
"loss": 2.7389,
"step": 274
},
{
"epoch": 0.09671061276334808,
"grad_norm": 0.6287350654602051,
"learning_rate": 9.941617646919421e-05,
"loss": 2.681,
"step": 276
},
{
"epoch": 0.09741141430511147,
"grad_norm": 0.7516258955001831,
"learning_rate": 9.939838665181076e-05,
"loss": 2.6696,
"step": 278
},
{
"epoch": 0.09811221584687486,
"grad_norm": 0.6962350606918335,
"learning_rate": 9.938033148798307e-05,
"loss": 2.6971,
"step": 280
},
{
"epoch": 0.09881301738863825,
"grad_norm": 0.6605144739151001,
"learning_rate": 9.936201107469555e-05,
"loss": 2.6999,
"step": 282
},
{
"epoch": 0.09951381893040165,
"grad_norm": 0.5991240739822388,
"learning_rate": 9.93434255103574e-05,
"loss": 2.6936,
"step": 284
},
{
"epoch": 0.10021462047216503,
"grad_norm": 0.5660961866378784,
"learning_rate": 9.932457489480213e-05,
"loss": 2.686,
"step": 286
},
{
"epoch": 0.10091542201392843,
"grad_norm": 0.690290093421936,
"learning_rate": 9.930545932928698e-05,
"loss": 2.6809,
"step": 288
},
{
"epoch": 0.10161622355569182,
"grad_norm": 0.7119167447090149,
"learning_rate": 9.928607891649234e-05,
"loss": 2.7221,
"step": 290
},
{
"epoch": 0.10231702509745522,
"grad_norm": 0.7049365639686584,
"learning_rate": 9.926643376052131e-05,
"loss": 2.6569,
"step": 292
},
{
"epoch": 0.1030178266392186,
"grad_norm": 0.6691743731498718,
"learning_rate": 9.924652396689902e-05,
"loss": 2.6751,
"step": 294
},
{
"epoch": 0.103718628180982,
"grad_norm": 0.5533433556556702,
"learning_rate": 9.922634964257215e-05,
"loss": 2.7064,
"step": 296
},
{
"epoch": 0.10441942972274539,
"grad_norm": 0.6669672727584839,
"learning_rate": 9.920591089590831e-05,
"loss": 2.687,
"step": 298
},
{
"epoch": 0.10512023126450878,
"grad_norm": 0.8539720773696899,
"learning_rate": 9.918520783669549e-05,
"loss": 2.6968,
"step": 300
},
{
"epoch": 0.10582103280627217,
"grad_norm": 0.827905535697937,
"learning_rate": 9.916424057614142e-05,
"loss": 2.7339,
"step": 302
},
{
"epoch": 0.10652183434803557,
"grad_norm": 0.7071542143821716,
"learning_rate": 9.9143009226873e-05,
"loss": 2.67,
"step": 304
},
{
"epoch": 0.10722263588979895,
"grad_norm": 0.6667853593826294,
"learning_rate": 9.912151390293575e-05,
"loss": 2.7113,
"step": 306
},
{
"epoch": 0.10792343743156235,
"grad_norm": 0.49210044741630554,
"learning_rate": 9.90997547197931e-05,
"loss": 2.7034,
"step": 308
},
{
"epoch": 0.10862423897332574,
"grad_norm": 0.5823047757148743,
"learning_rate": 9.907773179432581e-05,
"loss": 2.6815,
"step": 310
},
{
"epoch": 0.10932504051508914,
"grad_norm": 0.5159279704093933,
"learning_rate": 9.905544524483138e-05,
"loss": 2.7055,
"step": 312
},
{
"epoch": 0.11002584205685252,
"grad_norm": 0.5294278264045715,
"learning_rate": 9.903289519102338e-05,
"loss": 2.6821,
"step": 314
},
{
"epoch": 0.11072664359861592,
"grad_norm": 0.5865507125854492,
"learning_rate": 9.901008175403078e-05,
"loss": 2.698,
"step": 316
},
{
"epoch": 0.1114274451403793,
"grad_norm": 0.7102755904197693,
"learning_rate": 9.898700505639735e-05,
"loss": 2.693,
"step": 318
},
{
"epoch": 0.1121282466821427,
"grad_norm": 0.8151699900627136,
"learning_rate": 9.8963665222081e-05,
"loss": 2.6482,
"step": 320
},
{
"epoch": 0.11282904822390609,
"grad_norm": 0.5769193172454834,
"learning_rate": 9.894006237645304e-05,
"loss": 2.6893,
"step": 322
},
{
"epoch": 0.11352984976566949,
"grad_norm": 0.6606284976005554,
"learning_rate": 9.891619664629762e-05,
"loss": 2.6859,
"step": 324
},
{
"epoch": 0.11423065130743287,
"grad_norm": 0.5883016586303711,
"learning_rate": 9.889206815981094e-05,
"loss": 2.6622,
"step": 326
},
{
"epoch": 0.11493145284919627,
"grad_norm": 0.5413339734077454,
"learning_rate": 9.886767704660067e-05,
"loss": 2.6718,
"step": 328
},
{
"epoch": 0.11563225439095966,
"grad_norm": 0.7391770482063293,
"learning_rate": 9.884302343768512e-05,
"loss": 2.6695,
"step": 330
},
{
"epoch": 0.11633305593272306,
"grad_norm": 0.7529366612434387,
"learning_rate": 9.881810746549267e-05,
"loss": 2.7341,
"step": 332
},
{
"epoch": 0.11703385747448644,
"grad_norm": 0.6971571445465088,
"learning_rate": 9.8792929263861e-05,
"loss": 2.6444,
"step": 334
},
{
"epoch": 0.11773465901624984,
"grad_norm": 0.544129490852356,
"learning_rate": 9.876748896803633e-05,
"loss": 2.7351,
"step": 336
},
{
"epoch": 0.11843546055801323,
"grad_norm": 0.6561135649681091,
"learning_rate": 9.874178671467277e-05,
"loss": 2.6896,
"step": 338
},
{
"epoch": 0.11913626209977662,
"grad_norm": 0.6607089042663574,
"learning_rate": 9.871582264183155e-05,
"loss": 2.6664,
"step": 340
},
{
"epoch": 0.11983706364154001,
"grad_norm": 0.6727411150932312,
"learning_rate": 9.868959688898023e-05,
"loss": 2.68,
"step": 342
},
{
"epoch": 0.12053786518330341,
"grad_norm": 0.5672718286514282,
"learning_rate": 9.86631095969921e-05,
"loss": 2.6639,
"step": 344
},
{
"epoch": 0.1212386667250668,
"grad_norm": 0.7188961505889893,
"learning_rate": 9.86363609081452e-05,
"loss": 2.6604,
"step": 346
},
{
"epoch": 0.12193946826683019,
"grad_norm": 0.9785953760147095,
"learning_rate": 9.86093509661218e-05,
"loss": 2.6557,
"step": 348
},
{
"epoch": 0.12264026980859358,
"grad_norm": 0.7856999635696411,
"learning_rate": 9.85820799160074e-05,
"loss": 2.6418,
"step": 350
},
{
"epoch": 0.12334107135035698,
"grad_norm": 0.5956946015357971,
"learning_rate": 9.855454790429015e-05,
"loss": 2.658,
"step": 352
},
{
"epoch": 0.12404187289212036,
"grad_norm": 0.6523074507713318,
"learning_rate": 9.852675507885991e-05,
"loss": 2.6743,
"step": 354
},
{
"epoch": 0.12474267443388376,
"grad_norm": 0.71266108751297,
"learning_rate": 9.849870158900753e-05,
"loss": 2.6805,
"step": 356
},
{
"epoch": 0.12544347597564715,
"grad_norm": 0.5674154162406921,
"learning_rate": 9.847038758542404e-05,
"loss": 2.6678,
"step": 358
},
{
"epoch": 0.12614427751741053,
"grad_norm": 0.5430511236190796,
"learning_rate": 9.844181322019983e-05,
"loss": 2.643,
"step": 360
},
{
"epoch": 0.12684507905917394,
"grad_norm": 0.508791983127594,
"learning_rate": 9.841297864682388e-05,
"loss": 2.6524,
"step": 362
},
{
"epoch": 0.12754588060093733,
"grad_norm": 0.6082713603973389,
"learning_rate": 9.838388402018282e-05,
"loss": 2.6892,
"step": 364
},
{
"epoch": 0.1282466821427007,
"grad_norm": 0.6065689325332642,
"learning_rate": 9.835452949656022e-05,
"loss": 2.6083,
"step": 366
},
{
"epoch": 0.1289474836844641,
"grad_norm": 0.5220572352409363,
"learning_rate": 9.83249152336357e-05,
"loss": 2.6573,
"step": 368
},
{
"epoch": 0.1296482852262275,
"grad_norm": 0.568534791469574,
"learning_rate": 9.829504139048406e-05,
"loss": 2.6266,
"step": 370
},
{
"epoch": 0.1303490867679909,
"grad_norm": 0.6165401339530945,
"learning_rate": 9.826490812757452e-05,
"loss": 2.6928,
"step": 372
},
{
"epoch": 0.13104988830975428,
"grad_norm": 0.5951835513114929,
"learning_rate": 9.823451560676966e-05,
"loss": 2.6468,
"step": 374
},
{
"epoch": 0.13175068985151767,
"grad_norm": 0.4942519962787628,
"learning_rate": 9.820386399132482e-05,
"loss": 2.6493,
"step": 376
},
{
"epoch": 0.13245149139328108,
"grad_norm": 0.6185161471366882,
"learning_rate": 9.8172953445887e-05,
"loss": 2.6741,
"step": 378
},
{
"epoch": 0.13315229293504446,
"grad_norm": 0.5588895678520203,
"learning_rate": 9.814178413649407e-05,
"loss": 2.6393,
"step": 380
},
{
"epoch": 0.13385309447680785,
"grad_norm": 0.6289598941802979,
"learning_rate": 9.811035623057387e-05,
"loss": 2.6022,
"step": 382
},
{
"epoch": 0.13455389601857123,
"grad_norm": 0.6258370280265808,
"learning_rate": 9.807866989694334e-05,
"loss": 2.6033,
"step": 384
},
{
"epoch": 0.13525469756033462,
"grad_norm": 0.6390899419784546,
"learning_rate": 9.804672530580754e-05,
"loss": 2.6413,
"step": 386
},
{
"epoch": 0.13595549910209803,
"grad_norm": 0.6844115257263184,
"learning_rate": 9.801452262875877e-05,
"loss": 2.6339,
"step": 388
},
{
"epoch": 0.13665630064386142,
"grad_norm": 0.70540452003479,
"learning_rate": 9.798206203877569e-05,
"loss": 2.6471,
"step": 390
},
{
"epoch": 0.1373571021856248,
"grad_norm": 0.7336652278900146,
"learning_rate": 9.794934371022233e-05,
"loss": 2.6348,
"step": 392
},
{
"epoch": 0.1380579037273882,
"grad_norm": 0.7155029773712158,
"learning_rate": 9.79163678188472e-05,
"loss": 2.6128,
"step": 394
},
{
"epoch": 0.1387587052691516,
"grad_norm": 0.6354189515113831,
"learning_rate": 9.788313454178228e-05,
"loss": 2.6281,
"step": 396
},
{
"epoch": 0.13945950681091498,
"grad_norm": 0.596047043800354,
"learning_rate": 9.78496440575422e-05,
"loss": 2.6719,
"step": 398
},
{
"epoch": 0.14016030835267837,
"grad_norm": 0.6149719953536987,
"learning_rate": 9.781589654602306e-05,
"loss": 2.625,
"step": 400
},
{
"epoch": 0.14086110989444176,
"grad_norm": 0.6066911816596985,
"learning_rate": 9.778189218850174e-05,
"loss": 2.6193,
"step": 402
},
{
"epoch": 0.14156191143620517,
"grad_norm": 0.5690994262695312,
"learning_rate": 9.774763116763466e-05,
"loss": 2.6239,
"step": 404
},
{
"epoch": 0.14226271297796855,
"grad_norm": 0.532486081123352,
"learning_rate": 9.771311366745703e-05,
"loss": 2.6264,
"step": 406
},
{
"epoch": 0.14296351451973194,
"grad_norm": 0.5434598326683044,
"learning_rate": 9.767833987338171e-05,
"loss": 2.6534,
"step": 408
},
{
"epoch": 0.14366431606149532,
"grad_norm": 0.522413432598114,
"learning_rate": 9.764330997219822e-05,
"loss": 2.6468,
"step": 410
},
{
"epoch": 0.14436511760325874,
"grad_norm": 0.5612457990646362,
"learning_rate": 9.760802415207181e-05,
"loss": 2.6307,
"step": 412
},
{
"epoch": 0.14506591914502212,
"grad_norm": 0.5850318670272827,
"learning_rate": 9.757248260254244e-05,
"loss": 2.6324,
"step": 414
},
{
"epoch": 0.1457667206867855,
"grad_norm": 0.688555121421814,
"learning_rate": 9.753668551452368e-05,
"loss": 2.6066,
"step": 416
},
{
"epoch": 0.1464675222285489,
"grad_norm": 0.6506465077400208,
"learning_rate": 9.750063308030179e-05,
"loss": 2.5964,
"step": 418
},
{
"epoch": 0.1471683237703123,
"grad_norm": 0.6529019474983215,
"learning_rate": 9.746432549353462e-05,
"loss": 2.651,
"step": 420
},
{
"epoch": 0.1478691253120757,
"grad_norm": 0.5469995141029358,
"learning_rate": 9.742776294925058e-05,
"loss": 2.6129,
"step": 422
},
{
"epoch": 0.14856992685383907,
"grad_norm": 0.4992043673992157,
"learning_rate": 9.739094564384758e-05,
"loss": 2.6074,
"step": 424
},
{
"epoch": 0.14927072839560246,
"grad_norm": 0.5064156651496887,
"learning_rate": 9.735387377509206e-05,
"loss": 2.6408,
"step": 426
},
{
"epoch": 0.14997152993736587,
"grad_norm": 0.5961376428604126,
"learning_rate": 9.731654754211781e-05,
"loss": 2.615,
"step": 428
},
{
"epoch": 0.15067233147912926,
"grad_norm": 0.5533669590950012,
"learning_rate": 9.727896714542494e-05,
"loss": 2.6225,
"step": 430
},
{
"epoch": 0.15137313302089264,
"grad_norm": 0.5527905821800232,
"learning_rate": 9.724113278687888e-05,
"loss": 2.5836,
"step": 432
},
{
"epoch": 0.15207393456265603,
"grad_norm": 0.4616098701953888,
"learning_rate": 9.720304466970916e-05,
"loss": 2.6236,
"step": 434
},
{
"epoch": 0.15277473610441944,
"grad_norm": 0.5189539790153503,
"learning_rate": 9.716470299850844e-05,
"loss": 2.6364,
"step": 436
},
{
"epoch": 0.15347553764618282,
"grad_norm": 0.5303817987442017,
"learning_rate": 9.712610797923133e-05,
"loss": 2.6097,
"step": 438
},
{
"epoch": 0.1541763391879462,
"grad_norm": 0.5957894921302795,
"learning_rate": 9.708725981919333e-05,
"loss": 2.5749,
"step": 440
},
{
"epoch": 0.1548771407297096,
"grad_norm": 0.5686895251274109,
"learning_rate": 9.704815872706972e-05,
"loss": 2.6319,
"step": 442
},
{
"epoch": 0.155577942271473,
"grad_norm": 0.5570897459983826,
"learning_rate": 9.700880491289438e-05,
"loss": 2.6287,
"step": 444
},
{
"epoch": 0.1562787438132364,
"grad_norm": 0.5330969095230103,
"learning_rate": 9.696919858805873e-05,
"loss": 2.6014,
"step": 446
},
{
"epoch": 0.15697954535499978,
"grad_norm": 0.4891030192375183,
"learning_rate": 9.692933996531053e-05,
"loss": 2.6097,
"step": 448
},
{
"epoch": 0.15768034689676316,
"grad_norm": 0.5465073585510254,
"learning_rate": 9.688922925875285e-05,
"loss": 2.6162,
"step": 450
},
{
"epoch": 0.15838114843852658,
"grad_norm": 0.5483290553092957,
"learning_rate": 9.684886668384277e-05,
"loss": 2.5999,
"step": 452
},
{
"epoch": 0.15908194998028996,
"grad_norm": 0.6061928868293762,
"learning_rate": 9.68082524573903e-05,
"loss": 2.6614,
"step": 454
},
{
"epoch": 0.15978275152205335,
"grad_norm": 0.5806353688240051,
"learning_rate": 9.676738679755726e-05,
"loss": 2.6039,
"step": 456
},
{
"epoch": 0.16048355306381673,
"grad_norm": 0.5722226500511169,
"learning_rate": 9.672626992385602e-05,
"loss": 2.6529,
"step": 458
},
{
"epoch": 0.16118435460558014,
"grad_norm": 0.5939204096794128,
"learning_rate": 9.668490205714839e-05,
"loss": 2.6314,
"step": 460
},
{
"epoch": 0.16188515614734353,
"grad_norm": 0.7260386943817139,
"learning_rate": 9.664328341964436e-05,
"loss": 2.6211,
"step": 462
},
{
"epoch": 0.1625859576891069,
"grad_norm": 0.8503554463386536,
"learning_rate": 9.6601414234901e-05,
"loss": 2.6134,
"step": 464
},
{
"epoch": 0.1632867592308703,
"grad_norm": 0.5818518996238708,
"learning_rate": 9.655929472782116e-05,
"loss": 2.5667,
"step": 466
},
{
"epoch": 0.1639875607726337,
"grad_norm": 0.5678598284721375,
"learning_rate": 9.651692512465239e-05,
"loss": 2.6153,
"step": 468
},
{
"epoch": 0.1646883623143971,
"grad_norm": 0.5939005613327026,
"learning_rate": 9.647430565298555e-05,
"loss": 2.6098,
"step": 470
},
{
"epoch": 0.16538916385616048,
"grad_norm": 0.5300047993659973,
"learning_rate": 9.643143654175373e-05,
"loss": 2.6167,
"step": 472
},
{
"epoch": 0.16608996539792387,
"grad_norm": 0.4946250319480896,
"learning_rate": 9.638831802123101e-05,
"loss": 2.581,
"step": 474
},
{
"epoch": 0.16679076693968728,
"grad_norm": 0.4555206000804901,
"learning_rate": 9.634495032303111e-05,
"loss": 2.588,
"step": 476
},
{
"epoch": 0.16749156848145066,
"grad_norm": 0.5159677267074585,
"learning_rate": 9.630133368010628e-05,
"loss": 2.5868,
"step": 478
},
{
"epoch": 0.16819237002321405,
"grad_norm": 0.5565433502197266,
"learning_rate": 9.625746832674597e-05,
"loss": 2.6185,
"step": 480
},
{
"epoch": 0.16889317156497743,
"grad_norm": 0.4775915741920471,
"learning_rate": 9.621335449857562e-05,
"loss": 2.5897,
"step": 482
},
{
"epoch": 0.16959397310674085,
"grad_norm": 0.5150102376937866,
"learning_rate": 9.616899243255532e-05,
"loss": 2.5478,
"step": 484
},
{
"epoch": 0.17029477464850423,
"grad_norm": 0.48455357551574707,
"learning_rate": 9.612438236697863e-05,
"loss": 2.5639,
"step": 486
},
{
"epoch": 0.17099557619026762,
"grad_norm": 0.5149878859519958,
"learning_rate": 9.607952454147121e-05,
"loss": 2.599,
"step": 488
},
{
"epoch": 0.171696377732031,
"grad_norm": 0.6969982385635376,
"learning_rate": 9.603441919698963e-05,
"loss": 2.5733,
"step": 490
},
{
"epoch": 0.17239717927379442,
"grad_norm": 0.57285475730896,
"learning_rate": 9.598906657582e-05,
"loss": 2.5791,
"step": 492
},
{
"epoch": 0.1730979808155578,
"grad_norm": 0.5704159140586853,
"learning_rate": 9.594346692157667e-05,
"loss": 2.5692,
"step": 494
},
{
"epoch": 0.17379878235732119,
"grad_norm": 0.681797444820404,
"learning_rate": 9.589762047920096e-05,
"loss": 2.5759,
"step": 496
},
{
"epoch": 0.17449958389908457,
"grad_norm": 0.49717003107070923,
"learning_rate": 9.585152749495984e-05,
"loss": 2.5848,
"step": 498
},
{
"epoch": 0.17520038544084798,
"grad_norm": 0.48680582642555237,
"learning_rate": 9.580518821644457e-05,
"loss": 2.5682,
"step": 500
},
{
"epoch": 0.17590118698261137,
"grad_norm": 0.5525830388069153,
"learning_rate": 9.575860289256943e-05,
"loss": 2.5894,
"step": 502
},
{
"epoch": 0.17660198852437475,
"grad_norm": 0.5562606453895569,
"learning_rate": 9.571177177357032e-05,
"loss": 2.5675,
"step": 504
},
{
"epoch": 0.17730279006613814,
"grad_norm": 0.5515877604484558,
"learning_rate": 9.566469511100345e-05,
"loss": 2.5877,
"step": 506
},
{
"epoch": 0.17800359160790155,
"grad_norm": 0.6816357970237732,
"learning_rate": 9.561737315774398e-05,
"loss": 2.596,
"step": 508
},
{
"epoch": 0.17870439314966494,
"grad_norm": 0.507437527179718,
"learning_rate": 9.556980616798463e-05,
"loss": 2.5721,
"step": 510
},
{
"epoch": 0.17940519469142832,
"grad_norm": 0.5275202989578247,
"learning_rate": 9.552199439723443e-05,
"loss": 2.568,
"step": 512
},
{
"epoch": 0.1801059962331917,
"grad_norm": 0.5467104911804199,
"learning_rate": 9.547393810231722e-05,
"loss": 2.5842,
"step": 514
},
{
"epoch": 0.1808067977749551,
"grad_norm": 0.5407027006149292,
"learning_rate": 9.542563754137031e-05,
"loss": 2.5891,
"step": 516
},
{
"epoch": 0.1815075993167185,
"grad_norm": 0.5731847882270813,
"learning_rate": 9.537709297384308e-05,
"loss": 2.6143,
"step": 518
},
{
"epoch": 0.1822084008584819,
"grad_norm": 0.566457986831665,
"learning_rate": 9.532830466049565e-05,
"loss": 2.5522,
"step": 520
},
{
"epoch": 0.18290920240024527,
"grad_norm": 0.4899183213710785,
"learning_rate": 9.527927286339744e-05,
"loss": 2.5961,
"step": 522
},
{
"epoch": 0.18361000394200866,
"grad_norm": 0.4883110523223877,
"learning_rate": 9.52299978459257e-05,
"loss": 2.5557,
"step": 524
},
{
"epoch": 0.18431080548377207,
"grad_norm": 0.5534235239028931,
"learning_rate": 9.518047987276421e-05,
"loss": 2.6452,
"step": 526
},
{
"epoch": 0.18501160702553546,
"grad_norm": 0.47292667627334595,
"learning_rate": 9.513071920990179e-05,
"loss": 2.5848,
"step": 528
},
{
"epoch": 0.18571240856729884,
"grad_norm": 0.5438964366912842,
"learning_rate": 9.508071612463086e-05,
"loss": 2.5332,
"step": 530
},
{
"epoch": 0.18641321010906223,
"grad_norm": 0.5318060517311096,
"learning_rate": 9.503047088554601e-05,
"loss": 2.585,
"step": 532
},
{
"epoch": 0.18711401165082564,
"grad_norm": 0.49279502034187317,
"learning_rate": 9.497998376254267e-05,
"loss": 2.5948,
"step": 534
},
{
"epoch": 0.18781481319258903,
"grad_norm": 0.5161717534065247,
"learning_rate": 9.492925502681545e-05,
"loss": 2.5644,
"step": 536
},
{
"epoch": 0.1885156147343524,
"grad_norm": 0.4586479663848877,
"learning_rate": 9.487828495085684e-05,
"loss": 2.5568,
"step": 538
},
{
"epoch": 0.1892164162761158,
"grad_norm": 0.4390322268009186,
"learning_rate": 9.482707380845573e-05,
"loss": 2.5938,
"step": 540
},
{
"epoch": 0.1899172178178792,
"grad_norm": 0.5253728628158569,
"learning_rate": 9.47756218746959e-05,
"loss": 2.5996,
"step": 542
},
{
"epoch": 0.1906180193596426,
"grad_norm": 0.4567623436450958,
"learning_rate": 9.472392942595454e-05,
"loss": 2.5576,
"step": 544
},
{
"epoch": 0.19131882090140598,
"grad_norm": 0.5091727375984192,
"learning_rate": 9.467199673990077e-05,
"loss": 2.5873,
"step": 546
},
{
"epoch": 0.19201962244316936,
"grad_norm": 0.4959392845630646,
"learning_rate": 9.46198240954942e-05,
"loss": 2.5291,
"step": 548
},
{
"epoch": 0.19272042398493278,
"grad_norm": 0.5150632262229919,
"learning_rate": 9.456741177298336e-05,
"loss": 2.5503,
"step": 550
},
{
"epoch": 0.19342122552669616,
"grad_norm": 0.4603368639945984,
"learning_rate": 9.451476005390422e-05,
"loss": 2.5785,
"step": 552
},
{
"epoch": 0.19412202706845955,
"grad_norm": 0.4441729784011841,
"learning_rate": 9.446186922107873e-05,
"loss": 2.5512,
"step": 554
},
{
"epoch": 0.19482282861022293,
"grad_norm": 0.5432455539703369,
"learning_rate": 9.44087395586132e-05,
"loss": 2.5741,
"step": 556
},
{
"epoch": 0.19552363015198634,
"grad_norm": 0.42969366908073425,
"learning_rate": 9.435537135189687e-05,
"loss": 2.5677,
"step": 558
},
{
"epoch": 0.19622443169374973,
"grad_norm": 0.5706619620323181,
"learning_rate": 9.430176488760027e-05,
"loss": 2.556,
"step": 560
},
{
"epoch": 0.19692523323551311,
"grad_norm": 0.7202513217926025,
"learning_rate": 9.424792045367383e-05,
"loss": 2.5435,
"step": 562
},
{
"epoch": 0.1976260347772765,
"grad_norm": 0.5471363663673401,
"learning_rate": 9.419383833934621e-05,
"loss": 2.572,
"step": 564
},
{
"epoch": 0.1983268363190399,
"grad_norm": 0.654058575630188,
"learning_rate": 9.413951883512275e-05,
"loss": 2.5432,
"step": 566
},
{
"epoch": 0.1990276378608033,
"grad_norm": 0.6124361157417297,
"learning_rate": 9.408496223278403e-05,
"loss": 2.5803,
"step": 568
},
{
"epoch": 0.19972843940256668,
"grad_norm": 0.5291132926940918,
"learning_rate": 9.403016882538408e-05,
"loss": 2.576,
"step": 570
},
{
"epoch": 0.20042924094433007,
"grad_norm": 0.6087374687194824,
"learning_rate": 9.397513890724911e-05,
"loss": 2.5171,
"step": 572
},
{
"epoch": 0.20113004248609348,
"grad_norm": 0.5776922106742859,
"learning_rate": 9.391987277397566e-05,
"loss": 2.6054,
"step": 574
},
{
"epoch": 0.20183084402785686,
"grad_norm": 0.544319748878479,
"learning_rate": 9.38643707224291e-05,
"loss": 2.548,
"step": 576
},
{
"epoch": 0.20253164556962025,
"grad_norm": 0.5210007429122925,
"learning_rate": 9.38086330507421e-05,
"loss": 2.6019,
"step": 578
},
{
"epoch": 0.20323244711138364,
"grad_norm": 0.5160629153251648,
"learning_rate": 9.375266005831297e-05,
"loss": 2.6046,
"step": 580
},
{
"epoch": 0.20393324865314705,
"grad_norm": 0.6452796459197998,
"learning_rate": 9.369645204580403e-05,
"loss": 2.566,
"step": 582
},
{
"epoch": 0.20463405019491043,
"grad_norm": 0.5813329815864563,
"learning_rate": 9.364000931514008e-05,
"loss": 2.5661,
"step": 584
},
{
"epoch": 0.20533485173667382,
"grad_norm": 0.5450593829154968,
"learning_rate": 9.358333216950664e-05,
"loss": 2.5769,
"step": 586
},
{
"epoch": 0.2060356532784372,
"grad_norm": 0.5340794324874878,
"learning_rate": 9.352642091334849e-05,
"loss": 2.5549,
"step": 588
},
{
"epoch": 0.20673645482020062,
"grad_norm": 0.5767348408699036,
"learning_rate": 9.34692758523679e-05,
"loss": 2.5604,
"step": 590
},
{
"epoch": 0.207437256361964,
"grad_norm": 0.6048093438148499,
"learning_rate": 9.341189729352302e-05,
"loss": 2.5929,
"step": 592
},
{
"epoch": 0.20813805790372739,
"grad_norm": 0.4430505335330963,
"learning_rate": 9.33542855450263e-05,
"loss": 2.5563,
"step": 594
},
{
"epoch": 0.20883885944549077,
"grad_norm": 0.49373888969421387,
"learning_rate": 9.329644091634278e-05,
"loss": 2.5517,
"step": 596
},
{
"epoch": 0.20953966098725418,
"grad_norm": 0.5227393507957458,
"learning_rate": 9.323836371818837e-05,
"loss": 2.5286,
"step": 598
},
{
"epoch": 0.21024046252901757,
"grad_norm": 0.497405081987381,
"learning_rate": 9.318005426252832e-05,
"loss": 2.5638,
"step": 600
},
{
"epoch": 0.21094126407078095,
"grad_norm": 0.48721396923065186,
"learning_rate": 9.312151286257537e-05,
"loss": 2.5751,
"step": 602
},
{
"epoch": 0.21164206561254434,
"grad_norm": 0.4621741771697998,
"learning_rate": 9.306273983278825e-05,
"loss": 2.5654,
"step": 604
},
{
"epoch": 0.21234286715430775,
"grad_norm": 0.4756307005882263,
"learning_rate": 9.300373548886987e-05,
"loss": 2.5989,
"step": 606
},
{
"epoch": 0.21304366869607114,
"grad_norm": 0.42497771978378296,
"learning_rate": 9.294450014776566e-05,
"loss": 2.564,
"step": 608
},
{
"epoch": 0.21374447023783452,
"grad_norm": 0.5173219442367554,
"learning_rate": 9.288503412766185e-05,
"loss": 2.5296,
"step": 610
},
{
"epoch": 0.2144452717795979,
"grad_norm": 0.4622451066970825,
"learning_rate": 9.28253377479838e-05,
"loss": 2.5829,
"step": 612
},
{
"epoch": 0.21514607332136132,
"grad_norm": 0.5879294276237488,
"learning_rate": 9.276541132939428e-05,
"loss": 2.5462,
"step": 614
},
{
"epoch": 0.2158468748631247,
"grad_norm": 0.6237635612487793,
"learning_rate": 9.270525519379165e-05,
"loss": 2.6143,
"step": 616
},
{
"epoch": 0.2165476764048881,
"grad_norm": 0.5845280289649963,
"learning_rate": 9.264486966430829e-05,
"loss": 2.5272,
"step": 618
},
{
"epoch": 0.21724847794665147,
"grad_norm": 0.5140432715415955,
"learning_rate": 9.258425506530872e-05,
"loss": 2.5716,
"step": 620
},
{
"epoch": 0.2179492794884149,
"grad_norm": 0.5868300199508667,
"learning_rate": 9.2523411722388e-05,
"loss": 2.5699,
"step": 622
},
{
"epoch": 0.21865008103017827,
"grad_norm": 0.587374210357666,
"learning_rate": 9.246233996236983e-05,
"loss": 2.5335,
"step": 624
},
{
"epoch": 0.21935088257194166,
"grad_norm": 0.5000743865966797,
"learning_rate": 9.240104011330489e-05,
"loss": 2.5367,
"step": 626
},
{
"epoch": 0.22005168411370504,
"grad_norm": 0.5124289393424988,
"learning_rate": 9.233951250446902e-05,
"loss": 2.5598,
"step": 628
},
{
"epoch": 0.22075248565546846,
"grad_norm": 0.4815032482147217,
"learning_rate": 9.227775746636158e-05,
"loss": 2.5468,
"step": 630
},
{
"epoch": 0.22145328719723184,
"grad_norm": 0.5089353919029236,
"learning_rate": 9.22157753307035e-05,
"loss": 2.5482,
"step": 632
},
{
"epoch": 0.22215408873899523,
"grad_norm": 0.468841552734375,
"learning_rate": 9.215356643043559e-05,
"loss": 2.5138,
"step": 634
},
{
"epoch": 0.2228548902807586,
"grad_norm": 0.511968731880188,
"learning_rate": 9.209113109971676e-05,
"loss": 2.5481,
"step": 636
},
{
"epoch": 0.223555691822522,
"grad_norm": 0.6082082390785217,
"learning_rate": 9.202846967392217e-05,
"loss": 2.5459,
"step": 638
},
{
"epoch": 0.2242564933642854,
"grad_norm": 0.4931623637676239,
"learning_rate": 9.196558248964151e-05,
"loss": 2.5785,
"step": 640
},
{
"epoch": 0.2249572949060488,
"grad_norm": 0.5754916071891785,
"learning_rate": 9.190246988467712e-05,
"loss": 2.5166,
"step": 642
},
{
"epoch": 0.22565809644781218,
"grad_norm": 0.5335285067558289,
"learning_rate": 9.183913219804221e-05,
"loss": 2.4976,
"step": 644
},
{
"epoch": 0.22635889798957556,
"grad_norm": 0.4676333963871002,
"learning_rate": 9.1775569769959e-05,
"loss": 2.5361,
"step": 646
},
{
"epoch": 0.22705969953133898,
"grad_norm": 0.48826783895492554,
"learning_rate": 9.171178294185697e-05,
"loss": 2.5347,
"step": 648
},
{
"epoch": 0.22776050107310236,
"grad_norm": 0.509066104888916,
"learning_rate": 9.164777205637094e-05,
"loss": 2.5326,
"step": 650
},
{
"epoch": 0.22846130261486575,
"grad_norm": 0.5001896619796753,
"learning_rate": 9.158353745733927e-05,
"loss": 2.5605,
"step": 652
},
{
"epoch": 0.22916210415662913,
"grad_norm": 0.5497420430183411,
"learning_rate": 9.151907948980206e-05,
"loss": 2.5295,
"step": 654
},
{
"epoch": 0.22986290569839254,
"grad_norm": 0.43462875485420227,
"learning_rate": 9.145439849999919e-05,
"loss": 2.5358,
"step": 656
},
{
"epoch": 0.23056370724015593,
"grad_norm": 0.5398270487785339,
"learning_rate": 9.138949483536852e-05,
"loss": 2.5464,
"step": 658
},
{
"epoch": 0.23126450878191931,
"grad_norm": 0.5165109038352966,
"learning_rate": 9.132436884454408e-05,
"loss": 2.5043,
"step": 660
},
{
"epoch": 0.2319653103236827,
"grad_norm": 0.6717212200164795,
"learning_rate": 9.125902087735407e-05,
"loss": 2.547,
"step": 662
},
{
"epoch": 0.2326661118654461,
"grad_norm": 0.4584912061691284,
"learning_rate": 9.119345128481909e-05,
"loss": 2.5106,
"step": 664
},
{
"epoch": 0.2333669134072095,
"grad_norm": 0.5452204942703247,
"learning_rate": 9.112766041915019e-05,
"loss": 2.5189,
"step": 666
},
{
"epoch": 0.23406771494897288,
"grad_norm": 0.5055968761444092,
"learning_rate": 9.106164863374702e-05,
"loss": 2.4957,
"step": 668
},
{
"epoch": 0.23476851649073627,
"grad_norm": 0.4905461072921753,
"learning_rate": 9.099541628319592e-05,
"loss": 2.5523,
"step": 670
},
{
"epoch": 0.23546931803249968,
"grad_norm": 0.44840848445892334,
"learning_rate": 9.092896372326798e-05,
"loss": 2.4713,
"step": 672
},
{
"epoch": 0.23617011957426307,
"grad_norm": 0.46489134430885315,
"learning_rate": 9.086229131091717e-05,
"loss": 2.5071,
"step": 674
},
{
"epoch": 0.23687092111602645,
"grad_norm": 0.4460737705230713,
"learning_rate": 9.079539940427845e-05,
"loss": 2.5799,
"step": 676
},
{
"epoch": 0.23757172265778984,
"grad_norm": 0.5268511176109314,
"learning_rate": 9.072828836266574e-05,
"loss": 2.5574,
"step": 678
},
{
"epoch": 0.23827252419955325,
"grad_norm": 0.5001477003097534,
"learning_rate": 9.066095854657011e-05,
"loss": 2.5117,
"step": 680
},
{
"epoch": 0.23897332574131663,
"grad_norm": 0.5136899352073669,
"learning_rate": 9.059341031765773e-05,
"loss": 2.4855,
"step": 682
},
{
"epoch": 0.23967412728308002,
"grad_norm": 0.5532418489456177,
"learning_rate": 9.052564403876808e-05,
"loss": 2.5623,
"step": 684
},
{
"epoch": 0.2403749288248434,
"grad_norm": 0.4908037483692169,
"learning_rate": 9.045766007391185e-05,
"loss": 2.5248,
"step": 686
},
{
"epoch": 0.24107573036660682,
"grad_norm": 0.45994317531585693,
"learning_rate": 9.038945878826903e-05,
"loss": 2.5007,
"step": 688
},
{
"epoch": 0.2417765319083702,
"grad_norm": 0.5593565702438354,
"learning_rate": 9.032104054818698e-05,
"loss": 2.5759,
"step": 690
},
{
"epoch": 0.2424773334501336,
"grad_norm": 0.5076695084571838,
"learning_rate": 9.025240572117846e-05,
"loss": 2.5272,
"step": 692
},
{
"epoch": 0.24317813499189697,
"grad_norm": 0.3996141850948334,
"learning_rate": 9.018355467591962e-05,
"loss": 2.5317,
"step": 694
},
{
"epoch": 0.24387893653366038,
"grad_norm": 0.49347859621047974,
"learning_rate": 9.011448778224802e-05,
"loss": 2.5186,
"step": 696
},
{
"epoch": 0.24457973807542377,
"grad_norm": 0.5040503144264221,
"learning_rate": 9.004520541116075e-05,
"loss": 2.5015,
"step": 698
},
{
"epoch": 0.24528053961718715,
"grad_norm": 0.4658913016319275,
"learning_rate": 8.997570793481223e-05,
"loss": 2.5481,
"step": 700
},
{
"epoch": 0.24598134115895054,
"grad_norm": 0.47850051522254944,
"learning_rate": 8.990599572651242e-05,
"loss": 2.5505,
"step": 702
},
{
"epoch": 0.24668214270071395,
"grad_norm": 0.48090964555740356,
"learning_rate": 8.983606916072469e-05,
"loss": 2.5669,
"step": 704
},
{
"epoch": 0.24738294424247734,
"grad_norm": 0.5716775059700012,
"learning_rate": 8.976592861306384e-05,
"loss": 2.523,
"step": 706
},
{
"epoch": 0.24808374578424072,
"grad_norm": 0.49985334277153015,
"learning_rate": 8.969557446029409e-05,
"loss": 2.5439,
"step": 708
},
{
"epoch": 0.2487845473260041,
"grad_norm": 0.6331408023834229,
"learning_rate": 8.962500708032708e-05,
"loss": 2.5601,
"step": 710
},
{
"epoch": 0.24948534886776752,
"grad_norm": 0.5418590307235718,
"learning_rate": 8.955422685221979e-05,
"loss": 2.5495,
"step": 712
},
{
"epoch": 0.2501861504095309,
"grad_norm": 0.5396260619163513,
"learning_rate": 8.948323415617253e-05,
"loss": 2.5151,
"step": 714
},
{
"epoch": 0.2508869519512943,
"grad_norm": 0.5641499161720276,
"learning_rate": 8.941202937352686e-05,
"loss": 2.4895,
"step": 716
},
{
"epoch": 0.2515877534930577,
"grad_norm": 0.47651517391204834,
"learning_rate": 8.934061288676365e-05,
"loss": 2.5634,
"step": 718
},
{
"epoch": 0.25228855503482106,
"grad_norm": 0.5351449251174927,
"learning_rate": 8.92689850795009e-05,
"loss": 2.4804,
"step": 720
},
{
"epoch": 0.25298935657658445,
"grad_norm": 0.5856335759162903,
"learning_rate": 8.919714633649172e-05,
"loss": 2.5304,
"step": 722
},
{
"epoch": 0.2536901581183479,
"grad_norm": 0.4513723850250244,
"learning_rate": 8.912509704362232e-05,
"loss": 2.5369,
"step": 724
},
{
"epoch": 0.25439095966011127,
"grad_norm": 0.4676707983016968,
"learning_rate": 8.905283758790985e-05,
"loss": 2.5589,
"step": 726
},
{
"epoch": 0.25509176120187466,
"grad_norm": 0.5069173574447632,
"learning_rate": 8.89803683575004e-05,
"loss": 2.4958,
"step": 728
},
{
"epoch": 0.25579256274363804,
"grad_norm": 0.4774676263332367,
"learning_rate": 8.890768974166685e-05,
"loss": 2.5229,
"step": 730
},
{
"epoch": 0.2564933642854014,
"grad_norm": 0.548409104347229,
"learning_rate": 8.883480213080681e-05,
"loss": 2.4815,
"step": 732
},
{
"epoch": 0.2571941658271648,
"grad_norm": 0.4854792356491089,
"learning_rate": 8.876170591644054e-05,
"loss": 2.5118,
"step": 734
},
{
"epoch": 0.2578949673689282,
"grad_norm": 0.4988788664340973,
"learning_rate": 8.868840149120876e-05,
"loss": 2.5073,
"step": 736
},
{
"epoch": 0.2585957689106916,
"grad_norm": 0.4614211618900299,
"learning_rate": 8.861488924887071e-05,
"loss": 2.4866,
"step": 738
},
{
"epoch": 0.259296570452455,
"grad_norm": 0.4878149926662445,
"learning_rate": 8.854116958430185e-05,
"loss": 2.5315,
"step": 740
},
{
"epoch": 0.2599973719942184,
"grad_norm": 0.47185149788856506,
"learning_rate": 8.846724289349189e-05,
"loss": 2.4766,
"step": 742
},
{
"epoch": 0.2606981735359818,
"grad_norm": 0.446411669254303,
"learning_rate": 8.839310957354249e-05,
"loss": 2.5278,
"step": 744
},
{
"epoch": 0.2613989750777452,
"grad_norm": 0.45869573950767517,
"learning_rate": 8.831877002266536e-05,
"loss": 2.5051,
"step": 746
},
{
"epoch": 0.26209977661950856,
"grad_norm": 0.4578917920589447,
"learning_rate": 8.82442246401799e-05,
"loss": 2.4903,
"step": 748
},
{
"epoch": 0.26280057816127195,
"grad_norm": 0.4389136731624603,
"learning_rate": 8.816947382651116e-05,
"loss": 2.519,
"step": 750
},
{
"epoch": 0.26350137970303533,
"grad_norm": 0.4686265289783478,
"learning_rate": 8.80945179831877e-05,
"loss": 2.5537,
"step": 752
},
{
"epoch": 0.2642021812447987,
"grad_norm": 0.49357905983924866,
"learning_rate": 8.801935751283944e-05,
"loss": 2.4971,
"step": 754
},
{
"epoch": 0.26490298278656216,
"grad_norm": 0.5659007430076599,
"learning_rate": 8.794399281919537e-05,
"loss": 2.5291,
"step": 756
},
{
"epoch": 0.26560378432832554,
"grad_norm": 0.5637578964233398,
"learning_rate": 8.786842430708157e-05,
"loss": 2.5335,
"step": 758
},
{
"epoch": 0.26630458587008893,
"grad_norm": 0.47859886288642883,
"learning_rate": 8.779265238241888e-05,
"loss": 2.5104,
"step": 760
},
{
"epoch": 0.2670053874118523,
"grad_norm": 0.5444939732551575,
"learning_rate": 8.771667745222082e-05,
"loss": 2.4823,
"step": 762
},
{
"epoch": 0.2677061889536157,
"grad_norm": 0.5456621050834656,
"learning_rate": 8.76404999245914e-05,
"loss": 2.5027,
"step": 764
},
{
"epoch": 0.2684069904953791,
"grad_norm": 0.5168180465698242,
"learning_rate": 8.75641202087228e-05,
"loss": 2.5562,
"step": 766
},
{
"epoch": 0.26910779203714247,
"grad_norm": 0.5675712823867798,
"learning_rate": 8.748753871489333e-05,
"loss": 2.5195,
"step": 768
},
{
"epoch": 0.26980859357890585,
"grad_norm": 0.4084811806678772,
"learning_rate": 8.741075585446514e-05,
"loss": 2.4853,
"step": 770
},
{
"epoch": 0.27050939512066924,
"grad_norm": 0.4109669327735901,
"learning_rate": 8.733377203988208e-05,
"loss": 2.5186,
"step": 772
},
{
"epoch": 0.2712101966624327,
"grad_norm": 0.5689636468887329,
"learning_rate": 8.725658768466738e-05,
"loss": 2.5106,
"step": 774
},
{
"epoch": 0.27191099820419606,
"grad_norm": 0.4750414192676544,
"learning_rate": 8.71792032034215e-05,
"loss": 2.4927,
"step": 776
},
{
"epoch": 0.27261179974595945,
"grad_norm": 0.4577466547489166,
"learning_rate": 8.710161901181993e-05,
"loss": 2.5005,
"step": 778
},
{
"epoch": 0.27331260128772283,
"grad_norm": 0.4786745011806488,
"learning_rate": 8.702383552661081e-05,
"loss": 2.5099,
"step": 780
},
{
"epoch": 0.2740134028294862,
"grad_norm": 0.508456289768219,
"learning_rate": 8.694585316561296e-05,
"loss": 2.5377,
"step": 782
},
{
"epoch": 0.2747142043712496,
"grad_norm": 0.49584171175956726,
"learning_rate": 8.686767234771333e-05,
"loss": 2.5208,
"step": 784
},
{
"epoch": 0.275415005913013,
"grad_norm": 0.4523308575153351,
"learning_rate": 8.678929349286498e-05,
"loss": 2.5663,
"step": 786
},
{
"epoch": 0.2761158074547764,
"grad_norm": 0.411276638507843,
"learning_rate": 8.671071702208467e-05,
"loss": 2.5076,
"step": 788
},
{
"epoch": 0.2768166089965398,
"grad_norm": 0.47366130352020264,
"learning_rate": 8.663194335745071e-05,
"loss": 2.4725,
"step": 790
},
{
"epoch": 0.2775174105383032,
"grad_norm": 0.44845113158226013,
"learning_rate": 8.655297292210067e-05,
"loss": 2.5204,
"step": 792
},
{
"epoch": 0.2782182120800666,
"grad_norm": 0.4630947709083557,
"learning_rate": 8.647380614022902e-05,
"loss": 2.4848,
"step": 794
},
{
"epoch": 0.27891901362182997,
"grad_norm": 0.4739050567150116,
"learning_rate": 8.639444343708496e-05,
"loss": 2.4975,
"step": 796
},
{
"epoch": 0.27961981516359335,
"grad_norm": 0.41872844099998474,
"learning_rate": 8.631488523897011e-05,
"loss": 2.5105,
"step": 798
},
{
"epoch": 0.28032061670535674,
"grad_norm": 0.5174891948699951,
"learning_rate": 8.623513197323615e-05,
"loss": 2.4428,
"step": 800
},
{
"epoch": 0.2810214182471201,
"grad_norm": 0.4543634057044983,
"learning_rate": 8.615518406828262e-05,
"loss": 2.5248,
"step": 802
},
{
"epoch": 0.2817222197888835,
"grad_norm": 0.433250367641449,
"learning_rate": 8.607504195355458e-05,
"loss": 2.4887,
"step": 804
},
{
"epoch": 0.28242302133064695,
"grad_norm": 0.47642698884010315,
"learning_rate": 8.599470605954025e-05,
"loss": 2.5391,
"step": 806
},
{
"epoch": 0.28312382287241034,
"grad_norm": 0.45496654510498047,
"learning_rate": 8.59141768177688e-05,
"loss": 2.5444,
"step": 808
},
{
"epoch": 0.2838246244141737,
"grad_norm": 0.4619695544242859,
"learning_rate": 8.583345466080796e-05,
"loss": 2.504,
"step": 810
},
{
"epoch": 0.2845254259559371,
"grad_norm": 0.4610481262207031,
"learning_rate": 8.575254002226173e-05,
"loss": 2.4904,
"step": 812
},
{
"epoch": 0.2852262274977005,
"grad_norm": 0.4597660005092621,
"learning_rate": 8.5671433336768e-05,
"loss": 2.4923,
"step": 814
},
{
"epoch": 0.2859270290394639,
"grad_norm": 0.5440905094146729,
"learning_rate": 8.559013503999626e-05,
"loss": 2.4806,
"step": 816
},
{
"epoch": 0.28662783058122726,
"grad_norm": 0.4667718708515167,
"learning_rate": 8.550864556864529e-05,
"loss": 2.5595,
"step": 818
},
{
"epoch": 0.28732863212299065,
"grad_norm": 0.47145599126815796,
"learning_rate": 8.542696536044075e-05,
"loss": 2.4813,
"step": 820
},
{
"epoch": 0.2880294336647541,
"grad_norm": 0.4581964313983917,
"learning_rate": 8.534509485413284e-05,
"loss": 2.5467,
"step": 822
},
{
"epoch": 0.28873023520651747,
"grad_norm": 0.5127134919166565,
"learning_rate": 8.5263034489494e-05,
"loss": 2.5067,
"step": 824
},
{
"epoch": 0.28943103674828086,
"grad_norm": 0.5416949391365051,
"learning_rate": 8.518078470731644e-05,
"loss": 2.4669,
"step": 826
},
{
"epoch": 0.29013183829004424,
"grad_norm": 0.442828506231308,
"learning_rate": 8.509834594940991e-05,
"loss": 2.4708,
"step": 828
},
{
"epoch": 0.2908326398318076,
"grad_norm": 0.4708557426929474,
"learning_rate": 8.501571865859924e-05,
"loss": 2.5192,
"step": 830
},
{
"epoch": 0.291533441373571,
"grad_norm": 0.4371870458126068,
"learning_rate": 8.49329032787219e-05,
"loss": 2.4778,
"step": 832
},
{
"epoch": 0.2922342429153344,
"grad_norm": 0.48408806324005127,
"learning_rate": 8.48499002546258e-05,
"loss": 2.4868,
"step": 834
},
{
"epoch": 0.2929350444570978,
"grad_norm": 0.45126622915267944,
"learning_rate": 8.47667100321667e-05,
"loss": 2.4999,
"step": 836
},
{
"epoch": 0.2936358459988612,
"grad_norm": 0.4448654353618622,
"learning_rate": 8.468333305820599e-05,
"loss": 2.4848,
"step": 838
},
{
"epoch": 0.2943366475406246,
"grad_norm": 0.47776126861572266,
"learning_rate": 8.459976978060815e-05,
"loss": 2.5515,
"step": 840
},
{
"epoch": 0.295037449082388,
"grad_norm": 0.4572128653526306,
"learning_rate": 8.45160206482384e-05,
"loss": 2.5172,
"step": 842
},
{
"epoch": 0.2957382506241514,
"grad_norm": 0.4419424831867218,
"learning_rate": 8.443208611096036e-05,
"loss": 2.5035,
"step": 844
},
{
"epoch": 0.29643905216591476,
"grad_norm": 0.42213693261146545,
"learning_rate": 8.434796661963344e-05,
"loss": 2.542,
"step": 846
},
{
"epoch": 0.29713985370767815,
"grad_norm": 0.446344792842865,
"learning_rate": 8.426366262611067e-05,
"loss": 2.5119,
"step": 848
},
{
"epoch": 0.29784065524944153,
"grad_norm": 0.44233253598213196,
"learning_rate": 8.417917458323607e-05,
"loss": 2.4985,
"step": 850
},
{
"epoch": 0.2985414567912049,
"grad_norm": 0.492471843957901,
"learning_rate": 8.40945029448423e-05,
"loss": 2.4553,
"step": 852
},
{
"epoch": 0.29924225833296836,
"grad_norm": 0.4490063488483429,
"learning_rate": 8.400964816574826e-05,
"loss": 2.5389,
"step": 854
},
{
"epoch": 0.29994305987473174,
"grad_norm": 0.5494585633277893,
"learning_rate": 8.392461070175652e-05,
"loss": 2.5163,
"step": 856
},
{
"epoch": 0.30064386141649513,
"grad_norm": 0.4822872281074524,
"learning_rate": 8.383939100965103e-05,
"loss": 2.504,
"step": 858
},
{
"epoch": 0.3013446629582585,
"grad_norm": 0.5434439778327942,
"learning_rate": 8.375398954719456e-05,
"loss": 2.4841,
"step": 860
},
{
"epoch": 0.3020454645000219,
"grad_norm": 0.5055859088897705,
"learning_rate": 8.366840677312626e-05,
"loss": 2.4985,
"step": 862
},
{
"epoch": 0.3027462660417853,
"grad_norm": 0.44319674372673035,
"learning_rate": 8.358264314715923e-05,
"loss": 2.4661,
"step": 864
},
{
"epoch": 0.30344706758354867,
"grad_norm": 0.5121539235115051,
"learning_rate": 8.349669912997799e-05,
"loss": 2.4797,
"step": 866
},
{
"epoch": 0.30414786912531205,
"grad_norm": 0.4748767912387848,
"learning_rate": 8.341057518323607e-05,
"loss": 2.5009,
"step": 868
},
{
"epoch": 0.3048486706670755,
"grad_norm": 0.4823194742202759,
"learning_rate": 8.332427176955353e-05,
"loss": 2.4798,
"step": 870
},
{
"epoch": 0.3055494722088389,
"grad_norm": 0.4242302477359772,
"learning_rate": 8.323778935251437e-05,
"loss": 2.4764,
"step": 872
},
{
"epoch": 0.30625027375060226,
"grad_norm": 0.46324998140335083,
"learning_rate": 8.31511283966642e-05,
"loss": 2.509,
"step": 874
},
{
"epoch": 0.30695107529236565,
"grad_norm": 0.4894976317882538,
"learning_rate": 8.30642893675076e-05,
"loss": 2.498,
"step": 876
},
{
"epoch": 0.30765187683412903,
"grad_norm": 0.4574197232723236,
"learning_rate": 8.297727273150573e-05,
"loss": 2.48,
"step": 878
},
{
"epoch": 0.3083526783758924,
"grad_norm": 0.44225645065307617,
"learning_rate": 8.289007895607375e-05,
"loss": 2.502,
"step": 880
},
{
"epoch": 0.3090534799176558,
"grad_norm": 0.47749781608581543,
"learning_rate": 8.28027085095783e-05,
"loss": 2.5043,
"step": 882
},
{
"epoch": 0.3097542814594192,
"grad_norm": 0.4569682478904724,
"learning_rate": 8.271516186133511e-05,
"loss": 2.4454,
"step": 884
},
{
"epoch": 0.31045508300118263,
"grad_norm": 0.4561903178691864,
"learning_rate": 8.262743948160632e-05,
"loss": 2.4826,
"step": 886
},
{
"epoch": 0.311155884542946,
"grad_norm": 0.4749627411365509,
"learning_rate": 8.253954184159803e-05,
"loss": 2.4707,
"step": 888
},
{
"epoch": 0.3118566860847094,
"grad_norm": 0.4455653131008148,
"learning_rate": 8.245146941345774e-05,
"loss": 2.4647,
"step": 890
},
{
"epoch": 0.3125574876264728,
"grad_norm": 0.4758734405040741,
"learning_rate": 8.236322267027193e-05,
"loss": 2.4885,
"step": 892
},
{
"epoch": 0.31325828916823617,
"grad_norm": 0.45016252994537354,
"learning_rate": 8.227480208606333e-05,
"loss": 2.4993,
"step": 894
},
{
"epoch": 0.31395909070999956,
"grad_norm": 0.48177486658096313,
"learning_rate": 8.218620813578847e-05,
"loss": 2.4838,
"step": 896
},
{
"epoch": 0.31465989225176294,
"grad_norm": 0.4863053858280182,
"learning_rate": 8.209744129533519e-05,
"loss": 2.5381,
"step": 898
},
{
"epoch": 0.3153606937935263,
"grad_norm": 0.49010857939720154,
"learning_rate": 8.200850204151995e-05,
"loss": 2.5721,
"step": 900
}
],
"logging_steps": 2,
"max_steps": 2854,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0132651008589824e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}