tvkain's picture
Add files using upload-large-folder tool
9fd5c4b verified
raw
history blame
79 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.28760885196133257,
"eval_steps": 500,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00031956539106814733,
"grad_norm": 5.807275295257568,
"learning_rate": 0.0,
"loss": 5.0454,
"step": 1
},
{
"epoch": 0.0006391307821362947,
"grad_norm": 6.20149564743042,
"learning_rate": 6.369426751592357e-07,
"loss": 5.1424,
"step": 2
},
{
"epoch": 0.0012782615642725893,
"grad_norm": 5.7567291259765625,
"learning_rate": 1.910828025477707e-06,
"loss": 5.0835,
"step": 4
},
{
"epoch": 0.001917392346408884,
"grad_norm": 5.017141819000244,
"learning_rate": 3.1847133757961785e-06,
"loss": 5.0733,
"step": 6
},
{
"epoch": 0.0025565231285451786,
"grad_norm": 3.2059810161590576,
"learning_rate": 4.45859872611465e-06,
"loss": 5.0714,
"step": 8
},
{
"epoch": 0.003195653910681473,
"grad_norm": 6.303244113922119,
"learning_rate": 5.732484076433121e-06,
"loss": 4.9915,
"step": 10
},
{
"epoch": 0.003834784692817768,
"grad_norm": 4.852840423583984,
"learning_rate": 7.006369426751593e-06,
"loss": 4.9307,
"step": 12
},
{
"epoch": 0.004473915474954062,
"grad_norm": 3.78067946434021,
"learning_rate": 8.280254777070064e-06,
"loss": 4.8924,
"step": 14
},
{
"epoch": 0.005113046257090357,
"grad_norm": 3.5641331672668457,
"learning_rate": 9.554140127388536e-06,
"loss": 4.8244,
"step": 16
},
{
"epoch": 0.005752177039226652,
"grad_norm": 2.191957712173462,
"learning_rate": 1.0828025477707008e-05,
"loss": 4.6179,
"step": 18
},
{
"epoch": 0.006391307821362946,
"grad_norm": 2.0675458908081055,
"learning_rate": 1.2101910828025478e-05,
"loss": 4.5827,
"step": 20
},
{
"epoch": 0.007030438603499241,
"grad_norm": 1.6559146642684937,
"learning_rate": 1.337579617834395e-05,
"loss": 4.4544,
"step": 22
},
{
"epoch": 0.007669569385635536,
"grad_norm": 1.3731284141540527,
"learning_rate": 1.464968152866242e-05,
"loss": 4.3748,
"step": 24
},
{
"epoch": 0.00830870016777183,
"grad_norm": 1.3962030410766602,
"learning_rate": 1.592356687898089e-05,
"loss": 4.3269,
"step": 26
},
{
"epoch": 0.008947830949908125,
"grad_norm": 1.2659896612167358,
"learning_rate": 1.7197452229299362e-05,
"loss": 4.2133,
"step": 28
},
{
"epoch": 0.00958696173204442,
"grad_norm": 0.9881806373596191,
"learning_rate": 1.8471337579617834e-05,
"loss": 4.0961,
"step": 30
},
{
"epoch": 0.010226092514180714,
"grad_norm": 0.9945515394210815,
"learning_rate": 1.974522292993631e-05,
"loss": 4.0158,
"step": 32
},
{
"epoch": 0.01086522329631701,
"grad_norm": 0.9396588802337646,
"learning_rate": 2.1019108280254778e-05,
"loss": 3.8763,
"step": 34
},
{
"epoch": 0.011504354078453304,
"grad_norm": 1.0665779113769531,
"learning_rate": 2.229299363057325e-05,
"loss": 3.8635,
"step": 36
},
{
"epoch": 0.012143484860589597,
"grad_norm": 1.077245831489563,
"learning_rate": 2.356687898089172e-05,
"loss": 3.802,
"step": 38
},
{
"epoch": 0.012782615642725892,
"grad_norm": 0.8040191531181335,
"learning_rate": 2.4840764331210193e-05,
"loss": 3.7284,
"step": 40
},
{
"epoch": 0.013421746424862187,
"grad_norm": 1.4325759410858154,
"learning_rate": 2.6114649681528662e-05,
"loss": 3.665,
"step": 42
},
{
"epoch": 0.014060877206998482,
"grad_norm": 1.3450332880020142,
"learning_rate": 2.7388535031847134e-05,
"loss": 3.6242,
"step": 44
},
{
"epoch": 0.014700007989134777,
"grad_norm": 0.8203895688056946,
"learning_rate": 2.8662420382165606e-05,
"loss": 3.5576,
"step": 46
},
{
"epoch": 0.015339138771271072,
"grad_norm": 1.1661335229873657,
"learning_rate": 2.9936305732484078e-05,
"loss": 3.522,
"step": 48
},
{
"epoch": 0.015978269553407365,
"grad_norm": 1.0148671865463257,
"learning_rate": 3.121019108280255e-05,
"loss": 3.4594,
"step": 50
},
{
"epoch": 0.01661740033554366,
"grad_norm": 0.6624857187271118,
"learning_rate": 3.248407643312102e-05,
"loss": 3.465,
"step": 52
},
{
"epoch": 0.017256531117679955,
"grad_norm": 0.943125307559967,
"learning_rate": 3.375796178343949e-05,
"loss": 3.4021,
"step": 54
},
{
"epoch": 0.01789566189981625,
"grad_norm": 0.9854550957679749,
"learning_rate": 3.503184713375796e-05,
"loss": 3.3361,
"step": 56
},
{
"epoch": 0.018534792681952544,
"grad_norm": 1.2242411375045776,
"learning_rate": 3.630573248407643e-05,
"loss": 3.3283,
"step": 58
},
{
"epoch": 0.01917392346408884,
"grad_norm": 0.9556372761726379,
"learning_rate": 3.7579617834394906e-05,
"loss": 3.2914,
"step": 60
},
{
"epoch": 0.019813054246225134,
"grad_norm": 1.3133809566497803,
"learning_rate": 3.885350318471338e-05,
"loss": 3.3126,
"step": 62
},
{
"epoch": 0.02045218502836143,
"grad_norm": 0.9322234392166138,
"learning_rate": 4.012738853503185e-05,
"loss": 3.2443,
"step": 64
},
{
"epoch": 0.021091315810497724,
"grad_norm": 1.4383481740951538,
"learning_rate": 4.1401273885350325e-05,
"loss": 3.2428,
"step": 66
},
{
"epoch": 0.02173044659263402,
"grad_norm": 1.0156841278076172,
"learning_rate": 4.267515923566879e-05,
"loss": 3.1735,
"step": 68
},
{
"epoch": 0.022369577374770314,
"grad_norm": 1.1754450798034668,
"learning_rate": 4.394904458598726e-05,
"loss": 3.1788,
"step": 70
},
{
"epoch": 0.02300870815690661,
"grad_norm": 1.0960084199905396,
"learning_rate": 4.522292993630574e-05,
"loss": 3.1963,
"step": 72
},
{
"epoch": 0.023647838939042903,
"grad_norm": 1.054401159286499,
"learning_rate": 4.6496815286624206e-05,
"loss": 3.1604,
"step": 74
},
{
"epoch": 0.024286969721179195,
"grad_norm": 1.1957581043243408,
"learning_rate": 4.777070063694268e-05,
"loss": 3.1648,
"step": 76
},
{
"epoch": 0.02492610050331549,
"grad_norm": 0.7756203413009644,
"learning_rate": 4.904458598726115e-05,
"loss": 3.1066,
"step": 78
},
{
"epoch": 0.025565231285451784,
"grad_norm": 1.0459190607070923,
"learning_rate": 5.031847133757962e-05,
"loss": 3.1571,
"step": 80
},
{
"epoch": 0.02620436206758808,
"grad_norm": 0.9746761322021484,
"learning_rate": 5.159235668789809e-05,
"loss": 3.1026,
"step": 82
},
{
"epoch": 0.026843492849724374,
"grad_norm": 1.0770882368087769,
"learning_rate": 5.286624203821656e-05,
"loss": 3.1125,
"step": 84
},
{
"epoch": 0.02748262363186067,
"grad_norm": 0.9542138576507568,
"learning_rate": 5.414012738853504e-05,
"loss": 3.059,
"step": 86
},
{
"epoch": 0.028121754413996964,
"grad_norm": 1.3454134464263916,
"learning_rate": 5.5414012738853505e-05,
"loss": 3.0645,
"step": 88
},
{
"epoch": 0.02876088519613326,
"grad_norm": 1.0354089736938477,
"learning_rate": 5.6687898089171974e-05,
"loss": 3.04,
"step": 90
},
{
"epoch": 0.029400015978269554,
"grad_norm": 1.1339548826217651,
"learning_rate": 5.796178343949045e-05,
"loss": 3.0625,
"step": 92
},
{
"epoch": 0.03003914676040585,
"grad_norm": 1.200062870979309,
"learning_rate": 5.923566878980892e-05,
"loss": 3.057,
"step": 94
},
{
"epoch": 0.030678277542542143,
"grad_norm": 1.395698070526123,
"learning_rate": 6.0509554140127386e-05,
"loss": 3.0341,
"step": 96
},
{
"epoch": 0.031317408324678435,
"grad_norm": 0.9392653703689575,
"learning_rate": 6.178343949044585e-05,
"loss": 3.0087,
"step": 98
},
{
"epoch": 0.03195653910681473,
"grad_norm": 1.1301568746566772,
"learning_rate": 6.305732484076433e-05,
"loss": 3.0294,
"step": 100
},
{
"epoch": 0.032595669888951025,
"grad_norm": 0.9571443796157837,
"learning_rate": 6.43312101910828e-05,
"loss": 3.0522,
"step": 102
},
{
"epoch": 0.03323480067108732,
"grad_norm": 0.9494081735610962,
"learning_rate": 6.560509554140127e-05,
"loss": 3.0012,
"step": 104
},
{
"epoch": 0.033873931453223614,
"grad_norm": 1.3672889471054077,
"learning_rate": 6.687898089171974e-05,
"loss": 3.0188,
"step": 106
},
{
"epoch": 0.03451306223535991,
"grad_norm": 1.2122056484222412,
"learning_rate": 6.815286624203822e-05,
"loss": 2.9497,
"step": 108
},
{
"epoch": 0.035152193017496204,
"grad_norm": 1.2184698581695557,
"learning_rate": 6.942675159235669e-05,
"loss": 2.9739,
"step": 110
},
{
"epoch": 0.0357913237996325,
"grad_norm": 1.09404456615448,
"learning_rate": 7.070063694267515e-05,
"loss": 3.0241,
"step": 112
},
{
"epoch": 0.036430454581768794,
"grad_norm": 1.1653715372085571,
"learning_rate": 7.197452229299363e-05,
"loss": 2.9606,
"step": 114
},
{
"epoch": 0.03706958536390509,
"grad_norm": 1.050194501876831,
"learning_rate": 7.32484076433121e-05,
"loss": 2.9582,
"step": 116
},
{
"epoch": 0.037708716146041384,
"grad_norm": 1.1262322664260864,
"learning_rate": 7.452229299363057e-05,
"loss": 2.9462,
"step": 118
},
{
"epoch": 0.03834784692817768,
"grad_norm": 1.1232227087020874,
"learning_rate": 7.579617834394906e-05,
"loss": 2.9784,
"step": 120
},
{
"epoch": 0.03898697771031397,
"grad_norm": 0.9088072776794434,
"learning_rate": 7.707006369426753e-05,
"loss": 2.944,
"step": 122
},
{
"epoch": 0.03962610849245027,
"grad_norm": 0.8985419869422913,
"learning_rate": 7.834394904458599e-05,
"loss": 2.9003,
"step": 124
},
{
"epoch": 0.04026523927458656,
"grad_norm": 1.2419854402542114,
"learning_rate": 7.961783439490447e-05,
"loss": 2.9753,
"step": 126
},
{
"epoch": 0.04090437005672286,
"grad_norm": 1.4533154964447021,
"learning_rate": 8.089171974522294e-05,
"loss": 2.9069,
"step": 128
},
{
"epoch": 0.04154350083885915,
"grad_norm": 1.475258231163025,
"learning_rate": 8.21656050955414e-05,
"loss": 2.9402,
"step": 130
},
{
"epoch": 0.04218263162099545,
"grad_norm": 1.0348827838897705,
"learning_rate": 8.343949044585988e-05,
"loss": 2.9295,
"step": 132
},
{
"epoch": 0.04282176240313174,
"grad_norm": 0.9143719673156738,
"learning_rate": 8.471337579617836e-05,
"loss": 2.9408,
"step": 134
},
{
"epoch": 0.04346089318526804,
"grad_norm": 1.1310492753982544,
"learning_rate": 8.598726114649682e-05,
"loss": 2.875,
"step": 136
},
{
"epoch": 0.04410002396740433,
"grad_norm": 1.0483386516571045,
"learning_rate": 8.726114649681529e-05,
"loss": 2.9142,
"step": 138
},
{
"epoch": 0.04473915474954063,
"grad_norm": 0.921519935131073,
"learning_rate": 8.853503184713377e-05,
"loss": 2.9188,
"step": 140
},
{
"epoch": 0.04537828553167692,
"grad_norm": 1.3271907567977905,
"learning_rate": 8.980891719745223e-05,
"loss": 2.9075,
"step": 142
},
{
"epoch": 0.04601741631381322,
"grad_norm": 1.7488983869552612,
"learning_rate": 9.10828025477707e-05,
"loss": 2.9201,
"step": 144
},
{
"epoch": 0.04665654709594951,
"grad_norm": 1.4263213872909546,
"learning_rate": 9.235668789808918e-05,
"loss": 2.9045,
"step": 146
},
{
"epoch": 0.04729567787808581,
"grad_norm": 0.8777288794517517,
"learning_rate": 9.363057324840766e-05,
"loss": 2.8959,
"step": 148
},
{
"epoch": 0.047934808660222095,
"grad_norm": 1.3402196168899536,
"learning_rate": 9.490445859872612e-05,
"loss": 2.8893,
"step": 150
},
{
"epoch": 0.04857393944235839,
"grad_norm": 1.0943351984024048,
"learning_rate": 9.617834394904459e-05,
"loss": 2.9137,
"step": 152
},
{
"epoch": 0.049213070224494684,
"grad_norm": 1.0603907108306885,
"learning_rate": 9.745222929936307e-05,
"loss": 2.8677,
"step": 154
},
{
"epoch": 0.04985220100663098,
"grad_norm": 1.010772705078125,
"learning_rate": 9.872611464968153e-05,
"loss": 2.8374,
"step": 156
},
{
"epoch": 0.050491331788767274,
"grad_norm": 1.2628934383392334,
"learning_rate": 0.0001,
"loss": 2.9009,
"step": 158
},
{
"epoch": 0.05113046257090357,
"grad_norm": 1.146183729171753,
"learning_rate": 9.999988833687822e-05,
"loss": 2.8633,
"step": 160
},
{
"epoch": 0.051769593353039864,
"grad_norm": 0.8704808354377747,
"learning_rate": 9.99995533480116e-05,
"loss": 2.8464,
"step": 162
},
{
"epoch": 0.05240872413517616,
"grad_norm": 1.044418454170227,
"learning_rate": 9.999899503489641e-05,
"loss": 2.8695,
"step": 164
},
{
"epoch": 0.053047854917312454,
"grad_norm": 0.833791196346283,
"learning_rate": 9.999821340002636e-05,
"loss": 2.8605,
"step": 166
},
{
"epoch": 0.05368698569944875,
"grad_norm": 0.922815203666687,
"learning_rate": 9.99972084468926e-05,
"loss": 2.8737,
"step": 168
},
{
"epoch": 0.05432611648158504,
"grad_norm": 0.9120809435844421,
"learning_rate": 9.999598017998384e-05,
"loss": 2.8753,
"step": 170
},
{
"epoch": 0.05496524726372134,
"grad_norm": 1.0272431373596191,
"learning_rate": 9.999452860478611e-05,
"loss": 2.8907,
"step": 172
},
{
"epoch": 0.05560437804585763,
"grad_norm": 0.7777165174484253,
"learning_rate": 9.999285372778295e-05,
"loss": 2.8517,
"step": 174
},
{
"epoch": 0.05624350882799393,
"grad_norm": 0.7110999822616577,
"learning_rate": 9.999095555645523e-05,
"loss": 2.8211,
"step": 176
},
{
"epoch": 0.05688263961013022,
"grad_norm": 0.7857067584991455,
"learning_rate": 9.998883409928117e-05,
"loss": 2.8463,
"step": 178
},
{
"epoch": 0.05752177039226652,
"grad_norm": 0.8582798838615417,
"learning_rate": 9.998648936573629e-05,
"loss": 2.8197,
"step": 180
},
{
"epoch": 0.05816090117440281,
"grad_norm": 0.9790541529655457,
"learning_rate": 9.998392136629345e-05,
"loss": 2.8193,
"step": 182
},
{
"epoch": 0.05880003195653911,
"grad_norm": 1.1599719524383545,
"learning_rate": 9.998113011242264e-05,
"loss": 2.8206,
"step": 184
},
{
"epoch": 0.0594391627386754,
"grad_norm": 0.8326631188392639,
"learning_rate": 9.99781156165911e-05,
"loss": 2.8349,
"step": 186
},
{
"epoch": 0.0600782935208117,
"grad_norm": 0.8876377940177917,
"learning_rate": 9.997487789226312e-05,
"loss": 2.8225,
"step": 188
},
{
"epoch": 0.06071742430294799,
"grad_norm": 0.9899202585220337,
"learning_rate": 9.997141695390009e-05,
"loss": 2.7875,
"step": 190
},
{
"epoch": 0.06135655508508429,
"grad_norm": 1.0686557292938232,
"learning_rate": 9.996773281696037e-05,
"loss": 2.8024,
"step": 192
},
{
"epoch": 0.06199568586722058,
"grad_norm": 0.8899752497673035,
"learning_rate": 9.996382549789926e-05,
"loss": 2.8225,
"step": 194
},
{
"epoch": 0.06263481664935687,
"grad_norm": 0.7781797647476196,
"learning_rate": 9.995969501416891e-05,
"loss": 2.8046,
"step": 196
},
{
"epoch": 0.06327394743149317,
"grad_norm": 0.6428512930870056,
"learning_rate": 9.995534138421818e-05,
"loss": 2.7693,
"step": 198
},
{
"epoch": 0.06391307821362946,
"grad_norm": 0.7047809958457947,
"learning_rate": 9.995076462749273e-05,
"loss": 2.766,
"step": 200
},
{
"epoch": 0.06455220899576576,
"grad_norm": 0.6256312131881714,
"learning_rate": 9.99459647644347e-05,
"loss": 2.8071,
"step": 202
},
{
"epoch": 0.06519133977790205,
"grad_norm": 0.699400007724762,
"learning_rate": 9.994094181648283e-05,
"loss": 2.8347,
"step": 204
},
{
"epoch": 0.06583047056003835,
"grad_norm": 0.7256817817687988,
"learning_rate": 9.993569580607225e-05,
"loss": 2.8074,
"step": 206
},
{
"epoch": 0.06646960134217464,
"grad_norm": 0.573846161365509,
"learning_rate": 9.993022675663437e-05,
"loss": 2.7413,
"step": 208
},
{
"epoch": 0.06710873212431094,
"grad_norm": 0.7314406037330627,
"learning_rate": 9.992453469259685e-05,
"loss": 2.7983,
"step": 210
},
{
"epoch": 0.06774786290644723,
"grad_norm": 0.7307546734809875,
"learning_rate": 9.991861963938342e-05,
"loss": 2.8026,
"step": 212
},
{
"epoch": 0.06838699368858353,
"grad_norm": 0.6367102861404419,
"learning_rate": 9.991248162341384e-05,
"loss": 2.7424,
"step": 214
},
{
"epoch": 0.06902612447071982,
"grad_norm": 0.8630378246307373,
"learning_rate": 9.99061206721037e-05,
"loss": 2.7395,
"step": 216
},
{
"epoch": 0.06966525525285612,
"grad_norm": 0.7586290240287781,
"learning_rate": 9.989953681386433e-05,
"loss": 2.7624,
"step": 218
},
{
"epoch": 0.07030438603499241,
"grad_norm": 0.7091168761253357,
"learning_rate": 9.989273007810271e-05,
"loss": 2.7719,
"step": 220
},
{
"epoch": 0.07094351681712871,
"grad_norm": 0.684183657169342,
"learning_rate": 9.98857004952213e-05,
"loss": 2.7806,
"step": 222
},
{
"epoch": 0.071582647599265,
"grad_norm": 0.920498788356781,
"learning_rate": 9.987844809661791e-05,
"loss": 2.7626,
"step": 224
},
{
"epoch": 0.0722217783814013,
"grad_norm": 0.730060875415802,
"learning_rate": 9.987097291468552e-05,
"loss": 2.8107,
"step": 226
},
{
"epoch": 0.07286090916353759,
"grad_norm": 0.8606828451156616,
"learning_rate": 9.986327498281227e-05,
"loss": 2.7814,
"step": 228
},
{
"epoch": 0.07350003994567389,
"grad_norm": 0.8068298101425171,
"learning_rate": 9.985535433538113e-05,
"loss": 2.7775,
"step": 230
},
{
"epoch": 0.07413917072781018,
"grad_norm": 0.6887542009353638,
"learning_rate": 9.984721100776989e-05,
"loss": 2.784,
"step": 232
},
{
"epoch": 0.07477830150994648,
"grad_norm": 0.84773850440979,
"learning_rate": 9.98388450363509e-05,
"loss": 2.7333,
"step": 234
},
{
"epoch": 0.07541743229208277,
"grad_norm": 0.7914923429489136,
"learning_rate": 9.9830256458491e-05,
"loss": 2.7363,
"step": 236
},
{
"epoch": 0.07605656307421906,
"grad_norm": 0.8284217715263367,
"learning_rate": 9.982144531255127e-05,
"loss": 2.7389,
"step": 238
},
{
"epoch": 0.07669569385635536,
"grad_norm": 0.7706480622291565,
"learning_rate": 9.981241163788694e-05,
"loss": 2.7377,
"step": 240
},
{
"epoch": 0.07733482463849164,
"grad_norm": 0.6147120594978333,
"learning_rate": 9.980315547484711e-05,
"loss": 2.7862,
"step": 242
},
{
"epoch": 0.07797395542062795,
"grad_norm": 0.6364494562149048,
"learning_rate": 9.979367686477469e-05,
"loss": 2.762,
"step": 244
},
{
"epoch": 0.07861308620276423,
"grad_norm": 0.6944818496704102,
"learning_rate": 9.978397585000611e-05,
"loss": 2.7624,
"step": 246
},
{
"epoch": 0.07925221698490054,
"grad_norm": 1.2648204565048218,
"learning_rate": 9.977405247387119e-05,
"loss": 2.7544,
"step": 248
},
{
"epoch": 0.07989134776703682,
"grad_norm": 1.0054659843444824,
"learning_rate": 9.976390678069295e-05,
"loss": 2.7523,
"step": 250
},
{
"epoch": 0.08053047854917313,
"grad_norm": 0.715492308139801,
"learning_rate": 9.975353881578738e-05,
"loss": 2.7341,
"step": 252
},
{
"epoch": 0.08116960933130941,
"grad_norm": 0.7963582277297974,
"learning_rate": 9.974294862546325e-05,
"loss": 2.7484,
"step": 254
},
{
"epoch": 0.08180874011344572,
"grad_norm": 0.7069251537322998,
"learning_rate": 9.97321362570219e-05,
"loss": 2.7719,
"step": 256
},
{
"epoch": 0.082447870895582,
"grad_norm": 0.5716209411621094,
"learning_rate": 9.972110175875706e-05,
"loss": 2.8079,
"step": 258
},
{
"epoch": 0.0830870016777183,
"grad_norm": 0.65562903881073,
"learning_rate": 9.970984517995456e-05,
"loss": 2.7642,
"step": 260
},
{
"epoch": 0.0837261324598546,
"grad_norm": 0.647085964679718,
"learning_rate": 9.969836657089225e-05,
"loss": 2.7139,
"step": 262
},
{
"epoch": 0.0843652632419909,
"grad_norm": 0.6401609778404236,
"learning_rate": 9.968666598283955e-05,
"loss": 2.7278,
"step": 264
},
{
"epoch": 0.08500439402412718,
"grad_norm": 0.5514021515846252,
"learning_rate": 9.967474346805746e-05,
"loss": 2.7332,
"step": 266
},
{
"epoch": 0.08564352480626349,
"grad_norm": 0.5908826589584351,
"learning_rate": 9.96625990797982e-05,
"loss": 2.741,
"step": 268
},
{
"epoch": 0.08628265558839977,
"grad_norm": 0.5510653853416443,
"learning_rate": 9.965023287230497e-05,
"loss": 2.7025,
"step": 270
},
{
"epoch": 0.08692178637053607,
"grad_norm": 0.5656317472457886,
"learning_rate": 9.963764490081176e-05,
"loss": 2.7184,
"step": 272
},
{
"epoch": 0.08756091715267236,
"grad_norm": 0.5132441520690918,
"learning_rate": 9.962483522154302e-05,
"loss": 2.7632,
"step": 274
},
{
"epoch": 0.08820004793480866,
"grad_norm": 0.6730588674545288,
"learning_rate": 9.961180389171352e-05,
"loss": 2.7705,
"step": 276
},
{
"epoch": 0.08883917871694495,
"grad_norm": 0.5657472610473633,
"learning_rate": 9.959855096952804e-05,
"loss": 2.7191,
"step": 278
},
{
"epoch": 0.08947830949908125,
"grad_norm": 0.8265955448150635,
"learning_rate": 9.958507651418106e-05,
"loss": 2.7718,
"step": 280
},
{
"epoch": 0.09011744028121754,
"grad_norm": 0.8996996879577637,
"learning_rate": 9.957138058585658e-05,
"loss": 2.7124,
"step": 282
},
{
"epoch": 0.09075657106335384,
"grad_norm": 0.6458889842033386,
"learning_rate": 9.955746324572781e-05,
"loss": 2.7403,
"step": 284
},
{
"epoch": 0.09139570184549013,
"grad_norm": 0.7175470590591431,
"learning_rate": 9.954332455595689e-05,
"loss": 2.7188,
"step": 286
},
{
"epoch": 0.09203483262762643,
"grad_norm": 0.6640183329582214,
"learning_rate": 9.952896457969463e-05,
"loss": 2.7223,
"step": 288
},
{
"epoch": 0.09267396340976272,
"grad_norm": 0.6551202535629272,
"learning_rate": 9.951438338108022e-05,
"loss": 2.7189,
"step": 290
},
{
"epoch": 0.09331309419189902,
"grad_norm": 0.6980673670768738,
"learning_rate": 9.949958102524093e-05,
"loss": 2.7183,
"step": 292
},
{
"epoch": 0.09395222497403531,
"grad_norm": 0.5926324129104614,
"learning_rate": 9.948455757829187e-05,
"loss": 2.7476,
"step": 294
},
{
"epoch": 0.09459135575617161,
"grad_norm": 0.5434746742248535,
"learning_rate": 9.946931310733565e-05,
"loss": 2.7368,
"step": 296
},
{
"epoch": 0.0952304865383079,
"grad_norm": 0.6466372609138489,
"learning_rate": 9.945384768046206e-05,
"loss": 2.7307,
"step": 298
},
{
"epoch": 0.09586961732044419,
"grad_norm": 0.6376985311508179,
"learning_rate": 9.943816136674782e-05,
"loss": 2.7239,
"step": 300
},
{
"epoch": 0.09650874810258049,
"grad_norm": 0.6092653274536133,
"learning_rate": 9.942225423625624e-05,
"loss": 2.7678,
"step": 302
},
{
"epoch": 0.09714787888471678,
"grad_norm": 0.7219493389129639,
"learning_rate": 9.94061263600369e-05,
"loss": 2.723,
"step": 304
},
{
"epoch": 0.09778700966685308,
"grad_norm": 0.5244786143302917,
"learning_rate": 9.93897778101254e-05,
"loss": 2.7329,
"step": 306
},
{
"epoch": 0.09842614044898937,
"grad_norm": 0.5384829044342041,
"learning_rate": 9.937320865954289e-05,
"loss": 2.661,
"step": 308
},
{
"epoch": 0.09906527123112567,
"grad_norm": 0.624033510684967,
"learning_rate": 9.935641898229594e-05,
"loss": 2.7177,
"step": 310
},
{
"epoch": 0.09970440201326196,
"grad_norm": 0.6381804347038269,
"learning_rate": 9.933940885337602e-05,
"loss": 2.7616,
"step": 312
},
{
"epoch": 0.10034353279539826,
"grad_norm": 0.7671799659729004,
"learning_rate": 9.932217834875934e-05,
"loss": 2.7256,
"step": 314
},
{
"epoch": 0.10098266357753455,
"grad_norm": 0.5695899128913879,
"learning_rate": 9.930472754540634e-05,
"loss": 2.6975,
"step": 316
},
{
"epoch": 0.10162179435967085,
"grad_norm": 0.6461712121963501,
"learning_rate": 9.92870565212615e-05,
"loss": 2.7121,
"step": 318
},
{
"epoch": 0.10226092514180714,
"grad_norm": 0.6111094355583191,
"learning_rate": 9.926916535525283e-05,
"loss": 2.6964,
"step": 320
},
{
"epoch": 0.10290005592394344,
"grad_norm": 0.6368963718414307,
"learning_rate": 9.925105412729175e-05,
"loss": 2.6793,
"step": 322
},
{
"epoch": 0.10353918670607973,
"grad_norm": 0.6973994374275208,
"learning_rate": 9.923272291827245e-05,
"loss": 2.6862,
"step": 324
},
{
"epoch": 0.10417831748821603,
"grad_norm": 0.6717987656593323,
"learning_rate": 9.921417181007175e-05,
"loss": 2.686,
"step": 326
},
{
"epoch": 0.10481744827035232,
"grad_norm": 0.6282898783683777,
"learning_rate": 9.919540088554862e-05,
"loss": 2.6807,
"step": 328
},
{
"epoch": 0.10545657905248862,
"grad_norm": 0.6404539942741394,
"learning_rate": 9.91764102285439e-05,
"loss": 2.659,
"step": 330
},
{
"epoch": 0.10609570983462491,
"grad_norm": 0.679418683052063,
"learning_rate": 9.915719992387979e-05,
"loss": 2.662,
"step": 332
},
{
"epoch": 0.10673484061676121,
"grad_norm": 0.7185142040252686,
"learning_rate": 9.913777005735963e-05,
"loss": 2.7208,
"step": 334
},
{
"epoch": 0.1073739713988975,
"grad_norm": 0.5328919887542725,
"learning_rate": 9.911812071576736e-05,
"loss": 2.6428,
"step": 336
},
{
"epoch": 0.1080131021810338,
"grad_norm": 0.6135143637657166,
"learning_rate": 9.909825198686729e-05,
"loss": 2.6543,
"step": 338
},
{
"epoch": 0.10865223296317009,
"grad_norm": 0.6830089092254639,
"learning_rate": 9.907816395940359e-05,
"loss": 2.677,
"step": 340
},
{
"epoch": 0.10929136374530639,
"grad_norm": 0.6469766497612,
"learning_rate": 9.90578567230999e-05,
"loss": 2.726,
"step": 342
},
{
"epoch": 0.10993049452744268,
"grad_norm": 0.5899373888969421,
"learning_rate": 9.903733036865903e-05,
"loss": 2.7208,
"step": 344
},
{
"epoch": 0.11056962530957898,
"grad_norm": 0.82301926612854,
"learning_rate": 9.901658498776246e-05,
"loss": 2.6925,
"step": 346
},
{
"epoch": 0.11120875609171527,
"grad_norm": 0.8507819771766663,
"learning_rate": 9.899562067306989e-05,
"loss": 2.6905,
"step": 348
},
{
"epoch": 0.11184788687385157,
"grad_norm": 0.6785141229629517,
"learning_rate": 9.897443751821902e-05,
"loss": 2.6643,
"step": 350
},
{
"epoch": 0.11248701765598786,
"grad_norm": 0.6389050483703613,
"learning_rate": 9.89530356178249e-05,
"loss": 2.6769,
"step": 352
},
{
"epoch": 0.11312614843812416,
"grad_norm": 0.5903960466384888,
"learning_rate": 9.893141506747967e-05,
"loss": 2.6793,
"step": 354
},
{
"epoch": 0.11376527922026045,
"grad_norm": 0.583307147026062,
"learning_rate": 9.890957596375206e-05,
"loss": 2.676,
"step": 356
},
{
"epoch": 0.11440441000239673,
"grad_norm": 0.6372009515762329,
"learning_rate": 9.888751840418695e-05,
"loss": 2.6567,
"step": 358
},
{
"epoch": 0.11504354078453304,
"grad_norm": 0.7056903839111328,
"learning_rate": 9.886524248730497e-05,
"loss": 2.6973,
"step": 360
},
{
"epoch": 0.11568267156666932,
"grad_norm": 0.5459578633308411,
"learning_rate": 9.88427483126021e-05,
"loss": 2.6522,
"step": 362
},
{
"epoch": 0.11632180234880563,
"grad_norm": 0.5186561346054077,
"learning_rate": 9.882003598054907e-05,
"loss": 2.6567,
"step": 364
},
{
"epoch": 0.11696093313094191,
"grad_norm": 0.5469943881034851,
"learning_rate": 9.879710559259114e-05,
"loss": 2.6586,
"step": 366
},
{
"epoch": 0.11760006391307821,
"grad_norm": 0.6790450215339661,
"learning_rate": 9.877395725114742e-05,
"loss": 2.6874,
"step": 368
},
{
"epoch": 0.1182391946952145,
"grad_norm": 0.624920129776001,
"learning_rate": 9.875059105961056e-05,
"loss": 2.6777,
"step": 370
},
{
"epoch": 0.1188783254773508,
"grad_norm": 0.6039037704467773,
"learning_rate": 9.872700712234624e-05,
"loss": 2.6881,
"step": 372
},
{
"epoch": 0.11951745625948709,
"grad_norm": 0.6653264760971069,
"learning_rate": 9.87032055446927e-05,
"loss": 2.6388,
"step": 374
},
{
"epoch": 0.1201565870416234,
"grad_norm": 0.7718141078948975,
"learning_rate": 9.867918643296025e-05,
"loss": 2.6686,
"step": 376
},
{
"epoch": 0.12079571782375968,
"grad_norm": 0.6357402801513672,
"learning_rate": 9.865494989443092e-05,
"loss": 2.6611,
"step": 378
},
{
"epoch": 0.12143484860589598,
"grad_norm": 0.560418963432312,
"learning_rate": 9.863049603735775e-05,
"loss": 2.6944,
"step": 380
},
{
"epoch": 0.12207397938803227,
"grad_norm": 0.5758490562438965,
"learning_rate": 9.860582497096452e-05,
"loss": 2.6589,
"step": 382
},
{
"epoch": 0.12271311017016857,
"grad_norm": 0.6144497990608215,
"learning_rate": 9.858093680544516e-05,
"loss": 2.6839,
"step": 384
},
{
"epoch": 0.12335224095230486,
"grad_norm": 0.5986223816871643,
"learning_rate": 9.855583165196329e-05,
"loss": 2.6778,
"step": 386
},
{
"epoch": 0.12399137173444116,
"grad_norm": 0.5350797176361084,
"learning_rate": 9.853050962265169e-05,
"loss": 2.6539,
"step": 388
},
{
"epoch": 0.12463050251657745,
"grad_norm": 0.5589949488639832,
"learning_rate": 9.850497083061183e-05,
"loss": 2.6536,
"step": 390
},
{
"epoch": 0.12526963329871374,
"grad_norm": 0.5695136189460754,
"learning_rate": 9.847921538991339e-05,
"loss": 2.6615,
"step": 392
},
{
"epoch": 0.12590876408085006,
"grad_norm": 0.5739374756813049,
"learning_rate": 9.845324341559366e-05,
"loss": 2.6883,
"step": 394
},
{
"epoch": 0.12654789486298634,
"grad_norm": 0.528075098991394,
"learning_rate": 9.84270550236571e-05,
"loss": 2.6944,
"step": 396
},
{
"epoch": 0.12718702564512263,
"grad_norm": 0.6400613188743591,
"learning_rate": 9.840065033107483e-05,
"loss": 2.6596,
"step": 398
},
{
"epoch": 0.12782615642725892,
"grad_norm": 0.6734158992767334,
"learning_rate": 9.837402945578406e-05,
"loss": 2.6562,
"step": 400
},
{
"epoch": 0.12846528720939523,
"grad_norm": 0.6197201013565063,
"learning_rate": 9.834719251668761e-05,
"loss": 2.6971,
"step": 402
},
{
"epoch": 0.12910441799153152,
"grad_norm": 0.5766332745552063,
"learning_rate": 9.832013963365332e-05,
"loss": 2.6355,
"step": 404
},
{
"epoch": 0.1297435487736678,
"grad_norm": 0.7926291823387146,
"learning_rate": 9.829287092751357e-05,
"loss": 2.6438,
"step": 406
},
{
"epoch": 0.1303826795558041,
"grad_norm": 0.7527420520782471,
"learning_rate": 9.826538652006469e-05,
"loss": 2.6695,
"step": 408
},
{
"epoch": 0.13102181033794041,
"grad_norm": 0.7154802083969116,
"learning_rate": 9.823768653406652e-05,
"loss": 2.6158,
"step": 410
},
{
"epoch": 0.1316609411200767,
"grad_norm": 0.5435774326324463,
"learning_rate": 9.820977109324169e-05,
"loss": 2.6843,
"step": 412
},
{
"epoch": 0.132300071902213,
"grad_norm": 0.5893809199333191,
"learning_rate": 9.818164032227522e-05,
"loss": 2.6607,
"step": 414
},
{
"epoch": 0.13293920268434928,
"grad_norm": 0.5635148882865906,
"learning_rate": 9.815329434681392e-05,
"loss": 2.658,
"step": 416
},
{
"epoch": 0.13357833346648557,
"grad_norm": 0.4904562830924988,
"learning_rate": 9.812473329346578e-05,
"loss": 2.6616,
"step": 418
},
{
"epoch": 0.13421746424862188,
"grad_norm": 0.5800766944885254,
"learning_rate": 9.809595728979945e-05,
"loss": 2.6657,
"step": 420
},
{
"epoch": 0.13485659503075817,
"grad_norm": 0.5110253691673279,
"learning_rate": 9.806696646434367e-05,
"loss": 2.6192,
"step": 422
},
{
"epoch": 0.13549572581289446,
"grad_norm": 0.5567732453346252,
"learning_rate": 9.803776094658668e-05,
"loss": 2.6475,
"step": 424
},
{
"epoch": 0.13613485659503075,
"grad_norm": 0.5255835056304932,
"learning_rate": 9.800834086697566e-05,
"loss": 2.6644,
"step": 426
},
{
"epoch": 0.13677398737716706,
"grad_norm": 0.4851606786251068,
"learning_rate": 9.797870635691613e-05,
"loss": 2.6628,
"step": 428
},
{
"epoch": 0.13741311815930335,
"grad_norm": 0.4904446005821228,
"learning_rate": 9.794885754877135e-05,
"loss": 2.6222,
"step": 430
},
{
"epoch": 0.13805224894143964,
"grad_norm": 0.47077298164367676,
"learning_rate": 9.791879457586178e-05,
"loss": 2.5875,
"step": 432
},
{
"epoch": 0.13869137972357592,
"grad_norm": 0.4484720528125763,
"learning_rate": 9.788851757246443e-05,
"loss": 2.6279,
"step": 434
},
{
"epoch": 0.13933051050571224,
"grad_norm": 0.5684689283370972,
"learning_rate": 9.785802667381227e-05,
"loss": 2.6507,
"step": 436
},
{
"epoch": 0.13996964128784853,
"grad_norm": 0.5868870615959167,
"learning_rate": 9.78273220160937e-05,
"loss": 2.6476,
"step": 438
},
{
"epoch": 0.14060877206998482,
"grad_norm": 0.5244540572166443,
"learning_rate": 9.77964037364518e-05,
"loss": 2.6353,
"step": 440
},
{
"epoch": 0.1412479028521211,
"grad_norm": 0.5107213258743286,
"learning_rate": 9.776527197298386e-05,
"loss": 2.6335,
"step": 442
},
{
"epoch": 0.14188703363425742,
"grad_norm": 0.5410230159759521,
"learning_rate": 9.773392686474065e-05,
"loss": 2.6248,
"step": 444
},
{
"epoch": 0.1425261644163937,
"grad_norm": 0.5540198683738708,
"learning_rate": 9.770236855172587e-05,
"loss": 2.6304,
"step": 446
},
{
"epoch": 0.14316529519853,
"grad_norm": 0.6982893347740173,
"learning_rate": 9.767059717489557e-05,
"loss": 2.6285,
"step": 448
},
{
"epoch": 0.14380442598066628,
"grad_norm": 0.7649112939834595,
"learning_rate": 9.763861287615732e-05,
"loss": 2.6863,
"step": 450
},
{
"epoch": 0.1444435567628026,
"grad_norm": 0.5209079384803772,
"learning_rate": 9.760641579836984e-05,
"loss": 2.6262,
"step": 452
},
{
"epoch": 0.1450826875449389,
"grad_norm": 0.5985437631607056,
"learning_rate": 9.757400608534215e-05,
"loss": 2.5451,
"step": 454
},
{
"epoch": 0.14572181832707518,
"grad_norm": 0.6232045888900757,
"learning_rate": 9.754138388183305e-05,
"loss": 2.6142,
"step": 456
},
{
"epoch": 0.14636094910921146,
"grad_norm": 0.7111669778823853,
"learning_rate": 9.750854933355042e-05,
"loss": 2.5868,
"step": 458
},
{
"epoch": 0.14700007989134778,
"grad_norm": 0.6749933362007141,
"learning_rate": 9.747550258715059e-05,
"loss": 2.6233,
"step": 460
},
{
"epoch": 0.14763921067348407,
"grad_norm": 0.5915788412094116,
"learning_rate": 9.744224379023768e-05,
"loss": 2.6233,
"step": 462
},
{
"epoch": 0.14827834145562035,
"grad_norm": 0.6704515814781189,
"learning_rate": 9.740877309136291e-05,
"loss": 2.6432,
"step": 464
},
{
"epoch": 0.14891747223775664,
"grad_norm": 0.6156161427497864,
"learning_rate": 9.737509064002402e-05,
"loss": 2.6436,
"step": 466
},
{
"epoch": 0.14955660301989296,
"grad_norm": 0.49440738558769226,
"learning_rate": 9.734119658666448e-05,
"loss": 2.6488,
"step": 468
},
{
"epoch": 0.15019573380202925,
"grad_norm": 0.6561670899391174,
"learning_rate": 9.730709108267296e-05,
"loss": 2.6191,
"step": 470
},
{
"epoch": 0.15083486458416553,
"grad_norm": 0.6310847997665405,
"learning_rate": 9.727277428038253e-05,
"loss": 2.6055,
"step": 472
},
{
"epoch": 0.15147399536630182,
"grad_norm": 0.5141007304191589,
"learning_rate": 9.723824633307001e-05,
"loss": 2.626,
"step": 474
},
{
"epoch": 0.1521131261484381,
"grad_norm": 0.5299694538116455,
"learning_rate": 9.720350739495538e-05,
"loss": 2.6401,
"step": 476
},
{
"epoch": 0.15275225693057443,
"grad_norm": 0.5702034831047058,
"learning_rate": 9.716855762120097e-05,
"loss": 2.6392,
"step": 478
},
{
"epoch": 0.15339138771271071,
"grad_norm": 0.5058117508888245,
"learning_rate": 9.713339716791076e-05,
"loss": 2.5778,
"step": 480
},
{
"epoch": 0.154030518494847,
"grad_norm": 0.6530377864837646,
"learning_rate": 9.709802619212987e-05,
"loss": 2.6359,
"step": 482
},
{
"epoch": 0.1546696492769833,
"grad_norm": 0.6136478781700134,
"learning_rate": 9.706244485184357e-05,
"loss": 2.6117,
"step": 484
},
{
"epoch": 0.1553087800591196,
"grad_norm": 0.5947436094284058,
"learning_rate": 9.702665330597684e-05,
"loss": 2.6148,
"step": 486
},
{
"epoch": 0.1559479108412559,
"grad_norm": 0.6332894563674927,
"learning_rate": 9.699065171439349e-05,
"loss": 2.6251,
"step": 488
},
{
"epoch": 0.15658704162339218,
"grad_norm": 0.5429502129554749,
"learning_rate": 9.695444023789554e-05,
"loss": 2.577,
"step": 490
},
{
"epoch": 0.15722617240552847,
"grad_norm": 0.6252620220184326,
"learning_rate": 9.691801903822244e-05,
"loss": 2.6114,
"step": 492
},
{
"epoch": 0.15786530318766478,
"grad_norm": 0.5587325692176819,
"learning_rate": 9.68813882780504e-05,
"loss": 2.632,
"step": 494
},
{
"epoch": 0.15850443396980107,
"grad_norm": 0.5149174332618713,
"learning_rate": 9.68445481209916e-05,
"loss": 2.6394,
"step": 496
},
{
"epoch": 0.15914356475193736,
"grad_norm": 0.5343561172485352,
"learning_rate": 9.680749873159354e-05,
"loss": 2.572,
"step": 498
},
{
"epoch": 0.15978269553407365,
"grad_norm": 0.5082888603210449,
"learning_rate": 9.677024027533821e-05,
"loss": 2.5786,
"step": 500
},
{
"epoch": 0.16042182631620996,
"grad_norm": 0.46739038825035095,
"learning_rate": 9.673277291864145e-05,
"loss": 2.5933,
"step": 502
},
{
"epoch": 0.16106095709834625,
"grad_norm": 0.5262092351913452,
"learning_rate": 9.669509682885216e-05,
"loss": 2.6295,
"step": 504
},
{
"epoch": 0.16170008788048254,
"grad_norm": 0.5002930760383606,
"learning_rate": 9.66572121742515e-05,
"loss": 2.6306,
"step": 506
},
{
"epoch": 0.16233921866261883,
"grad_norm": 0.4859941601753235,
"learning_rate": 9.661911912405222e-05,
"loss": 2.5742,
"step": 508
},
{
"epoch": 0.16297834944475514,
"grad_norm": 0.6142066717147827,
"learning_rate": 9.65808178483979e-05,
"loss": 2.61,
"step": 510
},
{
"epoch": 0.16361748022689143,
"grad_norm": 0.6018419861793518,
"learning_rate": 9.654230851836214e-05,
"loss": 2.6158,
"step": 512
},
{
"epoch": 0.16425661100902772,
"grad_norm": 0.5785476565361023,
"learning_rate": 9.650359130594779e-05,
"loss": 2.629,
"step": 514
},
{
"epoch": 0.164895741791164,
"grad_norm": 0.5036047697067261,
"learning_rate": 9.646466638408629e-05,
"loss": 2.6087,
"step": 516
},
{
"epoch": 0.16553487257330032,
"grad_norm": 0.5089232325553894,
"learning_rate": 9.642553392663672e-05,
"loss": 2.6299,
"step": 518
},
{
"epoch": 0.1661740033554366,
"grad_norm": 0.5314218997955322,
"learning_rate": 9.63861941083852e-05,
"loss": 2.6152,
"step": 520
},
{
"epoch": 0.1668131341375729,
"grad_norm": 0.6545165181159973,
"learning_rate": 9.634664710504402e-05,
"loss": 2.5711,
"step": 522
},
{
"epoch": 0.1674522649197092,
"grad_norm": 0.7461646199226379,
"learning_rate": 9.630689309325082e-05,
"loss": 2.627,
"step": 524
},
{
"epoch": 0.1680913957018455,
"grad_norm": 0.6585918068885803,
"learning_rate": 9.626693225056794e-05,
"loss": 2.6231,
"step": 526
},
{
"epoch": 0.1687305264839818,
"grad_norm": 0.5888398289680481,
"learning_rate": 9.62267647554814e-05,
"loss": 2.6175,
"step": 528
},
{
"epoch": 0.16936965726611808,
"grad_norm": 0.49957162141799927,
"learning_rate": 9.618639078740037e-05,
"loss": 2.5771,
"step": 530
},
{
"epoch": 0.17000878804825437,
"grad_norm": 0.4573955535888672,
"learning_rate": 9.614581052665616e-05,
"loss": 2.5855,
"step": 532
},
{
"epoch": 0.17064791883039068,
"grad_norm": 0.5360051393508911,
"learning_rate": 9.610502415450153e-05,
"loss": 2.6107,
"step": 534
},
{
"epoch": 0.17128704961252697,
"grad_norm": 0.5413601994514465,
"learning_rate": 9.606403185310981e-05,
"loss": 2.5971,
"step": 536
},
{
"epoch": 0.17192618039466326,
"grad_norm": 0.5360136032104492,
"learning_rate": 9.602283380557416e-05,
"loss": 2.5878,
"step": 538
},
{
"epoch": 0.17256531117679955,
"grad_norm": 0.653225839138031,
"learning_rate": 9.598143019590664e-05,
"loss": 2.6,
"step": 540
},
{
"epoch": 0.17320444195893583,
"grad_norm": 0.5268750786781311,
"learning_rate": 9.593982120903754e-05,
"loss": 2.5992,
"step": 542
},
{
"epoch": 0.17384357274107215,
"grad_norm": 0.5311806797981262,
"learning_rate": 9.589800703081442e-05,
"loss": 2.5939,
"step": 544
},
{
"epoch": 0.17448270352320844,
"grad_norm": 0.47583094239234924,
"learning_rate": 9.585598784800135e-05,
"loss": 2.5863,
"step": 546
},
{
"epoch": 0.17512183430534473,
"grad_norm": 0.44130444526672363,
"learning_rate": 9.581376384827804e-05,
"loss": 2.5568,
"step": 548
},
{
"epoch": 0.175760965087481,
"grad_norm": 0.45064234733581543,
"learning_rate": 9.577133522023906e-05,
"loss": 2.5888,
"step": 550
},
{
"epoch": 0.17640009586961733,
"grad_norm": 0.4643968343734741,
"learning_rate": 9.572870215339294e-05,
"loss": 2.6121,
"step": 552
},
{
"epoch": 0.17703922665175362,
"grad_norm": 0.446347713470459,
"learning_rate": 9.568586483816129e-05,
"loss": 2.614,
"step": 554
},
{
"epoch": 0.1776783574338899,
"grad_norm": 0.48379895091056824,
"learning_rate": 9.564282346587809e-05,
"loss": 2.6353,
"step": 556
},
{
"epoch": 0.1783174882160262,
"grad_norm": 0.45891985297203064,
"learning_rate": 9.559957822878867e-05,
"loss": 2.6111,
"step": 558
},
{
"epoch": 0.1789566189981625,
"grad_norm": 0.49106699228286743,
"learning_rate": 9.555612932004896e-05,
"loss": 2.5876,
"step": 560
},
{
"epoch": 0.1795957497802988,
"grad_norm": 0.5220739245414734,
"learning_rate": 9.55124769337246e-05,
"loss": 2.5988,
"step": 562
},
{
"epoch": 0.18023488056243508,
"grad_norm": 0.6365030407905579,
"learning_rate": 9.546862126479006e-05,
"loss": 2.5763,
"step": 564
},
{
"epoch": 0.18087401134457137,
"grad_norm": 0.706681489944458,
"learning_rate": 9.542456250912776e-05,
"loss": 2.5965,
"step": 566
},
{
"epoch": 0.1815131421267077,
"grad_norm": 0.4519253373146057,
"learning_rate": 9.538030086352725e-05,
"loss": 2.568,
"step": 568
},
{
"epoch": 0.18215227290884398,
"grad_norm": 0.6023289561271667,
"learning_rate": 9.533583652568426e-05,
"loss": 2.6034,
"step": 570
},
{
"epoch": 0.18279140369098026,
"grad_norm": 0.581615686416626,
"learning_rate": 9.529116969419986e-05,
"loss": 2.5858,
"step": 572
},
{
"epoch": 0.18343053447311655,
"grad_norm": 0.49777430295944214,
"learning_rate": 9.524630056857958e-05,
"loss": 2.6062,
"step": 574
},
{
"epoch": 0.18406966525525287,
"grad_norm": 0.5936197638511658,
"learning_rate": 9.520122934923246e-05,
"loss": 2.5976,
"step": 576
},
{
"epoch": 0.18470879603738916,
"grad_norm": 0.5317326784133911,
"learning_rate": 9.515595623747022e-05,
"loss": 2.6004,
"step": 578
},
{
"epoch": 0.18534792681952544,
"grad_norm": 0.524297297000885,
"learning_rate": 9.511048143550637e-05,
"loss": 2.583,
"step": 580
},
{
"epoch": 0.18598705760166173,
"grad_norm": 0.5107091665267944,
"learning_rate": 9.506480514645523e-05,
"loss": 2.5704,
"step": 582
},
{
"epoch": 0.18662618838379805,
"grad_norm": 0.4521612226963043,
"learning_rate": 9.501892757433107e-05,
"loss": 2.5903,
"step": 584
},
{
"epoch": 0.18726531916593434,
"grad_norm": 0.48701736330986023,
"learning_rate": 9.497284892404721e-05,
"loss": 2.5758,
"step": 586
},
{
"epoch": 0.18790444994807062,
"grad_norm": 0.613917887210846,
"learning_rate": 9.492656940141512e-05,
"loss": 2.5749,
"step": 588
},
{
"epoch": 0.1885435807302069,
"grad_norm": 0.5269163846969604,
"learning_rate": 9.488008921314338e-05,
"loss": 2.6126,
"step": 590
},
{
"epoch": 0.18918271151234323,
"grad_norm": 0.6326431632041931,
"learning_rate": 9.483340856683696e-05,
"loss": 2.5863,
"step": 592
},
{
"epoch": 0.18982184229447951,
"grad_norm": 0.47863009572029114,
"learning_rate": 9.47865276709961e-05,
"loss": 2.6201,
"step": 594
},
{
"epoch": 0.1904609730766158,
"grad_norm": 0.5771295428276062,
"learning_rate": 9.473944673501549e-05,
"loss": 2.5914,
"step": 596
},
{
"epoch": 0.1911001038587521,
"grad_norm": 0.4584767818450928,
"learning_rate": 9.469216596918331e-05,
"loss": 2.5497,
"step": 598
},
{
"epoch": 0.19173923464088838,
"grad_norm": 0.4598289728164673,
"learning_rate": 9.464468558468026e-05,
"loss": 2.5841,
"step": 600
},
{
"epoch": 0.1923783654230247,
"grad_norm": 0.516592800617218,
"learning_rate": 9.459700579357869e-05,
"loss": 2.6013,
"step": 602
},
{
"epoch": 0.19301749620516098,
"grad_norm": 0.5296542048454285,
"learning_rate": 9.454912680884154e-05,
"loss": 2.6085,
"step": 604
},
{
"epoch": 0.19365662698729727,
"grad_norm": 0.5447851419448853,
"learning_rate": 9.45010488443215e-05,
"loss": 2.5507,
"step": 606
},
{
"epoch": 0.19429575776943356,
"grad_norm": 0.49331796169281006,
"learning_rate": 9.445277211476e-05,
"loss": 2.5476,
"step": 608
},
{
"epoch": 0.19493488855156987,
"grad_norm": 0.4537939429283142,
"learning_rate": 9.440429683578624e-05,
"loss": 2.5977,
"step": 610
},
{
"epoch": 0.19557401933370616,
"grad_norm": 0.5129672884941101,
"learning_rate": 9.435562322391627e-05,
"loss": 2.5689,
"step": 612
},
{
"epoch": 0.19621315011584245,
"grad_norm": 0.5162326693534851,
"learning_rate": 9.430675149655199e-05,
"loss": 2.5981,
"step": 614
},
{
"epoch": 0.19685228089797874,
"grad_norm": 0.5716260075569153,
"learning_rate": 9.425768187198016e-05,
"loss": 2.547,
"step": 616
},
{
"epoch": 0.19749141168011505,
"grad_norm": 0.5598787069320679,
"learning_rate": 9.420841456937151e-05,
"loss": 2.5743,
"step": 618
},
{
"epoch": 0.19813054246225134,
"grad_norm": 0.5771391987800598,
"learning_rate": 9.415894980877966e-05,
"loss": 2.589,
"step": 620
},
{
"epoch": 0.19876967324438763,
"grad_norm": 0.5378340482711792,
"learning_rate": 9.410928781114019e-05,
"loss": 2.5916,
"step": 622
},
{
"epoch": 0.19940880402652392,
"grad_norm": 0.5003606081008911,
"learning_rate": 9.405942879826967e-05,
"loss": 2.5535,
"step": 624
},
{
"epoch": 0.20004793480866023,
"grad_norm": 0.5581315755844116,
"learning_rate": 9.400937299286458e-05,
"loss": 2.6016,
"step": 626
},
{
"epoch": 0.20068706559079652,
"grad_norm": 0.5600181818008423,
"learning_rate": 9.395912061850046e-05,
"loss": 2.5622,
"step": 628
},
{
"epoch": 0.2013261963729328,
"grad_norm": 0.5221248269081116,
"learning_rate": 9.390867189963075e-05,
"loss": 2.5584,
"step": 630
},
{
"epoch": 0.2019653271550691,
"grad_norm": 0.4963245391845703,
"learning_rate": 9.385802706158594e-05,
"loss": 2.54,
"step": 632
},
{
"epoch": 0.2026044579372054,
"grad_norm": 0.4757302403450012,
"learning_rate": 9.380718633057246e-05,
"loss": 2.5856,
"step": 634
},
{
"epoch": 0.2032435887193417,
"grad_norm": 0.4876170754432678,
"learning_rate": 9.37561499336717e-05,
"loss": 2.5912,
"step": 636
},
{
"epoch": 0.203882719501478,
"grad_norm": 0.4831182360649109,
"learning_rate": 9.370491809883895e-05,
"loss": 2.5395,
"step": 638
},
{
"epoch": 0.20452185028361428,
"grad_norm": 0.5880109071731567,
"learning_rate": 9.365349105490253e-05,
"loss": 2.5579,
"step": 640
},
{
"epoch": 0.2051609810657506,
"grad_norm": 0.497311532497406,
"learning_rate": 9.360186903156259e-05,
"loss": 2.5629,
"step": 642
},
{
"epoch": 0.20580011184788688,
"grad_norm": 0.5942720174789429,
"learning_rate": 9.355005225939017e-05,
"loss": 2.5816,
"step": 644
},
{
"epoch": 0.20643924263002317,
"grad_norm": 0.5332151651382446,
"learning_rate": 9.34980409698262e-05,
"loss": 2.5603,
"step": 646
},
{
"epoch": 0.20707837341215946,
"grad_norm": 0.4901409149169922,
"learning_rate": 9.344583539518036e-05,
"loss": 2.569,
"step": 648
},
{
"epoch": 0.20771750419429577,
"grad_norm": 0.521522581577301,
"learning_rate": 9.339343576863018e-05,
"loss": 2.6077,
"step": 650
},
{
"epoch": 0.20835663497643206,
"grad_norm": 0.49068787693977356,
"learning_rate": 9.334084232421988e-05,
"loss": 2.5729,
"step": 652
},
{
"epoch": 0.20899576575856835,
"grad_norm": 0.48800089955329895,
"learning_rate": 9.32880552968594e-05,
"loss": 2.5814,
"step": 654
},
{
"epoch": 0.20963489654070463,
"grad_norm": 0.5036289691925049,
"learning_rate": 9.323507492232328e-05,
"loss": 2.5795,
"step": 656
},
{
"epoch": 0.21027402732284092,
"grad_norm": 0.4648139476776123,
"learning_rate": 9.318190143724972e-05,
"loss": 2.572,
"step": 658
},
{
"epoch": 0.21091315810497724,
"grad_norm": 0.42503541707992554,
"learning_rate": 9.312853507913938e-05,
"loss": 2.5765,
"step": 660
},
{
"epoch": 0.21155228888711353,
"grad_norm": 0.483327180147171,
"learning_rate": 9.307497608635447e-05,
"loss": 2.5965,
"step": 662
},
{
"epoch": 0.21219141966924981,
"grad_norm": 0.49550801515579224,
"learning_rate": 9.302122469811752e-05,
"loss": 2.5412,
"step": 664
},
{
"epoch": 0.2128305504513861,
"grad_norm": 0.457082599401474,
"learning_rate": 9.296728115451046e-05,
"loss": 2.5945,
"step": 666
},
{
"epoch": 0.21346968123352242,
"grad_norm": 0.5289996862411499,
"learning_rate": 9.291314569647346e-05,
"loss": 2.5364,
"step": 668
},
{
"epoch": 0.2141088120156587,
"grad_norm": 0.5246165990829468,
"learning_rate": 9.285881856580392e-05,
"loss": 2.5313,
"step": 670
},
{
"epoch": 0.214747942797795,
"grad_norm": 0.5950086712837219,
"learning_rate": 9.280430000515528e-05,
"loss": 2.5621,
"step": 672
},
{
"epoch": 0.21538707357993128,
"grad_norm": 0.49669399857521057,
"learning_rate": 9.274959025803604e-05,
"loss": 2.5515,
"step": 674
},
{
"epoch": 0.2160262043620676,
"grad_norm": 0.5234604477882385,
"learning_rate": 9.269468956880871e-05,
"loss": 2.5432,
"step": 676
},
{
"epoch": 0.21666533514420389,
"grad_norm": 0.5024713277816772,
"learning_rate": 9.263959818268853e-05,
"loss": 2.5893,
"step": 678
},
{
"epoch": 0.21730446592634017,
"grad_norm": 0.4908897876739502,
"learning_rate": 9.258431634574256e-05,
"loss": 2.6035,
"step": 680
},
{
"epoch": 0.21794359670847646,
"grad_norm": 0.47038817405700684,
"learning_rate": 9.252884430488849e-05,
"loss": 2.5652,
"step": 682
},
{
"epoch": 0.21858272749061278,
"grad_norm": 0.47875434160232544,
"learning_rate": 9.247318230789359e-05,
"loss": 2.5902,
"step": 684
},
{
"epoch": 0.21922185827274906,
"grad_norm": 0.4665825366973877,
"learning_rate": 9.241733060337354e-05,
"loss": 2.5292,
"step": 686
},
{
"epoch": 0.21986098905488535,
"grad_norm": 0.4810079336166382,
"learning_rate": 9.236128944079138e-05,
"loss": 2.5792,
"step": 688
},
{
"epoch": 0.22050011983702164,
"grad_norm": 0.45069095492362976,
"learning_rate": 9.230505907045635e-05,
"loss": 2.5316,
"step": 690
},
{
"epoch": 0.22113925061915796,
"grad_norm": 0.40244781970977783,
"learning_rate": 9.224863974352278e-05,
"loss": 2.5563,
"step": 692
},
{
"epoch": 0.22177838140129424,
"grad_norm": 0.5229255557060242,
"learning_rate": 9.219203171198902e-05,
"loss": 2.5402,
"step": 694
},
{
"epoch": 0.22241751218343053,
"grad_norm": 0.5138113498687744,
"learning_rate": 9.213523522869625e-05,
"loss": 2.5914,
"step": 696
},
{
"epoch": 0.22305664296556682,
"grad_norm": 0.47990405559539795,
"learning_rate": 9.207825054732736e-05,
"loss": 2.5525,
"step": 698
},
{
"epoch": 0.22369577374770314,
"grad_norm": 0.48454561829566956,
"learning_rate": 9.202107792240587e-05,
"loss": 2.5379,
"step": 700
},
{
"epoch": 0.22433490452983942,
"grad_norm": 0.49185454845428467,
"learning_rate": 9.19637176092947e-05,
"loss": 2.5462,
"step": 702
},
{
"epoch": 0.2249740353119757,
"grad_norm": 0.4852677583694458,
"learning_rate": 9.190616986419512e-05,
"loss": 2.5222,
"step": 704
},
{
"epoch": 0.225613166094112,
"grad_norm": 0.503039538860321,
"learning_rate": 9.18484349441456e-05,
"loss": 2.5714,
"step": 706
},
{
"epoch": 0.22625229687624832,
"grad_norm": 0.4584214389324188,
"learning_rate": 9.179051310702056e-05,
"loss": 2.5694,
"step": 708
},
{
"epoch": 0.2268914276583846,
"grad_norm": 0.46065405011177063,
"learning_rate": 9.173240461152935e-05,
"loss": 2.5804,
"step": 710
},
{
"epoch": 0.2275305584405209,
"grad_norm": 0.48372742533683777,
"learning_rate": 9.1674109717215e-05,
"loss": 2.5489,
"step": 712
},
{
"epoch": 0.22816968922265718,
"grad_norm": 0.43927186727523804,
"learning_rate": 9.16156286844531e-05,
"loss": 2.541,
"step": 714
},
{
"epoch": 0.22880882000479347,
"grad_norm": 0.46032947301864624,
"learning_rate": 9.155696177445064e-05,
"loss": 2.5597,
"step": 716
},
{
"epoch": 0.22944795078692978,
"grad_norm": 0.4477051794528961,
"learning_rate": 9.149810924924482e-05,
"loss": 2.551,
"step": 718
},
{
"epoch": 0.23008708156906607,
"grad_norm": 0.4732860326766968,
"learning_rate": 9.143907137170194e-05,
"loss": 2.5688,
"step": 720
},
{
"epoch": 0.23072621235120236,
"grad_norm": 0.520808219909668,
"learning_rate": 9.137984840551612e-05,
"loss": 2.5429,
"step": 722
},
{
"epoch": 0.23136534313333865,
"grad_norm": 0.448128879070282,
"learning_rate": 9.132044061520823e-05,
"loss": 2.5146,
"step": 724
},
{
"epoch": 0.23200447391547496,
"grad_norm": 0.520537257194519,
"learning_rate": 9.126084826612464e-05,
"loss": 2.5718,
"step": 726
},
{
"epoch": 0.23264360469761125,
"grad_norm": 0.5061787962913513,
"learning_rate": 9.120107162443605e-05,
"loss": 2.5341,
"step": 728
},
{
"epoch": 0.23328273547974754,
"grad_norm": 0.4683222770690918,
"learning_rate": 9.114111095713633e-05,
"loss": 2.5351,
"step": 730
},
{
"epoch": 0.23392186626188383,
"grad_norm": 0.4754564166069031,
"learning_rate": 9.108096653204125e-05,
"loss": 2.5798,
"step": 732
},
{
"epoch": 0.23456099704402014,
"grad_norm": 0.5304054021835327,
"learning_rate": 9.102063861778744e-05,
"loss": 2.5812,
"step": 734
},
{
"epoch": 0.23520012782615643,
"grad_norm": 0.4747471809387207,
"learning_rate": 9.0960127483831e-05,
"loss": 2.5847,
"step": 736
},
{
"epoch": 0.23583925860829272,
"grad_norm": 0.4957279562950134,
"learning_rate": 9.089943340044642e-05,
"loss": 2.5689,
"step": 738
},
{
"epoch": 0.236478389390429,
"grad_norm": 0.5040017366409302,
"learning_rate": 9.083855663872533e-05,
"loss": 2.5345,
"step": 740
},
{
"epoch": 0.23711752017256532,
"grad_norm": 0.5398538708686829,
"learning_rate": 9.07774974705753e-05,
"loss": 2.5517,
"step": 742
},
{
"epoch": 0.2377566509547016,
"grad_norm": 0.5123056173324585,
"learning_rate": 9.071625616871862e-05,
"loss": 2.5746,
"step": 744
},
{
"epoch": 0.2383957817368379,
"grad_norm": 0.4740076959133148,
"learning_rate": 9.06548330066911e-05,
"loss": 2.5449,
"step": 746
},
{
"epoch": 0.23903491251897419,
"grad_norm": 0.4199361801147461,
"learning_rate": 9.05932282588408e-05,
"loss": 2.5857,
"step": 748
},
{
"epoch": 0.2396740433011105,
"grad_norm": 0.4691718816757202,
"learning_rate": 9.053144220032688e-05,
"loss": 2.5408,
"step": 750
},
{
"epoch": 0.2403131740832468,
"grad_norm": 0.4801616668701172,
"learning_rate": 9.04694751071183e-05,
"loss": 2.6167,
"step": 752
},
{
"epoch": 0.24095230486538308,
"grad_norm": 0.5200051069259644,
"learning_rate": 9.040732725599261e-05,
"loss": 2.5032,
"step": 754
},
{
"epoch": 0.24159143564751936,
"grad_norm": 0.5068468451499939,
"learning_rate": 9.034499892453477e-05,
"loss": 2.5041,
"step": 756
},
{
"epoch": 0.24223056642965568,
"grad_norm": 0.5166811347007751,
"learning_rate": 9.028249039113577e-05,
"loss": 2.6254,
"step": 758
},
{
"epoch": 0.24286969721179197,
"grad_norm": 0.5714825987815857,
"learning_rate": 9.021980193499157e-05,
"loss": 2.5375,
"step": 760
},
{
"epoch": 0.24350882799392826,
"grad_norm": 0.4392567574977875,
"learning_rate": 9.015693383610169e-05,
"loss": 2.5482,
"step": 762
},
{
"epoch": 0.24414795877606454,
"grad_norm": 0.44030579924583435,
"learning_rate": 9.009388637526808e-05,
"loss": 2.5577,
"step": 764
},
{
"epoch": 0.24478708955820086,
"grad_norm": 0.49010273814201355,
"learning_rate": 9.00306598340938e-05,
"loss": 2.5707,
"step": 766
},
{
"epoch": 0.24542622034033715,
"grad_norm": 0.560543417930603,
"learning_rate": 8.996725449498173e-05,
"loss": 2.5574,
"step": 768
},
{
"epoch": 0.24606535112247344,
"grad_norm": 0.5686501264572144,
"learning_rate": 8.990367064113343e-05,
"loss": 2.5459,
"step": 770
},
{
"epoch": 0.24670448190460972,
"grad_norm": 0.5197829008102417,
"learning_rate": 8.983990855654774e-05,
"loss": 2.5316,
"step": 772
},
{
"epoch": 0.247343612686746,
"grad_norm": 0.48393699526786804,
"learning_rate": 8.977596852601961e-05,
"loss": 2.5376,
"step": 774
},
{
"epoch": 0.24798274346888233,
"grad_norm": 0.4604134261608124,
"learning_rate": 8.971185083513878e-05,
"loss": 2.5373,
"step": 776
},
{
"epoch": 0.24862187425101862,
"grad_norm": 0.5080364346504211,
"learning_rate": 8.964755577028852e-05,
"loss": 2.516,
"step": 778
},
{
"epoch": 0.2492610050331549,
"grad_norm": 0.5315148830413818,
"learning_rate": 8.958308361864429e-05,
"loss": 2.5182,
"step": 780
},
{
"epoch": 0.2499001358152912,
"grad_norm": 0.4669964015483856,
"learning_rate": 8.951843466817261e-05,
"loss": 2.506,
"step": 782
},
{
"epoch": 0.2505392665974275,
"grad_norm": 0.5169178247451782,
"learning_rate": 8.94536092076296e-05,
"loss": 2.5524,
"step": 784
},
{
"epoch": 0.25117839737956377,
"grad_norm": 0.530693769454956,
"learning_rate": 8.93886075265598e-05,
"loss": 2.5515,
"step": 786
},
{
"epoch": 0.2518175281617001,
"grad_norm": 0.5086248517036438,
"learning_rate": 8.932342991529484e-05,
"loss": 2.5235,
"step": 788
},
{
"epoch": 0.2524566589438364,
"grad_norm": 0.5186027884483337,
"learning_rate": 8.925807666495212e-05,
"loss": 2.5616,
"step": 790
},
{
"epoch": 0.2530957897259727,
"grad_norm": 0.5286267995834351,
"learning_rate": 8.919254806743358e-05,
"loss": 2.558,
"step": 792
},
{
"epoch": 0.253734920508109,
"grad_norm": 0.56434166431427,
"learning_rate": 8.912684441542432e-05,
"loss": 2.5315,
"step": 794
},
{
"epoch": 0.25437405129024526,
"grad_norm": 0.5112208127975464,
"learning_rate": 8.906096600239135e-05,
"loss": 2.5842,
"step": 796
},
{
"epoch": 0.25501318207238155,
"grad_norm": 0.5397393703460693,
"learning_rate": 8.899491312258221e-05,
"loss": 2.5405,
"step": 798
},
{
"epoch": 0.25565231285451784,
"grad_norm": 0.4671647250652313,
"learning_rate": 8.892868607102376e-05,
"loss": 2.4999,
"step": 800
},
{
"epoch": 0.2562914436366541,
"grad_norm": 0.41425585746765137,
"learning_rate": 8.886228514352076e-05,
"loss": 2.5312,
"step": 802
},
{
"epoch": 0.25693057441879047,
"grad_norm": 0.43078532814979553,
"learning_rate": 8.879571063665462e-05,
"loss": 2.5218,
"step": 804
},
{
"epoch": 0.25756970520092676,
"grad_norm": 0.432005912065506,
"learning_rate": 8.872896284778201e-05,
"loss": 2.523,
"step": 806
},
{
"epoch": 0.25820883598306305,
"grad_norm": 0.40941286087036133,
"learning_rate": 8.866204207503359e-05,
"loss": 2.575,
"step": 808
},
{
"epoch": 0.25884796676519933,
"grad_norm": 0.431316077709198,
"learning_rate": 8.859494861731267e-05,
"loss": 2.5837,
"step": 810
},
{
"epoch": 0.2594870975473356,
"grad_norm": 0.4376726448535919,
"learning_rate": 8.852768277429384e-05,
"loss": 2.5137,
"step": 812
},
{
"epoch": 0.2601262283294719,
"grad_norm": 0.5029991865158081,
"learning_rate": 8.846024484642166e-05,
"loss": 2.5526,
"step": 814
},
{
"epoch": 0.2607653591116082,
"grad_norm": 0.5601023435592651,
"learning_rate": 8.839263513490931e-05,
"loss": 2.5788,
"step": 816
},
{
"epoch": 0.2614044898937445,
"grad_norm": 0.5238969922065735,
"learning_rate": 8.832485394173726e-05,
"loss": 2.5589,
"step": 818
},
{
"epoch": 0.26204362067588083,
"grad_norm": 0.4996497929096222,
"learning_rate": 8.825690156965188e-05,
"loss": 2.57,
"step": 820
},
{
"epoch": 0.2626827514580171,
"grad_norm": 0.47495660185813904,
"learning_rate": 8.818877832216413e-05,
"loss": 2.5341,
"step": 822
},
{
"epoch": 0.2633218822401534,
"grad_norm": 0.503065288066864,
"learning_rate": 8.812048450354819e-05,
"loss": 2.5416,
"step": 824
},
{
"epoch": 0.2639610130222897,
"grad_norm": 0.49365946650505066,
"learning_rate": 8.805202041884012e-05,
"loss": 2.516,
"step": 826
},
{
"epoch": 0.264600143804426,
"grad_norm": 0.4978492558002472,
"learning_rate": 8.798338637383645e-05,
"loss": 2.52,
"step": 828
},
{
"epoch": 0.26523927458656227,
"grad_norm": 0.4817914664745331,
"learning_rate": 8.791458267509283e-05,
"loss": 2.5118,
"step": 830
},
{
"epoch": 0.26587840536869856,
"grad_norm": 0.5463185906410217,
"learning_rate": 8.78456096299227e-05,
"loss": 2.5527,
"step": 832
},
{
"epoch": 0.26651753615083484,
"grad_norm": 0.4575677812099457,
"learning_rate": 8.77764675463959e-05,
"loss": 2.5076,
"step": 834
},
{
"epoch": 0.26715666693297113,
"grad_norm": 0.47226086258888245,
"learning_rate": 8.770715673333722e-05,
"loss": 2.5357,
"step": 836
},
{
"epoch": 0.2677957977151075,
"grad_norm": 0.44436129927635193,
"learning_rate": 8.763767750032518e-05,
"loss": 2.5354,
"step": 838
},
{
"epoch": 0.26843492849724376,
"grad_norm": 0.48564496636390686,
"learning_rate": 8.756803015769049e-05,
"loss": 2.5479,
"step": 840
},
{
"epoch": 0.26907405927938005,
"grad_norm": 0.47404852509498596,
"learning_rate": 8.749821501651472e-05,
"loss": 2.5175,
"step": 842
},
{
"epoch": 0.26971319006151634,
"grad_norm": 0.4444579482078552,
"learning_rate": 8.742823238862895e-05,
"loss": 2.5066,
"step": 844
},
{
"epoch": 0.2703523208436526,
"grad_norm": 0.48305433988571167,
"learning_rate": 8.735808258661233e-05,
"loss": 2.5314,
"step": 846
},
{
"epoch": 0.2709914516257889,
"grad_norm": 0.5266690254211426,
"learning_rate": 8.728776592379068e-05,
"loss": 2.5734,
"step": 848
},
{
"epoch": 0.2716305824079252,
"grad_norm": 0.4550389051437378,
"learning_rate": 8.721728271423512e-05,
"loss": 2.556,
"step": 850
},
{
"epoch": 0.2722697131900615,
"grad_norm": 0.47347402572631836,
"learning_rate": 8.71466332727607e-05,
"loss": 2.5649,
"step": 852
},
{
"epoch": 0.27290884397219783,
"grad_norm": 0.5236901640892029,
"learning_rate": 8.707581791492485e-05,
"loss": 2.564,
"step": 854
},
{
"epoch": 0.2735479747543341,
"grad_norm": 0.5352922677993774,
"learning_rate": 8.700483695702617e-05,
"loss": 2.4933,
"step": 856
},
{
"epoch": 0.2741871055364704,
"grad_norm": 0.5335637331008911,
"learning_rate": 8.693369071610287e-05,
"loss": 2.4958,
"step": 858
},
{
"epoch": 0.2748262363186067,
"grad_norm": 0.4980125427246094,
"learning_rate": 8.686237950993137e-05,
"loss": 2.5519,
"step": 860
},
{
"epoch": 0.275465367100743,
"grad_norm": 0.4874439239501953,
"learning_rate": 8.679090365702498e-05,
"loss": 2.5326,
"step": 862
},
{
"epoch": 0.2761044978828793,
"grad_norm": 0.5071477293968201,
"learning_rate": 8.671926347663238e-05,
"loss": 2.5092,
"step": 864
},
{
"epoch": 0.27674362866501556,
"grad_norm": 0.4566083252429962,
"learning_rate": 8.664745928873619e-05,
"loss": 2.5108,
"step": 866
},
{
"epoch": 0.27738275944715185,
"grad_norm": 0.4723774492740631,
"learning_rate": 8.657549141405161e-05,
"loss": 2.4921,
"step": 868
},
{
"epoch": 0.2780218902292882,
"grad_norm": 0.44547411799430847,
"learning_rate": 8.650336017402494e-05,
"loss": 2.5481,
"step": 870
},
{
"epoch": 0.2786610210114245,
"grad_norm": 0.4709297716617584,
"learning_rate": 8.643106589083216e-05,
"loss": 2.501,
"step": 872
},
{
"epoch": 0.27930015179356077,
"grad_norm": 0.4446027874946594,
"learning_rate": 8.63586088873775e-05,
"loss": 2.5133,
"step": 874
},
{
"epoch": 0.27993928257569706,
"grad_norm": 0.4283333420753479,
"learning_rate": 8.628598948729197e-05,
"loss": 2.5338,
"step": 876
},
{
"epoch": 0.28057841335783335,
"grad_norm": 0.4817812442779541,
"learning_rate": 8.621320801493188e-05,
"loss": 2.5519,
"step": 878
},
{
"epoch": 0.28121754413996963,
"grad_norm": 0.49330389499664307,
"learning_rate": 8.614026479537753e-05,
"loss": 2.5047,
"step": 880
},
{
"epoch": 0.2818566749221059,
"grad_norm": 0.43356356024742126,
"learning_rate": 8.606716015443161e-05,
"loss": 2.4994,
"step": 882
},
{
"epoch": 0.2824958057042422,
"grad_norm": 0.4848228991031647,
"learning_rate": 8.599389441861782e-05,
"loss": 2.5186,
"step": 884
},
{
"epoch": 0.28313493648637855,
"grad_norm": 0.5413882732391357,
"learning_rate": 8.59204679151794e-05,
"loss": 2.5508,
"step": 886
},
{
"epoch": 0.28377406726851484,
"grad_norm": 0.46791547536849976,
"learning_rate": 8.584688097207764e-05,
"loss": 2.5728,
"step": 888
},
{
"epoch": 0.28441319805065113,
"grad_norm": 0.4600776731967926,
"learning_rate": 8.577313391799046e-05,
"loss": 2.5341,
"step": 890
},
{
"epoch": 0.2850523288327874,
"grad_norm": 0.3964655101299286,
"learning_rate": 8.569922708231089e-05,
"loss": 2.553,
"step": 892
},
{
"epoch": 0.2856914596149237,
"grad_norm": 0.41062116622924805,
"learning_rate": 8.562516079514569e-05,
"loss": 2.5726,
"step": 894
},
{
"epoch": 0.28633059039706,
"grad_norm": 0.43652409315109253,
"learning_rate": 8.555093538731374e-05,
"loss": 2.5313,
"step": 896
},
{
"epoch": 0.2869697211791963,
"grad_norm": 0.4312250316143036,
"learning_rate": 8.547655119034467e-05,
"loss": 2.4911,
"step": 898
},
{
"epoch": 0.28760885196133257,
"grad_norm": 0.4835914373397827,
"learning_rate": 8.540200853647737e-05,
"loss": 2.5262,
"step": 900
}
],
"logging_steps": 2,
"max_steps": 3130,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0132651008589824e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}